Eric Lee / linux-smarc-t335x-v3.2

1

/*

1

/*

2

* kernel/sched.c

2

* kernel/sched.c

3

*

3

*

4

* Kernel scheduler and related syscalls

4

* Kernel scheduler and related syscalls

5

*

5

*

6

7

*

7

*

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

9

* make semaphores SMP safe

9

* make semaphores SMP safe

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

11

* by Andrea Arcangeli

11

* by Andrea Arcangeli

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

13

* hybrid priority-list and round-robin design with

13

* hybrid priority-list and round-robin design with

14

* an array-switch method of distributing timeslices

14

* an array-switch method of distributing timeslices

15

* and per-CPU runqueues. Cleanups and useful suggestions

15

* and per-CPU runqueues. Cleanups and useful suggestions

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

18

* 2004-04-02 Scheduler domains code by Nick Piggin

18

* 2004-04-02 Scheduler domains code by Nick Piggin

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

20

* fair scheduling design by Con Kolivas.

20

* fair scheduling design by Con Kolivas.

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

22

* by Peter Williams

22

* by Peter Williams

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

25

* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,

25

* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,

26

* Thomas Gleixner, Mike Kravetz

26

* Thomas Gleixner, Mike Kravetz

27

*/

27

*/

28

29

#include <linux/mm.h>

29

#include <linux/mm.h>

30

#include <linux/module.h>

30

#include <linux/module.h>

31

#include <linux/nmi.h>

31

#include <linux/nmi.h>

32

#include <linux/init.h>

32

#include <linux/init.h>

33

#include <linux/uaccess.h>

33

#include <linux/uaccess.h>

34

#include <linux/highmem.h>

34

#include <linux/highmem.h>

35

#include <linux/smp_lock.h>

35

#include <linux/smp_lock.h>

36

#include <asm/mmu_context.h>

36

#include <asm/mmu_context.h>

37

#include <linux/interrupt.h>

37

#include <linux/interrupt.h>

38

#include <linux/capability.h>

38

#include <linux/capability.h>

39

#include <linux/completion.h>

39

#include <linux/completion.h>

40

#include <linux/kernel_stat.h>

40

#include <linux/kernel_stat.h>

41

#include <linux/debug_locks.h>

41

#include <linux/debug_locks.h>

42

#include <linux/perf_event.h>

42

#include <linux/perf_event.h>

43

#include <linux/security.h>

43

#include <linux/security.h>

44

#include <linux/notifier.h>

44

#include <linux/notifier.h>

45

#include <linux/profile.h>

45

#include <linux/profile.h>

46

#include <linux/freezer.h>

46

#include <linux/freezer.h>

47

#include <linux/vmalloc.h>

47

#include <linux/vmalloc.h>

48

#include <linux/blkdev.h>

48

#include <linux/blkdev.h>

49

#include <linux/delay.h>

49

#include <linux/delay.h>

50

#include <linux/pid_namespace.h>

50

#include <linux/pid_namespace.h>

51

#include <linux/smp.h>

51

#include <linux/smp.h>

52

#include <linux/threads.h>

52

#include <linux/threads.h>

53

#include <linux/timer.h>

53

#include <linux/timer.h>

54

#include <linux/rcupdate.h>

54

#include <linux/rcupdate.h>

55

#include <linux/cpu.h>

55

#include <linux/cpu.h>

56

#include <linux/cpuset.h>

56

#include <linux/cpuset.h>

57

#include <linux/percpu.h>

57

#include <linux/percpu.h>

58

#include <linux/kthread.h>

58

#include <linux/kthread.h>

59

#include <linux/proc_fs.h>

59

#include <linux/proc_fs.h>

60

#include <linux/seq_file.h>

60

#include <linux/seq_file.h>

61

#include <linux/sysctl.h>

61

#include <linux/sysctl.h>

62

#include <linux/syscalls.h>

62

#include <linux/syscalls.h>

63

#include <linux/times.h>

63

#include <linux/times.h>

64

#include <linux/tsacct_kern.h>

64

#include <linux/tsacct_kern.h>

65

#include <linux/kprobes.h>

65

#include <linux/kprobes.h>

66

#include <linux/delayacct.h>

66

#include <linux/delayacct.h>

67

#include <linux/unistd.h>

67

#include <linux/unistd.h>

68

#include <linux/pagemap.h>

68

#include <linux/pagemap.h>

69

#include <linux/hrtimer.h>

69

#include <linux/hrtimer.h>

70

#include <linux/tick.h>

70

#include <linux/tick.h>

71

#include <linux/debugfs.h>

71

#include <linux/debugfs.h>

72

#include <linux/ctype.h>

72

#include <linux/ctype.h>

73

#include <linux/ftrace.h>

73

#include <linux/ftrace.h>

74

#include <linux/slab.h>

74

#include <linux/slab.h>

75

76

#include <asm/tlb.h>

76

#include <asm/tlb.h>

77

#include <asm/irq_regs.h>

77

#include <asm/irq_regs.h>

78

79

#include "sched_cpupri.h"

79

#include "sched_cpupri.h"

80

81

#define CREATE_TRACE_POINTS

81

#define CREATE_TRACE_POINTS

82

#include <trace/events/sched.h>

82

#include <trace/events/sched.h>

83

84

/*

84

/*

85

* Convert user-nice values [ -20 ... 0 ... 19 ]

85

* Convert user-nice values [ -20 ... 0 ... 19 ]

86

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

86

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

87

* and back.

87

* and back.

88

*/

88

*/

89

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

89

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

90

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

90

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

91

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

91

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

92

93

/*

93

/*

94

* 'User priority' is the nice value converted to something we

94

* 'User priority' is the nice value converted to something we

95

* can work with better when scaling various scheduler parameters,

95

* can work with better when scaling various scheduler parameters,

96

* it's a [ 0 ... 39 ] range.

96

* it's a [ 0 ... 39 ] range.

97

*/

97

*/

98

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

98

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

99

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

99

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

100

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

100

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

101

102

/*

102

/*

103

* Helpers for converting nanosecond timing to jiffy resolution

103

* Helpers for converting nanosecond timing to jiffy resolution

104

*/

104

*/

105

#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))

105

#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))

106

107

#define NICE_0_LOAD SCHED_LOAD_SCALE

107

#define NICE_0_LOAD SCHED_LOAD_SCALE

108

#define NICE_0_SHIFT SCHED_LOAD_SHIFT

108

#define NICE_0_SHIFT SCHED_LOAD_SHIFT

109

110

/*

110

/*

111

* These are the 'tuning knobs' of the scheduler:

111

* These are the 'tuning knobs' of the scheduler:

112

*

112

*

113

* default timeslice is 100 msecs (used only for SCHED_RR tasks).

113

* default timeslice is 100 msecs (used only for SCHED_RR tasks).

114

* Timeslices get refilled after they expire.

114

* Timeslices get refilled after they expire.

115

*/

115

*/

116

#define DEF_TIMESLICE (100 * HZ / 1000)

116

#define DEF_TIMESLICE (100 * HZ / 1000)

117

118

/*

118

/*

119

* single value that denotes runtime == period, ie unlimited time.

119

* single value that denotes runtime == period, ie unlimited time.

120

*/

120

*/

121

#define RUNTIME_INF ((u64)~0ULL)

121

#define RUNTIME_INF ((u64)~0ULL)

122

123

static inline int rt_policy(int policy)

123

static inline int rt_policy(int policy)

124

{

124

{

125

if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))

125

if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))

126

return 1;

126

return 1;

127

return 0;

127

return 0;

128

}

128

}

129

130

static inline int task_has_rt_policy(struct task_struct *p)

130

static inline int task_has_rt_policy(struct task_struct *p)

131

{

131

{

132

return rt_policy(p->policy);

132

return rt_policy(p->policy);

133

}

133

}

134

135

/*

135

/*

136

* This is the priority-queue data structure of the RT scheduling class:

136

* This is the priority-queue data structure of the RT scheduling class:

137

*/

137

*/

138

struct rt_prio_array {

138

struct rt_prio_array {

139

DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */

139

DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */

140

struct list_head queue[MAX_RT_PRIO];

140

struct list_head queue[MAX_RT_PRIO];

141

};

141

};

142

143

struct rt_bandwidth {

143

struct rt_bandwidth {

144

/* nests inside the rq lock: */

144

/* nests inside the rq lock: */

145

raw_spinlock_t rt_runtime_lock;

145

raw_spinlock_t rt_runtime_lock;

146

ktime_t rt_period;

146

ktime_t rt_period;

147

u64 rt_runtime;

147

u64 rt_runtime;

148

struct hrtimer rt_period_timer;

148

struct hrtimer rt_period_timer;

149

};

149

};

150

151

static struct rt_bandwidth def_rt_bandwidth;

151

static struct rt_bandwidth def_rt_bandwidth;

152

153

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

153

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

154

155

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)

155

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)

156

{

156

{

157

struct rt_bandwidth *rt_b =

157

struct rt_bandwidth *rt_b =

158

container_of(timer, struct rt_bandwidth, rt_period_timer);

158

container_of(timer, struct rt_bandwidth, rt_period_timer);

159

ktime_t now;

159

ktime_t now;

160

int overrun;

160

int overrun;

161

int idle = 0;

161

int idle = 0;

162

163

for (;;) {

163

for (;;) {

164

now = hrtimer_cb_get_time(timer);

164

now = hrtimer_cb_get_time(timer);

165

overrun = hrtimer_forward(timer, now, rt_b->rt_period);

165

overrun = hrtimer_forward(timer, now, rt_b->rt_period);

166

167

if (!overrun)

167

if (!overrun)

168

break;

168

break;

169

170

idle = do_sched_rt_period_timer(rt_b, overrun);

170

idle = do_sched_rt_period_timer(rt_b, overrun);

171

}

171

}

172

173

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

173

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

174

}

174

}

175

176

static

176

static

177

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)

177

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)

178

{

178

{

179

rt_b->rt_period = ns_to_ktime(period);

179

rt_b->rt_period = ns_to_ktime(period);

180

rt_b->rt_runtime = runtime;

180

rt_b->rt_runtime = runtime;

181

182

raw_spin_lock_init(&rt_b->rt_runtime_lock);

182

raw_spin_lock_init(&rt_b->rt_runtime_lock);

183

184

hrtimer_init(&rt_b->rt_period_timer,

184

hrtimer_init(&rt_b->rt_period_timer,

185

CLOCK_MONOTONIC, HRTIMER_MODE_REL);

185

CLOCK_MONOTONIC, HRTIMER_MODE_REL);

186

rt_b->rt_period_timer.function = sched_rt_period_timer;

186

rt_b->rt_period_timer.function = sched_rt_period_timer;

187

}

187

}

188

189

static inline int rt_bandwidth_enabled(void)

189

static inline int rt_bandwidth_enabled(void)

190

{

190

{

191

return sysctl_sched_rt_runtime >= 0;

191

return sysctl_sched_rt_runtime >= 0;

192

}

192

}

193

194

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)

194

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)

195

{

195

{

196

ktime_t now;

196

ktime_t now;

197

198

if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)

198

if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)

199

return;

199

return;

200

201

if (hrtimer_active(&rt_b->rt_period_timer))

201

if (hrtimer_active(&rt_b->rt_period_timer))

202

return;

202

return;

203

204

raw_spin_lock(&rt_b->rt_runtime_lock);

204

raw_spin_lock(&rt_b->rt_runtime_lock);

205

for (;;) {

205

for (;;) {

206

unsigned long delta;

206

unsigned long delta;

207

ktime_t soft, hard;

207

ktime_t soft, hard;

208

209

if (hrtimer_active(&rt_b->rt_period_timer))

209

if (hrtimer_active(&rt_b->rt_period_timer))

210

break;

210

break;

211

212

now = hrtimer_cb_get_time(&rt_b->rt_period_timer);

212

now = hrtimer_cb_get_time(&rt_b->rt_period_timer);

213

hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);

213

hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);

214

215

soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);

215

soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);

216

hard = hrtimer_get_expires(&rt_b->rt_period_timer);

216

hard = hrtimer_get_expires(&rt_b->rt_period_timer);

217

delta = ktime_to_ns(ktime_sub(hard, soft));

217

delta = ktime_to_ns(ktime_sub(hard, soft));

218

__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,

218

__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,

219

HRTIMER_MODE_ABS_PINNED, 0);

219

HRTIMER_MODE_ABS_PINNED, 0);

220

}

220

}

221

raw_spin_unlock(&rt_b->rt_runtime_lock);

221

raw_spin_unlock(&rt_b->rt_runtime_lock);

222

}

222

}

223

224

#ifdef CONFIG_RT_GROUP_SCHED

224

#ifdef CONFIG_RT_GROUP_SCHED

225

static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)

225

static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)

226

{

226

{

227

hrtimer_cancel(&rt_b->rt_period_timer);

227

hrtimer_cancel(&rt_b->rt_period_timer);

228

}

228

}

229

#endif

229

#endif

230

231

/*

231

/*

232

* sched_domains_mutex serializes calls to arch_init_sched_domains,

232

* sched_domains_mutex serializes calls to arch_init_sched_domains,

233

* detach_destroy_domains and partition_sched_domains.

233

* detach_destroy_domains and partition_sched_domains.

234

*/

234

*/

235

static DEFINE_MUTEX(sched_domains_mutex);

235

static DEFINE_MUTEX(sched_domains_mutex);

236

237

#ifdef CONFIG_CGROUP_SCHED

237

#ifdef CONFIG_CGROUP_SCHED

238

239

#include <linux/cgroup.h>

239

#include <linux/cgroup.h>

240

241

struct cfs_rq;

241

struct cfs_rq;

242

243

static LIST_HEAD(task_groups);

243

static LIST_HEAD(task_groups);

244

245

/* task group related information */

245

/* task group related information */

246

struct task_group {

246

struct task_group {

247

struct cgroup_subsys_state css;

247

struct cgroup_subsys_state css;

248

249

#ifdef CONFIG_FAIR_GROUP_SCHED

249

#ifdef CONFIG_FAIR_GROUP_SCHED

250

/* schedulable entities of this group on each cpu */

250

/* schedulable entities of this group on each cpu */

251

struct sched_entity **se;

251

struct sched_entity **se;

252

/* runqueue "owned" by this group on each cpu */

252

/* runqueue "owned" by this group on each cpu */

253

struct cfs_rq **cfs_rq;

253

struct cfs_rq **cfs_rq;

254

unsigned long shares;

254

unsigned long shares;

255

#endif

255

#endif

256

257

#ifdef CONFIG_RT_GROUP_SCHED

257

#ifdef CONFIG_RT_GROUP_SCHED

258

struct sched_rt_entity **rt_se;

258

struct sched_rt_entity **rt_se;

259

struct rt_rq **rt_rq;

259

struct rt_rq **rt_rq;

260

261

struct rt_bandwidth rt_bandwidth;

261

struct rt_bandwidth rt_bandwidth;

262

#endif

262

#endif

263

264

struct rcu_head rcu;

264

struct rcu_head rcu;

265

struct list_head list;

265

struct list_head list;

266

267

struct task_group *parent;

267

struct task_group *parent;

268

struct list_head siblings;

268

struct list_head siblings;

269

struct list_head children;

269

struct list_head children;

270

};

270

};

271

272

#define root_task_group init_task_group

272

#define root_task_group init_task_group

273

274

/* task_group_lock serializes add/remove of task groups and also changes to

274

/* task_group_lock serializes add/remove of task groups and also changes to

275

* a task group's cpu shares.

275

* a task group's cpu shares.

276

*/

276

*/

277

static DEFINE_SPINLOCK(task_group_lock);

277

static DEFINE_SPINLOCK(task_group_lock);

278

279

#ifdef CONFIG_FAIR_GROUP_SCHED

279

#ifdef CONFIG_FAIR_GROUP_SCHED

280

281

#ifdef CONFIG_SMP

281

#ifdef CONFIG_SMP

282

static int root_task_group_empty(void)

282

static int root_task_group_empty(void)

283

{

283

{

284

return list_empty(&root_task_group.children);

284

return list_empty(&root_task_group.children);

285

}

285

}

286

#endif

286

#endif

287

288

# define INIT_TASK_GROUP_LOAD NICE_0_LOAD

288

# define INIT_TASK_GROUP_LOAD NICE_0_LOAD

289

290

/*

290

/*

291

* A weight of 0 or 1 can cause arithmetics problems.

291

* A weight of 0 or 1 can cause arithmetics problems.

292

* A weight of a cfs_rq is the sum of weights of which entities

292

* A weight of a cfs_rq is the sum of weights of which entities

293

* are queued on this cfs_rq, so a weight of a entity should not be

293

* are queued on this cfs_rq, so a weight of a entity should not be

294

* too large, so as the shares value of a task group.

294

* too large, so as the shares value of a task group.

295

* (The default weight is 1024 - so there's no practical

295

* (The default weight is 1024 - so there's no practical

296

* limitation from this.)

296

* limitation from this.)

297

*/

297

*/

298

#define MIN_SHARES 2

298

#define MIN_SHARES 2

299

#define MAX_SHARES (1UL << 18)

299

#define MAX_SHARES (1UL << 18)

300

301

static int init_task_group_load = INIT_TASK_GROUP_LOAD;

301

static int init_task_group_load = INIT_TASK_GROUP_LOAD;

302

#endif

302

#endif

303

304

/* Default task group.

304

/* Default task group.

305

* Every task in system belong to this group at bootup.

305

* Every task in system belong to this group at bootup.

306

*/

306

*/

307

struct task_group init_task_group;

307

struct task_group init_task_group;

308

309

/* return group to which a task belongs */

309

/* return group to which a task belongs */

310

static inline struct task_group *task_group(struct task_struct *p)

310

static inline struct task_group *task_group(struct task_struct *p)

311

{

311

{

312

struct task_group *tg;

312

struct task_group *tg;

313

314

#ifdef CONFIG_CGROUP_SCHED

314

#ifdef CONFIG_CGROUP_SCHED

315

tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),

315

tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),

316

struct task_group, css);

316

struct task_group, css);

317

#else

317

#else

318

tg = &init_task_group;

318

tg = &init_task_group;

319

#endif

319

#endif

320

return tg;

320

return tg;

321

}

321

}

322

323

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

323

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

324

static inline void set_task_rq(struct task_struct *p, unsigned int cpu)

324

static inline void set_task_rq(struct task_struct *p, unsigned int cpu)

325

{

325

{

326

#ifdef CONFIG_FAIR_GROUP_SCHED

326

#ifdef CONFIG_FAIR_GROUP_SCHED

327

p->se.cfs_rq = task_group(p)->cfs_rq[cpu];

327

p->se.cfs_rq = task_group(p)->cfs_rq[cpu];

328

p->se.parent = task_group(p)->se[cpu];

328

p->se.parent = task_group(p)->se[cpu];

329

#endif

329

#endif

330

331

#ifdef CONFIG_RT_GROUP_SCHED

331

#ifdef CONFIG_RT_GROUP_SCHED

332

p->rt.rt_rq = task_group(p)->rt_rq[cpu];

332

p->rt.rt_rq = task_group(p)->rt_rq[cpu];

333

p->rt.parent = task_group(p)->rt_se[cpu];

333

p->rt.parent = task_group(p)->rt_se[cpu];

334

#endif

334

#endif

335

}

335

}

336

337

#else

337

#else

338

339

static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }

339

static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }

340

static inline struct task_group *task_group(struct task_struct *p)

340

static inline struct task_group *task_group(struct task_struct *p)

341

{

341

{

342

return NULL;

342

return NULL;

343

}

343

}

344

345

#endif /* CONFIG_CGROUP_SCHED */

345

#endif /* CONFIG_CGROUP_SCHED */

346

347

/* CFS-related fields in a runqueue */

347

/* CFS-related fields in a runqueue */

348

struct cfs_rq {

348

struct cfs_rq {

349

struct load_weight load;

349

struct load_weight load;

350

unsigned long nr_running;

350

unsigned long nr_running;

351

352

u64 exec_clock;

352

u64 exec_clock;

353

u64 min_vruntime;

353

u64 min_vruntime;

354

355

struct rb_root tasks_timeline;

355

struct rb_root tasks_timeline;

356

struct rb_node *rb_leftmost;

356

struct rb_node *rb_leftmost;

357

358

struct list_head tasks;

358

struct list_head tasks;

359

struct list_head *balance_iterator;

359

struct list_head *balance_iterator;

360

361

/*

361

/*

362

* 'curr' points to currently running entity on this cfs_rq.

362

* 'curr' points to currently running entity on this cfs_rq.

363

* It is set to NULL otherwise (i.e when none are currently running).

363

* It is set to NULL otherwise (i.e when none are currently running).

364

*/

364

*/

365

struct sched_entity *curr, *next, *last;

365

struct sched_entity *curr, *next, *last;

366

367

unsigned int nr_spread_over;

367

unsigned int nr_spread_over;

368

369

#ifdef CONFIG_FAIR_GROUP_SCHED

369

#ifdef CONFIG_FAIR_GROUP_SCHED

370

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

370

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

371

372

/*

372

/*

373

* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

373

* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

374

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

374

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

375

* (like users, containers etc.)

375

* (like users, containers etc.)

376

*

376

*

377

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

377

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

378

* list is used during load balance.

378

* list is used during load balance.

379

*/

379

*/

380

struct list_head leaf_cfs_rq_list;

380

struct list_head leaf_cfs_rq_list;

381

struct task_group *tg; /* group that "owns" this runqueue */

381

struct task_group *tg; /* group that "owns" this runqueue */

382

383

#ifdef CONFIG_SMP

383

#ifdef CONFIG_SMP

384

/*

384

/*

385

* the part of load.weight contributed by tasks

385

* the part of load.weight contributed by tasks

386

*/

386

*/

387

unsigned long task_weight;

387

unsigned long task_weight;

388

389

/*

389

/*

390

* h_load = weight * f(tg)

390

* h_load = weight * f(tg)

391

*

391

*

392

* Where f(tg) is the recursive weight fraction assigned to

392

* Where f(tg) is the recursive weight fraction assigned to

393

* this group.

393

* this group.

394

*/

394

*/

395

unsigned long h_load;

395

unsigned long h_load;

396

397

/*

397

/*

398

* this cpu's part of tg->shares

398

* this cpu's part of tg->shares

399

*/

399

*/

400

unsigned long shares;

400

unsigned long shares;

401

402

/*

402

/*

403

* load.weight at the time we set shares

403

* load.weight at the time we set shares

404

*/

404

*/

405

unsigned long rq_weight;

405

unsigned long rq_weight;

406

#endif

406

#endif

407

#endif

407

#endif

408

};

408

};

409

410

/* Real-Time classes' related field in a runqueue: */

410

/* Real-Time classes' related field in a runqueue: */

411

struct rt_rq {

411

struct rt_rq {

412

struct rt_prio_array active;

412

struct rt_prio_array active;

413

unsigned long rt_nr_running;

413

unsigned long rt_nr_running;

414

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

414

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

415

struct {

415

struct {

416

int curr; /* highest queued rt task prio */

416

int curr; /* highest queued rt task prio */

417

#ifdef CONFIG_SMP

417

#ifdef CONFIG_SMP

418

int next; /* next highest */

418

int next; /* next highest */

419

#endif

419

#endif

420

} highest_prio;

420

} highest_prio;

421

#endif

421

#endif

422

#ifdef CONFIG_SMP

422

#ifdef CONFIG_SMP

423

unsigned long rt_nr_migratory;

423

unsigned long rt_nr_migratory;

424

unsigned long rt_nr_total;

424

unsigned long rt_nr_total;

425

int overloaded;

425

int overloaded;

426

struct plist_head pushable_tasks;

426

struct plist_head pushable_tasks;

427

#endif

427

#endif

428

int rt_throttled;

428

int rt_throttled;

429

u64 rt_time;

429

u64 rt_time;

430

u64 rt_runtime;

430

u64 rt_runtime;

431

/* Nests inside the rq lock: */

431

/* Nests inside the rq lock: */

432

raw_spinlock_t rt_runtime_lock;

432

raw_spinlock_t rt_runtime_lock;

433

434

#ifdef CONFIG_RT_GROUP_SCHED

434

#ifdef CONFIG_RT_GROUP_SCHED

435

unsigned long rt_nr_boosted;

435

unsigned long rt_nr_boosted;

436

437

struct rq *rq;

437

struct rq *rq;

438

struct list_head leaf_rt_rq_list;

438

struct list_head leaf_rt_rq_list;

439

struct task_group *tg;

439

struct task_group *tg;

440

#endif

440

#endif

441

};

441

};

442

443

#ifdef CONFIG_SMP

443

#ifdef CONFIG_SMP

444

445

/*

445

/*

446

* We add the notion of a root-domain which will be used to define per-domain

446

* We add the notion of a root-domain which will be used to define per-domain

447

* variables. Each exclusive cpuset essentially defines an island domain by

447

* variables. Each exclusive cpuset essentially defines an island domain by

448

* fully partitioning the member cpus from any other cpuset. Whenever a new

448

* fully partitioning the member cpus from any other cpuset. Whenever a new

449

* exclusive cpuset is created, we also create and attach a new root-domain

449

* exclusive cpuset is created, we also create and attach a new root-domain

450

* object.

450

* object.

451

*

451

*

452

*/

452

*/

453

struct root_domain {

453

struct root_domain {

454

atomic_t refcount;

454

atomic_t refcount;

455

cpumask_var_t span;

455

cpumask_var_t span;

456

cpumask_var_t online;

456

cpumask_var_t online;

457

458

/*

458

/*

459

* The "RT overload" flag: it gets set if a CPU has more than

459

* The "RT overload" flag: it gets set if a CPU has more than

460

* one runnable RT task.

460

* one runnable RT task.

461

*/

461

*/

462

cpumask_var_t rto_mask;

462

cpumask_var_t rto_mask;

463

atomic_t rto_count;

463

atomic_t rto_count;

464

#ifdef CONFIG_SMP

464

#ifdef CONFIG_SMP

465

struct cpupri cpupri;

465

struct cpupri cpupri;

466

#endif

466

#endif

467

};

467

};

468

469

/*

469

/*

470

* By default the system creates a single root-domain with all cpus as

470

* By default the system creates a single root-domain with all cpus as

471

* members (mimicking the global state we have today).

471

* members (mimicking the global state we have today).

472

*/

472

*/

473

static struct root_domain def_root_domain;

473

static struct root_domain def_root_domain;

474

475

#endif

475

#endif

476

477

/*

477

/*

478

* This is the main, per-CPU runqueue data structure.

478

* This is the main, per-CPU runqueue data structure.

479

*

479

*

480

* Locking rule: those places that want to lock multiple runqueues

480

* Locking rule: those places that want to lock multiple runqueues

481

* (such as the load balancing or the thread migration code), lock

481

* (such as the load balancing or the thread migration code), lock

482

* acquire operations must be ordered by ascending &runqueue.

482

* acquire operations must be ordered by ascending &runqueue.

483

*/

483

*/

484

struct rq {

484

struct rq {

485

/* runqueue lock: */

485

/* runqueue lock: */

486

raw_spinlock_t lock;

486

raw_spinlock_t lock;

487

488

/*

488

/*

489

* nr_running and cpu_load should be in the same cacheline because

489

* nr_running and cpu_load should be in the same cacheline because

490

* remote CPUs use both these fields when doing load calculation.

490

* remote CPUs use both these fields when doing load calculation.

491

*/

491

*/

492

unsigned long nr_running;

492

unsigned long nr_running;

493

#define CPU_LOAD_IDX_MAX 5

493

#define CPU_LOAD_IDX_MAX 5

494

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

494

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

495

#ifdef CONFIG_NO_HZ

495

#ifdef CONFIG_NO_HZ

496

u64 nohz_stamp;

496

u64 nohz_stamp;

497

unsigned char in_nohz_recently;

497

unsigned char in_nohz_recently;

498

#endif

498

#endif

499

unsigned int skip_clock_update;

499

unsigned int skip_clock_update;

500

501

/* capture load from *all* tasks on this cpu: */

501

/* capture load from *all* tasks on this cpu: */

502

struct load_weight load;

502

struct load_weight load;

503

unsigned long nr_load_updates;

503

unsigned long nr_load_updates;

504

u64 nr_switches;

504

u64 nr_switches;

505

506

struct cfs_rq cfs;

506

struct cfs_rq cfs;

507

struct rt_rq rt;

507

struct rt_rq rt;

508

509

#ifdef CONFIG_FAIR_GROUP_SCHED

509

#ifdef CONFIG_FAIR_GROUP_SCHED

510

/* list of leaf cfs_rq on this cpu: */

510

/* list of leaf cfs_rq on this cpu: */

511

struct list_head leaf_cfs_rq_list;

511

struct list_head leaf_cfs_rq_list;

512

#endif

512

#endif

513

#ifdef CONFIG_RT_GROUP_SCHED

513

#ifdef CONFIG_RT_GROUP_SCHED

514

struct list_head leaf_rt_rq_list;

514

struct list_head leaf_rt_rq_list;

515

#endif

515

#endif

516

517

/*

517

/*

518

* This is part of a global counter where only the total sum

518

* This is part of a global counter where only the total sum

519

* over all CPUs matters. A task can increase this counter on

519

* over all CPUs matters. A task can increase this counter on

520

* one CPU and if it got migrated afterwards it may decrease

520

* one CPU and if it got migrated afterwards it may decrease

521

* it on another CPU. Always updated under the runqueue lock:

521

* it on another CPU. Always updated under the runqueue lock:

522

*/

522

*/

523

unsigned long nr_uninterruptible;

523

unsigned long nr_uninterruptible;

524

525

struct task_struct *curr, *idle;

525

struct task_struct *curr, *idle;

526

unsigned long next_balance;

526

unsigned long next_balance;

527

struct mm_struct *prev_mm;

527

struct mm_struct *prev_mm;

528

529

u64 clock;

529

u64 clock;

530

531

atomic_t nr_iowait;

531

atomic_t nr_iowait;

532

533

#ifdef CONFIG_SMP

533

#ifdef CONFIG_SMP

534

struct root_domain *rd;

534

struct root_domain *rd;

535

struct sched_domain *sd;

535

struct sched_domain *sd;

536

537

unsigned char idle_at_tick;

537

unsigned char idle_at_tick;

538

/* For active balancing */

538

/* For active balancing */

539

int post_schedule;

539

int post_schedule;

540

int active_balance;

540

int active_balance;

541

int push_cpu;

541

int push_cpu;

542

/* cpu of this runqueue: */

542

/* cpu of this runqueue: */

543

int cpu;

543

int cpu;

544

int online;

544

int online;

545

546

unsigned long avg_load_per_task;

546

unsigned long avg_load_per_task;

547

548

struct task_struct *migration_thread;

548

struct task_struct *migration_thread;

549

struct list_head migration_queue;

549

struct list_head migration_queue;

550

551

u64 rt_avg;

551

u64 rt_avg;

552

u64 age_stamp;

552

u64 age_stamp;

553

u64 idle_stamp;

553

u64 idle_stamp;

554

u64 avg_idle;

554

u64 avg_idle;

555

#endif

555

#endif

556

557

/* calc_load related fields */

557

/* calc_load related fields */

558

unsigned long calc_load_update;

558

unsigned long calc_load_update;

559

long calc_load_active;

559

long calc_load_active;

560

561

#ifdef CONFIG_SCHED_HRTICK

561

#ifdef CONFIG_SCHED_HRTICK

562

#ifdef CONFIG_SMP

562

#ifdef CONFIG_SMP

563

int hrtick_csd_pending;

563

int hrtick_csd_pending;

564

struct call_single_data hrtick_csd;

564

struct call_single_data hrtick_csd;

565

#endif

565

#endif

566

struct hrtimer hrtick_timer;

566

struct hrtimer hrtick_timer;

567

#endif

567

#endif

568

569

#ifdef CONFIG_SCHEDSTATS

569

#ifdef CONFIG_SCHEDSTATS

570

/* latency stats */

570

/* latency stats */

571

struct sched_info rq_sched_info;

571

struct sched_info rq_sched_info;

572

unsigned long long rq_cpu_time;

572

unsigned long long rq_cpu_time;

573

/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

573

/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

574

575

/* sys_sched_yield() stats */

575

/* sys_sched_yield() stats */

576

unsigned int yld_count;

576

unsigned int yld_count;

577

578

/* schedule() stats */

578

/* schedule() stats */

579

unsigned int sched_switch;

579

unsigned int sched_switch;

580

unsigned int sched_count;

580

unsigned int sched_count;

581

unsigned int sched_goidle;

581

unsigned int sched_goidle;

582

583

/* try_to_wake_up() stats */

583

/* try_to_wake_up() stats */

584

unsigned int ttwu_count;

584

unsigned int ttwu_count;

585

unsigned int ttwu_local;

585

unsigned int ttwu_local;

586

587

/* BKL stats */

587

/* BKL stats */

588

unsigned int bkl_count;

588

unsigned int bkl_count;

589

#endif

589

#endif

590

};

590

};

591

592

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

592

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

593

594

static inline

594

static inline

595

void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)

595

void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)

596

{

596

{

597

rq->curr->sched_class->check_preempt_curr(rq, p, flags);

597

rq->curr->sched_class->check_preempt_curr(rq, p, flags);

598

599

/*

599

/*

600

* A queue event has occurred, and we're going to schedule. In

600

* A queue event has occurred, and we're going to schedule. In

601

* this case, we can save a useless back to back clock update.

601

* this case, we can save a useless back to back clock update.

602

*/

602

*/

603

if (test_tsk_need_resched(p))

603

if (test_tsk_need_resched(p))

604

rq->skip_clock_update = 1;

604

rq->skip_clock_update = 1;

605

}

605

}

606

607

static inline int cpu_of(struct rq *rq)

607

static inline int cpu_of(struct rq *rq)

608

{

608

{

609

#ifdef CONFIG_SMP

609

#ifdef CONFIG_SMP

610

return rq->cpu;

610

return rq->cpu;

611

#else

611

#else

612

return 0;

612

return 0;

613

#endif

613

#endif

614

}

614

}

615

616

#define rcu_dereference_check_sched_domain(p) \

616

#define rcu_dereference_check_sched_domain(p) \

617

rcu_dereference_check((p), \

617

rcu_dereference_check((p), \

618

rcu_read_lock_sched_held() || \

618

rcu_read_lock_sched_held() || \

619

lockdep_is_held(&sched_domains_mutex))

619

lockdep_is_held(&sched_domains_mutex))

620

621

/*

621

/*

622

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

622

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

623

* See detach_destroy_domains: synchronize_sched for details.

623

* See detach_destroy_domains: synchronize_sched for details.

624

*

624

*

625

* The domain tree of any CPU may only be accessed from within

625

* The domain tree of any CPU may only be accessed from within

626

* preempt-disabled sections.

626

* preempt-disabled sections.

627

*/

627

*/

628

#define for_each_domain(cpu, __sd) \

628

#define for_each_domain(cpu, __sd) \

629

for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)

629

for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)

630

631

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

631

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

632

#define this_rq() (&__get_cpu_var(runqueues))

632

#define this_rq() (&__get_cpu_var(runqueues))

633

#define task_rq(p) cpu_rq(task_cpu(p))

633

#define task_rq(p) cpu_rq(task_cpu(p))

634

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

634

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

635

#define raw_rq() (&__raw_get_cpu_var(runqueues))

635

#define raw_rq() (&__raw_get_cpu_var(runqueues))

636

637

inline void update_rq_clock(struct rq *rq)

637

inline void update_rq_clock(struct rq *rq)

638

{

638

{

639

if (!rq->skip_clock_update)

639

if (!rq->skip_clock_update)

640

rq->clock = sched_clock_cpu(cpu_of(rq));

640

rq->clock = sched_clock_cpu(cpu_of(rq));

641

}

641

}

642

643

/*

643

/*

644

* Tunables that become constants when CONFIG_SCHED_DEBUG is off:

644

* Tunables that become constants when CONFIG_SCHED_DEBUG is off:

645

*/

645

*/

646

#ifdef CONFIG_SCHED_DEBUG

646

#ifdef CONFIG_SCHED_DEBUG

647

# define const_debug __read_mostly

647

# define const_debug __read_mostly

648

#else

648

#else

649

# define const_debug static const

649

# define const_debug static const

650

#endif

650

#endif

651

652

/**

652

/**

653

* runqueue_is_locked

653

* runqueue_is_locked

654

* @cpu: the processor in question.

654

* @cpu: the processor in question.

655

*

655

*

656

* Returns true if the current cpu runqueue is locked.

656

* Returns true if the current cpu runqueue is locked.

657

* This interface allows printk to be called with the runqueue lock

657

* This interface allows printk to be called with the runqueue lock

658

* held and know whether or not it is OK to wake up the klogd.

658

* held and know whether or not it is OK to wake up the klogd.

659

*/

659

*/

660

int runqueue_is_locked(int cpu)

660

int runqueue_is_locked(int cpu)

661

{

661

{

662

return raw_spin_is_locked(&cpu_rq(cpu)->lock);

662

return raw_spin_is_locked(&cpu_rq(cpu)->lock);

663

}

663

}

664

665

/*

665

/*

666

* Debugging: various feature bits

666

* Debugging: various feature bits

667

*/

667

*/

668

669

#define SCHED_FEAT(name, enabled) \

669

#define SCHED_FEAT(name, enabled) \

670

__SCHED_FEAT_##name ,

670

__SCHED_FEAT_##name ,

671

672

enum {

672

enum {

673

#include "sched_features.h"

673

#include "sched_features.h"

674

};

674

};

675

676

#undef SCHED_FEAT

676

#undef SCHED_FEAT

677

678

#define SCHED_FEAT(name, enabled) \

678

#define SCHED_FEAT(name, enabled) \

679

(1UL << __SCHED_FEAT_##name) * enabled |

679

(1UL << __SCHED_FEAT_##name) * enabled |

680

681

const_debug unsigned int sysctl_sched_features =

681

const_debug unsigned int sysctl_sched_features =

682

#include "sched_features.h"

682

#include "sched_features.h"

683

0;

683

0;

684

685

#undef SCHED_FEAT

685

#undef SCHED_FEAT

686

687

#ifdef CONFIG_SCHED_DEBUG

687

#ifdef CONFIG_SCHED_DEBUG

688

#define SCHED_FEAT(name, enabled) \

688

#define SCHED_FEAT(name, enabled) \

689

#name ,

689

#name ,

690

691

static __read_mostly char *sched_feat_names[] = {

691

static __read_mostly char *sched_feat_names[] = {

692

#include "sched_features.h"

692

#include "sched_features.h"

693

NULL

693

NULL

694

};

694

};

695

696

#undef SCHED_FEAT

696

#undef SCHED_FEAT

697

698

static int sched_feat_show(struct seq_file *m, void *v)

698

static int sched_feat_show(struct seq_file *m, void *v)

699

{

699

{

700

int i;

700

int i;

701

702

for (i = 0; sched_feat_names[i]; i++) {

702

for (i = 0; sched_feat_names[i]; i++) {

703

if (!(sysctl_sched_features & (1UL << i)))

703

if (!(sysctl_sched_features & (1UL << i)))

704

seq_puts(m, "NO_");

704

seq_puts(m, "NO_");

705

seq_printf(m, "%s ", sched_feat_names[i]);

705

seq_printf(m, "%s ", sched_feat_names[i]);

706

}

706

}

707

seq_puts(m, "\n");

707

seq_puts(m, "\n");

708

709

return 0;

709

return 0;

710

}

710

}

711

712

static ssize_t

712

static ssize_t

713

sched_feat_write(struct file *filp, const char __user *ubuf,

713

sched_feat_write(struct file *filp, const char __user *ubuf,

714

size_t cnt, loff_t *ppos)

714

size_t cnt, loff_t *ppos)

715

{

715

{

716

char buf[64];

716

char buf[64];

717

char *cmp = buf;

717

char *cmp = buf;

718

int neg = 0;

718

int neg = 0;

719

int i;

719

int i;

720

721

if (cnt > 63)

721

if (cnt > 63)

722

cnt = 63;

722

cnt = 63;

723

724

if (copy_from_user(&buf, ubuf, cnt))

724

if (copy_from_user(&buf, ubuf, cnt))

725

return -EFAULT;

725

return -EFAULT;

726

727

buf[cnt] = 0;

727

buf[cnt] = 0;

728

729

if (strncmp(buf, "NO_", 3) == 0) {

729

if (strncmp(buf, "NO_", 3) == 0) {

730

neg = 1;

730

neg = 1;

731

cmp += 3;

731

cmp += 3;

732

}

732

}

733

734

for (i = 0; sched_feat_names[i]; i++) {

734

for (i = 0; sched_feat_names[i]; i++) {

735

int len = strlen(sched_feat_names[i]);

735

int len = strlen(sched_feat_names[i]);

736

737

if (strncmp(cmp, sched_feat_names[i], len) == 0) {

737

if (strncmp(cmp, sched_feat_names[i], len) == 0) {

738

if (neg)

738

if (neg)

739

sysctl_sched_features &= ~(1UL << i);

739

sysctl_sched_features &= ~(1UL << i);

740

else

740

else

741

sysctl_sched_features |= (1UL << i);

741

sysctl_sched_features |= (1UL << i);

742

break;

742

break;

743

}

743

}

744

}

744

}

745

746

if (!sched_feat_names[i])

746

if (!sched_feat_names[i])

747

return -EINVAL;

747

return -EINVAL;

748

749

*ppos += cnt;

749

*ppos += cnt;

750

751

return cnt;

751

return cnt;

752

}

752

}

753

754

static int sched_feat_open(struct inode *inode, struct file *filp)

754

static int sched_feat_open(struct inode *inode, struct file *filp)

755

{

755

{

756

return single_open(filp, sched_feat_show, NULL);

756

return single_open(filp, sched_feat_show, NULL);

757

}

757

}

758

759

static const struct file_operations sched_feat_fops = {

759

static const struct file_operations sched_feat_fops = {

760

.open = sched_feat_open,

760

.open = sched_feat_open,

761

.write = sched_feat_write,

761

.write = sched_feat_write,

762

.read = seq_read,

762

.read = seq_read,

763

.llseek = seq_lseek,

763

.llseek = seq_lseek,

764

.release = single_release,

764

.release = single_release,

765

};

765

};

766

767

static __init int sched_init_debug(void)

767

static __init int sched_init_debug(void)

768

{

768

{

769

debugfs_create_file("sched_features", 0644, NULL, NULL,

769

debugfs_create_file("sched_features", 0644, NULL, NULL,

770

&sched_feat_fops);

770

&sched_feat_fops);

771

772

return 0;

772

return 0;

773

}

773

}

774

late_initcall(sched_init_debug);

774

late_initcall(sched_init_debug);

775

776

#endif

776

#endif

777

778

#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))

778

#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))

779

780

/*

780

/*

781

* Number of tasks to iterate in a single balance run.

781

* Number of tasks to iterate in a single balance run.

782

* Limited because this is done with IRQs disabled.

782

* Limited because this is done with IRQs disabled.

783

*/

783

*/

784

const_debug unsigned int sysctl_sched_nr_migrate = 32;

784

const_debug unsigned int sysctl_sched_nr_migrate = 32;

785

786

/*

786

/*

787

* ratelimit for updating the group shares.

787

* ratelimit for updating the group shares.

788

* default: 0.25ms

788

* default: 0.25ms

789

*/

789

*/

790

unsigned int sysctl_sched_shares_ratelimit = 250000;

790

unsigned int sysctl_sched_shares_ratelimit = 250000;

791

unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;

791

unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;

792

793

/*

793

/*

794

* Inject some fuzzyness into changing the per-cpu group shares

794

* Inject some fuzzyness into changing the per-cpu group shares

795

* this avoids remote rq-locks at the expense of fairness.

795

* this avoids remote rq-locks at the expense of fairness.

796

* default: 4

796

* default: 4

797

*/

797

*/

798

unsigned int sysctl_sched_shares_thresh = 4;

798

unsigned int sysctl_sched_shares_thresh = 4;

799

800

/*

800

/*

801

* period over which we average the RT time consumption, measured

801

* period over which we average the RT time consumption, measured

802

* in ms.

802

* in ms.

803

*

803

*

804

* default: 1s

804

* default: 1s

805

*/

805

*/

806

const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

806

const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

807

808

/*

808

/*

809

* period over which we measure -rt task cpu usage in us.

809

* period over which we measure -rt task cpu usage in us.

810

* default: 1s

810

* default: 1s

811

*/

811

*/

812

unsigned int sysctl_sched_rt_period = 1000000;

812

unsigned int sysctl_sched_rt_period = 1000000;

813

814

static __read_mostly int scheduler_running;

814

static __read_mostly int scheduler_running;

815

816

/*

816

/*

817

* part of the period that we allow rt tasks to run in us.

817

* part of the period that we allow rt tasks to run in us.

818

* default: 0.95s

818

* default: 0.95s

819

*/

819

*/

820

int sysctl_sched_rt_runtime = 950000;

820

int sysctl_sched_rt_runtime = 950000;

821

822

static inline u64 global_rt_period(void)

822

static inline u64 global_rt_period(void)

823

{

823

{

824

return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;

824

return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;

825

}

825

}

826

827

static inline u64 global_rt_runtime(void)

827

static inline u64 global_rt_runtime(void)

828

{

828

{

829

if (sysctl_sched_rt_runtime < 0)

829

if (sysctl_sched_rt_runtime < 0)

830

return RUNTIME_INF;

830

return RUNTIME_INF;

831

832

return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;

832

return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;

833

}

833

}

834

835

#ifndef prepare_arch_switch

835

#ifndef prepare_arch_switch

836

# define prepare_arch_switch(next) do { } while (0)

836

# define prepare_arch_switch(next) do { } while (0)

837

#endif

837

#endif

838

#ifndef finish_arch_switch

838

#ifndef finish_arch_switch

839

# define finish_arch_switch(prev) do { } while (0)

839

# define finish_arch_switch(prev) do { } while (0)

840

#endif

840

#endif

841

842

static inline int task_current(struct rq *rq, struct task_struct *p)

842

static inline int task_current(struct rq *rq, struct task_struct *p)

843

{

843

{

844

return rq->curr == p;

844

return rq->curr == p;

845

}

845

}

846

847

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

847

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

848

static inline int task_running(struct rq *rq, struct task_struct *p)

848

static inline int task_running(struct rq *rq, struct task_struct *p)

849

{

849

{

850

return task_current(rq, p);

850

return task_current(rq, p);

851

}

851

}

852

853

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

853

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

854

{

854

{

855

}

855

}

856

857

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

857

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

858

{

858

{

859

#ifdef CONFIG_DEBUG_SPINLOCK

859

#ifdef CONFIG_DEBUG_SPINLOCK

860

/* this is a valid case when another task releases the spinlock */

860

/* this is a valid case when another task releases the spinlock */

861

rq->lock.owner = current;

861

rq->lock.owner = current;

862

#endif

862

#endif

863

/*

863

/*

864

* If we are tracking spinlock dependencies then we have to

864

* If we are tracking spinlock dependencies then we have to

865

* fix up the runqueue lock - which gets 'carried over' from

865

* fix up the runqueue lock - which gets 'carried over' from

866

* prev into current:

866

* prev into current:

867

*/

867

*/

868

spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

868

spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

869

870

raw_spin_unlock_irq(&rq->lock);

870

raw_spin_unlock_irq(&rq->lock);

871

}

871

}

872

873

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

873

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

874

static inline int task_running(struct rq *rq, struct task_struct *p)

874

static inline int task_running(struct rq *rq, struct task_struct *p)

875

{

875

{

876

#ifdef CONFIG_SMP

876

#ifdef CONFIG_SMP

877

return p->oncpu;

877

return p->oncpu;

878

#else

878

#else

879

return task_current(rq, p);

879

return task_current(rq, p);

880

#endif

880

#endif

881

}

881

}

882

883

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

883

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

884

{

884

{

885

#ifdef CONFIG_SMP

885

#ifdef CONFIG_SMP

886

/*

886

/*

887

* We can optimise this out completely for !SMP, because the

887

* We can optimise this out completely for !SMP, because the

888

* SMP rebalancing from interrupt is the only thing that cares

888

* SMP rebalancing from interrupt is the only thing that cares

889

* here.

889

* here.

890

*/

890

*/

891

next->oncpu = 1;

891

next->oncpu = 1;

892

#endif

892

#endif

893

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

893

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

894

raw_spin_unlock_irq(&rq->lock);

894

raw_spin_unlock_irq(&rq->lock);

895

#else

895

#else

896

raw_spin_unlock(&rq->lock);

896

raw_spin_unlock(&rq->lock);

897

#endif

897

#endif

898

}

898

}

899

900

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

900

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

901

{

901

{

902

#ifdef CONFIG_SMP

902

#ifdef CONFIG_SMP

903

/*

903

/*

904

* After ->oncpu is cleared, the task can be moved to a different CPU.

904

* After ->oncpu is cleared, the task can be moved to a different CPU.

905

* We must ensure this doesn't happen until the switch is completely

905

* We must ensure this doesn't happen until the switch is completely

906

* finished.

906

* finished.

907

*/

907

*/

908

smp_wmb();

908

smp_wmb();

909

prev->oncpu = 0;

909

prev->oncpu = 0;

910

#endif

910

#endif

911

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

911

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

912

local_irq_enable();

912

local_irq_enable();

913

#endif

913

#endif

914

}

914

}

915

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

915

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

916

917

/*

917

/*

918

* Check whether the task is waking, we use this to synchronize ->cpus_allowed

918

* Check whether the task is waking, we use this to synchronize ->cpus_allowed

919

* against ttwu().

919

* against ttwu().

920

*/

920

*/

921

static inline int task_is_waking(struct task_struct *p)

921

static inline int task_is_waking(struct task_struct *p)

922

{

922

{

923

return unlikely(p->state == TASK_WAKING);

923

return unlikely(p->state == TASK_WAKING);

924

}

924

}

925

926

/*

926

/*

927

* __task_rq_lock - lock the runqueue a given task resides on.

927

* __task_rq_lock - lock the runqueue a given task resides on.

928

* Must be called interrupts disabled.

928

* Must be called interrupts disabled.

929

*/

929

*/

930

static inline struct rq *__task_rq_lock(struct task_struct *p)

930

static inline struct rq *__task_rq_lock(struct task_struct *p)

931

__acquires(rq->lock)

931

__acquires(rq->lock)

932

{

932

{

933

struct rq *rq;

933

struct rq *rq;

934

935

for (;;) {

935

for (;;) {

936

rq = task_rq(p);

936

rq = task_rq(p);

937

raw_spin_lock(&rq->lock);

937

raw_spin_lock(&rq->lock);

938

if (likely(rq == task_rq(p)))

938

if (likely(rq == task_rq(p)))

939

return rq;

939

return rq;

940

raw_spin_unlock(&rq->lock);

940

raw_spin_unlock(&rq->lock);

941

}

941

}

942

}

942

}

943

944

/*

944

/*

945

* task_rq_lock - lock the runqueue a given task resides on and disable

945

* task_rq_lock - lock the runqueue a given task resides on and disable

946

* interrupts. Note the ordering: we can safely lookup the task_rq without

946

* interrupts. Note the ordering: we can safely lookup the task_rq without

947

* explicitly disabling preemption.

947

* explicitly disabling preemption.

948

*/

948

*/

949

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

949

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

950

__acquires(rq->lock)

950

__acquires(rq->lock)

951

{

951

{

952

struct rq *rq;

952

struct rq *rq;

953

954

for (;;) {

954

for (;;) {

955

local_irq_save(*flags);

955

local_irq_save(*flags);

956

rq = task_rq(p);

956

rq = task_rq(p);

957

raw_spin_lock(&rq->lock);

957

raw_spin_lock(&rq->lock);

958

if (likely(rq == task_rq(p)))

958

if (likely(rq == task_rq(p)))

959

return rq;

959

return rq;

960

raw_spin_unlock_irqrestore(&rq->lock, *flags);

960

raw_spin_unlock_irqrestore(&rq->lock, *flags);

961

}

961

}

962

}

962

}

963

964

void task_rq_unlock_wait(struct task_struct *p)

964

void task_rq_unlock_wait(struct task_struct *p)

965

{

965

{

966

struct rq *rq = task_rq(p);

966

struct rq *rq = task_rq(p);

967

968

smp_mb(); /* spin-unlock-wait is not a full memory barrier */

968

smp_mb(); /* spin-unlock-wait is not a full memory barrier */

969

raw_spin_unlock_wait(&rq->lock);

969

raw_spin_unlock_wait(&rq->lock);

970

}

970

}

971

972

static void __task_rq_unlock(struct rq *rq)

972

static void __task_rq_unlock(struct rq *rq)

973

__releases(rq->lock)

973

__releases(rq->lock)

974

{

974

{

975

raw_spin_unlock(&rq->lock);

975

raw_spin_unlock(&rq->lock);

976

}

976

}

977

978

static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)

978

static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)

979

__releases(rq->lock)

979

__releases(rq->lock)

980

{

980

{

981

raw_spin_unlock_irqrestore(&rq->lock, *flags);

981

raw_spin_unlock_irqrestore(&rq->lock, *flags);

982

}

982

}

983

984

/*

984

/*

985

* this_rq_lock - lock this runqueue and disable interrupts.

985

* this_rq_lock - lock this runqueue and disable interrupts.

986

*/

986

*/

987

static struct rq *this_rq_lock(void)

987

static struct rq *this_rq_lock(void)

988

__acquires(rq->lock)

988

__acquires(rq->lock)

989

{

989

{

990

struct rq *rq;

990

struct rq *rq;

991

992

local_irq_disable();

992

local_irq_disable();

993

rq = this_rq();

993

rq = this_rq();

994

raw_spin_lock(&rq->lock);

994

raw_spin_lock(&rq->lock);

995

996

return rq;

996

return rq;

997

}

997

}

998

999

#ifdef CONFIG_SCHED_HRTICK

999

#ifdef CONFIG_SCHED_HRTICK

1000

/*

1000

/*

1001

* Use HR-timers to deliver accurate preemption points.

1001

* Use HR-timers to deliver accurate preemption points.

1002

*

1002

*

1003

* Its all a bit involved since we cannot program an hrt while holding the

1003

* Its all a bit involved since we cannot program an hrt while holding the

1004

* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a

1004

* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a

1005

* reschedule event.

1005

* reschedule event.

1006

*

1006

*

1007

* When we get rescheduled we reprogram the hrtick_timer outside of the

1007

* When we get rescheduled we reprogram the hrtick_timer outside of the

1008

* rq->lock.

1008

* rq->lock.

1009

*/

1009

*/

1010

1011

/*

1011

/*

1012

* Use hrtick when:

1012

* Use hrtick when:

1013

* - enabled by features

1013

* - enabled by features

1014

* - hrtimer is actually high res

1014

* - hrtimer is actually high res

1015

*/

1015

*/

1016

static inline int hrtick_enabled(struct rq *rq)

1016

static inline int hrtick_enabled(struct rq *rq)

1017

{

1017

{

1018

if (!sched_feat(HRTICK))

1018

if (!sched_feat(HRTICK))

1019

return 0;

1019

return 0;

1020

if (!cpu_active(cpu_of(rq)))

1020

if (!cpu_active(cpu_of(rq)))

1021

return 0;

1021

return 0;

1022

return hrtimer_is_hres_active(&rq->hrtick_timer);

1022

return hrtimer_is_hres_active(&rq->hrtick_timer);

1023

}

1023

}

1024

1025

static void hrtick_clear(struct rq *rq)

1025

static void hrtick_clear(struct rq *rq)

1026

{

1026

{

1027

if (hrtimer_active(&rq->hrtick_timer))

1027

if (hrtimer_active(&rq->hrtick_timer))

1028

hrtimer_cancel(&rq->hrtick_timer);

1028

hrtimer_cancel(&rq->hrtick_timer);

1029

}

1029

}

1030

1031

/*

1031

/*

1032

* High-resolution timer tick.

1032

* High-resolution timer tick.

1033

* Runs from hardirq context with interrupts disabled.

1033

* Runs from hardirq context with interrupts disabled.

1034

*/

1034

*/

1035

static enum hrtimer_restart hrtick(struct hrtimer *timer)

1035

static enum hrtimer_restart hrtick(struct hrtimer *timer)

1036

{

1036

{

1037

struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1037

struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1038

1039

WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1039

WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1040

1041

raw_spin_lock(&rq->lock);

1041

raw_spin_lock(&rq->lock);

1042

update_rq_clock(rq);

1042

update_rq_clock(rq);

1043

rq->curr->sched_class->task_tick(rq, rq->curr, 1);

1043

rq->curr->sched_class->task_tick(rq, rq->curr, 1);

1044

raw_spin_unlock(&rq->lock);

1044

raw_spin_unlock(&rq->lock);

1045

1046

return HRTIMER_NORESTART;

1046

return HRTIMER_NORESTART;

1047

}

1047

}

1048

1049

#ifdef CONFIG_SMP

1049

#ifdef CONFIG_SMP

1050

/*

1050

/*

1051

* called from hardirq (IPI) context

1051

* called from hardirq (IPI) context

1052

*/

1052

*/

1053

static void __hrtick_start(void *arg)

1053

static void __hrtick_start(void *arg)

1054

{

1054

{

1055

struct rq *rq = arg;

1055

struct rq *rq = arg;

1056

1057

raw_spin_lock(&rq->lock);

1057

raw_spin_lock(&rq->lock);

1058

hrtimer_restart(&rq->hrtick_timer);

1058

hrtimer_restart(&rq->hrtick_timer);

1059

rq->hrtick_csd_pending = 0;

1059

rq->hrtick_csd_pending = 0;

1060

raw_spin_unlock(&rq->lock);

1060

raw_spin_unlock(&rq->lock);

1061

}

1061

}

1062

1063

/*

1063

/*

1064

* Called to set the hrtick timer state.

1064

* Called to set the hrtick timer state.

1065

*

1065

*

1066

* called with rq->lock held and irqs disabled

1066

* called with rq->lock held and irqs disabled

1067

*/

1067

*/

1068

static void hrtick_start(struct rq *rq, u64 delay)

1068

static void hrtick_start(struct rq *rq, u64 delay)

1069

{

1069

{

1070

struct hrtimer *timer = &rq->hrtick_timer;

1070

struct hrtimer *timer = &rq->hrtick_timer;

1071

ktime_t time = ktime_add_ns(timer->base->get_time(), delay);

1071

ktime_t time = ktime_add_ns(timer->base->get_time(), delay);

1072

1073

hrtimer_set_expires(timer, time);

1073

hrtimer_set_expires(timer, time);

1074

1075

if (rq == this_rq()) {

1075

if (rq == this_rq()) {

1076

hrtimer_restart(timer);

1076

hrtimer_restart(timer);

1077

} else if (!rq->hrtick_csd_pending) {

1077

} else if (!rq->hrtick_csd_pending) {

1078

__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);

1078

__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);

1079

rq->hrtick_csd_pending = 1;

1079

rq->hrtick_csd_pending = 1;

1080

}

1080

}

1081

}

1081

}

1082

1083

static int

1083

static int

1084

hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)

1084

hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)

1085

{

1085

{

1086

int cpu = (int)(long)hcpu;

1086

int cpu = (int)(long)hcpu;

1087

1088

switch (action) {

1088

switch (action) {

1089

case CPU_UP_CANCELED:

1089

case CPU_UP_CANCELED:

1090

case CPU_UP_CANCELED_FROZEN:

1090

case CPU_UP_CANCELED_FROZEN:

1091

case CPU_DOWN_PREPARE:

1091

case CPU_DOWN_PREPARE:

1092

case CPU_DOWN_PREPARE_FROZEN:

1092

case CPU_DOWN_PREPARE_FROZEN:

1093

case CPU_DEAD:

1093

case CPU_DEAD:

1094

case CPU_DEAD_FROZEN:

1094

case CPU_DEAD_FROZEN:

1095

hrtick_clear(cpu_rq(cpu));

1095

hrtick_clear(cpu_rq(cpu));

1096

return NOTIFY_OK;

1096

return NOTIFY_OK;

1097

}

1097

}

1098

1099

return NOTIFY_DONE;

1099

return NOTIFY_DONE;

1100

}

1100

}

1101

1102

static __init void init_hrtick(void)

1102

static __init void init_hrtick(void)

1103

{

1103

{

1104

hotcpu_notifier(hotplug_hrtick, 0);

1104

hotcpu_notifier(hotplug_hrtick, 0);

1105

}

1105

}

1106

#else

1106

#else

1107

/*

1107

/*

1108

* Called to set the hrtick timer state.

1108

* Called to set the hrtick timer state.

1109

*

1109

*

1110

* called with rq->lock held and irqs disabled

1110

* called with rq->lock held and irqs disabled

1111

*/

1111

*/

1112

static void hrtick_start(struct rq *rq, u64 delay)

1112

static void hrtick_start(struct rq *rq, u64 delay)

1113

{

1113

{

1114

__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,

1114

__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,

1115

HRTIMER_MODE_REL_PINNED, 0);

1115

HRTIMER_MODE_REL_PINNED, 0);

1116

}

1116

}

1117

1118

static inline void init_hrtick(void)

1118

static inline void init_hrtick(void)

1119

{

1119

{

1120

}

1120

}

1121

#endif /* CONFIG_SMP */

1121

#endif /* CONFIG_SMP */

1122

1123

static void init_rq_hrtick(struct rq *rq)

1123

static void init_rq_hrtick(struct rq *rq)

1124

{

1124

{

1125

#ifdef CONFIG_SMP

1125

#ifdef CONFIG_SMP

1126

rq->hrtick_csd_pending = 0;

1126

rq->hrtick_csd_pending = 0;

1127

1128

rq->hrtick_csd.flags = 0;

1128

rq->hrtick_csd.flags = 0;

1129

rq->hrtick_csd.func = __hrtick_start;

1129

rq->hrtick_csd.func = __hrtick_start;

1130

rq->hrtick_csd.info = rq;

1130

rq->hrtick_csd.info = rq;

1131

#endif

1131

#endif

1132

1133

hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

1133

hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

1134

rq->hrtick_timer.function = hrtick;

1134

rq->hrtick_timer.function = hrtick;

1135

}

1135

}

1136

#else /* CONFIG_SCHED_HRTICK */

1136

#else /* CONFIG_SCHED_HRTICK */

1137

static inline void hrtick_clear(struct rq *rq)

1137

static inline void hrtick_clear(struct rq *rq)

1138

{

1138

{

1139

}

1139

}

1140

1141

static inline void init_rq_hrtick(struct rq *rq)

1141

static inline void init_rq_hrtick(struct rq *rq)

1142

{

1142

{

1143

}

1143

}

1144

1145

static inline void init_hrtick(void)

1145

static inline void init_hrtick(void)

1146

{

1146

{

1147

}

1147

}

1148

#endif /* CONFIG_SCHED_HRTICK */

1148

#endif /* CONFIG_SCHED_HRTICK */

1149

1150

/*

1150

/*

1151

* resched_task - mark a task 'to be rescheduled now'.

1151

* resched_task - mark a task 'to be rescheduled now'.

1152

*

1152

*

1153

* On UP this means the setting of the need_resched flag, on SMP it

1153

* On UP this means the setting of the need_resched flag, on SMP it

1154

* might also involve a cross-CPU call to trigger the scheduler on

1154

* might also involve a cross-CPU call to trigger the scheduler on

1155

* the target CPU.

1155

* the target CPU.

1156

*/

1156

*/

1157

#ifdef CONFIG_SMP

1157

#ifdef CONFIG_SMP

1158

1159

#ifndef tsk_is_polling

1159

#ifndef tsk_is_polling

1160

#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)

1160

#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)

1161

#endif

1161

#endif

1162

1163

static void resched_task(struct task_struct *p)

1163

static void resched_task(struct task_struct *p)

1164

{

1164

{

1165

int cpu;

1165

int cpu;

1166

1167

assert_raw_spin_locked(&task_rq(p)->lock);

1167

assert_raw_spin_locked(&task_rq(p)->lock);

1168

1169

if (test_tsk_need_resched(p))

1169

if (test_tsk_need_resched(p))

1170

return;

1170

return;

1171

1172

set_tsk_need_resched(p);

1172

set_tsk_need_resched(p);

1173

1174

cpu = task_cpu(p);

1174

cpu = task_cpu(p);

1175

if (cpu == smp_processor_id())

1175

if (cpu == smp_processor_id())

1176

return;

1176

return;

1177

1178

/* NEED_RESCHED must be visible before we test polling */

1178

/* NEED_RESCHED must be visible before we test polling */

1179

smp_mb();

1179

smp_mb();

1180

if (!tsk_is_polling(p))

1180

if (!tsk_is_polling(p))

1181

smp_send_reschedule(cpu);

1181

smp_send_reschedule(cpu);

1182

}

1182

}

1183

1184

static void resched_cpu(int cpu)

1184

static void resched_cpu(int cpu)

1185

{

1185

{

1186

struct rq *rq = cpu_rq(cpu);

1186

struct rq *rq = cpu_rq(cpu);

1187

unsigned long flags;

1187

unsigned long flags;

1188

1189

if (!raw_spin_trylock_irqsave(&rq->lock, flags))

1189

if (!raw_spin_trylock_irqsave(&rq->lock, flags))

1190

return;

1190

return;

1191

resched_task(cpu_curr(cpu));

1191

resched_task(cpu_curr(cpu));

1192

raw_spin_unlock_irqrestore(&rq->lock, flags);

1192

raw_spin_unlock_irqrestore(&rq->lock, flags);

1193

}

1193

}

1194

1195

#ifdef CONFIG_NO_HZ

1195

#ifdef CONFIG_NO_HZ

1196

/*

1196

/*

1197

* When add_timer_on() enqueues a timer into the timer wheel of an

1197

* When add_timer_on() enqueues a timer into the timer wheel of an

1198

* idle CPU then this timer might expire before the next timer event

1198

* idle CPU then this timer might expire before the next timer event

1199

* which is scheduled to wake up that CPU. In case of a completely

1199

* which is scheduled to wake up that CPU. In case of a completely

1200

* idle system the next event might even be infinite time into the

1200

* idle system the next event might even be infinite time into the

1201

* future. wake_up_idle_cpu() ensures that the CPU is woken up and

1201

* future. wake_up_idle_cpu() ensures that the CPU is woken up and

1202

* leaves the inner idle loop so the newly added timer is taken into

1202

* leaves the inner idle loop so the newly added timer is taken into

1203

* account when the CPU goes back to idle and evaluates the timer

1203

* account when the CPU goes back to idle and evaluates the timer

1204

* wheel for the next timer event.

1204

* wheel for the next timer event.

1205

*/

1205

*/

1206

void wake_up_idle_cpu(int cpu)

1206

void wake_up_idle_cpu(int cpu)

1207

{

1207

{

1208

struct rq *rq = cpu_rq(cpu);

1208

struct rq *rq = cpu_rq(cpu);

1209

1210

if (cpu == smp_processor_id())

1210

if (cpu == smp_processor_id())

1211

return;

1211

return;

1212

1213

/*

1213

/*

1214

* This is safe, as this function is called with the timer

1214

* This is safe, as this function is called with the timer

1215

* wheel base lock of (cpu) held. When the CPU is on the way

1215

* wheel base lock of (cpu) held. When the CPU is on the way

1216

* to idle and has not yet set rq->curr to idle then it will

1216

* to idle and has not yet set rq->curr to idle then it will

1217

* be serialized on the timer wheel base lock and take the new

1217

* be serialized on the timer wheel base lock and take the new

1218

* timer into account automatically.

1218

* timer into account automatically.

1219

*/

1219

*/

1220

if (rq->curr != rq->idle)

1220

if (rq->curr != rq->idle)

1221

return;

1221

return;

1222

1223

/*

1223

/*

1224

* We can set TIF_RESCHED on the idle task of the other CPU

1224

* We can set TIF_RESCHED on the idle task of the other CPU

1225

* lockless. The worst case is that the other CPU runs the

1225

* lockless. The worst case is that the other CPU runs the

1226

* idle task through an additional NOOP schedule()

1226

* idle task through an additional NOOP schedule()

1227

*/

1227

*/

1228

set_tsk_need_resched(rq->idle);

1228

set_tsk_need_resched(rq->idle);

1229

1230

/* NEED_RESCHED must be visible before we test polling */

1230

/* NEED_RESCHED must be visible before we test polling */

1231

smp_mb();

1231

smp_mb();

1232

if (!tsk_is_polling(rq->idle))

1232

if (!tsk_is_polling(rq->idle))

1233

smp_send_reschedule(cpu);

1233

smp_send_reschedule(cpu);

1234

}

1234

}

1235

1236

int nohz_ratelimit(int cpu)

1236

int nohz_ratelimit(int cpu)

1237

{

1237

{

1238

struct rq *rq = cpu_rq(cpu);

1238

struct rq *rq = cpu_rq(cpu);

1239

u64 diff = rq->clock - rq->nohz_stamp;

1239

u64 diff = rq->clock - rq->nohz_stamp;

1240

1241

rq->nohz_stamp = rq->clock;

1241

rq->nohz_stamp = rq->clock;

1242

1243

return diff < (NSEC_PER_SEC / HZ) >> 1;

1243

return diff < (NSEC_PER_SEC / HZ) >> 1;

1244

}

1244

}

1245

1246

#endif /* CONFIG_NO_HZ */

1246

#endif /* CONFIG_NO_HZ */

1247

1248

static u64 sched_avg_period(void)

1248

static u64 sched_avg_period(void)

1249

{

1249

{

1250

return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;

1250

return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;

1251

}

1251

}

1252

1253

static void sched_avg_update(struct rq *rq)

1253

static void sched_avg_update(struct rq *rq)

1254

{

1254

{

1255

s64 period = sched_avg_period();

1255

s64 period = sched_avg_period();

1256

1257

while ((s64)(rq->clock - rq->age_stamp) > period) {

1257

while ((s64)(rq->clock - rq->age_stamp) > period) {

1258

rq->age_stamp += period;

1258

rq->age_stamp += period;

1259

rq->rt_avg /= 2;

1259

rq->rt_avg /= 2;

1260

}

1260

}

1261

}

1261

}

1262

1263

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1263

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1264

{

1264

{

1265

rq->rt_avg += rt_delta;

1265

rq->rt_avg += rt_delta;

1266

sched_avg_update(rq);

1266

sched_avg_update(rq);

1267

}

1267

}

1268

1269

#else /* !CONFIG_SMP */

1269

#else /* !CONFIG_SMP */

1270

static void resched_task(struct task_struct *p)

1270

static void resched_task(struct task_struct *p)

1271

{

1271

{

1272

assert_raw_spin_locked(&task_rq(p)->lock);

1272

assert_raw_spin_locked(&task_rq(p)->lock);

1273

set_tsk_need_resched(p);

1273

set_tsk_need_resched(p);

1274

}

1274

}

1275

1276

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1276

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1277

{

1277

{

1278

}

1278

}

1279

#endif /* CONFIG_SMP */

1279

#endif /* CONFIG_SMP */

1280

1281

#if BITS_PER_LONG == 32

1281

#if BITS_PER_LONG == 32

1282

# define WMULT_CONST (~0UL)

1282

# define WMULT_CONST (~0UL)

1283

#else

1283

#else

1284

# define WMULT_CONST (1UL << 32)

1284

# define WMULT_CONST (1UL << 32)

1285

#endif

1285

#endif

1286

1287

#define WMULT_SHIFT 32

1287

#define WMULT_SHIFT 32

1288

1289

/*

1289

/*

1290

* Shift right and round:

1290

* Shift right and round:

1291

*/

1291

*/

1292

#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))

1292

#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))

1293

1294

/*

1294

/*

1295

* delta *= weight / lw

1295

* delta *= weight / lw

1296

*/

1296

*/

1297

static unsigned long

1297

static unsigned long

1298

calc_delta_mine(unsigned long delta_exec, unsigned long weight,

1298

calc_delta_mine(unsigned long delta_exec, unsigned long weight,

1299

struct load_weight *lw)

1299

struct load_weight *lw)

1300

{

1300

{

1301

u64 tmp;

1301

u64 tmp;

1302

1303

if (!lw->inv_weight) {

1303

if (!lw->inv_weight) {

1304

if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))

1304

if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))

1305

lw->inv_weight = 1;

1305

lw->inv_weight = 1;

1306

else

1306

else

1307

lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)

1307

lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)

1308

/ (lw->weight+1);

1308

/ (lw->weight+1);

1309

}

1309

}

1310

1311

tmp = (u64)delta_exec * weight;

1311

tmp = (u64)delta_exec * weight;

1312

/*

1312

/*

1313

* Check whether we'd overflow the 64-bit multiplication:

1313

* Check whether we'd overflow the 64-bit multiplication:

1314

*/

1314

*/

1315

if (unlikely(tmp > WMULT_CONST))

1315

if (unlikely(tmp > WMULT_CONST))

1316

tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,

1316

tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,

1317

WMULT_SHIFT/2);

1317

WMULT_SHIFT/2);

1318

else

1318

else

1319

tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

1319

tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

1320

1321

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);

1321

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);

1322

}

1322

}

1323

1324

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

1324

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

1325

{

1325

{

1326

lw->weight += inc;

1326

lw->weight += inc;

1327

lw->inv_weight = 0;

1327

lw->inv_weight = 0;

1328

}

1328

}

1329

1330

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

1330

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

1331

{

1331

{

1332

lw->weight -= dec;

1332

lw->weight -= dec;

1333

lw->inv_weight = 0;

1333

lw->inv_weight = 0;

1334

}

1334

}

1335

1336

/*

1336

/*

1337

* To aid in avoiding the subversion of "niceness" due to uneven distribution

1337

* To aid in avoiding the subversion of "niceness" due to uneven distribution

1338

* of tasks with abnormal "nice" values across CPUs the contribution that

1338

* of tasks with abnormal "nice" values across CPUs the contribution that

1339

* each task makes to its run queue's load is weighted according to its

1339

* each task makes to its run queue's load is weighted according to its

1340

* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a

1340

* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a

1341

* scaled version of the new time slice allocation that they receive on time

1341

* scaled version of the new time slice allocation that they receive on time

1342

* slice expiry etc.

1342

* slice expiry etc.

1343

*/

1343

*/

1344

1345

#define WEIGHT_IDLEPRIO 3

1345

#define WEIGHT_IDLEPRIO 3

1346

#define WMULT_IDLEPRIO 1431655765

1346

#define WMULT_IDLEPRIO 1431655765

1347

1348

/*

1348

/*

1349

* Nice levels are multiplicative, with a gentle 10% change for every

1349

* Nice levels are multiplicative, with a gentle 10% change for every

1350

* nice level changed. I.e. when a CPU-bound task goes from nice 0 to

1350

* nice level changed. I.e. when a CPU-bound task goes from nice 0 to

1351

* nice 1, it will get ~10% less CPU time than another CPU-bound task

1351

* nice 1, it will get ~10% less CPU time than another CPU-bound task

1352

* that remained on nice 0.

1352

* that remained on nice 0.

1353

*

1353

*

1354

* The "10% effect" is relative and cumulative: from _any_ nice level,

1354

* The "10% effect" is relative and cumulative: from _any_ nice level,

1355

* if you go up 1 level, it's -10% CPU usage, if you go down 1 level

1355

* if you go up 1 level, it's -10% CPU usage, if you go down 1 level

1356

* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.

1356

* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.

1357

* If a task goes up by ~10% and another task goes down by ~10% then

1357

* If a task goes up by ~10% and another task goes down by ~10% then

1358

* the relative distance between them is ~25%.)

1358

* the relative distance between them is ~25%.)

1359

*/

1359

*/

1360

static const int prio_to_weight[40] = {

1360

static const int prio_to_weight[40] = {

1361

/* -20 */ 88761, 71755, 56483, 46273, 36291,

1361

/* -20 */ 88761, 71755, 56483, 46273, 36291,

1362

/* -15 */ 29154, 23254, 18705, 14949, 11916,

1362

/* -15 */ 29154, 23254, 18705, 14949, 11916,

1363

/* -10 */ 9548, 7620, 6100, 4904, 3906,

1363

/* -10 */ 9548, 7620, 6100, 4904, 3906,

1364

/* -5 */ 3121, 2501, 1991, 1586, 1277,

1364

/* -5 */ 3121, 2501, 1991, 1586, 1277,

1365

/* 0 */ 1024, 820, 655, 526, 423,

1365

/* 0 */ 1024, 820, 655, 526, 423,

1366

/* 5 */ 335, 272, 215, 172, 137,

1366

/* 5 */ 335, 272, 215, 172, 137,

1367

/* 10 */ 110, 87, 70, 56, 45,

1367

/* 10 */ 110, 87, 70, 56, 45,

1368

/* 15 */ 36, 29, 23, 18, 15,

1368

/* 15 */ 36, 29, 23, 18, 15,

1369

};

1369

};

1370

1371

/*

1371

/*

1372

* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.

1372

* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.

1373

*

1373

*

1374

* In cases where the weight does not change often, we can use the

1374

* In cases where the weight does not change often, we can use the

1375

* precalculated inverse to speed up arithmetics by turning divisions

1375

* precalculated inverse to speed up arithmetics by turning divisions

1376

* into multiplications:

1376

* into multiplications:

1377

*/

1377

*/

1378

static const u32 prio_to_wmult[40] = {

1378

static const u32 prio_to_wmult[40] = {

1379

/* -20 */ 48388, 59856, 76040, 92818, 118348,

1379

/* -20 */ 48388, 59856, 76040, 92818, 118348,

1380

/* -15 */ 147320, 184698, 229616, 287308, 360437,

1380

/* -15 */ 147320, 184698, 229616, 287308, 360437,

1381

/* -10 */ 449829, 563644, 704093, 875809, 1099582,

1381

/* -10 */ 449829, 563644, 704093, 875809, 1099582,

1382

/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,

1382

/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,

1383

/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,

1383

/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,

1384

/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,

1384

/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,

1385

/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,

1385

/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,

1386

/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,

1386

/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,

1387

};

1387

};

1388

1389

/* Time spent by the tasks of the cpu accounting group executing in ... */

1389

/* Time spent by the tasks of the cpu accounting group executing in ... */

1390

enum cpuacct_stat_index {

1390

enum cpuacct_stat_index {

1391

CPUACCT_STAT_USER, /* ... user mode */

1391

CPUACCT_STAT_USER, /* ... user mode */

1392

CPUACCT_STAT_SYSTEM, /* ... kernel mode */

1392

CPUACCT_STAT_SYSTEM, /* ... kernel mode */

1393

1394

CPUACCT_STAT_NSTATS,

1394

CPUACCT_STAT_NSTATS,

1395

};

1395

};

1396

1397

#ifdef CONFIG_CGROUP_CPUACCT

1397

#ifdef CONFIG_CGROUP_CPUACCT

1398

static void cpuacct_charge(struct task_struct *tsk, u64 cputime);

1398

static void cpuacct_charge(struct task_struct *tsk, u64 cputime);

1399

static void cpuacct_update_stats(struct task_struct *tsk,

1399

static void cpuacct_update_stats(struct task_struct *tsk,

1400

enum cpuacct_stat_index idx, cputime_t val);

1400

enum cpuacct_stat_index idx, cputime_t val);

1401

#else

1401

#else

1402

static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}

1402

static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}

1403

static inline void cpuacct_update_stats(struct task_struct *tsk,

1403

static inline void cpuacct_update_stats(struct task_struct *tsk,

1404

enum cpuacct_stat_index idx, cputime_t val) {}

1404

enum cpuacct_stat_index idx, cputime_t val) {}

1405

#endif

1405

#endif

1406

1407

static inline void inc_cpu_load(struct rq *rq, unsigned long load)

1407

static inline void inc_cpu_load(struct rq *rq, unsigned long load)

1408

{

1408

{

1409

update_load_add(&rq->load, load);

1409

update_load_add(&rq->load, load);

1410

}

1410

}

1411

1412

static inline void dec_cpu_load(struct rq *rq, unsigned long load)

1412

static inline void dec_cpu_load(struct rq *rq, unsigned long load)

1413

{

1413

{

1414

update_load_sub(&rq->load, load);

1414

update_load_sub(&rq->load, load);

1415

}

1415

}

1416

1417

#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)

1417

#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)

1418

typedef int (*tg_visitor)(struct task_group *, void *);

1418

typedef int (*tg_visitor)(struct task_group *, void *);

1419

1420

/*

1420

/*

1421

* Iterate the full tree, calling @down when first entering a node and @up when

1421

* Iterate the full tree, calling @down when first entering a node and @up when

1422

* leaving it for the final time.

1422

* leaving it for the final time.

1423

*/

1423

*/

1424

static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)

1424

static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)

1425

{

1425

{

1426

struct task_group *parent, *child;

1426

struct task_group *parent, *child;

1427

int ret;

1427

int ret;

1428

1429

rcu_read_lock();

1429

rcu_read_lock();

1430

parent = &root_task_group;

1430

parent = &root_task_group;

1431

down:

1431

down:

1432

ret = (*down)(parent, data);

1432

ret = (*down)(parent, data);

1433

if (ret)

1433

if (ret)

1434

goto out_unlock;

1434

goto out_unlock;

1435

list_for_each_entry_rcu(child, &parent->children, siblings) {

1435

list_for_each_entry_rcu(child, &parent->children, siblings) {

1436

parent = child;

1436

parent = child;

1437

goto down;

1437

goto down;

1438

1439

up:

1439

up:

1440

continue;

1440

continue;

1441

}

1441

}

1442

ret = (*up)(parent, data);

1442

ret = (*up)(parent, data);

1443

if (ret)

1443

if (ret)

1444

goto out_unlock;

1444

goto out_unlock;

1445

1446

child = parent;

1446

child = parent;

1447

parent = parent->parent;

1447

parent = parent->parent;

1448

if (parent)

1448

if (parent)

1449

goto up;

1449

goto up;

1450

out_unlock:

1450

out_unlock:

1451

rcu_read_unlock();

1451

rcu_read_unlock();

1452

1453

return ret;

1453

return ret;

1454

}

1454

}

1455

1456

static int tg_nop(struct task_group *tg, void *data)

1456

static int tg_nop(struct task_group *tg, void *data)

1457

{

1457

{

1458

return 0;

1458

return 0;

1459

}

1459

}

1460

#endif

1460

#endif

1461

1462

#ifdef CONFIG_SMP

1462

#ifdef CONFIG_SMP

1463

/* Used instead of source_load when we know the type == 0 */

1463

/* Used instead of source_load when we know the type == 0 */

1464

static unsigned long weighted_cpuload(const int cpu)

1464

static unsigned long weighted_cpuload(const int cpu)

1465

{

1465

{

1466

return cpu_rq(cpu)->load.weight;

1466

return cpu_rq(cpu)->load.weight;

1467

}

1467

}

1468

1469

/*

1469

/*

1470

* Return a low guess at the load of a migration-source cpu weighted

1470

* Return a low guess at the load of a migration-source cpu weighted

1471

* according to the scheduling class and "nice" value.

1471

* according to the scheduling class and "nice" value.

1472

*

1472

*

1473

* We want to under-estimate the load of migration sources, to

1473

* We want to under-estimate the load of migration sources, to

1474

* balance conservatively.

1474

* balance conservatively.

1475

*/

1475

*/

1476

static unsigned long source_load(int cpu, int type)

1476

static unsigned long source_load(int cpu, int type)

1477

{

1477

{

1478

struct rq *rq = cpu_rq(cpu);

1478

struct rq *rq = cpu_rq(cpu);

1479

unsigned long total = weighted_cpuload(cpu);

1479

unsigned long total = weighted_cpuload(cpu);

1480

1481

if (type == 0 || !sched_feat(LB_BIAS))

1481

if (type == 0 || !sched_feat(LB_BIAS))

1482

return total;

1482

return total;

1483

1484

return min(rq->cpu_load[type-1], total);

1484

return min(rq->cpu_load[type-1], total);

1485

}

1485

}

1486

1487

/*

1487

/*

1488

* Return a high guess at the load of a migration-target cpu weighted

1488

* Return a high guess at the load of a migration-target cpu weighted

1489

* according to the scheduling class and "nice" value.

1489

* according to the scheduling class and "nice" value.

1490

*/

1490

*/

1491

static unsigned long target_load(int cpu, int type)

1491

static unsigned long target_load(int cpu, int type)

1492

{

1492

{

1493

struct rq *rq = cpu_rq(cpu);

1493

struct rq *rq = cpu_rq(cpu);

1494

unsigned long total = weighted_cpuload(cpu);

1494

unsigned long total = weighted_cpuload(cpu);

1495

1496

if (type == 0 || !sched_feat(LB_BIAS))

1496

if (type == 0 || !sched_feat(LB_BIAS))

1497

return total;

1497

return total;

1498

1499

return max(rq->cpu_load[type-1], total);

1499

return max(rq->cpu_load[type-1], total);

1500

}

1500

}

1501

1502

static struct sched_group *group_of(int cpu)

1502

static struct sched_group *group_of(int cpu)

1503

{

1503

{

1504

struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);

1504

struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);

1505

1506

if (!sd)

1506

if (!sd)

1507

return NULL;

1507

return NULL;

1508

1509

return sd->groups;

1509

return sd->groups;

1510

}

1510

}

1511

1512

static unsigned long power_of(int cpu)

1512

static unsigned long power_of(int cpu)

1513

{

1513

{

1514

struct sched_group *group = group_of(cpu);

1514

struct sched_group *group = group_of(cpu);

1515

1516

if (!group)

1516

if (!group)

1517

return SCHED_LOAD_SCALE;

1517

return SCHED_LOAD_SCALE;

1518

1519

return group->cpu_power;

1519

return group->cpu_power;

1520

}

1520

}

1521

1522

static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);

1522

static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);

1523

1524

static unsigned long cpu_avg_load_per_task(int cpu)

1524

static unsigned long cpu_avg_load_per_task(int cpu)

1525

{

1525

{

1526

struct rq *rq = cpu_rq(cpu);

1526

struct rq *rq = cpu_rq(cpu);

1527

unsigned long nr_running = ACCESS_ONCE(rq->nr_running);

1527

unsigned long nr_running = ACCESS_ONCE(rq->nr_running);

1528

1529

if (nr_running)

1529

if (nr_running)

1530

rq->avg_load_per_task = rq->load.weight / nr_running;

1530

rq->avg_load_per_task = rq->load.weight / nr_running;

1531

else

1531

else

1532

rq->avg_load_per_task = 0;

1532

rq->avg_load_per_task = 0;

1533

1534

return rq->avg_load_per_task;

1534

return rq->avg_load_per_task;

1535

}

1535

}

1536

1537

#ifdef CONFIG_FAIR_GROUP_SCHED

1537

#ifdef CONFIG_FAIR_GROUP_SCHED

1538

1539

static __read_mostly unsigned long __percpu *update_shares_data;

1539

static __read_mostly unsigned long __percpu *update_shares_data;

1540

1541

static void __set_se_shares(struct sched_entity *se, unsigned long shares);

1541

static void __set_se_shares(struct sched_entity *se, unsigned long shares);

1542

1543

/*

1543

/*

1544

* Calculate and set the cpu's group shares.

1544

* Calculate and set the cpu's group shares.

1545

*/

1545

*/

1546

static void update_group_shares_cpu(struct task_group *tg, int cpu,

1546

static void update_group_shares_cpu(struct task_group *tg, int cpu,

1547

unsigned long sd_shares,

1547

unsigned long sd_shares,

1548

unsigned long sd_rq_weight,

1548

unsigned long sd_rq_weight,

1549

unsigned long *usd_rq_weight)

1549

unsigned long *usd_rq_weight)

1550

{

1550

{

1551

unsigned long shares, rq_weight;

1551

unsigned long shares, rq_weight;

1552

int boost = 0;

1552

int boost = 0;

1553

1554

rq_weight = usd_rq_weight[cpu];

1554

rq_weight = usd_rq_weight[cpu];

1555

if (!rq_weight) {

1555

if (!rq_weight) {

1556

boost = 1;

1556

boost = 1;

1557

rq_weight = NICE_0_LOAD;

1557

rq_weight = NICE_0_LOAD;

1558

}

1558

}

1559

1560

/*

1560

/*

1561

* \Sum_j shares_j * rq_weight_i

1561

* \Sum_j shares_j * rq_weight_i

1562

* shares_i = -----------------------------

1562

* shares_i = -----------------------------

1563

* \Sum_j rq_weight_j

1563

* \Sum_j rq_weight_j

1564

*/

1564

*/

1565

shares = (sd_shares * rq_weight) / sd_rq_weight;

1565

shares = (sd_shares * rq_weight) / sd_rq_weight;

1566

shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);

1566

shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);

1567

1568

if (abs(shares - tg->se[cpu]->load.weight) >

1568

if (abs(shares - tg->se[cpu]->load.weight) >

1569

sysctl_sched_shares_thresh) {

1569

sysctl_sched_shares_thresh) {

1570

struct rq *rq = cpu_rq(cpu);

1570

struct rq *rq = cpu_rq(cpu);

1571

unsigned long flags;

1571

unsigned long flags;

1572

1573

raw_spin_lock_irqsave(&rq->lock, flags);

1573

raw_spin_lock_irqsave(&rq->lock, flags);

1574

tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;

1574

tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;

1575

tg->cfs_rq[cpu]->shares = boost ? 0 : shares;

1575

tg->cfs_rq[cpu]->shares = boost ? 0 : shares;

1576

__set_se_shares(tg->se[cpu], shares);

1576

__set_se_shares(tg->se[cpu], shares);

1577

raw_spin_unlock_irqrestore(&rq->lock, flags);

1577

raw_spin_unlock_irqrestore(&rq->lock, flags);

1578

}

1578

}

1579

}

1579

}

1580

1581

/*

1581

/*

1582

* Re-compute the task group their per cpu shares over the given domain.

1582

* Re-compute the task group their per cpu shares over the given domain.

1583

* This needs to be done in a bottom-up fashion because the rq weight of a

1583

* This needs to be done in a bottom-up fashion because the rq weight of a

1584

* parent group depends on the shares of its child groups.

1584

* parent group depends on the shares of its child groups.

1585

*/

1585

*/

1586

static int tg_shares_up(struct task_group *tg, void *data)

1586

static int tg_shares_up(struct task_group *tg, void *data)

1587

{

1587

{

1588

unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;

1588

unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;

1589

unsigned long *usd_rq_weight;

1589

unsigned long *usd_rq_weight;

1590

struct sched_domain *sd = data;

1590

struct sched_domain *sd = data;

1591

unsigned long flags;

1591

unsigned long flags;

1592

int i;

1592

int i;

1593

1594

if (!tg->se[0])

1594

if (!tg->se[0])

1595

return 0;

1595

return 0;

1596

1597

local_irq_save(flags);

1597

local_irq_save(flags);

1598

usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());

1598

usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());

1599

1600

for_each_cpu(i, sched_domain_span(sd)) {

1600

for_each_cpu(i, sched_domain_span(sd)) {

1601

weight = tg->cfs_rq[i]->load.weight;

1601

weight = tg->cfs_rq[i]->load.weight;

1602

usd_rq_weight[i] = weight;

1602

usd_rq_weight[i] = weight;

1603

1604

rq_weight += weight;

1604

rq_weight += weight;

1605

/*

1605

/*

1606

* If there are currently no tasks on the cpu pretend there

1606

* If there are currently no tasks on the cpu pretend there

1607

* is one of average load so that when a new task gets to

1607

* is one of average load so that when a new task gets to

1608

* run here it will not get delayed by group starvation.

1608

* run here it will not get delayed by group starvation.

1609

*/

1609

*/

1610

if (!weight)

1610

if (!weight)

1611

weight = NICE_0_LOAD;

1611

weight = NICE_0_LOAD;

1612

1613

sum_weight += weight;

1613

sum_weight += weight;

1614

shares += tg->cfs_rq[i]->shares;

1614

shares += tg->cfs_rq[i]->shares;

1615

}

1615

}

1616

1617

if (!rq_weight)

1617

if (!rq_weight)

1618

rq_weight = sum_weight;

1618

rq_weight = sum_weight;

1619

1620

if ((!shares && rq_weight) || shares > tg->shares)

1620

if ((!shares && rq_weight) || shares > tg->shares)

1621

shares = tg->shares;

1621

shares = tg->shares;

1622

1623

if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))

1623

if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))

1624

shares = tg->shares;

1624

shares = tg->shares;

1625

1626

for_each_cpu(i, sched_domain_span(sd))

1626

for_each_cpu(i, sched_domain_span(sd))

1627

update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);

1627

update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);

1628

1629

local_irq_restore(flags);

1629

local_irq_restore(flags);

1630

1631

return 0;

1631

return 0;

1632

}

1632

}

1633

1634

/*

1634

/*

1635

* Compute the cpu's hierarchical load factor for each task group.

1635

* Compute the cpu's hierarchical load factor for each task group.

1636

* This needs to be done in a top-down fashion because the load of a child

1636

* This needs to be done in a top-down fashion because the load of a child

1637

* group is a fraction of its parents load.

1637

* group is a fraction of its parents load.

1638

*/

1638

*/

1639

static int tg_load_down(struct task_group *tg, void *data)

1639

static int tg_load_down(struct task_group *tg, void *data)

1640

{

1640

{

1641

unsigned long load;

1641

unsigned long load;

1642

long cpu = (long)data;

1642

long cpu = (long)data;

1643

1644

if (!tg->parent) {

1644

if (!tg->parent) {

1645

load = cpu_rq(cpu)->load.weight;

1645

load = cpu_rq(cpu)->load.weight;

1646

} else {

1646

} else {

1647

load = tg->parent->cfs_rq[cpu]->h_load;

1647

load = tg->parent->cfs_rq[cpu]->h_load;

1648

load *= tg->cfs_rq[cpu]->shares;

1648

load *= tg->cfs_rq[cpu]->shares;

1649

load /= tg->parent->cfs_rq[cpu]->load.weight + 1;

1649

load /= tg->parent->cfs_rq[cpu]->load.weight + 1;

1650

}

1650

}

1651

1652

tg->cfs_rq[cpu]->h_load = load;

1652

tg->cfs_rq[cpu]->h_load = load;

1653

1654

return 0;

1654

return 0;

1655

}

1655

}

1656

1657

static void update_shares(struct sched_domain *sd)

1657

static void update_shares(struct sched_domain *sd)

1658

{

1658

{

1659

s64 elapsed;

1659

s64 elapsed;

1660

u64 now;

1660

u64 now;

1661

1662

if (root_task_group_empty())

1662

if (root_task_group_empty())

1663

return;

1663

return;

1664

1665

now = cpu_clock(raw_smp_processor_id());

1665

now = cpu_clock(raw_smp_processor_id());

1666

elapsed = now - sd->last_update;

1666

elapsed = now - sd->last_update;

1667

1668

if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {

1668

if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {

1669

sd->last_update = now;

1669

sd->last_update = now;

1670

walk_tg_tree(tg_nop, tg_shares_up, sd);

1670

walk_tg_tree(tg_nop, tg_shares_up, sd);

1671

}

1671

}

1672

}

1672

}

1673

1674

static void update_h_load(long cpu)

1674

static void update_h_load(long cpu)

1675

{

1675

{

1676

if (root_task_group_empty())

1676

if (root_task_group_empty())

1677

return;

1677

return;

1678

1679

walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);

1679

walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);

1680

}

1680

}

1681

1682

#else

1682

#else

1683

1684

static inline void update_shares(struct sched_domain *sd)

1684

static inline void update_shares(struct sched_domain *sd)

1685

{

1685

{

1686

}

1686

}

1687

1688

#endif

1688

#endif

1689

1690

#ifdef CONFIG_PREEMPT

1690

#ifdef CONFIG_PREEMPT

1691

1692

static void double_rq_lock(struct rq *rq1, struct rq *rq2);

1692

static void double_rq_lock(struct rq *rq1, struct rq *rq2);

1693

1694

/*

1694

/*

1695

* fair double_lock_balance: Safely acquires both rq->locks in a fair

1695

* fair double_lock_balance: Safely acquires both rq->locks in a fair

1696

* way at the expense of forcing extra atomic operations in all

1696

* way at the expense of forcing extra atomic operations in all

1697

* invocations. This assures that the double_lock is acquired using the

1697

* invocations. This assures that the double_lock is acquired using the

1698

* same underlying policy as the spinlock_t on this architecture, which

1698

* same underlying policy as the spinlock_t on this architecture, which

1699

* reduces latency compared to the unfair variant below. However, it

1699

* reduces latency compared to the unfair variant below. However, it

1700

* also adds more overhead and therefore may reduce throughput.

1700

* also adds more overhead and therefore may reduce throughput.

1701

*/

1701

*/

1702

static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1702

static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1703

__releases(this_rq->lock)

1703

__releases(this_rq->lock)

1704

__acquires(busiest->lock)

1704

__acquires(busiest->lock)

1705

__acquires(this_rq->lock)

1705

__acquires(this_rq->lock)

1706

{

1706

{

1707

raw_spin_unlock(&this_rq->lock);

1707

raw_spin_unlock(&this_rq->lock);

1708

double_rq_lock(this_rq, busiest);

1708

double_rq_lock(this_rq, busiest);

1709

1710

return 1;

1710

return 1;

1711

}

1711

}

1712

1713

#else

1713

#else

1714

/*

1714

/*

1715

* Unfair double_lock_balance: Optimizes throughput at the expense of

1715

* Unfair double_lock_balance: Optimizes throughput at the expense of

1716

* latency by eliminating extra atomic operations when the locks are

1716

* latency by eliminating extra atomic operations when the locks are

1717

* already in proper order on entry. This favors lower cpu-ids and will

1717

* already in proper order on entry. This favors lower cpu-ids and will

1718

* grant the double lock to lower cpus over higher ids under contention,

1718

* grant the double lock to lower cpus over higher ids under contention,

1719

* regardless of entry order into the function.

1719

* regardless of entry order into the function.

1720

*/

1720

*/

1721

static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1721

static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1722

__releases(this_rq->lock)

1722

__releases(this_rq->lock)

1723

__acquires(busiest->lock)

1723

__acquires(busiest->lock)

1724

__acquires(this_rq->lock)

1724

__acquires(this_rq->lock)

1725

{

1725

{

1726

int ret = 0;

1726

int ret = 0;

1727

1728

if (unlikely(!raw_spin_trylock(&busiest->lock))) {

1728

if (unlikely(!raw_spin_trylock(&busiest->lock))) {

1729

if (busiest < this_rq) {

1729

if (busiest < this_rq) {

1730

raw_spin_unlock(&this_rq->lock);

1730

raw_spin_unlock(&this_rq->lock);

1731

raw_spin_lock(&busiest->lock);

1731

raw_spin_lock(&busiest->lock);

1732

raw_spin_lock_nested(&this_rq->lock,

1732

raw_spin_lock_nested(&this_rq->lock,

1733

SINGLE_DEPTH_NESTING);

1733

SINGLE_DEPTH_NESTING);

1734

ret = 1;

1734

ret = 1;

1735

} else

1735

} else

1736

raw_spin_lock_nested(&busiest->lock,

1736

raw_spin_lock_nested(&busiest->lock,

1737

SINGLE_DEPTH_NESTING);

1737

SINGLE_DEPTH_NESTING);

1738

}

1738

}

1739

return ret;

1739

return ret;

1740

}

1740

}

1741

1742

#endif /* CONFIG_PREEMPT */

1742

#endif /* CONFIG_PREEMPT */

1743

1744

/*

1744

/*

1745

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

1745

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

1746

*/

1746

*/

1747

static int double_lock_balance(struct rq *this_rq, struct rq *busiest)

1747

static int double_lock_balance(struct rq *this_rq, struct rq *busiest)

1748

{

1748

{

1749

if (unlikely(!irqs_disabled())) {

1749

if (unlikely(!irqs_disabled())) {

1750

/* printk() doesn't work good under rq->lock */

1750

/* printk() doesn't work good under rq->lock */

1751

raw_spin_unlock(&this_rq->lock);

1751

raw_spin_unlock(&this_rq->lock);

1752

BUG_ON(1);

1752

BUG_ON(1);

1753

}

1753

}

1754

1755

return _double_lock_balance(this_rq, busiest);

1755

return _double_lock_balance(this_rq, busiest);

1756

}

1756

}

1757

1758

static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)

1758

static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)

1759

__releases(busiest->lock)

1759

__releases(busiest->lock)

1760

{

1760

{

1761

raw_spin_unlock(&busiest->lock);

1761

raw_spin_unlock(&busiest->lock);

1762

lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);

1762

lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);

1763

}

1763

}

1764

1765

/*

1765

/*

1766

* double_rq_lock - safely lock two runqueues

1766

* double_rq_lock - safely lock two runqueues

1767

*

1767

*

1768

* Note this does not disable interrupts like task_rq_lock,

1768

* Note this does not disable interrupts like task_rq_lock,

1769

* you need to do so manually before calling.

1769

* you need to do so manually before calling.

1770

*/

1770

*/

1771

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

1771

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

1772

__acquires(rq1->lock)

1772

__acquires(rq1->lock)

1773

__acquires(rq2->lock)

1773

__acquires(rq2->lock)

1774

{

1774

{

1775

BUG_ON(!irqs_disabled());

1775

BUG_ON(!irqs_disabled());

1776

if (rq1 == rq2) {

1776

if (rq1 == rq2) {

1777

raw_spin_lock(&rq1->lock);

1777

raw_spin_lock(&rq1->lock);

1778

__acquire(rq2->lock); /* Fake it out ;) */

1778

__acquire(rq2->lock); /* Fake it out ;) */

1779

} else {

1779

} else {

1780

if (rq1 < rq2) {

1780

if (rq1 < rq2) {

1781

raw_spin_lock(&rq1->lock);

1781

raw_spin_lock(&rq1->lock);

1782

raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);

1782

raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);

1783

} else {

1783

} else {

1784

raw_spin_lock(&rq2->lock);

1784

raw_spin_lock(&rq2->lock);

1785

raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);

1785

raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);

1786

}

1786

}

1787

}

1787

}

1788

}

1788

}

1789

1790

/*

1790

/*

1791

* double_rq_unlock - safely unlock two runqueues

1791

* double_rq_unlock - safely unlock two runqueues

1792

*

1792

*

1793

* Note this does not restore interrupts like task_rq_unlock,

1793

* Note this does not restore interrupts like task_rq_unlock,

1794

* you need to do so manually after calling.

1794

* you need to do so manually after calling.

1795

*/

1795

*/

1796

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

1796

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

1797

__releases(rq1->lock)

1797

__releases(rq1->lock)

1798

__releases(rq2->lock)

1798

__releases(rq2->lock)

1799

{

1799

{

1800

raw_spin_unlock(&rq1->lock);

1800

raw_spin_unlock(&rq1->lock);

1801

if (rq1 != rq2)

1801

if (rq1 != rq2)

1802

raw_spin_unlock(&rq2->lock);

1802

raw_spin_unlock(&rq2->lock);

1803

else

1803

else

1804

__release(rq2->lock);

1804

__release(rq2->lock);

1805

}

1805

}

1806

1807

#endif

1807

#endif

1808

1809

#ifdef CONFIG_FAIR_GROUP_SCHED

1809

#ifdef CONFIG_FAIR_GROUP_SCHED

1810

static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)

1810

static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)

1811

{

1811

{

1812

#ifdef CONFIG_SMP

1812

#ifdef CONFIG_SMP

1813

cfs_rq->shares = shares;

1813

cfs_rq->shares = shares;

1814

#endif

1814

#endif

1815

}

1815

}

1816

#endif

1816

#endif

1817

1818

static void calc_load_account_active(struct rq *this_rq);

1818

static void calc_load_account_idle(struct rq *this_rq);

1819

static void update_sysctl(void);

1819

static void update_sysctl(void);

1820

static int get_update_sysctl_factor(void);

1820

static int get_update_sysctl_factor(void);

1821

1822

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

1822

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

1823

{

1823

{

1824

set_task_rq(p, cpu);

1824

set_task_rq(p, cpu);

1825

#ifdef CONFIG_SMP

1825

#ifdef CONFIG_SMP

1826

/*

1826

/*

1827

* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be

1827

* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be

1828

* successfuly executed on another CPU. We must ensure that updates of

1828

* successfuly executed on another CPU. We must ensure that updates of

1829

* per-task data have been completed by this moment.

1829

* per-task data have been completed by this moment.

1830

*/

1830

*/

1831

smp_wmb();

1831

smp_wmb();

1832

task_thread_info(p)->cpu = cpu;

1832

task_thread_info(p)->cpu = cpu;

1833

#endif

1833

#endif

1834

}

1834

}

1835

1836

static const struct sched_class rt_sched_class;

1836

static const struct sched_class rt_sched_class;

1837

1838

#define sched_class_highest (&rt_sched_class)

1838

#define sched_class_highest (&rt_sched_class)

1839

#define for_each_class(class) \

1839

#define for_each_class(class) \

1840

for (class = sched_class_highest; class; class = class->next)

1840

for (class = sched_class_highest; class; class = class->next)

1841

1842

#include "sched_stats.h"

1842

#include "sched_stats.h"

1843

1844

static void inc_nr_running(struct rq *rq)

1844

static void inc_nr_running(struct rq *rq)

1845

{

1845

{

1846

rq->nr_running++;

1846

rq->nr_running++;

1847

}

1847

}

1848

1849

static void dec_nr_running(struct rq *rq)

1849

static void dec_nr_running(struct rq *rq)

1850

{

1850

{

1851

rq->nr_running--;

1851

rq->nr_running--;

1852

}

1852

}

1853

1854

static void set_load_weight(struct task_struct *p)

1854

static void set_load_weight(struct task_struct *p)

1855

{

1855

{

1856

if (task_has_rt_policy(p)) {

1856

if (task_has_rt_policy(p)) {

1857

p->se.load.weight = prio_to_weight[0] * 2;

1857

p->se.load.weight = prio_to_weight[0] * 2;

1858

p->se.load.inv_weight = prio_to_wmult[0] >> 1;

1858

p->se.load.inv_weight = prio_to_wmult[0] >> 1;

1859

return;

1859

return;

1860

}

1860

}

1861

1862

/*

1862

/*

1863

* SCHED_IDLE tasks get minimal weight:

1863

* SCHED_IDLE tasks get minimal weight:

1864

*/

1864

*/

1865

if (p->policy == SCHED_IDLE) {

1865

if (p->policy == SCHED_IDLE) {

1866

p->se.load.weight = WEIGHT_IDLEPRIO;

1866

p->se.load.weight = WEIGHT_IDLEPRIO;

1867

p->se.load.inv_weight = WMULT_IDLEPRIO;

1867

p->se.load.inv_weight = WMULT_IDLEPRIO;

1868

return;

1868

return;

1869

}

1869

}

1870

1871

p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];

1871

p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];

1872

p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];

1872

p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];

1873

}

1873

}

1874

1875

static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)

1875

static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)

1876

{

1876

{

1877

update_rq_clock(rq);

1877

update_rq_clock(rq);

1878

sched_info_queued(p);

1878

sched_info_queued(p);

1879

p->sched_class->enqueue_task(rq, p, flags);

1879

p->sched_class->enqueue_task(rq, p, flags);

1880

p->se.on_rq = 1;

1880

p->se.on_rq = 1;

1881

}

1881

}

1882

1883

static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)

1883

static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)

1884

{

1884

{

1885

update_rq_clock(rq);

1885

update_rq_clock(rq);

1886

sched_info_dequeued(p);

1886

sched_info_dequeued(p);

1887

p->sched_class->dequeue_task(rq, p, flags);

1887

p->sched_class->dequeue_task(rq, p, flags);

1888

p->se.on_rq = 0;

1888

p->se.on_rq = 0;

1889

}

1889

}

1890

1891

/*

1891

/*

1892

* activate_task - move a task to the runqueue.

1892

* activate_task - move a task to the runqueue.

1893

*/

1893

*/

1894

static void activate_task(struct rq *rq, struct task_struct *p, int flags)

1894

static void activate_task(struct rq *rq, struct task_struct *p, int flags)

1895

{

1895

{

1896

if (task_contributes_to_load(p))

1896

if (task_contributes_to_load(p))

1897

rq->nr_uninterruptible--;

1897

rq->nr_uninterruptible--;

1898

1899

enqueue_task(rq, p, flags);

1899

enqueue_task(rq, p, flags);

1900

inc_nr_running(rq);

1900

inc_nr_running(rq);

1901

}

1901

}

1902

1903

/*

1903

/*

1904

* deactivate_task - remove a task from the runqueue.

1904

* deactivate_task - remove a task from the runqueue.

1905

*/

1905

*/

1906

static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)

1906

static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)

1907

{

1907

{

1908

if (task_contributes_to_load(p))

1908

if (task_contributes_to_load(p))

1909

rq->nr_uninterruptible++;

1909

rq->nr_uninterruptible++;

1910

1911

dequeue_task(rq, p, flags);

1911

dequeue_task(rq, p, flags);

1912

dec_nr_running(rq);

1912

dec_nr_running(rq);

1913

}

1913

}

1914

1915

#include "sched_idletask.c"

1915

#include "sched_idletask.c"

1916

#include "sched_fair.c"

1916

#include "sched_fair.c"

1917

#include "sched_rt.c"

1917

#include "sched_rt.c"

1918

#ifdef CONFIG_SCHED_DEBUG

1918

#ifdef CONFIG_SCHED_DEBUG

1919

# include "sched_debug.c"

1919

# include "sched_debug.c"

1920

#endif

1920

#endif

1921

1922

/*

1922

/*

1923

* __normal_prio - return the priority that is based on the static prio

1923

* __normal_prio - return the priority that is based on the static prio

1924

*/

1924

*/

1925

static inline int __normal_prio(struct task_struct *p)

1925

static inline int __normal_prio(struct task_struct *p)

1926

{

1926

{

1927

return p->static_prio;

1927

return p->static_prio;

1928

}

1928

}

1929

1930

/*

1930

/*

1931

* Calculate the expected normal priority: i.e. priority

1931

* Calculate the expected normal priority: i.e. priority

1932

* without taking RT-inheritance into account. Might be

1932

* without taking RT-inheritance into account. Might be

1933

* boosted by interactivity modifiers. Changes upon fork,

1933

* boosted by interactivity modifiers. Changes upon fork,

1934

* setprio syscalls, and whenever the interactivity

1934

* setprio syscalls, and whenever the interactivity

1935

* estimator recalculates.

1935

* estimator recalculates.

1936

*/

1936

*/

1937

static inline int normal_prio(struct task_struct *p)

1937

static inline int normal_prio(struct task_struct *p)

1938

{

1938

{

1939

int prio;

1939

int prio;

1940

1941

if (task_has_rt_policy(p))

1941

if (task_has_rt_policy(p))

1942

prio = MAX_RT_PRIO-1 - p->rt_priority;

1942

prio = MAX_RT_PRIO-1 - p->rt_priority;

1943

else

1943

else

1944

prio = __normal_prio(p);

1944

prio = __normal_prio(p);

1945

return prio;

1945

return prio;

1946

}

1946

}

1947

1948

/*

1948

/*

1949

* Calculate the current priority, i.e. the priority

1949

* Calculate the current priority, i.e. the priority

1950

* taken into account by the scheduler. This value might

1950

* taken into account by the scheduler. This value might

1951

* be boosted by RT tasks, or might be boosted by

1951

* be boosted by RT tasks, or might be boosted by

1952

* interactivity modifiers. Will be RT if the task got

1952

* interactivity modifiers. Will be RT if the task got

1953

* RT-boosted. If not then it returns p->normal_prio.

1953

* RT-boosted. If not then it returns p->normal_prio.

1954

*/

1954

*/

1955

static int effective_prio(struct task_struct *p)

1955

static int effective_prio(struct task_struct *p)

1956

{

1956

{

1957

p->normal_prio = normal_prio(p);

1957

p->normal_prio = normal_prio(p);

1958

/*

1958

/*

1959

* If we are RT tasks or we were boosted to RT priority,

1959

* If we are RT tasks or we were boosted to RT priority,

1960

* keep the priority unchanged. Otherwise, update priority

1960

* keep the priority unchanged. Otherwise, update priority

1961

* to the normal priority:

1961

* to the normal priority:

1962

*/

1962

*/

1963

if (!rt_prio(p->prio))

1963

if (!rt_prio(p->prio))

1964

return p->normal_prio;

1964

return p->normal_prio;

1965

return p->prio;

1965

return p->prio;

1966

}

1966

}

1967

1968

/**

1968

/**

1969

* task_curr - is this task currently executing on a CPU?

1969

* task_curr - is this task currently executing on a CPU?

1970

* @p: the task in question.

1970

* @p: the task in question.

1971

*/

1971

*/

1972

inline int task_curr(const struct task_struct *p)

1972

inline int task_curr(const struct task_struct *p)

1973

{

1973

{

1974

return cpu_curr(task_cpu(p)) == p;

1974

return cpu_curr(task_cpu(p)) == p;

1975

}

1975

}

1976

1977

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

1977

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

1978

const struct sched_class *prev_class,

1978

const struct sched_class *prev_class,

1979

int oldprio, int running)

1979

int oldprio, int running)

1980

{

1980

{

1981

if (prev_class != p->sched_class) {

1981

if (prev_class != p->sched_class) {

1982

if (prev_class->switched_from)

1982

if (prev_class->switched_from)

1983

prev_class->switched_from(rq, p, running);

1983

prev_class->switched_from(rq, p, running);

1984

p->sched_class->switched_to(rq, p, running);

1984

p->sched_class->switched_to(rq, p, running);

1985

} else

1985

} else

1986

p->sched_class->prio_changed(rq, p, oldprio, running);

1986

p->sched_class->prio_changed(rq, p, oldprio, running);

1987

}

1987

}

1988

1989

#ifdef CONFIG_SMP

1989

#ifdef CONFIG_SMP

1990

/*

1990

/*

1991

* Is this task likely cache-hot:

1991

* Is this task likely cache-hot:

1992

*/

1992

*/

1993

static int

1993

static int

1994

task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)

1994

task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)

1995

{

1995

{

1996

s64 delta;

1996

s64 delta;

1997

1998

if (p->sched_class != &fair_sched_class)

1998

if (p->sched_class != &fair_sched_class)

1999

return 0;

1999

return 0;

2000

2001

/*

2001

/*

2002

* Buddy candidates are cache hot:

2002

* Buddy candidates are cache hot:

2003

*/

2003

*/

2004

if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&

2004

if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&

2005

(&p->se == cfs_rq_of(&p->se)->next ||

2005

(&p->se == cfs_rq_of(&p->se)->next ||

2006

&p->se == cfs_rq_of(&p->se)->last))

2006

&p->se == cfs_rq_of(&p->se)->last))

2007

return 1;

2007

return 1;

2008

2009

if (sysctl_sched_migration_cost == -1)

2009

if (sysctl_sched_migration_cost == -1)

2010

return 1;

2010

return 1;

2011

if (sysctl_sched_migration_cost == 0)

2011

if (sysctl_sched_migration_cost == 0)

2012

return 0;

2012

return 0;

2013

2014

delta = now - p->se.exec_start;

2014

delta = now - p->se.exec_start;

2015

2016

return delta < (s64)sysctl_sched_migration_cost;

2016

return delta < (s64)sysctl_sched_migration_cost;

2017

}

2017

}

2018

2019

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

2019

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

2020

{

2020

{

2021

#ifdef CONFIG_SCHED_DEBUG

2021

#ifdef CONFIG_SCHED_DEBUG

2022

/*

2022

/*

2023

* We should never call set_task_cpu() on a blocked task,

2023

* We should never call set_task_cpu() on a blocked task,

2024

* ttwu() will sort out the placement.

2024

* ttwu() will sort out the placement.

2025

*/

2025

*/

2026

WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&

2026

WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&

2027

!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));

2027

!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));

2028

#endif

2028

#endif

2029

2030

trace_sched_migrate_task(p, new_cpu);

2030

trace_sched_migrate_task(p, new_cpu);

2031

2032

if (task_cpu(p) != new_cpu) {

2032

if (task_cpu(p) != new_cpu) {

2033

p->se.nr_migrations++;

2033

p->se.nr_migrations++;

2034

perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);

2034

perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);

2035

}

2035

}

2036

2037

__set_task_cpu(p, new_cpu);

2037

__set_task_cpu(p, new_cpu);

2038

}

2038

}

2039

2040

struct migration_req {

2040

struct migration_req {

2041

struct list_head list;

2041

struct list_head list;

2042

2043

struct task_struct *task;

2043

struct task_struct *task;

2044

int dest_cpu;

2044

int dest_cpu;

2045

2046

struct completion done;

2046

struct completion done;

2047

};

2047

};

2048

2049

/*

2049

/*

2050

* The task's runqueue lock must be held.

2050

* The task's runqueue lock must be held.

2051

* Returns true if you have to wait for migration thread.

2051

* Returns true if you have to wait for migration thread.

2052

*/

2052

*/

2053

static int

2053

static int

2054

migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)

2054

migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)

2055

{

2055

{

2056

struct rq *rq = task_rq(p);

2056

struct rq *rq = task_rq(p);

2057

2058

/*

2058

/*

2059

* If the task is not on a runqueue (and not running), then

2059

* If the task is not on a runqueue (and not running), then

2060

* the next wake-up will properly place the task.

2060

* the next wake-up will properly place the task.

2061

*/

2061

*/

2062

if (!p->se.on_rq && !task_running(rq, p))

2062

if (!p->se.on_rq && !task_running(rq, p))

2063

return 0;

2063

return 0;

2064

2065

init_completion(&req->done);

2065

init_completion(&req->done);

2066

req->task = p;

2066

req->task = p;

2067

req->dest_cpu = dest_cpu;

2067

req->dest_cpu = dest_cpu;

2068

list_add(&req->list, &rq->migration_queue);

2068

list_add(&req->list, &rq->migration_queue);

2069

2070

return 1;

2070

return 1;

2071

}

2071

}

2072

2073

/*

2073

/*

2074

* wait_task_context_switch - wait for a thread to complete at least one

2074

* wait_task_context_switch - wait for a thread to complete at least one

2075

* context switch.

2075

* context switch.

2076

*

2076

*

2077

* @p must not be current.

2077

* @p must not be current.

2078

*/

2078

*/

2079

void wait_task_context_switch(struct task_struct *p)

2079

void wait_task_context_switch(struct task_struct *p)

2080

{

2080

{

2081

unsigned long nvcsw, nivcsw, flags;

2081

unsigned long nvcsw, nivcsw, flags;

2082

int running;

2082

int running;

2083

struct rq *rq;

2083

struct rq *rq;

2084

2085

nvcsw = p->nvcsw;

2085

nvcsw = p->nvcsw;

2086

nivcsw = p->nivcsw;

2086

nivcsw = p->nivcsw;

2087

for (;;) {

2087

for (;;) {

2088

/*

2088

/*

2089

* The runqueue is assigned before the actual context

2089

* The runqueue is assigned before the actual context

2090

* switch. We need to take the runqueue lock.

2090

* switch. We need to take the runqueue lock.

2091

*

2091

*

2092

* We could check initially without the lock but it is

2092

* We could check initially without the lock but it is

2093

* very likely that we need to take the lock in every

2093

* very likely that we need to take the lock in every

2094

* iteration.

2094

* iteration.

2095

*/

2095

*/

2096

rq = task_rq_lock(p, &flags);

2096

rq = task_rq_lock(p, &flags);

2097

running = task_running(rq, p);

2097

running = task_running(rq, p);

2098

task_rq_unlock(rq, &flags);

2098

task_rq_unlock(rq, &flags);

2099

2100

if (likely(!running))

2100

if (likely(!running))

2101

break;

2101

break;

2102

/*

2102

/*

2103

* The switch count is incremented before the actual

2103

* The switch count is incremented before the actual

2104

* context switch. We thus wait for two switches to be

2104

* context switch. We thus wait for two switches to be

2105

* sure at least one completed.

2105

* sure at least one completed.

2106

*/

2106

*/

2107

if ((p->nvcsw - nvcsw) > 1)

2107

if ((p->nvcsw - nvcsw) > 1)

2108

break;

2108

break;

2109

if ((p->nivcsw - nivcsw) > 1)

2109

if ((p->nivcsw - nivcsw) > 1)

2110

break;

2110

break;

2111

2112

cpu_relax();

2112

cpu_relax();

2113

}

2113

}

2114

}

2114

}

2115

2116

/*

2116

/*

2117

* wait_task_inactive - wait for a thread to unschedule.

2117

* wait_task_inactive - wait for a thread to unschedule.

2118

*

2118

*

2119

* If @match_state is nonzero, it's the @p->state value just checked and

2119

* If @match_state is nonzero, it's the @p->state value just checked and

2120

* not expected to change. If it changes, i.e. @p might have woken up,

2120

* not expected to change. If it changes, i.e. @p might have woken up,

2121

* then return zero. When we succeed in waiting for @p to be off its CPU,

2121

* then return zero. When we succeed in waiting for @p to be off its CPU,

2122

* we return a positive number (its total switch count). If a second call

2122

* we return a positive number (its total switch count). If a second call

2123

* a short while later returns the same number, the caller can be sure that

2123

* a short while later returns the same number, the caller can be sure that

2124

* @p has remained unscheduled the whole time.

2124

* @p has remained unscheduled the whole time.

2125

*

2125

*

2126

* The caller must ensure that the task *will* unschedule sometime soon,

2126

* The caller must ensure that the task *will* unschedule sometime soon,

2127

* else this function might spin for a *long* time. This function can't

2127

* else this function might spin for a *long* time. This function can't

2128

* be called with interrupts off, or it may introduce deadlock with

2128

* be called with interrupts off, or it may introduce deadlock with

2129

* smp_call_function() if an IPI is sent by the same process we are

2129

* smp_call_function() if an IPI is sent by the same process we are

2130

* waiting to become inactive.

2130

* waiting to become inactive.

2131

*/

2131

*/

2132

unsigned long wait_task_inactive(struct task_struct *p, long match_state)

2132

unsigned long wait_task_inactive(struct task_struct *p, long match_state)

2133

{

2133

{

2134

unsigned long flags;

2134

unsigned long flags;

2135

int running, on_rq;

2135

int running, on_rq;

2136

unsigned long ncsw;

2136

unsigned long ncsw;

2137

struct rq *rq;

2137

struct rq *rq;

2138

2139

for (;;) {

2139

for (;;) {

2140

/*

2140

/*

2141

* We do the initial early heuristics without holding

2141

* We do the initial early heuristics without holding

2142

* any task-queue locks at all. We'll only try to get

2142

* any task-queue locks at all. We'll only try to get

2143

* the runqueue lock when things look like they will

2143

* the runqueue lock when things look like they will

2144

* work out!

2144

* work out!

2145

*/

2145

*/

2146

rq = task_rq(p);

2146

rq = task_rq(p);

2147

2148

/*

2148

/*

2149

* If the task is actively running on another CPU

2149

* If the task is actively running on another CPU

2150

* still, just relax and busy-wait without holding

2150

* still, just relax and busy-wait without holding

2151

* any locks.

2151

* any locks.

2152

*

2152

*

2153

* NOTE! Since we don't hold any locks, it's not

2153

* NOTE! Since we don't hold any locks, it's not

2154

* even sure that "rq" stays as the right runqueue!

2154

* even sure that "rq" stays as the right runqueue!

2155

* But we don't care, since "task_running()" will

2155

* But we don't care, since "task_running()" will

2156

* return false if the runqueue has changed and p

2156

* return false if the runqueue has changed and p

2157

* is actually now running somewhere else!

2157

* is actually now running somewhere else!

2158

*/

2158

*/

2159

while (task_running(rq, p)) {

2159

while (task_running(rq, p)) {

2160

if (match_state && unlikely(p->state != match_state))

2160

if (match_state && unlikely(p->state != match_state))

2161

return 0;

2161

return 0;

2162

cpu_relax();

2162

cpu_relax();

2163

}

2163

}

2164

2165

/*

2165

/*

2166

* Ok, time to look more closely! We need the rq

2166

* Ok, time to look more closely! We need the rq

2167

* lock now, to be *sure*. If we're wrong, we'll

2167

* lock now, to be *sure*. If we're wrong, we'll

2168

* just go back and repeat.

2168

* just go back and repeat.

2169

*/

2169

*/

2170

rq = task_rq_lock(p, &flags);

2170

rq = task_rq_lock(p, &flags);

2171

trace_sched_wait_task(rq, p);

2171

trace_sched_wait_task(rq, p);

2172

running = task_running(rq, p);

2172

running = task_running(rq, p);

2173

on_rq = p->se.on_rq;

2173

on_rq = p->se.on_rq;

2174

ncsw = 0;

2174

ncsw = 0;

2175

if (!match_state || p->state == match_state)

2175

if (!match_state || p->state == match_state)

2176

ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

2176

ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

2177

task_rq_unlock(rq, &flags);

2177

task_rq_unlock(rq, &flags);

2178

2179

/*

2179

/*

2180

* If it changed from the expected state, bail out now.

2180

* If it changed from the expected state, bail out now.

2181

*/

2181

*/

2182

if (unlikely(!ncsw))

2182

if (unlikely(!ncsw))

2183

break;

2183

break;

2184

2185

/*

2185

/*

2186

* Was it really running after all now that we

2186

* Was it really running after all now that we

2187

* checked with the proper locks actually held?

2187

* checked with the proper locks actually held?

2188

*

2188

*

2189

* Oops. Go back and try again..

2189

* Oops. Go back and try again..

2190

*/

2190

*/

2191

if (unlikely(running)) {

2191

if (unlikely(running)) {

2192

cpu_relax();

2192

cpu_relax();

2193

continue;

2193

continue;

2194

}

2194

}

2195

2196

/*

2196

/*

2197

* It's not enough that it's not actively running,

2197

* It's not enough that it's not actively running,

2198

* it must be off the runqueue _entirely_, and not

2198

* it must be off the runqueue _entirely_, and not

2199

* preempted!

2199

* preempted!

2200

*

2200

*

2201

* So if it was still runnable (but just not actively

2201

* So if it was still runnable (but just not actively

2202

* running right now), it's preempted, and we should

2202

* running right now), it's preempted, and we should

2203

* yield - it could be a while.

2203

* yield - it could be a while.

2204

*/

2204

*/

2205

if (unlikely(on_rq)) {

2205

if (unlikely(on_rq)) {

2206

schedule_timeout_uninterruptible(1);

2206

schedule_timeout_uninterruptible(1);

2207

continue;

2207

continue;

2208

}

2208

}

2209

2210

/*

2210

/*

2211

* Ahh, all good. It wasn't running, and it wasn't

2211

* Ahh, all good. It wasn't running, and it wasn't

2212

* runnable, which means that it will never become

2212

* runnable, which means that it will never become

2213

* running in the future either. We're all done!

2213

* running in the future either. We're all done!

2214

*/

2214

*/

2215

break;

2215

break;

2216

}

2216

}

2217

2218

return ncsw;

2218

return ncsw;

2219

}

2219

}

2220

2221

/***

2221

/***

2222

* kick_process - kick a running thread to enter/exit the kernel

2222

* kick_process - kick a running thread to enter/exit the kernel

2223

* @p: the to-be-kicked thread

2223

* @p: the to-be-kicked thread

2224

*

2224

*

2225

* Cause a process which is running on another CPU to enter

2225

* Cause a process which is running on another CPU to enter

2226

* kernel-mode, without any delay. (to get signals handled.)

2226

* kernel-mode, without any delay. (to get signals handled.)

2227

*

2227

*

2228

* NOTE: this function doesnt have to take the runqueue lock,

2228

* NOTE: this function doesnt have to take the runqueue lock,

2229

* because all it wants to ensure is that the remote task enters

2229

* because all it wants to ensure is that the remote task enters

2230

* the kernel. If the IPI races and the task has been migrated

2230

* the kernel. If the IPI races and the task has been migrated

2231

* to another CPU then no harm is done and the purpose has been

2231

* to another CPU then no harm is done and the purpose has been

2232

* achieved as well.

2232

* achieved as well.

2233

*/

2233

*/

2234

void kick_process(struct task_struct *p)

2234

void kick_process(struct task_struct *p)

2235

{

2235

{

2236

int cpu;

2236

int cpu;

2237

2238

preempt_disable();

2238

preempt_disable();

2239

cpu = task_cpu(p);

2239

cpu = task_cpu(p);

2240

if ((cpu != smp_processor_id()) && task_curr(p))

2240

if ((cpu != smp_processor_id()) && task_curr(p))

2241

smp_send_reschedule(cpu);

2241

smp_send_reschedule(cpu);

2242

preempt_enable();

2242

preempt_enable();

2243

}

2243

}

2244

EXPORT_SYMBOL_GPL(kick_process);

2244

EXPORT_SYMBOL_GPL(kick_process);

2245

#endif /* CONFIG_SMP */

2245

#endif /* CONFIG_SMP */

2246

2247

/**

2247

/**

2248

* task_oncpu_function_call - call a function on the cpu on which a task runs

2248

* task_oncpu_function_call - call a function on the cpu on which a task runs

2249

* @p: the task to evaluate

2249

* @p: the task to evaluate

2250

* @func: the function to be called

2250

* @func: the function to be called

2251

* @info: the function call argument

2251

* @info: the function call argument

2252

*

2252

*

2253

* Calls the function @func when the task is currently running. This might

2253

* Calls the function @func when the task is currently running. This might

2254

* be on the current CPU, which just calls the function directly

2254

* be on the current CPU, which just calls the function directly

2255

*/

2255

*/

2256

void task_oncpu_function_call(struct task_struct *p,

2256

void task_oncpu_function_call(struct task_struct *p,

2257

void (*func) (void *info), void *info)

2257

void (*func) (void *info), void *info)

2258

{

2258

{

2259

int cpu;

2259

int cpu;

2260

2261

preempt_disable();

2261

preempt_disable();

2262

cpu = task_cpu(p);

2262

cpu = task_cpu(p);

2263

if (task_curr(p))

2263

if (task_curr(p))

2264

smp_call_function_single(cpu, func, info, 1);

2264

smp_call_function_single(cpu, func, info, 1);

2265

preempt_enable();

2265

preempt_enable();

2266

}

2266

}

2267

2268

#ifdef CONFIG_SMP

2268

#ifdef CONFIG_SMP

2269

/*

2269

/*

2270

* ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.

2270

* ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.

2271

*/

2271

*/

2272

static int select_fallback_rq(int cpu, struct task_struct *p)

2272

static int select_fallback_rq(int cpu, struct task_struct *p)

2273

{

2273

{

2274

int dest_cpu;

2274

int dest_cpu;

2275

const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));

2275

const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));

2276

2277

/* Look for allowed, online CPU in same node. */

2277

/* Look for allowed, online CPU in same node. */

2278

for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)

2278

for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)

2279

if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

2279

if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

2280

return dest_cpu;

2280

return dest_cpu;

2281

2282

/* Any allowed, online CPU? */

2282

/* Any allowed, online CPU? */

2283

dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);

2283

dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);

2284

if (dest_cpu < nr_cpu_ids)

2284

if (dest_cpu < nr_cpu_ids)

2285

return dest_cpu;

2285

return dest_cpu;

2286

2287

/* No more Mr. Nice Guy. */

2287

/* No more Mr. Nice Guy. */

2288

if (unlikely(dest_cpu >= nr_cpu_ids)) {

2288

if (unlikely(dest_cpu >= nr_cpu_ids)) {

2289

dest_cpu = cpuset_cpus_allowed_fallback(p);

2289

dest_cpu = cpuset_cpus_allowed_fallback(p);

2290

/*

2290

/*

2291

* Don't tell them about moving exiting tasks or

2291

* Don't tell them about moving exiting tasks or

2292

* kernel threads (both mm NULL), since they never

2292

* kernel threads (both mm NULL), since they never

2293

* leave kernel.

2293

* leave kernel.

2294

*/

2294

*/

2295

if (p->mm && printk_ratelimit()) {

2295

if (p->mm && printk_ratelimit()) {

2296

printk(KERN_INFO "process %d (%s) no "

2296

printk(KERN_INFO "process %d (%s) no "

2297

"longer affine to cpu%d\n",

2297

"longer affine to cpu%d\n",

2298

task_pid_nr(p), p->comm, cpu);

2298

task_pid_nr(p), p->comm, cpu);

2299

}

2299

}

2300

}

2300

}

2301

2302

return dest_cpu;

2302

return dest_cpu;

2303

}

2303

}

2304

2305

/*

2305

/*

2306

* The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.

2306

* The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.

2307

*/

2307

*/

2308

static inline

2308

static inline

2309

int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)

2309

int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)

2310

{

2310

{

2311

int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);

2311

int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);

2312

2313

/*

2313

/*

2314

* In order not to call set_task_cpu() on a blocking task we need

2314

* In order not to call set_task_cpu() on a blocking task we need

2315

* to rely on ttwu() to place the task on a valid ->cpus_allowed

2315

* to rely on ttwu() to place the task on a valid ->cpus_allowed

2316

* cpu.

2316

* cpu.

2317

*

2317

*

2318

* Since this is common to all placement strategies, this lives here.

2318

* Since this is common to all placement strategies, this lives here.

2319

*

2319

*

2320

* [ this allows ->select_task() to simply return task_cpu(p) and

2320

* [ this allows ->select_task() to simply return task_cpu(p) and

2321

* not worry about this generic constraint ]

2321

* not worry about this generic constraint ]

2322

*/

2322

*/

2323

if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||

2323

if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||

2324

!cpu_online(cpu)))

2324

!cpu_online(cpu)))

2325

cpu = select_fallback_rq(task_cpu(p), p);

2325

cpu = select_fallback_rq(task_cpu(p), p);

2326

2327

return cpu;

2327

return cpu;

2328

}

2328

}

2329

2330

static void update_avg(u64 *avg, u64 sample)

2330

static void update_avg(u64 *avg, u64 sample)

2331

{

2331

{

2332

s64 diff = sample - *avg;

2332

s64 diff = sample - *avg;

2333

*avg += diff >> 3;

2333

*avg += diff >> 3;

2334

}

2334

}

2335

#endif

2335

#endif

2336

2337

/***

2337

/***

2338

* try_to_wake_up - wake up a thread

2338

* try_to_wake_up - wake up a thread

2339

* @p: the to-be-woken-up thread

2339

* @p: the to-be-woken-up thread

2340

* @state: the mask of task states that can be woken

2340

* @state: the mask of task states that can be woken

2341

* @sync: do a synchronous wakeup?

2341

* @sync: do a synchronous wakeup?

2342

*

2342

*

2343

* Put it on the run-queue if it's not already there. The "current"

2343

* Put it on the run-queue if it's not already there. The "current"

2344

* thread is always on the run-queue (except when the actual

2344

* thread is always on the run-queue (except when the actual

2345

* re-schedule is in progress), and as such you're allowed to do

2345

* re-schedule is in progress), and as such you're allowed to do

2346

* the simpler "current->state = TASK_RUNNING" to mark yourself

2346

* the simpler "current->state = TASK_RUNNING" to mark yourself

2347

* runnable without the overhead of this.

2347

* runnable without the overhead of this.

2348

*

2348

*

2349

* returns failure only if the task is already active.

2349

* returns failure only if the task is already active.

2350

*/

2350

*/

2351

static int try_to_wake_up(struct task_struct *p, unsigned int state,

2351

static int try_to_wake_up(struct task_struct *p, unsigned int state,

2352

int wake_flags)

2352

int wake_flags)

2353

{

2353

{

2354

int cpu, orig_cpu, this_cpu, success = 0;

2354

int cpu, orig_cpu, this_cpu, success = 0;

2355

unsigned long flags;

2355

unsigned long flags;

2356

unsigned long en_flags = ENQUEUE_WAKEUP;

2356

unsigned long en_flags = ENQUEUE_WAKEUP;

2357

struct rq *rq;

2357

struct rq *rq;

2358

2359

this_cpu = get_cpu();

2359

this_cpu = get_cpu();

2360

2361

smp_wmb();

2361

smp_wmb();

2362

rq = task_rq_lock(p, &flags);

2362

rq = task_rq_lock(p, &flags);

2363

if (!(p->state & state))

2363

if (!(p->state & state))

2364

goto out;

2364

goto out;

2365

2366

if (p->se.on_rq)

2366

if (p->se.on_rq)

2367

goto out_running;

2367

goto out_running;

2368

2369

cpu = task_cpu(p);

2369

cpu = task_cpu(p);

2370

orig_cpu = cpu;

2370

orig_cpu = cpu;

2371

2372

#ifdef CONFIG_SMP

2372

#ifdef CONFIG_SMP

2373

if (unlikely(task_running(rq, p)))

2373

if (unlikely(task_running(rq, p)))

2374

goto out_activate;

2374

goto out_activate;

2375

2376

/*

2376

/*

2377

* In order to handle concurrent wakeups and release the rq->lock

2377

* In order to handle concurrent wakeups and release the rq->lock

2378

* we put the task in TASK_WAKING state.

2378

* we put the task in TASK_WAKING state.

2379

*

2379

*

2380

* First fix up the nr_uninterruptible count:

2380

* First fix up the nr_uninterruptible count:

2381

*/

2381

*/

2382

if (task_contributes_to_load(p)) {

2382

if (task_contributes_to_load(p)) {

2383

if (likely(cpu_online(orig_cpu)))

2383

if (likely(cpu_online(orig_cpu)))

2384

rq->nr_uninterruptible--;

2384

rq->nr_uninterruptible--;

2385

else

2385

else

2386

this_rq()->nr_uninterruptible--;

2386

this_rq()->nr_uninterruptible--;

2387

}

2387

}

2388

p->state = TASK_WAKING;

2388

p->state = TASK_WAKING;

2389

2390

if (p->sched_class->task_waking) {

2390

if (p->sched_class->task_waking) {

2391

p->sched_class->task_waking(rq, p);

2391

p->sched_class->task_waking(rq, p);

2392

en_flags |= ENQUEUE_WAKING;

2392

en_flags |= ENQUEUE_WAKING;

2393

}

2393

}

2394

2395

cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);

2395

cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);

2396

if (cpu != orig_cpu)

2396

if (cpu != orig_cpu)

2397

set_task_cpu(p, cpu);

2397

set_task_cpu(p, cpu);

2398

__task_rq_unlock(rq);

2398

__task_rq_unlock(rq);

2399

2400

rq = cpu_rq(cpu);

2400

rq = cpu_rq(cpu);

2401

raw_spin_lock(&rq->lock);

2401

raw_spin_lock(&rq->lock);

2402

2403

/*

2403

/*

2404

* We migrated the task without holding either rq->lock, however

2404

* We migrated the task without holding either rq->lock, however

2405

* since the task is not on the task list itself, nobody else

2405

* since the task is not on the task list itself, nobody else

2406

* will try and migrate the task, hence the rq should match the

2406

* will try and migrate the task, hence the rq should match the

2407

* cpu we just moved it to.

2407

* cpu we just moved it to.

2408

*/

2408

*/

2409

WARN_ON(task_cpu(p) != cpu);

2409

WARN_ON(task_cpu(p) != cpu);

2410

WARN_ON(p->state != TASK_WAKING);

2410

WARN_ON(p->state != TASK_WAKING);

2411

2412

#ifdef CONFIG_SCHEDSTATS

2412

#ifdef CONFIG_SCHEDSTATS

2413

schedstat_inc(rq, ttwu_count);

2413

schedstat_inc(rq, ttwu_count);

2414

if (cpu == this_cpu)

2414

if (cpu == this_cpu)

2415

schedstat_inc(rq, ttwu_local);

2415

schedstat_inc(rq, ttwu_local);

2416

else {

2416

else {

2417

struct sched_domain *sd;

2417

struct sched_domain *sd;

2418

for_each_domain(this_cpu, sd) {

2418

for_each_domain(this_cpu, sd) {

2419

if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {

2419

if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {

2420

schedstat_inc(sd, ttwu_wake_remote);

2420

schedstat_inc(sd, ttwu_wake_remote);

2421

break;

2421

break;

2422

}

2422

}

2423

}

2423

}

2424

}

2424

}

2425

#endif /* CONFIG_SCHEDSTATS */

2425

#endif /* CONFIG_SCHEDSTATS */

2426

2427

out_activate:

2427

out_activate:

2428

#endif /* CONFIG_SMP */

2428

#endif /* CONFIG_SMP */

2429

schedstat_inc(p, se.statistics.nr_wakeups);

2429

schedstat_inc(p, se.statistics.nr_wakeups);

2430

if (wake_flags & WF_SYNC)

2430

if (wake_flags & WF_SYNC)

2431

schedstat_inc(p, se.statistics.nr_wakeups_sync);

2431

schedstat_inc(p, se.statistics.nr_wakeups_sync);

2432

if (orig_cpu != cpu)

2432

if (orig_cpu != cpu)

2433

schedstat_inc(p, se.statistics.nr_wakeups_migrate);

2433

schedstat_inc(p, se.statistics.nr_wakeups_migrate);

2434

if (cpu == this_cpu)

2434

if (cpu == this_cpu)

2435

schedstat_inc(p, se.statistics.nr_wakeups_local);

2435

schedstat_inc(p, se.statistics.nr_wakeups_local);

2436

else

2436

else

2437

schedstat_inc(p, se.statistics.nr_wakeups_remote);

2437

schedstat_inc(p, se.statistics.nr_wakeups_remote);

2438

activate_task(rq, p, en_flags);

2438

activate_task(rq, p, en_flags);

2439

success = 1;

2439

success = 1;

2440

2441

out_running:

2441

out_running:

2442

trace_sched_wakeup(rq, p, success);

2442

trace_sched_wakeup(rq, p, success);

2443

check_preempt_curr(rq, p, wake_flags);

2443

check_preempt_curr(rq, p, wake_flags);

2444

2445

p->state = TASK_RUNNING;

2445

p->state = TASK_RUNNING;

2446

#ifdef CONFIG_SMP

2446

#ifdef CONFIG_SMP

2447

if (p->sched_class->task_woken)

2447

if (p->sched_class->task_woken)

2448

p->sched_class->task_woken(rq, p);

2448

p->sched_class->task_woken(rq, p);

2449

2450

if (unlikely(rq->idle_stamp)) {

2450

if (unlikely(rq->idle_stamp)) {

2451

u64 delta = rq->clock - rq->idle_stamp;

2451

u64 delta = rq->clock - rq->idle_stamp;

2452

u64 max = 2*sysctl_sched_migration_cost;

2452

u64 max = 2*sysctl_sched_migration_cost;

2453

2454

if (delta > max)

2454

if (delta > max)

2455

rq->avg_idle = max;

2455

rq->avg_idle = max;

2456

else

2456

else

2457

update_avg(&rq->avg_idle, delta);

2457

update_avg(&rq->avg_idle, delta);

2458

rq->idle_stamp = 0;

2458

rq->idle_stamp = 0;

2459

}

2459

}

2460

#endif

2460

#endif

2461

out:

2461

out:

2462

task_rq_unlock(rq, &flags);

2462

task_rq_unlock(rq, &flags);

2463

put_cpu();

2463

put_cpu();

2464

2465

return success;

2465

return success;

2466

}

2466

}

2467

2468

/**

2468

/**

2469

* wake_up_process - Wake up a specific process

2469

* wake_up_process - Wake up a specific process

2470

* @p: The process to be woken up.

2470

* @p: The process to be woken up.

2471

*

2471

*

2472

* Attempt to wake up the nominated process and move it to the set of runnable

2472

* Attempt to wake up the nominated process and move it to the set of runnable

2473

* processes. Returns 1 if the process was woken up, 0 if it was already

2473

* processes. Returns 1 if the process was woken up, 0 if it was already

2474

* running.

2474

* running.

2475

*

2475

*

2476

* It may be assumed that this function implies a write memory barrier before

2476

* It may be assumed that this function implies a write memory barrier before

2477

* changing the task state if and only if any tasks are woken up.

2477

* changing the task state if and only if any tasks are woken up.

2478

*/

2478

*/

2479

int wake_up_process(struct task_struct *p)

2479

int wake_up_process(struct task_struct *p)

2480

{

2480

{

2481

return try_to_wake_up(p, TASK_ALL, 0);

2481

return try_to_wake_up(p, TASK_ALL, 0);

2482

}

2482

}

2483

EXPORT_SYMBOL(wake_up_process);

2483

EXPORT_SYMBOL(wake_up_process);

2484

2485

int wake_up_state(struct task_struct *p, unsigned int state)

2485

int wake_up_state(struct task_struct *p, unsigned int state)

2486

{

2486

{

2487

return try_to_wake_up(p, state, 0);

2487

return try_to_wake_up(p, state, 0);

2488

}

2488

}

2489

2490

/*

2490

/*

2491

* Perform scheduler related setup for a newly forked process p.

2491

* Perform scheduler related setup for a newly forked process p.

2492

* p is forked by current.

2492

* p is forked by current.

2493

*

2493

*

2494

* __sched_fork() is basic setup used by init_idle() too:

2494

* __sched_fork() is basic setup used by init_idle() too:

2495

*/

2495

*/

2496

static void __sched_fork(struct task_struct *p)

2496

static void __sched_fork(struct task_struct *p)

2497

{

2497

{

2498

p->se.exec_start = 0;

2498

p->se.exec_start = 0;

2499

p->se.sum_exec_runtime = 0;

2499

p->se.sum_exec_runtime = 0;

2500

p->se.prev_sum_exec_runtime = 0;

2500

p->se.prev_sum_exec_runtime = 0;

2501

p->se.nr_migrations = 0;

2501

p->se.nr_migrations = 0;

2502

2503

#ifdef CONFIG_SCHEDSTATS

2503

#ifdef CONFIG_SCHEDSTATS

2504

memset(&p->se.statistics, 0, sizeof(p->se.statistics));

2504

memset(&p->se.statistics, 0, sizeof(p->se.statistics));

2505

#endif

2505

#endif

2506

2507

INIT_LIST_HEAD(&p->rt.run_list);

2507

INIT_LIST_HEAD(&p->rt.run_list);

2508

p->se.on_rq = 0;

2508

p->se.on_rq = 0;

2509

INIT_LIST_HEAD(&p->se.group_node);

2509

INIT_LIST_HEAD(&p->se.group_node);

2510

2511

#ifdef CONFIG_PREEMPT_NOTIFIERS

2511

#ifdef CONFIG_PREEMPT_NOTIFIERS

2512

INIT_HLIST_HEAD(&p->preempt_notifiers);

2512

INIT_HLIST_HEAD(&p->preempt_notifiers);

2513

#endif

2513

#endif

2514

}

2514

}

2515

2516

/*

2516

/*

2517

* fork()/clone()-time setup:

2517

* fork()/clone()-time setup:

2518

*/

2518

*/

2519

void sched_fork(struct task_struct *p, int clone_flags)

2519

void sched_fork(struct task_struct *p, int clone_flags)

2520

{

2520

{

2521

int cpu = get_cpu();

2521

int cpu = get_cpu();

2522

2523

__sched_fork(p);

2523

__sched_fork(p);

2524

/*

2524

/*

2525

* We mark the process as running here. This guarantees that

2525

* We mark the process as running here. This guarantees that

2526

* nobody will actually run it, and a signal or other external

2526

* nobody will actually run it, and a signal or other external

2527

* event cannot wake it up and insert it on the runqueue either.

2527

* event cannot wake it up and insert it on the runqueue either.

2528

*/

2528

*/

2529

p->state = TASK_RUNNING;

2529

p->state = TASK_RUNNING;

2530

2531

/*

2531

/*

2532

* Revert to default priority/policy on fork if requested.

2532

* Revert to default priority/policy on fork if requested.

2533

*/

2533

*/

2534

if (unlikely(p->sched_reset_on_fork)) {

2534

if (unlikely(p->sched_reset_on_fork)) {

2535

if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {

2535

if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {

2536

p->policy = SCHED_NORMAL;

2536

p->policy = SCHED_NORMAL;

2537

p->normal_prio = p->static_prio;

2537

p->normal_prio = p->static_prio;

2538

}

2538

}

2539

2540

if (PRIO_TO_NICE(p->static_prio) < 0) {

2540

if (PRIO_TO_NICE(p->static_prio) < 0) {

2541

p->static_prio = NICE_TO_PRIO(0);

2541

p->static_prio = NICE_TO_PRIO(0);

2542

p->normal_prio = p->static_prio;

2542

p->normal_prio = p->static_prio;

2543

set_load_weight(p);

2543

set_load_weight(p);

2544

}

2544

}

2545

2546

/*

2546

/*

2547

* We don't need the reset flag anymore after the fork. It has

2547

* We don't need the reset flag anymore after the fork. It has

2548

* fulfilled its duty:

2548

* fulfilled its duty:

2549

*/

2549

*/

2550

p->sched_reset_on_fork = 0;

2550

p->sched_reset_on_fork = 0;

2551

}

2551

}

2552

2553

/*

2553

/*

2554

* Make sure we do not leak PI boosting priority to the child.

2554

* Make sure we do not leak PI boosting priority to the child.

2555

*/

2555

*/

2556

p->prio = current->normal_prio;

2556

p->prio = current->normal_prio;

2557

2558

if (!rt_prio(p->prio))

2558

if (!rt_prio(p->prio))

2559

p->sched_class = &fair_sched_class;

2559

p->sched_class = &fair_sched_class;

2560

2561

if (p->sched_class->task_fork)

2561

if (p->sched_class->task_fork)

2562

p->sched_class->task_fork(p);

2562

p->sched_class->task_fork(p);

2563

2564

set_task_cpu(p, cpu);

2564

set_task_cpu(p, cpu);

2565

2566

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

2566

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

2567

if (likely(sched_info_on()))

2567

if (likely(sched_info_on()))

2568

memset(&p->sched_info, 0, sizeof(p->sched_info));

2568

memset(&p->sched_info, 0, sizeof(p->sched_info));

2569

#endif

2569

#endif

2570

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

2570

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

2571

p->oncpu = 0;

2571

p->oncpu = 0;

2572

#endif

2572

#endif

2573

#ifdef CONFIG_PREEMPT

2573

#ifdef CONFIG_PREEMPT

2574

/* Want to start with kernel preemption disabled. */

2574

/* Want to start with kernel preemption disabled. */

2575

task_thread_info(p)->preempt_count = 1;

2575

task_thread_info(p)->preempt_count = 1;

2576

#endif

2576

#endif

2577

plist_node_init(&p->pushable_tasks, MAX_PRIO);

2577

plist_node_init(&p->pushable_tasks, MAX_PRIO);

2578

2579

put_cpu();

2579

put_cpu();

2580

}

2580

}

2581

2582

/*

2582

/*

2583

* wake_up_new_task - wake up a newly created task for the first time.

2583

* wake_up_new_task - wake up a newly created task for the first time.

2584

*

2584

*

2585

* This function will do some initial scheduler statistics housekeeping

2585

* This function will do some initial scheduler statistics housekeeping

2586

* that must be done for every newly created context, then puts the task

2586

* that must be done for every newly created context, then puts the task

2587

* on the runqueue and wakes it.

2587

* on the runqueue and wakes it.

2588

*/

2588

*/

2589

void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)

2589

void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)

2590

{

2590

{

2591

unsigned long flags;

2591

unsigned long flags;

2592

struct rq *rq;

2592

struct rq *rq;

2593

int cpu __maybe_unused = get_cpu();

2593

int cpu __maybe_unused = get_cpu();

2594

2595

#ifdef CONFIG_SMP

2595

#ifdef CONFIG_SMP

2596

rq = task_rq_lock(p, &flags);

2596

rq = task_rq_lock(p, &flags);

2597

p->state = TASK_WAKING;

2597

p->state = TASK_WAKING;

2598

2599

/*

2599

/*

2600

* Fork balancing, do it here and not earlier because:

2600

* Fork balancing, do it here and not earlier because:

2601

* - cpus_allowed can change in the fork path

2601

* - cpus_allowed can change in the fork path

2602

* - any previously selected cpu might disappear through hotplug

2602

* - any previously selected cpu might disappear through hotplug

2603

*

2603

*

2604

* We set TASK_WAKING so that select_task_rq() can drop rq->lock

2604

* We set TASK_WAKING so that select_task_rq() can drop rq->lock

2605

* without people poking at ->cpus_allowed.

2605

* without people poking at ->cpus_allowed.

2606

*/

2606

*/

2607

cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);

2607

cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);

2608

set_task_cpu(p, cpu);

2608

set_task_cpu(p, cpu);

2609

2610

p->state = TASK_RUNNING;

2610

p->state = TASK_RUNNING;

2611

task_rq_unlock(rq, &flags);

2611

task_rq_unlock(rq, &flags);

2612

#endif

2612

#endif

2613

2614

rq = task_rq_lock(p, &flags);

2614

rq = task_rq_lock(p, &flags);

2615

activate_task(rq, p, 0);

2615

activate_task(rq, p, 0);

2616

trace_sched_wakeup_new(rq, p, 1);

2616

trace_sched_wakeup_new(rq, p, 1);

2617

check_preempt_curr(rq, p, WF_FORK);

2617

check_preempt_curr(rq, p, WF_FORK);

2618

#ifdef CONFIG_SMP

2618

#ifdef CONFIG_SMP

2619

if (p->sched_class->task_woken)

2619

if (p->sched_class->task_woken)

2620

p->sched_class->task_woken(rq, p);

2620

p->sched_class->task_woken(rq, p);

2621

#endif

2621

#endif

2622

task_rq_unlock(rq, &flags);

2622

task_rq_unlock(rq, &flags);

2623

put_cpu();

2623

put_cpu();

2624

}

2624

}

2625

2626

#ifdef CONFIG_PREEMPT_NOTIFIERS

2626

#ifdef CONFIG_PREEMPT_NOTIFIERS

2627

2628

/**

2628

/**

2629

* preempt_notifier_register - tell me when current is being preempted & rescheduled

2629

* preempt_notifier_register - tell me when current is being preempted & rescheduled

2630

* @notifier: notifier struct to register

2630

* @notifier: notifier struct to register

2631

*/

2631

*/

2632

void preempt_notifier_register(struct preempt_notifier *notifier)

2632

void preempt_notifier_register(struct preempt_notifier *notifier)

2633

{

2633

{

2634

hlist_add_head(&notifier->link, &current->preempt_notifiers);

2634

hlist_add_head(&notifier->link, &current->preempt_notifiers);

2635

}

2635

}

2636

EXPORT_SYMBOL_GPL(preempt_notifier_register);

2636

EXPORT_SYMBOL_GPL(preempt_notifier_register);

2637

2638

/**

2638

/**

2639

* preempt_notifier_unregister - no longer interested in preemption notifications

2639

* preempt_notifier_unregister - no longer interested in preemption notifications

2640

* @notifier: notifier struct to unregister

2640

* @notifier: notifier struct to unregister

2641

*

2641

*

2642

* This is safe to call from within a preemption notifier.

2642

* This is safe to call from within a preemption notifier.

2643

*/

2643

*/

2644

void preempt_notifier_unregister(struct preempt_notifier *notifier)

2644

void preempt_notifier_unregister(struct preempt_notifier *notifier)

2645

{

2645

{

2646

hlist_del(&notifier->link);

2646

hlist_del(&notifier->link);

2647

}

2647

}

2648

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

2648

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

2649

2650

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2650

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2651

{

2651

{

2652

struct preempt_notifier *notifier;

2652

struct preempt_notifier *notifier;

2653

struct hlist_node *node;

2653

struct hlist_node *node;

2654

2655

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2655

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2656

notifier->ops->sched_in(notifier, raw_smp_processor_id());

2656

notifier->ops->sched_in(notifier, raw_smp_processor_id());

2657

}

2657

}

2658

2659

static void

2659

static void

2660

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2660

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2661

struct task_struct *next)

2661

struct task_struct *next)

2662

{

2662

{

2663

struct preempt_notifier *notifier;

2663

struct preempt_notifier *notifier;

2664

struct hlist_node *node;

2664

struct hlist_node *node;

2665

2666

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2666

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2667

notifier->ops->sched_out(notifier, next);

2667

notifier->ops->sched_out(notifier, next);

2668

}

2668

}

2669

2670

#else /* !CONFIG_PREEMPT_NOTIFIERS */

2670

#else /* !CONFIG_PREEMPT_NOTIFIERS */

2671

2672

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2672

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2673

{

2673

{

2674

}

2674

}

2675

2676

static void

2676

static void

2677

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2677

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2678

struct task_struct *next)

2678

struct task_struct *next)

2679

{

2679

{

2680

}

2680

}

2681

2682

#endif /* CONFIG_PREEMPT_NOTIFIERS */

2682

#endif /* CONFIG_PREEMPT_NOTIFIERS */

2683

2684

/**

2684

/**

2685

* prepare_task_switch - prepare to switch tasks

2685

* prepare_task_switch - prepare to switch tasks

2686

* @rq: the runqueue preparing to switch

2686

* @rq: the runqueue preparing to switch

2687

* @prev: the current task that is being switched out

2687

* @prev: the current task that is being switched out

2688

* @next: the task we are going to switch to.

2688

* @next: the task we are going to switch to.

2689

*

2689

*

2690

* This is called with the rq lock held and interrupts off. It must

2690

* This is called with the rq lock held and interrupts off. It must

2691

* be paired with a subsequent finish_task_switch after the context

2691

* be paired with a subsequent finish_task_switch after the context

2692

* switch.

2692

* switch.

2693

*

2693

*

2694

* prepare_task_switch sets up locking and calls architecture specific

2694

* prepare_task_switch sets up locking and calls architecture specific

2695

* hooks.

2695

* hooks.

2696

*/

2696

*/

2697

static inline void

2697

static inline void

2698

prepare_task_switch(struct rq *rq, struct task_struct *prev,

2698

prepare_task_switch(struct rq *rq, struct task_struct *prev,

2699

struct task_struct *next)

2699

struct task_struct *next)

2700

{

2700

{

2701

fire_sched_out_preempt_notifiers(prev, next);

2701

fire_sched_out_preempt_notifiers(prev, next);

2702

prepare_lock_switch(rq, next);

2702

prepare_lock_switch(rq, next);

2703

prepare_arch_switch(next);

2703

prepare_arch_switch(next);

2704

}

2704

}

2705

2706

/**

2706

/**

2707

* finish_task_switch - clean up after a task-switch

2707

* finish_task_switch - clean up after a task-switch

2708

* @rq: runqueue associated with task-switch

2708

* @rq: runqueue associated with task-switch

2709

* @prev: the thread we just switched away from.

2709

* @prev: the thread we just switched away from.

2710

*

2710

*

2711

* finish_task_switch must be called after the context switch, paired

2711

* finish_task_switch must be called after the context switch, paired

2712

* with a prepare_task_switch call before the context switch.

2712

* with a prepare_task_switch call before the context switch.

2713

* finish_task_switch will reconcile locking set up by prepare_task_switch,

2713

* finish_task_switch will reconcile locking set up by prepare_task_switch,

2714

* and do any other architecture-specific cleanup actions.

2714

* and do any other architecture-specific cleanup actions.

2715

*

2715

*

2716

* Note that we may have delayed dropping an mm in context_switch(). If

2716

* Note that we may have delayed dropping an mm in context_switch(). If

2717

* so, we finish that here outside of the runqueue lock. (Doing it

2717

* so, we finish that here outside of the runqueue lock. (Doing it

2718

* with the lock held can cause deadlocks; see schedule() for

2718

* with the lock held can cause deadlocks; see schedule() for

2719

* details.)

2719

* details.)

2720

*/

2720

*/

2721

static void finish_task_switch(struct rq *rq, struct task_struct *prev)

2721

static void finish_task_switch(struct rq *rq, struct task_struct *prev)

2722

__releases(rq->lock)

2722

__releases(rq->lock)

2723

{

2723

{

2724

struct mm_struct *mm = rq->prev_mm;

2724

struct mm_struct *mm = rq->prev_mm;

2725

long prev_state;

2725

long prev_state;

2726

2727

rq->prev_mm = NULL;

2727

rq->prev_mm = NULL;

2728

2729

/*

2729

/*

2730

* A task struct has one reference for the use as "current".

2730

* A task struct has one reference for the use as "current".

2731

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

2731

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

2732

* schedule one last time. The schedule call will never return, and

2732

* schedule one last time. The schedule call will never return, and

2733

* the scheduled task must drop that reference.

2733

* the scheduled task must drop that reference.

2734

* The test for TASK_DEAD must occur while the runqueue locks are

2734

* The test for TASK_DEAD must occur while the runqueue locks are

2735

* still held, otherwise prev could be scheduled on another cpu, die

2735

* still held, otherwise prev could be scheduled on another cpu, die

2736

* there before we look at prev->state, and then the reference would

2736

* there before we look at prev->state, and then the reference would

2737

* be dropped twice.

2737

* be dropped twice.

2738

* Manfred Spraul <manfred@colorfullife.com>

2738

* Manfred Spraul <manfred@colorfullife.com>

2739

*/

2739

*/

2740

prev_state = prev->state;

2740

prev_state = prev->state;

2741

finish_arch_switch(prev);

2741

finish_arch_switch(prev);

2742

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

2742

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

2743

local_irq_disable();

2743

local_irq_disable();

2744

#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */

2744

#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */

2745

perf_event_task_sched_in(current);

2745

perf_event_task_sched_in(current);

2746

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

2746

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

2747

local_irq_enable();

2747

local_irq_enable();

2748

#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */

2748

#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */

2749

finish_lock_switch(rq, prev);

2749

finish_lock_switch(rq, prev);

2750

2751

fire_sched_in_preempt_notifiers(current);

2751

fire_sched_in_preempt_notifiers(current);

2752

if (mm)

2752

if (mm)

2753

mmdrop(mm);

2753

mmdrop(mm);

2754

if (unlikely(prev_state == TASK_DEAD)) {

2754

if (unlikely(prev_state == TASK_DEAD)) {

2755

/*

2755

/*

2756

* Remove function-return probe instances associated with this

2756

* Remove function-return probe instances associated with this

2757

* task and put them back on the free list.

2757

* task and put them back on the free list.

2758

*/

2758

*/

2759

kprobe_flush_task(prev);

2759

kprobe_flush_task(prev);

2760

put_task_struct(prev);

2760

put_task_struct(prev);

2761

}

2761

}

2762

}

2762

}

2763

2764

#ifdef CONFIG_SMP

2764

#ifdef CONFIG_SMP

2765

2766

/* assumes rq->lock is held */

2766

/* assumes rq->lock is held */

2767

static inline void pre_schedule(struct rq *rq, struct task_struct *prev)

2767

static inline void pre_schedule(struct rq *rq, struct task_struct *prev)

2768

{

2768

{

2769

if (prev->sched_class->pre_schedule)

2769

if (prev->sched_class->pre_schedule)

2770

prev->sched_class->pre_schedule(rq, prev);

2770

prev->sched_class->pre_schedule(rq, prev);

2771

}

2771

}

2772

2773

/* rq->lock is NOT held, but preemption is disabled */

2773

/* rq->lock is NOT held, but preemption is disabled */

2774

static inline void post_schedule(struct rq *rq)

2774

static inline void post_schedule(struct rq *rq)

2775

{

2775

{

2776

if (rq->post_schedule) {

2776

if (rq->post_schedule) {

2777

unsigned long flags;

2777

unsigned long flags;

2778

2779

raw_spin_lock_irqsave(&rq->lock, flags);

2779

raw_spin_lock_irqsave(&rq->lock, flags);

2780

if (rq->curr->sched_class->post_schedule)

2780

if (rq->curr->sched_class->post_schedule)

2781

rq->curr->sched_class->post_schedule(rq);

2781

rq->curr->sched_class->post_schedule(rq);

2782

raw_spin_unlock_irqrestore(&rq->lock, flags);

2782

raw_spin_unlock_irqrestore(&rq->lock, flags);

2783

2784

rq->post_schedule = 0;

2784

rq->post_schedule = 0;

2785

}

2785

}

2786

}

2786

}

2787

2788

#else

2788

#else

2789

2790

static inline void pre_schedule(struct rq *rq, struct task_struct *p)

2790

static inline void pre_schedule(struct rq *rq, struct task_struct *p)

2791

{

2791

{

2792

}

2792

}

2793

2794

static inline void post_schedule(struct rq *rq)

2794

static inline void post_schedule(struct rq *rq)

2795

{

2795

{

2796

}

2796

}

2797

2798

#endif

2798

#endif

2799

2800

/**

2800

/**

2801

* schedule_tail - first thing a freshly forked thread must call.

2801

* schedule_tail - first thing a freshly forked thread must call.

2802

* @prev: the thread we just switched away from.

2802

* @prev: the thread we just switched away from.

2803

*/

2803

*/

2804

asmlinkage void schedule_tail(struct task_struct *prev)

2804

asmlinkage void schedule_tail(struct task_struct *prev)

2805

__releases(rq->lock)

2805

__releases(rq->lock)

2806

{

2806

{

2807

struct rq *rq = this_rq();

2807

struct rq *rq = this_rq();

2808

2809

finish_task_switch(rq, prev);

2809

finish_task_switch(rq, prev);

2810

2811

/*

2811

/*

2812

* FIXME: do we need to worry about rq being invalidated by the

2812

* FIXME: do we need to worry about rq being invalidated by the

2813

* task_switch?

2813

* task_switch?

2814

*/

2814

*/

2815

post_schedule(rq);

2815

post_schedule(rq);

2816

2817

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

2817

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

2818

/* In this case, finish_task_switch does not reenable preemption */

2818

/* In this case, finish_task_switch does not reenable preemption */

2819

preempt_enable();

2819

preempt_enable();

2820

#endif

2820

#endif

2821

if (current->set_child_tid)

2821

if (current->set_child_tid)

2822

put_user(task_pid_vnr(current), current->set_child_tid);

2822

put_user(task_pid_vnr(current), current->set_child_tid);

2823

}

2823

}

2824

2825

/*

2825

/*

2826

* context_switch - switch to the new MM and the new

2826

* context_switch - switch to the new MM and the new

2827

* thread's register state.

2827

* thread's register state.

2828

*/

2828

*/

2829

static inline void

2829

static inline void

2830

context_switch(struct rq *rq, struct task_struct *prev,

2830

context_switch(struct rq *rq, struct task_struct *prev,

2831

struct task_struct *next)

2831

struct task_struct *next)

2832

{

2832

{

2833

struct mm_struct *mm, *oldmm;

2833

struct mm_struct *mm, *oldmm;

2834

2835

prepare_task_switch(rq, prev, next);

2835

prepare_task_switch(rq, prev, next);

2836

trace_sched_switch(rq, prev, next);

2836

trace_sched_switch(rq, prev, next);

2837

mm = next->mm;

2837

mm = next->mm;

2838

oldmm = prev->active_mm;

2838

oldmm = prev->active_mm;

2839

/*

2839

/*

2840

* For paravirt, this is coupled with an exit in switch_to to

2840

* For paravirt, this is coupled with an exit in switch_to to

2841

* combine the page table reload and the switch backend into

2841

* combine the page table reload and the switch backend into

2842

* one hypercall.

2842

* one hypercall.

2843

*/

2843

*/

2844

arch_start_context_switch(prev);

2844

arch_start_context_switch(prev);

2845

2846

if (likely(!mm)) {

2846

if (likely(!mm)) {

2847

next->active_mm = oldmm;

2847

next->active_mm = oldmm;

2848

atomic_inc(&oldmm->mm_count);

2848

atomic_inc(&oldmm->mm_count);

2849

enter_lazy_tlb(oldmm, next);

2849

enter_lazy_tlb(oldmm, next);

2850

} else

2850

} else

2851

switch_mm(oldmm, mm, next);

2851

switch_mm(oldmm, mm, next);

2852

2853

if (likely(!prev->mm)) {

2853

if (likely(!prev->mm)) {

2854

prev->active_mm = NULL;

2854

prev->active_mm = NULL;

2855

rq->prev_mm = oldmm;

2855

rq->prev_mm = oldmm;

2856

}

2856

}

2857

/*

2857

/*

2858

* Since the runqueue lock will be released by the next

2858

* Since the runqueue lock will be released by the next

2859

* task (which is an invalid locking op but in the case

2859

* task (which is an invalid locking op but in the case

2860

* of the scheduler it's an obvious special-case), so we

2860

* of the scheduler it's an obvious special-case), so we

2861

* do an early lockdep release here:

2861

* do an early lockdep release here:

2862

*/

2862

*/

2863

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

2863

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

2864

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

2864

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

2865

#endif

2865

#endif

2866

2867

/* Here we just switch the register state and the stack. */

2867

/* Here we just switch the register state and the stack. */

2868

switch_to(prev, next, prev);

2868

switch_to(prev, next, prev);

2869

2870

barrier();

2870

barrier();

2871

/*

2871

/*

2872

* this_rq must be evaluated again because prev may have moved

2872

* this_rq must be evaluated again because prev may have moved

2873

* CPUs since it called schedule(), thus the 'rq' on its stack

2873

* CPUs since it called schedule(), thus the 'rq' on its stack

2874

* frame will be invalid.

2874

* frame will be invalid.

2875

*/

2875

*/

2876

finish_task_switch(this_rq(), prev);

2876

finish_task_switch(this_rq(), prev);

2877

}

2877

}

2878

2879

/*

2879

/*

2880

* nr_running, nr_uninterruptible and nr_context_switches:

2880

* nr_running, nr_uninterruptible and nr_context_switches:

2881

*

2881

*

2882

* externally visible scheduler statistics: current number of runnable

2882

* externally visible scheduler statistics: current number of runnable

2883

* threads, current number of uninterruptible-sleeping threads, total

2883

* threads, current number of uninterruptible-sleeping threads, total

2884

* number of context switches performed since bootup.

2884

* number of context switches performed since bootup.

2885

*/

2885

*/

2886

unsigned long nr_running(void)

2886

unsigned long nr_running(void)

2887

{

2887

{

2888

unsigned long i, sum = 0;

2888

unsigned long i, sum = 0;

2889

2890

for_each_online_cpu(i)

2890

for_each_online_cpu(i)

2891

sum += cpu_rq(i)->nr_running;

2891

sum += cpu_rq(i)->nr_running;

2892

2893

return sum;

2893

return sum;

2894

}

2894

}

2895

2896

unsigned long nr_uninterruptible(void)

2896

unsigned long nr_uninterruptible(void)

2897

{

2897

{

2898

unsigned long i, sum = 0;

2898

unsigned long i, sum = 0;

2899

2900

for_each_possible_cpu(i)

2900

for_each_possible_cpu(i)

2901

sum += cpu_rq(i)->nr_uninterruptible;

2901

sum += cpu_rq(i)->nr_uninterruptible;

2902

2903

/*

2903

/*

2904

* Since we read the counters lockless, it might be slightly

2904

* Since we read the counters lockless, it might be slightly

2905

* inaccurate. Do not allow it to go below zero though:

2905

* inaccurate. Do not allow it to go below zero though:

2906

*/

2906

*/

2907

if (unlikely((long)sum < 0))

2907

if (unlikely((long)sum < 0))

2908

sum = 0;

2908

sum = 0;

2909

2910

return sum;

2910

return sum;

2911

}

2911

}

2912

2913

unsigned long long nr_context_switches(void)

2913

unsigned long long nr_context_switches(void)

2914

{

2914

{

2915

int i;

2915

int i;

2916

unsigned long long sum = 0;

2916

unsigned long long sum = 0;

2917

2918

for_each_possible_cpu(i)

2918

for_each_possible_cpu(i)

2919

sum += cpu_rq(i)->nr_switches;

2919

sum += cpu_rq(i)->nr_switches;

2920

2921

return sum;

2921

return sum;

2922

}

2922

}

2923

2924

unsigned long nr_iowait(void)

2924

unsigned long nr_iowait(void)

2925

{

2925

{

2926

unsigned long i, sum = 0;

2926

unsigned long i, sum = 0;

2927

2928

for_each_possible_cpu(i)

2928

for_each_possible_cpu(i)

2929

sum += atomic_read(&cpu_rq(i)->nr_iowait);

2929

sum += atomic_read(&cpu_rq(i)->nr_iowait);

2930

2931

return sum;

2931

return sum;

2932

}

2932

}

2933

2934

unsigned long nr_iowait_cpu(void)

2934

unsigned long nr_iowait_cpu(void)

2935

{

2935

{

2936

struct rq *this = this_rq();

2936

struct rq *this = this_rq();

2937

return atomic_read(&this->nr_iowait);

2937

return atomic_read(&this->nr_iowait);

2938

}

2938

}

2939

2940

unsigned long this_cpu_load(void)

2940

unsigned long this_cpu_load(void)

2941

{

2941

{

2942

struct rq *this = this_rq();

2942

struct rq *this = this_rq();

2943

return this->cpu_load[0];

2943

return this->cpu_load[0];

2944

}

2944

}

2945

2946

2947

/* Variables and functions for calc_load */

2947

/* Variables and functions for calc_load */

2948

static atomic_long_t calc_load_tasks;

2948

static atomic_long_t calc_load_tasks;

2949

static unsigned long calc_load_update;

2949

static unsigned long calc_load_update;

2950

unsigned long avenrun[3];

2950

unsigned long avenrun[3];

2951

EXPORT_SYMBOL(avenrun);

2951

EXPORT_SYMBOL(avenrun);

2952

2953

static long calc_load_fold_active(struct rq *this_rq)

2954

{

2955

long nr_active, delta = 0;

2956

2957

nr_active = this_rq->nr_running;

2958

nr_active += (long) this_rq->nr_uninterruptible;

2959

2960

if (nr_active != this_rq->calc_load_active) {

2961

delta = nr_active - this_rq->calc_load_active;

2962

this_rq->calc_load_active = nr_active;

2963

}

2964

2965

return delta;

2966

}

2967

2968

#ifdef CONFIG_NO_HZ

2969

/*

2970

* For NO_HZ we delay the active fold to the next LOAD_FREQ update.

2971

*

2972

* When making the ILB scale, we should try to pull this in as well.

2973

*/

2974

static atomic_long_t calc_load_tasks_idle;

2975

2976

static void calc_load_account_idle(struct rq *this_rq)

2977

{

2978

long delta;

2979

2980

delta = calc_load_fold_active(this_rq);

2981

if (delta)

2982

atomic_long_add(delta, &calc_load_tasks_idle);

2983

}

2984

2985

static long calc_load_fold_idle(void)

2986

{

2987

long delta = 0;

2988

2989

/*

2990

* Its got a race, we don't care...

2991

*/

2992

if (atomic_long_read(&calc_load_tasks_idle))

2993

delta = atomic_long_xchg(&calc_load_tasks_idle, 0);

2994

2995

return delta;

2996

}

2997

#else

2998

static void calc_load_account_idle(struct rq *this_rq)

2999

{

3000

}

3001

3002

static inline long calc_load_fold_idle(void)

3003

{

3004

return 0;

3005

}

3006

#endif

3007

2953

/**

3008

/**

2954

* get_avenrun - get the load average array

3009

* get_avenrun - get the load average array

2955

* @loads: pointer to dest load array

3010

* @loads: pointer to dest load array

2956

* @offset: offset to add

3011

* @offset: offset to add

2957

* @shift: shift count to shift the result left

3012

* @shift: shift count to shift the result left

2958

*

3013

*

2959

* These values are estimates at best, so no need for locking.

3014

* These values are estimates at best, so no need for locking.

2960

*/

3015

*/

2961

void get_avenrun(unsigned long *loads, unsigned long offset, int shift)

3016

void get_avenrun(unsigned long *loads, unsigned long offset, int shift)

2962

{

3017

{

2963

loads[0] = (avenrun[0] + offset) << shift;

3018

loads[0] = (avenrun[0] + offset) << shift;

2964

loads[1] = (avenrun[1] + offset) << shift;

3019

loads[1] = (avenrun[1] + offset) << shift;

2965

loads[2] = (avenrun[2] + offset) << shift;

3020

loads[2] = (avenrun[2] + offset) << shift;

2966

}

3021

}

2967

3022

2968

static unsigned long

3023

static unsigned long

2969

calc_load(unsigned long load, unsigned long exp, unsigned long active)

3024

calc_load(unsigned long load, unsigned long exp, unsigned long active)

2970

{

3025

{

2971

load *= exp;

3026

load *= exp;

2972

load += active * (FIXED_1 - exp);

3027

load += active * (FIXED_1 - exp);

2973

return load >> FSHIFT;

3028

return load >> FSHIFT;

2974

}

3029

}

2975

3030

2976

/*

3031

/*

2977

* calc_load - update the avenrun load estimates 10 ticks after the

3032

* calc_load - update the avenrun load estimates 10 ticks after the

2978

* CPUs have updated calc_load_tasks.

3033

* CPUs have updated calc_load_tasks.

2979

*/

3034

*/

2980

void calc_global_load(void)

3035

void calc_global_load(void)

2981

{

3036

{

2982

unsigned long upd = calc_load_update + 10;

3037

unsigned long upd = calc_load_update + 10;

2983

long active;

3038

long active;

2984

3039

2985

if (time_before(jiffies, upd))

3040

if (time_before(jiffies, upd))

2986

return;

3041

return;

2987

3042

2988

active = atomic_long_read(&calc_load_tasks);

3043

active = atomic_long_read(&calc_load_tasks);

2989

active = active > 0 ? active * FIXED_1 : 0;

3044

active = active > 0 ? active * FIXED_1 : 0;

2990

3045

2991

avenrun[0] = calc_load(avenrun[0], EXP_1, active);

3046

avenrun[0] = calc_load(avenrun[0], EXP_1, active);

2992

avenrun[1] = calc_load(avenrun[1], EXP_5, active);

3047

avenrun[1] = calc_load(avenrun[1], EXP_5, active);

2993

avenrun[2] = calc_load(avenrun[2], EXP_15, active);

3048

avenrun[2] = calc_load(avenrun[2], EXP_15, active);

2994

3049

2995

calc_load_update += LOAD_FREQ;

3050

calc_load_update += LOAD_FREQ;

2996

}

3051

}

2997

3052

2998

/*

3053

/*

2999

* Either called from update_cpu_load() or from a cpu going idle

3054

* Called from update_cpu_load() to periodically update this CPU's

3055

* active count.

3000

*/

3056

*/

3001

static void calc_load_account_active(struct rq *this_rq)

3057

static void calc_load_account_active(struct rq *this_rq)

3002

{

3058

{

3003

long nr_active, delta;

3059

long delta;

3004

3060

3005

nr_active = this_rq->nr_running;

3061

if (time_before(jiffies, this_rq->calc_load_update))

3006

nr_active += (long) this_rq->nr_uninterruptible;

3062

return;

3007

3063

3008

if (nr_active != this_rq->calc_load_active) {

3064

delta = calc_load_fold_active(this_rq);

3009

delta = nr_active - this_rq->calc_load_active;

3065

delta += calc_load_fold_idle();

3010

this_rq->calc_load_active = nr_active;

3066

if (delta)

3011

atomic_long_add(delta, &calc_load_tasks);

3067

atomic_long_add(delta, &calc_load_tasks);

3012

}

3068

3069

this_rq->calc_load_update += LOAD_FREQ;

3013

}

3070

}

3014

3071

3015

/*

3072

/*

3016

* Update rq->cpu_load[] statistics. This function is usually called every

3073

* Update rq->cpu_load[] statistics. This function is usually called every

3017

* scheduler tick (TICK_NSEC).

3074

* scheduler tick (TICK_NSEC).

3018

*/

3075

*/

3019

static void update_cpu_load(struct rq *this_rq)

3076

static void update_cpu_load(struct rq *this_rq)

3020

{

3077

{

3021

unsigned long this_load = this_rq->load.weight;

3078

unsigned long this_load = this_rq->load.weight;

3022

int i, scale;

3079

int i, scale;

3023

3080

3024

this_rq->nr_load_updates++;

3081

this_rq->nr_load_updates++;

3025

3082

3026

/* Update our load: */

3083

/* Update our load: */

3027

for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {

3084

for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {

3028

unsigned long old_load, new_load;

3085

unsigned long old_load, new_load;

3029

3086

3030

/* scale is effectively 1 << i now, and >> i divides by scale */

3087

/* scale is effectively 1 << i now, and >> i divides by scale */

3031

3088

3032

old_load = this_rq->cpu_load[i];

3089

old_load = this_rq->cpu_load[i];

3033

new_load = this_load;

3090

new_load = this_load;

3034

/*

3091

/*

3035

* Round up the averaging division if load is increasing. This

3092

* Round up the averaging division if load is increasing. This

3036

* prevents us from getting stuck on 9 if the load is 10, for

3093

* prevents us from getting stuck on 9 if the load is 10, for

3037

* example.

3094

* example.

3038

*/

3095

*/

3039

if (new_load > old_load)

3096

if (new_load > old_load)

3040

new_load += scale-1;

3097

new_load += scale-1;

3041

this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;

3098

this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;

3042

}

3099

}

3043

3100

3044

if (time_after_eq(jiffies, this_rq->calc_load_update)) {

3101

calc_load_account_active(this_rq);

3045

this_rq->calc_load_update += LOAD_FREQ;

3046

calc_load_account_active(this_rq);

3047

}

3048

}

3102

}

3049

3103

3050

#ifdef CONFIG_SMP

3104

#ifdef CONFIG_SMP

3051

3105

3052

/*

3106

/*

3053

* sched_exec - execve() is a valuable balancing opportunity, because at

3107

* sched_exec - execve() is a valuable balancing opportunity, because at

3054

* this point the task has the smallest effective memory and cache footprint.

3108

* this point the task has the smallest effective memory and cache footprint.

3055

*/

3109

*/

3056

void sched_exec(void)

3110

void sched_exec(void)

3057

{

3111

{

3058

struct task_struct *p = current;

3112

struct task_struct *p = current;

3059

struct migration_req req;

3113

struct migration_req req;

3060

unsigned long flags;

3114

unsigned long flags;

3061

struct rq *rq;

3115

struct rq *rq;

3062

int dest_cpu;

3116

int dest_cpu;

3063

3117

3064

rq = task_rq_lock(p, &flags);

3118

rq = task_rq_lock(p, &flags);

3065

dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);

3119

dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);

3066

if (dest_cpu == smp_processor_id())

3120

if (dest_cpu == smp_processor_id())

3067

goto unlock;

3121

goto unlock;

3068

3122

3069

/*

3123

/*

3070

* select_task_rq() can race against ->cpus_allowed

3124

* select_task_rq() can race against ->cpus_allowed

3071

*/

3125

*/

3072

if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&

3126

if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&

3073

likely(cpu_active(dest_cpu)) &&

3127

likely(cpu_active(dest_cpu)) &&

3074

migrate_task(p, dest_cpu, &req)) {

3128

migrate_task(p, dest_cpu, &req)) {

3075

/* Need to wait for migration thread (might exit: take ref). */

3129

/* Need to wait for migration thread (might exit: take ref). */

3076

struct task_struct *mt = rq->migration_thread;

3130

struct task_struct *mt = rq->migration_thread;

3077

3131

3078

get_task_struct(mt);

3132

get_task_struct(mt);

3079

task_rq_unlock(rq, &flags);

3133

task_rq_unlock(rq, &flags);

3080

wake_up_process(mt);

3134

wake_up_process(mt);

3081

put_task_struct(mt);

3135

put_task_struct(mt);

3082

wait_for_completion(&req.done);

3136

wait_for_completion(&req.done);

3083

3137

3084

return;

3138

return;

3085

}

3139

}

3086

unlock:

3140

unlock:

3087

task_rq_unlock(rq, &flags);

3141

task_rq_unlock(rq, &flags);

3088

}

3142

}

3089

3143

3090

#endif

3144

#endif

3091

3145

3092

DEFINE_PER_CPU(struct kernel_stat, kstat);

3146

DEFINE_PER_CPU(struct kernel_stat, kstat);

3093

3147

3094

EXPORT_PER_CPU_SYMBOL(kstat);

3148

EXPORT_PER_CPU_SYMBOL(kstat);

3095

3149

3096

/*

3150

/*

3097

* Return any ns on the sched_clock that have not yet been accounted in

3151

* Return any ns on the sched_clock that have not yet been accounted in

3098

* @p in case that task is currently running.

3152

* @p in case that task is currently running.

3099

*

3153

*

3100

* Called with task_rq_lock() held on @rq.

3154

* Called with task_rq_lock() held on @rq.

3101

*/

3155

*/

3102

static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)

3156

static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)

3103

{

3157

{

3104

u64 ns = 0;

3158

u64 ns = 0;

3105

3159

3106

if (task_current(rq, p)) {

3160

if (task_current(rq, p)) {

3107

update_rq_clock(rq);

3161

update_rq_clock(rq);

3108

ns = rq->clock - p->se.exec_start;

3162

ns = rq->clock - p->se.exec_start;

3109

if ((s64)ns < 0)

3163

if ((s64)ns < 0)

3110

ns = 0;

3164

ns = 0;

3111

}

3165

}

3112

3166

3113

return ns;

3167

return ns;

3114

}

3168

}

3115

3169

3116

unsigned long long task_delta_exec(struct task_struct *p)

3170

unsigned long long task_delta_exec(struct task_struct *p)

3117

{

3171

{

3118

unsigned long flags;

3172

unsigned long flags;

3119

struct rq *rq;

3173

struct rq *rq;

3120

u64 ns = 0;

3174

u64 ns = 0;

3121

3175

3122

rq = task_rq_lock(p, &flags);

3176

rq = task_rq_lock(p, &flags);

3123

ns = do_task_delta_exec(p, rq);

3177

ns = do_task_delta_exec(p, rq);

3124

task_rq_unlock(rq, &flags);

3178

task_rq_unlock(rq, &flags);

3125

3179

3126

return ns;

3180

return ns;

3127

}

3181

}

3128

3182

3129

/*

3183

/*

3130

* Return accounted runtime for the task.

3184

* Return accounted runtime for the task.

3131

* In case the task is currently running, return the runtime plus current's

3185

* In case the task is currently running, return the runtime plus current's

3132

* pending runtime that have not been accounted yet.

3186

* pending runtime that have not been accounted yet.

3133

*/

3187

*/

3134

unsigned long long task_sched_runtime(struct task_struct *p)

3188

unsigned long long task_sched_runtime(struct task_struct *p)

3135

{

3189

{

3136

unsigned long flags;

3190

unsigned long flags;

3137

struct rq *rq;

3191

struct rq *rq;

3138

u64 ns = 0;

3192

u64 ns = 0;

3139

3193

3140

rq = task_rq_lock(p, &flags);

3194

rq = task_rq_lock(p, &flags);

3141

ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);

3195

ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);

3142

task_rq_unlock(rq, &flags);

3196

task_rq_unlock(rq, &flags);

3143

3197

3144

return ns;

3198

return ns;

3145

}

3199

}

3146

3200

3147

/*

3201

/*

3148

* Return sum_exec_runtime for the thread group.

3202

* Return sum_exec_runtime for the thread group.

3149

* In case the task is currently running, return the sum plus current's

3203

* In case the task is currently running, return the sum plus current's

3150

* pending runtime that have not been accounted yet.

3204

* pending runtime that have not been accounted yet.

3151

*

3205

*

3152

* Note that the thread group might have other running tasks as well,

3206

* Note that the thread group might have other running tasks as well,

3153

* so the return value not includes other pending runtime that other

3207

* so the return value not includes other pending runtime that other

3154

* running tasks might have.

3208

* running tasks might have.

3155

*/

3209

*/

3156

unsigned long long thread_group_sched_runtime(struct task_struct *p)

3210

unsigned long long thread_group_sched_runtime(struct task_struct *p)

3157

{

3211

{

3158

struct task_cputime totals;

3212

struct task_cputime totals;

3159

unsigned long flags;

3213

unsigned long flags;

3160

struct rq *rq;

3214

struct rq *rq;

3161

u64 ns;

3215

u64 ns;

3162

3216

3163

rq = task_rq_lock(p, &flags);

3217

rq = task_rq_lock(p, &flags);

3164

thread_group_cputime(p, &totals);

3218

thread_group_cputime(p, &totals);

3165

ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);

3219

ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);

3166

task_rq_unlock(rq, &flags);

3220

task_rq_unlock(rq, &flags);

3167

3221

3168

return ns;

3222

return ns;

3169

}

3223

}

3170

3224

3171

/*

3225

/*

3172

* Account user cpu time to a process.

3226

* Account user cpu time to a process.

3173

* @p: the process that the cpu time gets accounted to

3227

* @p: the process that the cpu time gets accounted to

3174

* @cputime: the cpu time spent in user space since the last update

3228

* @cputime: the cpu time spent in user space since the last update

3175

* @cputime_scaled: cputime scaled by cpu frequency

3229

* @cputime_scaled: cputime scaled by cpu frequency

3176

*/

3230

*/

3177

void account_user_time(struct task_struct *p, cputime_t cputime,

3231

void account_user_time(struct task_struct *p, cputime_t cputime,

3178

cputime_t cputime_scaled)

3232

cputime_t cputime_scaled)

3179

{

3233

{

3180

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3234

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3181

cputime64_t tmp;

3235

cputime64_t tmp;

3182

3236

3183

/* Add user time to process. */

3237

/* Add user time to process. */

3184

p->utime = cputime_add(p->utime, cputime);

3238

p->utime = cputime_add(p->utime, cputime);

3185

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

3239

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

3186

account_group_user_time(p, cputime);

3240

account_group_user_time(p, cputime);

3187

3241

3188

/* Add user time to cpustat. */

3242

/* Add user time to cpustat. */

3189

tmp = cputime_to_cputime64(cputime);

3243

tmp = cputime_to_cputime64(cputime);

3190

if (TASK_NICE(p) > 0)

3244

if (TASK_NICE(p) > 0)

3191

cpustat->nice = cputime64_add(cpustat->nice, tmp);

3245

cpustat->nice = cputime64_add(cpustat->nice, tmp);

3192

else

3246

else

3193

cpustat->user = cputime64_add(cpustat->user, tmp);

3247

cpustat->user = cputime64_add(cpustat->user, tmp);

3194

3248

3195

cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);

3249

cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);

3196

/* Account for user time used */

3250

/* Account for user time used */

3197

acct_update_integrals(p);

3251

acct_update_integrals(p);

3198

}

3252

}

3199

3253

3200

/*

3254

/*

3201

* Account guest cpu time to a process.

3255

* Account guest cpu time to a process.

3202

* @p: the process that the cpu time gets accounted to

3256

* @p: the process that the cpu time gets accounted to

3203

* @cputime: the cpu time spent in virtual machine since the last update

3257

* @cputime: the cpu time spent in virtual machine since the last update

3204

* @cputime_scaled: cputime scaled by cpu frequency

3258

* @cputime_scaled: cputime scaled by cpu frequency

3205

*/

3259

*/

3206

static void account_guest_time(struct task_struct *p, cputime_t cputime,

3260

static void account_guest_time(struct task_struct *p, cputime_t cputime,

3207

cputime_t cputime_scaled)

3261

cputime_t cputime_scaled)

3208

{

3262

{

3209

cputime64_t tmp;

3263

cputime64_t tmp;

3210

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3264

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3211

3265

3212

tmp = cputime_to_cputime64(cputime);

3266

tmp = cputime_to_cputime64(cputime);

3213

3267

3214

/* Add guest time to process. */

3268

/* Add guest time to process. */

3215

p->utime = cputime_add(p->utime, cputime);

3269

p->utime = cputime_add(p->utime, cputime);

3216

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

3270

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

3217

account_group_user_time(p, cputime);

3271

account_group_user_time(p, cputime);

3218

p->gtime = cputime_add(p->gtime, cputime);

3272

p->gtime = cputime_add(p->gtime, cputime);

3219

3273

3220

/* Add guest time to cpustat. */

3274

/* Add guest time to cpustat. */

3221

if (TASK_NICE(p) > 0) {

3275

if (TASK_NICE(p) > 0) {

3222

cpustat->nice = cputime64_add(cpustat->nice, tmp);

3276

cpustat->nice = cputime64_add(cpustat->nice, tmp);

3223

cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);

3277

cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);

3224

} else {

3278

} else {

3225

cpustat->user = cputime64_add(cpustat->user, tmp);

3279

cpustat->user = cputime64_add(cpustat->user, tmp);

3226

cpustat->guest = cputime64_add(cpustat->guest, tmp);

3280

cpustat->guest = cputime64_add(cpustat->guest, tmp);

3227

}

3281

}

3228

}

3282

}

3229

3283

3230

/*

3284

/*

3231

* Account system cpu time to a process.

3285

* Account system cpu time to a process.

3232

* @p: the process that the cpu time gets accounted to

3286

* @p: the process that the cpu time gets accounted to

3233

* @hardirq_offset: the offset to subtract from hardirq_count()

3287

* @hardirq_offset: the offset to subtract from hardirq_count()

3234

* @cputime: the cpu time spent in kernel space since the last update

3288

* @cputime: the cpu time spent in kernel space since the last update

3235

* @cputime_scaled: cputime scaled by cpu frequency

3289

* @cputime_scaled: cputime scaled by cpu frequency

3236

*/

3290

*/

3237

void account_system_time(struct task_struct *p, int hardirq_offset,

3291

void account_system_time(struct task_struct *p, int hardirq_offset,

3238

cputime_t cputime, cputime_t cputime_scaled)

3292

cputime_t cputime, cputime_t cputime_scaled)

3239

{

3293

{

3240

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3294

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3241

cputime64_t tmp;

3295

cputime64_t tmp;

3242

3296

3243

if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {

3297

if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {

3244

account_guest_time(p, cputime, cputime_scaled);

3298

account_guest_time(p, cputime, cputime_scaled);

3245

return;

3299

return;

3246

}

3300

}

3247

3301

3248

/* Add system time to process. */

3302

/* Add system time to process. */

3249

p->stime = cputime_add(p->stime, cputime);

3303

p->stime = cputime_add(p->stime, cputime);

3250

p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);

3304

p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);

3251

account_group_system_time(p, cputime);

3305

account_group_system_time(p, cputime);

3252

3306

3253

/* Add system time to cpustat. */

3307

/* Add system time to cpustat. */

3254

tmp = cputime_to_cputime64(cputime);

3308

tmp = cputime_to_cputime64(cputime);

3255

if (hardirq_count() - hardirq_offset)

3309

if (hardirq_count() - hardirq_offset)

3256

cpustat->irq = cputime64_add(cpustat->irq, tmp);

3310

cpustat->irq = cputime64_add(cpustat->irq, tmp);

3257

else if (softirq_count())

3311

else if (softirq_count())

3258

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

3312

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

3259

else

3313

else

3260

cpustat->system = cputime64_add(cpustat->system, tmp);

3314

cpustat->system = cputime64_add(cpustat->system, tmp);

3261

3315

3262

cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);

3316

cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);

3263

3317

3264

/* Account for system time used */

3318

/* Account for system time used */

3265

acct_update_integrals(p);

3319

acct_update_integrals(p);

3266

}

3320

}

3267

3321

3268

/*

3322

/*

3269

* Account for involuntary wait time.

3323

* Account for involuntary wait time.

3270

* @steal: the cpu time spent in involuntary wait

3324

* @steal: the cpu time spent in involuntary wait

3271

*/

3325

*/

3272

void account_steal_time(cputime_t cputime)

3326

void account_steal_time(cputime_t cputime)

3273

{

3327

{

3274

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3328

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3275

cputime64_t cputime64 = cputime_to_cputime64(cputime);

3329

cputime64_t cputime64 = cputime_to_cputime64(cputime);

3276

3330

3277

cpustat->steal = cputime64_add(cpustat->steal, cputime64);

3331

cpustat->steal = cputime64_add(cpustat->steal, cputime64);

3278

}

3332

}

3279

3333

3280

/*

3334

/*

3281

* Account for idle time.

3335

* Account for idle time.

3282

* @cputime: the cpu time spent in idle wait

3336

* @cputime: the cpu time spent in idle wait

3283

*/

3337

*/

3284

void account_idle_time(cputime_t cputime)

3338

void account_idle_time(cputime_t cputime)

3285

{

3339

{

3286

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3340

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3287

cputime64_t cputime64 = cputime_to_cputime64(cputime);

3341

cputime64_t cputime64 = cputime_to_cputime64(cputime);

3288

struct rq *rq = this_rq();

3342

struct rq *rq = this_rq();

3289

3343

3290

if (atomic_read(&rq->nr_iowait) > 0)

3344

if (atomic_read(&rq->nr_iowait) > 0)

3291

cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);

3345

cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);

3292

else

3346

else

3293

cpustat->idle = cputime64_add(cpustat->idle, cputime64);

3347

cpustat->idle = cputime64_add(cpustat->idle, cputime64);

3294

}

3348

}

3295

3349

3296

#ifndef CONFIG_VIRT_CPU_ACCOUNTING

3350

#ifndef CONFIG_VIRT_CPU_ACCOUNTING

3297

3351

3298

/*

3352

/*

3299

* Account a single tick of cpu time.

3353

* Account a single tick of cpu time.

3300

* @p: the process that the cpu time gets accounted to

3354

* @p: the process that the cpu time gets accounted to

3301

* @user_tick: indicates if the tick is a user or a system tick

3355

* @user_tick: indicates if the tick is a user or a system tick

3302

*/

3356

*/

3303

void account_process_tick(struct task_struct *p, int user_tick)

3357

void account_process_tick(struct task_struct *p, int user_tick)

3304

{

3358

{

3305

cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);

3359

cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);

3306

struct rq *rq = this_rq();

3360

struct rq *rq = this_rq();

3307

3361

3308

if (user_tick)

3362

if (user_tick)

3309

account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);

3363

account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);

3310

else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))

3364

else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))

3311

account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,

3365

account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,

3312

one_jiffy_scaled);

3366

one_jiffy_scaled);

3313

else

3367

else

3314

account_idle_time(cputime_one_jiffy);

3368

account_idle_time(cputime_one_jiffy);

3315

}

3369

}

3316

3370

3317

/*

3371

/*

3318

* Account multiple ticks of steal time.

3372

* Account multiple ticks of steal time.

3319

* @p: the process from which the cpu time has been stolen

3373

* @p: the process from which the cpu time has been stolen

3320

* @ticks: number of stolen ticks

3374

* @ticks: number of stolen ticks

3321

*/

3375

*/

3322

void account_steal_ticks(unsigned long ticks)

3376

void account_steal_ticks(unsigned long ticks)

3323

{

3377

{

3324

account_steal_time(jiffies_to_cputime(ticks));

3378

account_steal_time(jiffies_to_cputime(ticks));

3325

}

3379

}

3326

3380

3327

/*

3381

/*

3328

* Account multiple ticks of idle time.

3382

* Account multiple ticks of idle time.

3329

* @ticks: number of stolen ticks

3383

* @ticks: number of stolen ticks

3330

*/

3384

*/

3331

void account_idle_ticks(unsigned long ticks)

3385

void account_idle_ticks(unsigned long ticks)

3332

{

3386

{

3333

account_idle_time(jiffies_to_cputime(ticks));

3387

account_idle_time(jiffies_to_cputime(ticks));

3334

}

3388

}

3335

3389

3336

#endif

3390

#endif

3337

3391

3338

/*

3392

/*

3339

* Use precise platform statistics if available:

3393

* Use precise platform statistics if available:

3340

*/

3394

*/

3341

#ifdef CONFIG_VIRT_CPU_ACCOUNTING

3395

#ifdef CONFIG_VIRT_CPU_ACCOUNTING

3342

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

3396

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

3343

{

3397

{

3344

*ut = p->utime;

3398

*ut = p->utime;

3345

*st = p->stime;

3399

*st = p->stime;

3346

}

3400

}

3347

3401

3348

void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

3402

void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

3349

{

3403

{

3350

struct task_cputime cputime;

3404

struct task_cputime cputime;

3351

3405

3352

thread_group_cputime(p, &cputime);

3406

thread_group_cputime(p, &cputime);

3353

3407

3354

*ut = cputime.utime;

3408

*ut = cputime.utime;

3355

*st = cputime.stime;

3409

*st = cputime.stime;

3356

}

3410

}

3357

#else

3411

#else

3358

3412

3359

#ifndef nsecs_to_cputime

3413

#ifndef nsecs_to_cputime

3360

# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)

3414

# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)

3361

#endif

3415

#endif

3362

3416

3363

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

3417

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

3364

{

3418

{

3365

cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);

3419

cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);

3366

3420

3367

/*

3421

/*

3368

* Use CFS's precise accounting:

3422

* Use CFS's precise accounting:

3369

*/

3423

*/

3370

rtime = nsecs_to_cputime(p->se.sum_exec_runtime);

3424

rtime = nsecs_to_cputime(p->se.sum_exec_runtime);

3371

3425

3372

if (total) {

3426

if (total) {

3373

u64 temp;

3427

u64 temp;

3374

3428

3375

temp = (u64)(rtime * utime);

3429

temp = (u64)(rtime * utime);

3376

do_div(temp, total);

3430

do_div(temp, total);

3377

utime = (cputime_t)temp;

3431

utime = (cputime_t)temp;

3378

} else

3432

} else

3379

utime = rtime;

3433

utime = rtime;

3380

3434

3381

/*

3435

/*

3382

* Compare with previous values, to keep monotonicity:

3436

* Compare with previous values, to keep monotonicity:

3383

*/

3437

*/

3384

p->prev_utime = max(p->prev_utime, utime);

3438

p->prev_utime = max(p->prev_utime, utime);

3385

p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));

3439

p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));

3386

3440

3387

*ut = p->prev_utime;

3441

*ut = p->prev_utime;

3388

*st = p->prev_stime;

3442

*st = p->prev_stime;

3389

}

3443

}

3390

3444

3391

/*

3445

/*

3392

* Must be called with siglock held.

3446

* Must be called with siglock held.

3393

*/

3447

*/

3394

void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

3448

void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

3395

{

3449

{

3396

struct signal_struct *sig = p->signal;

3450

struct signal_struct *sig = p->signal;

3397

struct task_cputime cputime;

3451

struct task_cputime cputime;

3398

cputime_t rtime, utime, total;

3452

cputime_t rtime, utime, total;

3399

3453

3400

thread_group_cputime(p, &cputime);

3454

thread_group_cputime(p, &cputime);

3401

3455

3402

total = cputime_add(cputime.utime, cputime.stime);

3456

total = cputime_add(cputime.utime, cputime.stime);

3403

rtime = nsecs_to_cputime(cputime.sum_exec_runtime);

3457

rtime = nsecs_to_cputime(cputime.sum_exec_runtime);

3404

3458

3405

if (total) {

3459

if (total) {

3406

u64 temp;

3460

u64 temp;

3407

3461

3408

temp = (u64)(rtime * cputime.utime);

3462

temp = (u64)(rtime * cputime.utime);

3409

do_div(temp, total);

3463

do_div(temp, total);

3410

utime = (cputime_t)temp;

3464

utime = (cputime_t)temp;

3411

} else

3465

} else

3412

utime = rtime;

3466

utime = rtime;

3413

3467

3414

sig->prev_utime = max(sig->prev_utime, utime);

3468

sig->prev_utime = max(sig->prev_utime, utime);

3415

sig->prev_stime = max(sig->prev_stime,

3469

sig->prev_stime = max(sig->prev_stime,

3416

cputime_sub(rtime, sig->prev_utime));

3470

cputime_sub(rtime, sig->prev_utime));

3417

3471

3418

*ut = sig->prev_utime;

3472

*ut = sig->prev_utime;

3419

*st = sig->prev_stime;

3473

*st = sig->prev_stime;

3420

}

3474

}

3421

#endif

3475

#endif

3422

3476

3423

/*

3477

/*

3424

* This function gets called by the timer code, with HZ frequency.

3478

* This function gets called by the timer code, with HZ frequency.

3425

* We call it with interrupts disabled.

3479

* We call it with interrupts disabled.

3426

*

3480

*

3427

* It also gets called by the fork code, when changing the parent's

3481

* It also gets called by the fork code, when changing the parent's

3428

* timeslices.

3482

* timeslices.

3429

*/

3483

*/

3430

void scheduler_tick(void)

3484

void scheduler_tick(void)

3431

{

3485

{

3432

int cpu = smp_processor_id();

3486

int cpu = smp_processor_id();

3433

struct rq *rq = cpu_rq(cpu);

3487

struct rq *rq = cpu_rq(cpu);

3434

struct task_struct *curr = rq->curr;

3488

struct task_struct *curr = rq->curr;

3435

3489

3436

sched_clock_tick();

3490

sched_clock_tick();

3437

3491

3438

raw_spin_lock(&rq->lock);

3492

raw_spin_lock(&rq->lock);

3439

update_rq_clock(rq);

3493

update_rq_clock(rq);

3440

update_cpu_load(rq);

3494

update_cpu_load(rq);

3441

curr->sched_class->task_tick(rq, curr, 0);

3495

curr->sched_class->task_tick(rq, curr, 0);

3442

raw_spin_unlock(&rq->lock);

3496

raw_spin_unlock(&rq->lock);

3443

3497

3444

perf_event_task_tick(curr);

3498

perf_event_task_tick(curr);

3445

3499

3446

#ifdef CONFIG_SMP

3500

#ifdef CONFIG_SMP

3447

rq->idle_at_tick = idle_cpu(cpu);

3501

rq->idle_at_tick = idle_cpu(cpu);

3448

trigger_load_balance(rq, cpu);

3502

trigger_load_balance(rq, cpu);

3449

#endif

3503

#endif

3450

}

3504

}

3451

3505

3452

notrace unsigned long get_parent_ip(unsigned long addr)

3506

notrace unsigned long get_parent_ip(unsigned long addr)

3453

{

3507

{

3454

if (in_lock_functions(addr)) {

3508

if (in_lock_functions(addr)) {

3455

addr = CALLER_ADDR2;

3509

addr = CALLER_ADDR2;

3456

if (in_lock_functions(addr))

3510

if (in_lock_functions(addr))

3457

addr = CALLER_ADDR3;

3511

addr = CALLER_ADDR3;

3458

}

3512

}

3459

return addr;

3513

return addr;

3460

}

3514

}

3461

3515

3462

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \

3516

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \

3463

defined(CONFIG_PREEMPT_TRACER))

3517

defined(CONFIG_PREEMPT_TRACER))

3464

3518

3465

void __kprobes add_preempt_count(int val)

3519

void __kprobes add_preempt_count(int val)

3466

{

3520

{

3467

#ifdef CONFIG_DEBUG_PREEMPT

3521

#ifdef CONFIG_DEBUG_PREEMPT

3468

/*

3522

/*

3469

* Underflow?

3523

* Underflow?

3470

*/

3524

*/

3471

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

3525

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

3472

return;

3526

return;

3473

#endif

3527

#endif

3474

preempt_count() += val;

3528

preempt_count() += val;

3475

#ifdef CONFIG_DEBUG_PREEMPT

3529

#ifdef CONFIG_DEBUG_PREEMPT

3476

/*

3530

/*

3477

* Spinlock count overflowing soon?

3531

* Spinlock count overflowing soon?

3478

*/

3532

*/

3479

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

3533

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

3480

PREEMPT_MASK - 10);

3534

PREEMPT_MASK - 10);

3481

#endif

3535

#endif

3482

if (preempt_count() == val)

3536

if (preempt_count() == val)

3483

trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

3537

trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

3484

}

3538

}

3485

EXPORT_SYMBOL(add_preempt_count);

3539

EXPORT_SYMBOL(add_preempt_count);

3486

3540

3487

void __kprobes sub_preempt_count(int val)

3541

void __kprobes sub_preempt_count(int val)

3488

{

3542

{

3489

#ifdef CONFIG_DEBUG_PREEMPT

3543

#ifdef CONFIG_DEBUG_PREEMPT

3490

/*

3544

/*

3491

* Underflow?

3545

* Underflow?

3492

*/

3546

*/

3493

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

3547

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

3494

return;

3548

return;

3495

/*

3549

/*

3496

* Is the spinlock portion underflowing?

3550

* Is the spinlock portion underflowing?

3497

*/

3551

*/

3498

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

3552

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

3499

!(preempt_count() & PREEMPT_MASK)))

3553

!(preempt_count() & PREEMPT_MASK)))

3500

return;

3554

return;

3501

#endif

3555

#endif

3502

3556

3503

if (preempt_count() == val)

3557

if (preempt_count() == val)

3504

trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

3558

trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

3505

preempt_count() -= val;

3559

preempt_count() -= val;

3506

}

3560

}

3507

EXPORT_SYMBOL(sub_preempt_count);

3561

EXPORT_SYMBOL(sub_preempt_count);

3508

3562

3509

#endif

3563

#endif

3510

3564

3511

/*

3565

/*

3512

* Print scheduling while atomic bug:

3566

* Print scheduling while atomic bug:

3513

*/

3567

*/

3514

static noinline void __schedule_bug(struct task_struct *prev)

3568

static noinline void __schedule_bug(struct task_struct *prev)

3515

{

3569

{

3516

struct pt_regs *regs = get_irq_regs();

3570

struct pt_regs *regs = get_irq_regs();

3517

3571

3518

printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

3572

printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

3519

prev->comm, prev->pid, preempt_count());

3573

prev->comm, prev->pid, preempt_count());

3520

3574

3521

debug_show_held_locks(prev);

3575

debug_show_held_locks(prev);

3522

print_modules();

3576

print_modules();

3523

if (irqs_disabled())

3577

if (irqs_disabled())

3524

print_irqtrace_events(prev);

3578

print_irqtrace_events(prev);

3525

3579

3526

if (regs)

3580

if (regs)

3527

show_regs(regs);

3581

show_regs(regs);

3528

else

3582

else

3529

dump_stack();

3583

dump_stack();

3530

}

3584

}

3531

3585

3532

/*

3586

/*

3533

* Various schedule()-time debugging checks and statistics:

3587

* Various schedule()-time debugging checks and statistics:

3534

*/

3588

*/

3535

static inline void schedule_debug(struct task_struct *prev)

3589

static inline void schedule_debug(struct task_struct *prev)

3536

{

3590

{

3537

/*

3591

/*

3538

* Test if we are atomic. Since do_exit() needs to call into

3592

* Test if we are atomic. Since do_exit() needs to call into

3539

* schedule() atomically, we ignore that path for now.

3593

* schedule() atomically, we ignore that path for now.

3540

* Otherwise, whine if we are scheduling when we should not be.

3594

* Otherwise, whine if we are scheduling when we should not be.

3541

*/

3595

*/

3542

if (unlikely(in_atomic_preempt_off() && !prev->exit_state))

3596

if (unlikely(in_atomic_preempt_off() && !prev->exit_state))

3543

__schedule_bug(prev);

3597

__schedule_bug(prev);

3544

3598

3545

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

3599

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

3546

3600

3547

schedstat_inc(this_rq(), sched_count);

3601

schedstat_inc(this_rq(), sched_count);

3548

#ifdef CONFIG_SCHEDSTATS

3602

#ifdef CONFIG_SCHEDSTATS

3549

if (unlikely(prev->lock_depth >= 0)) {

3603

if (unlikely(prev->lock_depth >= 0)) {

3550

schedstat_inc(this_rq(), bkl_count);

3604

schedstat_inc(this_rq(), bkl_count);

3551

schedstat_inc(prev, sched_info.bkl_count);

3605

schedstat_inc(prev, sched_info.bkl_count);

3552

}

3606

}

3553

#endif

3607

#endif

3554

}

3608

}

3555

3609

3556

static void put_prev_task(struct rq *rq, struct task_struct *prev)

3610

static void put_prev_task(struct rq *rq, struct task_struct *prev)

3557

{

3611

{

3558

if (prev->se.on_rq)

3612

if (prev->se.on_rq)

3559

update_rq_clock(rq);

3613

update_rq_clock(rq);

3560

rq->skip_clock_update = 0;

3614

rq->skip_clock_update = 0;

3561

prev->sched_class->put_prev_task(rq, prev);

3615

prev->sched_class->put_prev_task(rq, prev);

3562

}

3616

}

3563

3617

3564

/*

3618

/*

3565

* Pick up the highest-prio task:

3619

* Pick up the highest-prio task:

3566

*/

3620

*/

3567

static inline struct task_struct *

3621

static inline struct task_struct *

3568

pick_next_task(struct rq *rq)

3622

pick_next_task(struct rq *rq)

3569

{

3623

{

3570

const struct sched_class *class;

3624

const struct sched_class *class;

3571

struct task_struct *p;

3625

struct task_struct *p;

3572

3626

3573

/*

3627

/*

3574

* Optimization: we know that if all tasks are in

3628

* Optimization: we know that if all tasks are in

3575

* the fair class we can call that function directly:

3629

* the fair class we can call that function directly:

3576

*/

3630

*/

3577

if (likely(rq->nr_running == rq->cfs.nr_running)) {

3631

if (likely(rq->nr_running == rq->cfs.nr_running)) {

3578

p = fair_sched_class.pick_next_task(rq);

3632

p = fair_sched_class.pick_next_task(rq);

3579

if (likely(p))

3633

if (likely(p))

3580

return p;

3634

return p;

3581

}

3635

}

3582

3636

3583

class = sched_class_highest;

3637

class = sched_class_highest;

3584

for ( ; ; ) {

3638

for ( ; ; ) {

3585

p = class->pick_next_task(rq);

3639

p = class->pick_next_task(rq);

3586

if (p)

3640

if (p)

3587

return p;

3641

return p;

3588

/*

3642

/*

3589

* Will never be NULL as the idle class always

3643

* Will never be NULL as the idle class always

3590

* returns a non-NULL p:

3644

* returns a non-NULL p:

3591

*/

3645

*/

3592

class = class->next;

3646

class = class->next;

3593

}

3647

}

3594

}

3648

}

3595

3649

3596

/*

3650

/*

3597

* schedule() is the main scheduler function.

3651

* schedule() is the main scheduler function.

3598

*/

3652

*/

3599

asmlinkage void __sched schedule(void)

3653

asmlinkage void __sched schedule(void)

3600

{

3654

{

3601

struct task_struct *prev, *next;

3655

struct task_struct *prev, *next;

3602

unsigned long *switch_count;

3656

unsigned long *switch_count;

3603

struct rq *rq;

3657

struct rq *rq;

3604

int cpu;

3658

int cpu;

3605

3659

3606

need_resched:

3660

need_resched:

3607

preempt_disable();

3661

preempt_disable();

3608

cpu = smp_processor_id();

3662

cpu = smp_processor_id();

3609

rq = cpu_rq(cpu);

3663

rq = cpu_rq(cpu);

3610

rcu_sched_qs(cpu);

3664

rcu_sched_qs(cpu);

3611

prev = rq->curr;

3665

prev = rq->curr;

3612

switch_count = &prev->nivcsw;

3666

switch_count = &prev->nivcsw;

3613

3667

3614

release_kernel_lock(prev);

3668

release_kernel_lock(prev);

3615

need_resched_nonpreemptible:

3669

need_resched_nonpreemptible:

3616

3670

3617

schedule_debug(prev);

3671

schedule_debug(prev);

3618

3672

3619

if (sched_feat(HRTICK))

3673

if (sched_feat(HRTICK))

3620

hrtick_clear(rq);

3674

hrtick_clear(rq);

3621

3675

3622

raw_spin_lock_irq(&rq->lock);

3676

raw_spin_lock_irq(&rq->lock);

3623

clear_tsk_need_resched(prev);

3677

clear_tsk_need_resched(prev);

3624

3678

3625

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

3679

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

3626

if (unlikely(signal_pending_state(prev->state, prev)))

3680

if (unlikely(signal_pending_state(prev->state, prev)))

3627

prev->state = TASK_RUNNING;

3681

prev->state = TASK_RUNNING;

3628

else

3682

else

3629

deactivate_task(rq, prev, DEQUEUE_SLEEP);

3683

deactivate_task(rq, prev, DEQUEUE_SLEEP);

3630

switch_count = &prev->nvcsw;

3684

switch_count = &prev->nvcsw;

3631

}

3685

}

3632

3686

3633

pre_schedule(rq, prev);

3687

pre_schedule(rq, prev);

3634

3688

3635

if (unlikely(!rq->nr_running))

3689

if (unlikely(!rq->nr_running))

3636

idle_balance(cpu, rq);

3690

idle_balance(cpu, rq);

3637

3691

3638

put_prev_task(rq, prev);

3692

put_prev_task(rq, prev);

3639

next = pick_next_task(rq);

3693

next = pick_next_task(rq);

3640

3694

3641

if (likely(prev != next)) {

3695

if (likely(prev != next)) {

3642

sched_info_switch(prev, next);

3696

sched_info_switch(prev, next);

3643

perf_event_task_sched_out(prev, next);

3697

perf_event_task_sched_out(prev, next);

3644

3698

3645

rq->nr_switches++;

3699

rq->nr_switches++;

3646

rq->curr = next;

3700

rq->curr = next;

3647

++*switch_count;

3701

++*switch_count;

3648

3702

3649

context_switch(rq, prev, next); /* unlocks the rq */

3703

context_switch(rq, prev, next); /* unlocks the rq */

3650

/*

3704

/*

3651

* the context switch might have flipped the stack from under

3705

* the context switch might have flipped the stack from under

3652

* us, hence refresh the local variables.

3706

* us, hence refresh the local variables.

3653

*/

3707

*/

3654

cpu = smp_processor_id();

3708

cpu = smp_processor_id();

3655

rq = cpu_rq(cpu);

3709

rq = cpu_rq(cpu);

3656

} else

3710

} else

3657

raw_spin_unlock_irq(&rq->lock);

3711

raw_spin_unlock_irq(&rq->lock);

3658

3712

3659

post_schedule(rq);

3713

post_schedule(rq);

3660

3714

3661

if (unlikely(reacquire_kernel_lock(current) < 0)) {

3715

if (unlikely(reacquire_kernel_lock(current) < 0)) {

3662

prev = rq->curr;

3716

prev = rq->curr;

3663

switch_count = &prev->nivcsw;

3717

switch_count = &prev->nivcsw;

3664

goto need_resched_nonpreemptible;

3718

goto need_resched_nonpreemptible;

3665

}

3719

}

3666

3720

3667

preempt_enable_no_resched();

3721

preempt_enable_no_resched();

3668

if (need_resched())

3722

if (need_resched())

3669

goto need_resched;

3723

goto need_resched;

3670

}

3724

}

3671

EXPORT_SYMBOL(schedule);

3725

EXPORT_SYMBOL(schedule);

3672

3726

3673

#ifdef CONFIG_MUTEX_SPIN_ON_OWNER

3727

#ifdef CONFIG_MUTEX_SPIN_ON_OWNER

3674

/*

3728

/*

3675

* Look out! "owner" is an entirely speculative pointer

3729

* Look out! "owner" is an entirely speculative pointer

3676

* access and not reliable.

3730

* access and not reliable.

3677

*/

3731

*/

3678

int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)

3732

int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)

3679

{

3733

{

3680

unsigned int cpu;

3734

unsigned int cpu;

3681

struct rq *rq;

3735

struct rq *rq;

3682

3736

3683

if (!sched_feat(OWNER_SPIN))

3737

if (!sched_feat(OWNER_SPIN))

3684

return 0;

3738

return 0;

3685

3739

3686

#ifdef CONFIG_DEBUG_PAGEALLOC

3740

#ifdef CONFIG_DEBUG_PAGEALLOC

3687

/*

3741

/*

3688

* Need to access the cpu field knowing that

3742

* Need to access the cpu field knowing that

3689

* DEBUG_PAGEALLOC could have unmapped it if

3743

* DEBUG_PAGEALLOC could have unmapped it if

3690

* the mutex owner just released it and exited.

3744

* the mutex owner just released it and exited.

3691

*/

3745

*/

3692

if (probe_kernel_address(&owner->cpu, cpu))

3746

if (probe_kernel_address(&owner->cpu, cpu))

3693

goto out;

3747

goto out;

3694

#else

3748

#else

3695

cpu = owner->cpu;

3749

cpu = owner->cpu;

3696

#endif

3750

#endif

3697

3751

3698

/*

3752

/*

3699

* Even if the access succeeded (likely case),

3753

* Even if the access succeeded (likely case),

3700

* the cpu field may no longer be valid.

3754

* the cpu field may no longer be valid.

3701

*/

3755

*/

3702

if (cpu >= nr_cpumask_bits)

3756

if (cpu >= nr_cpumask_bits)

3703

goto out;

3757

goto out;

3704

3758

3705

/*

3759

/*

3706

* We need to validate that we can do a

3760

* We need to validate that we can do a

3707

* get_cpu() and that we have the percpu area.

3761

* get_cpu() and that we have the percpu area.

3708

*/

3762

*/

3709

if (!cpu_online(cpu))

3763

if (!cpu_online(cpu))

3710

goto out;

3764

goto out;

3711

3765

3712

rq = cpu_rq(cpu);

3766

rq = cpu_rq(cpu);

3713

3767

3714

for (;;) {

3768

for (;;) {

3715

/*

3769

/*

3716

* Owner changed, break to re-assess state.

3770

* Owner changed, break to re-assess state.

3717

*/

3771

*/

3718

if (lock->owner != owner)

3772

if (lock->owner != owner)

3719

break;

3773

break;

3720

3774

3721

/*

3775

/*

3722

* Is that owner really running on that cpu?

3776

* Is that owner really running on that cpu?

3723

*/

3777

*/

3724

if (task_thread_info(rq->curr) != owner || need_resched())

3778

if (task_thread_info(rq->curr) != owner || need_resched())

3725

return 0;

3779

return 0;

3726

3780

3727

cpu_relax();

3781

cpu_relax();

3728

}

3782

}

3729

out:

3783

out:

3730

return 1;

3784

return 1;

3731

}

3785

}

3732

#endif

3786

#endif

3733

3787

3734

#ifdef CONFIG_PREEMPT

3788

#ifdef CONFIG_PREEMPT

3735

/*

3789

/*

3736

* this is the entry point to schedule() from in-kernel preemption

3790

* this is the entry point to schedule() from in-kernel preemption

3737

* off of preempt_enable. Kernel preemptions off return from interrupt

3791

* off of preempt_enable. Kernel preemptions off return from interrupt

3738

* occur there and call schedule directly.

3792

* occur there and call schedule directly.

3739

*/

3793

*/

3740

asmlinkage void __sched preempt_schedule(void)

3794

asmlinkage void __sched preempt_schedule(void)

3741

{

3795

{

3742

struct thread_info *ti = current_thread_info();

3796

struct thread_info *ti = current_thread_info();

3743

3797

3744

/*

3798

/*

3745

* If there is a non-zero preempt_count or interrupts are disabled,

3799

* If there is a non-zero preempt_count or interrupts are disabled,

3746

* we do not want to preempt the current task. Just return..

3800

* we do not want to preempt the current task. Just return..

3747

*/

3801

*/

3748

if (likely(ti->preempt_count || irqs_disabled()))

3802

if (likely(ti->preempt_count || irqs_disabled()))

3749

return;

3803

return;

3750

3804

3751

do {

3805

do {

3752

add_preempt_count(PREEMPT_ACTIVE);

3806

add_preempt_count(PREEMPT_ACTIVE);

3753

schedule();

3807

schedule();

3754

sub_preempt_count(PREEMPT_ACTIVE);

3808

sub_preempt_count(PREEMPT_ACTIVE);

3755

3809

3756

/*

3810

/*

3757

* Check again in case we missed a preemption opportunity

3811

* Check again in case we missed a preemption opportunity

3758

* between schedule and now.

3812

* between schedule and now.

3759

*/

3813

*/

3760

barrier();

3814

barrier();

3761

} while (need_resched());

3815

} while (need_resched());

3762

}

3816

}

3763

EXPORT_SYMBOL(preempt_schedule);

3817

EXPORT_SYMBOL(preempt_schedule);

3764

3818

3765

/*

3819

/*

3766

* this is the entry point to schedule() from kernel preemption

3820

* this is the entry point to schedule() from kernel preemption

3767

* off of irq context.

3821

* off of irq context.

3768

* Note, that this is called and return with irqs disabled. This will

3822

* Note, that this is called and return with irqs disabled. This will

3769

* protect us against recursive calling from irq.

3823

* protect us against recursive calling from irq.

3770

*/

3824

*/

3771

asmlinkage void __sched preempt_schedule_irq(void)

3825

asmlinkage void __sched preempt_schedule_irq(void)

3772

{

3826

{

3773

struct thread_info *ti = current_thread_info();

3827

struct thread_info *ti = current_thread_info();

3774

3828

3775

/* Catch callers which need to be fixed */

3829

/* Catch callers which need to be fixed */

3776

BUG_ON(ti->preempt_count || !irqs_disabled());

3830

BUG_ON(ti->preempt_count || !irqs_disabled());

3777

3831

3778

do {

3832

do {

3779

add_preempt_count(PREEMPT_ACTIVE);

3833

add_preempt_count(PREEMPT_ACTIVE);

3780

local_irq_enable();

3834

local_irq_enable();

3781

schedule();

3835

schedule();

3782

local_irq_disable();

3836

local_irq_disable();

3783

sub_preempt_count(PREEMPT_ACTIVE);

3837

sub_preempt_count(PREEMPT_ACTIVE);

3784

3838

3785

/*

3839

/*

3786

* Check again in case we missed a preemption opportunity

3840

* Check again in case we missed a preemption opportunity

3787

* between schedule and now.

3841

* between schedule and now.

3788

*/

3842

*/

3789

barrier();

3843

barrier();

3790

} while (need_resched());

3844

} while (need_resched());

3791

}

3845

}

3792

3846

3793

#endif /* CONFIG_PREEMPT */

3847

#endif /* CONFIG_PREEMPT */

3794

3848

3795

int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,

3849

int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,

3796

void *key)

3850

void *key)

3797

{

3851

{

3798

return try_to_wake_up(curr->private, mode, wake_flags);

3852

return try_to_wake_up(curr->private, mode, wake_flags);

3799

}

3853

}

3800

EXPORT_SYMBOL(default_wake_function);

3854

EXPORT_SYMBOL(default_wake_function);

3801

3855

3802

/*

3856

/*

3803

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

3857

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

3804

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

3858

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

3805

* number) then we wake all the non-exclusive tasks and one exclusive task.

3859

* number) then we wake all the non-exclusive tasks and one exclusive task.

3806

*

3860

*

3807

* There are circumstances in which we can try to wake a task which has already

3861

* There are circumstances in which we can try to wake a task which has already

3808

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

3862

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

3809

* zero in this (rare) case, and we handle it by continuing to scan the queue.

3863

* zero in this (rare) case, and we handle it by continuing to scan the queue.

3810

*/

3864

*/

3811

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

3865

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

3812

int nr_exclusive, int wake_flags, void *key)

3866

int nr_exclusive, int wake_flags, void *key)

3813

{

3867

{

3814

wait_queue_t *curr, *next;

3868

wait_queue_t *curr, *next;

3815

3869

3816

list_for_each_entry_safe(curr, next, &q->task_list, task_list) {

3870

list_for_each_entry_safe(curr, next, &q->task_list, task_list) {

3817

unsigned flags = curr->flags;

3871

unsigned flags = curr->flags;

3818

3872

3819

if (curr->func(curr, mode, wake_flags, key) &&

3873

if (curr->func(curr, mode, wake_flags, key) &&

3820

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

3874

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

3821

break;

3875

break;

3822

}

3876

}

3823

}

3877

}

3824

3878

3825

/**

3879

/**

3826

* __wake_up - wake up threads blocked on a waitqueue.

3880

* __wake_up - wake up threads blocked on a waitqueue.

3827

* @q: the waitqueue

3881

* @q: the waitqueue

3828

* @mode: which threads

3882

* @mode: which threads

3829

* @nr_exclusive: how many wake-one or wake-many threads to wake up

3883

* @nr_exclusive: how many wake-one or wake-many threads to wake up

3830

* @key: is directly passed to the wakeup function

3884

* @key: is directly passed to the wakeup function

3831

*

3885

*

3832

* It may be assumed that this function implies a write memory barrier before

3886

* It may be assumed that this function implies a write memory barrier before

3833

* changing the task state if and only if any tasks are woken up.

3887

* changing the task state if and only if any tasks are woken up.

3834

*/

3888

*/

3835

void __wake_up(wait_queue_head_t *q, unsigned int mode,

3889

void __wake_up(wait_queue_head_t *q, unsigned int mode,

3836

int nr_exclusive, void *key)

3890

int nr_exclusive, void *key)

3837

{

3891

{

3838

unsigned long flags;

3892

unsigned long flags;

3839

3893

3840

spin_lock_irqsave(&q->lock, flags);

3894

spin_lock_irqsave(&q->lock, flags);

3841

__wake_up_common(q, mode, nr_exclusive, 0, key);

3895

__wake_up_common(q, mode, nr_exclusive, 0, key);

3842

spin_unlock_irqrestore(&q->lock, flags);

3896

spin_unlock_irqrestore(&q->lock, flags);

3843

}

3897

}

3844

EXPORT_SYMBOL(__wake_up);

3898

EXPORT_SYMBOL(__wake_up);

3845

3899

3846

/*

3900

/*

3847

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

3901

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

3848

*/

3902

*/

3849

void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

3903

void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

3850

{

3904

{

3851

__wake_up_common(q, mode, 1, 0, NULL);

3905

__wake_up_common(q, mode, 1, 0, NULL);

3852

}

3906

}

3853

3907

3854

void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)

3908

void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)

3855

{

3909

{

3856

__wake_up_common(q, mode, 1, 0, key);

3910

__wake_up_common(q, mode, 1, 0, key);

3857

}

3911

}

3858

3912

3859

/**

3913

/**

3860

* __wake_up_sync_key - wake up threads blocked on a waitqueue.

3914

* __wake_up_sync_key - wake up threads blocked on a waitqueue.

3861

* @q: the waitqueue

3915

* @q: the waitqueue

3862

* @mode: which threads

3916

* @mode: which threads

3863

* @nr_exclusive: how many wake-one or wake-many threads to wake up

3917

* @nr_exclusive: how many wake-one or wake-many threads to wake up

3864

* @key: opaque value to be passed to wakeup targets

3918

* @key: opaque value to be passed to wakeup targets

3865

*

3919

*

3866

* The sync wakeup differs that the waker knows that it will schedule

3920

* The sync wakeup differs that the waker knows that it will schedule

3867

* away soon, so while the target thread will be woken up, it will not

3921

* away soon, so while the target thread will be woken up, it will not

3868

* be migrated to another CPU - ie. the two threads are 'synchronized'

3922

* be migrated to another CPU - ie. the two threads are 'synchronized'

3869

* with each other. This can prevent needless bouncing between CPUs.

3923

* with each other. This can prevent needless bouncing between CPUs.

3870

*

3924

*

3871

* On UP it can prevent extra preemption.

3925

* On UP it can prevent extra preemption.

3872

*

3926

*

3873

* It may be assumed that this function implies a write memory barrier before

3927

* It may be assumed that this function implies a write memory barrier before

3874

* changing the task state if and only if any tasks are woken up.

3928

* changing the task state if and only if any tasks are woken up.

3875

*/

3929

*/

3876

void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,

3930

void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,

3877

int nr_exclusive, void *key)

3931

int nr_exclusive, void *key)

3878

{

3932

{

3879

unsigned long flags;

3933

unsigned long flags;

3880

int wake_flags = WF_SYNC;

3934

int wake_flags = WF_SYNC;

3881

3935

3882

if (unlikely(!q))

3936

if (unlikely(!q))

3883

return;

3937

return;

3884

3938

3885

if (unlikely(!nr_exclusive))

3939

if (unlikely(!nr_exclusive))

3886

wake_flags = 0;

3940

wake_flags = 0;

3887

3941

3888

spin_lock_irqsave(&q->lock, flags);

3942

spin_lock_irqsave(&q->lock, flags);

3889

__wake_up_common(q, mode, nr_exclusive, wake_flags, key);

3943

__wake_up_common(q, mode, nr_exclusive, wake_flags, key);

3890

spin_unlock_irqrestore(&q->lock, flags);

3944

spin_unlock_irqrestore(&q->lock, flags);

3891

}

3945

}

3892

EXPORT_SYMBOL_GPL(__wake_up_sync_key);

3946

EXPORT_SYMBOL_GPL(__wake_up_sync_key);

3893

3947

3894

/*

3948

/*

3895

* __wake_up_sync - see __wake_up_sync_key()

3949

* __wake_up_sync - see __wake_up_sync_key()

3896

*/

3950

*/

3897

void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

3951

void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

3898

{

3952

{

3899

__wake_up_sync_key(q, mode, nr_exclusive, NULL);

3953

__wake_up_sync_key(q, mode, nr_exclusive, NULL);

3900

}

3954

}

3901

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

3955

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

3902

3956

3903

/**

3957

/**

3904

* complete: - signals a single thread waiting on this completion

3958

* complete: - signals a single thread waiting on this completion

3905

* @x: holds the state of this particular completion

3959

* @x: holds the state of this particular completion

3906

*

3960

*

3907

* This will wake up a single thread waiting on this completion. Threads will be

3961

* This will wake up a single thread waiting on this completion. Threads will be

3908

* awakened in the same order in which they were queued.

3962

* awakened in the same order in which they were queued.

3909

*

3963

*

3910

* See also complete_all(), wait_for_completion() and related routines.

3964

* See also complete_all(), wait_for_completion() and related routines.

3911

*

3965

*

3912

* It may be assumed that this function implies a write memory barrier before

3966

* It may be assumed that this function implies a write memory barrier before

3913

* changing the task state if and only if any tasks are woken up.

3967

* changing the task state if and only if any tasks are woken up.

3914

*/

3968

*/

3915

void complete(struct completion *x)

3969

void complete(struct completion *x)

3916

{

3970

{

3917

unsigned long flags;

3971

unsigned long flags;

3918

3972

3919

spin_lock_irqsave(&x->wait.lock, flags);

3973

spin_lock_irqsave(&x->wait.lock, flags);

3920

x->done++;

3974

x->done++;

3921

__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);

3975

__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);

3922

spin_unlock_irqrestore(&x->wait.lock, flags);

3976

spin_unlock_irqrestore(&x->wait.lock, flags);

3923

}

3977

}

3924

EXPORT_SYMBOL(complete);

3978

EXPORT_SYMBOL(complete);

3925

3979

3926

/**

3980

/**

3927

* complete_all: - signals all threads waiting on this completion

3981

* complete_all: - signals all threads waiting on this completion

3928

* @x: holds the state of this particular completion

3982

* @x: holds the state of this particular completion

3929

*

3983

*

3930

* This will wake up all threads waiting on this particular completion event.

3984

* This will wake up all threads waiting on this particular completion event.

3931

*

3985

*

3932

* It may be assumed that this function implies a write memory barrier before

3986

* It may be assumed that this function implies a write memory barrier before

3933

* changing the task state if and only if any tasks are woken up.

3987

* changing the task state if and only if any tasks are woken up.

3934

*/

3988

*/

3935

void complete_all(struct completion *x)

3989

void complete_all(struct completion *x)

3936

{

3990

{

3937

unsigned long flags;

3991

unsigned long flags;

3938

3992

3939

spin_lock_irqsave(&x->wait.lock, flags);

3993

spin_lock_irqsave(&x->wait.lock, flags);

3940

x->done += UINT_MAX/2;

3994

x->done += UINT_MAX/2;

3941

__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);

3995

__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);

3942

spin_unlock_irqrestore(&x->wait.lock, flags);

3996

spin_unlock_irqrestore(&x->wait.lock, flags);

3943

}

3997

}

3944

EXPORT_SYMBOL(complete_all);

3998

EXPORT_SYMBOL(complete_all);

3945

3999

3946

static inline long __sched

4000

static inline long __sched

3947

do_wait_for_common(struct completion *x, long timeout, int state)

4001

do_wait_for_common(struct completion *x, long timeout, int state)

3948

{

4002

{

3949

if (!x->done) {

4003

if (!x->done) {

3950

DECLARE_WAITQUEUE(wait, current);

4004

DECLARE_WAITQUEUE(wait, current);

3951

4005

3952

wait.flags |= WQ_FLAG_EXCLUSIVE;

4006

wait.flags |= WQ_FLAG_EXCLUSIVE;

3953

__add_wait_queue_tail(&x->wait, &wait);

4007

__add_wait_queue_tail(&x->wait, &wait);

3954

do {

4008

do {

3955

if (signal_pending_state(state, current)) {

4009

if (signal_pending_state(state, current)) {

3956

timeout = -ERESTARTSYS;

4010

timeout = -ERESTARTSYS;

3957

break;

4011

break;

3958

}

4012

}

3959

__set_current_state(state);

4013

__set_current_state(state);

3960

spin_unlock_irq(&x->wait.lock);

4014

spin_unlock_irq(&x->wait.lock);

3961

timeout = schedule_timeout(timeout);

4015

timeout = schedule_timeout(timeout);

3962

spin_lock_irq(&x->wait.lock);

4016

spin_lock_irq(&x->wait.lock);

3963

} while (!x->done && timeout);

4017

} while (!x->done && timeout);

3964

__remove_wait_queue(&x->wait, &wait);

4018

__remove_wait_queue(&x->wait, &wait);

3965

if (!x->done)

4019

if (!x->done)

3966

return timeout;

4020

return timeout;

3967

}

4021

}

3968

x->done--;

4022

x->done--;

3969

return timeout ?: 1;

4023

return timeout ?: 1;

3970

}

4024

}

3971

4025

3972

static long __sched

4026

static long __sched

3973

wait_for_common(struct completion *x, long timeout, int state)

4027

wait_for_common(struct completion *x, long timeout, int state)

3974

{

4028

{

3975

might_sleep();

4029

might_sleep();

3976

4030

3977

spin_lock_irq(&x->wait.lock);

4031

spin_lock_irq(&x->wait.lock);

3978

timeout = do_wait_for_common(x, timeout, state);

4032

timeout = do_wait_for_common(x, timeout, state);

3979

spin_unlock_irq(&x->wait.lock);

4033

spin_unlock_irq(&x->wait.lock);

3980

return timeout;

4034

return timeout;

3981

}

4035

}

3982

4036

3983

/**

4037

/**

3984

* wait_for_completion: - waits for completion of a task

4038

* wait_for_completion: - waits for completion of a task

3985

* @x: holds the state of this particular completion

4039

* @x: holds the state of this particular completion

3986

*

4040

*

3987

* This waits to be signaled for completion of a specific task. It is NOT

4041

* This waits to be signaled for completion of a specific task. It is NOT

3988

* interruptible and there is no timeout.

4042

* interruptible and there is no timeout.

3989

*

4043

*

3990

* See also similar routines (i.e. wait_for_completion_timeout()) with timeout

4044

* See also similar routines (i.e. wait_for_completion_timeout()) with timeout

3991

* and interrupt capability. Also see complete().

4045

* and interrupt capability. Also see complete().

3992

*/

4046

*/

3993

void __sched wait_for_completion(struct completion *x)

4047

void __sched wait_for_completion(struct completion *x)

3994

{

4048

{

3995

wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);

4049

wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);

3996

}

4050

}

3997

EXPORT_SYMBOL(wait_for_completion);

4051

EXPORT_SYMBOL(wait_for_completion);

3998

4052

3999

/**

4053

/**

4000

* wait_for_completion_timeout: - waits for completion of a task (w/timeout)

4054

* wait_for_completion_timeout: - waits for completion of a task (w/timeout)

4001

* @x: holds the state of this particular completion

4055

* @x: holds the state of this particular completion

4002

* @timeout: timeout value in jiffies

4056

* @timeout: timeout value in jiffies

4003

*

4057

*

4004

* This waits for either a completion of a specific task to be signaled or for a

4058

* This waits for either a completion of a specific task to be signaled or for a

4005

* specified timeout to expire. The timeout is in jiffies. It is not

4059

* specified timeout to expire. The timeout is in jiffies. It is not

4006

* interruptible.

4060

* interruptible.

4007

*/

4061

*/

4008

unsigned long __sched

4062

unsigned long __sched

4009

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

4063

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

4010

{

4064

{

4011

return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);

4065

return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);

4012

}

4066

}

4013

EXPORT_SYMBOL(wait_for_completion_timeout);

4067

EXPORT_SYMBOL(wait_for_completion_timeout);

4014

4068

4015

/**

4069

/**

4016

* wait_for_completion_interruptible: - waits for completion of a task (w/intr)

4070

* wait_for_completion_interruptible: - waits for completion of a task (w/intr)

4017

* @x: holds the state of this particular completion

4071

* @x: holds the state of this particular completion

4018

*

4072

*

4019

* This waits for completion of a specific task to be signaled. It is

4073

* This waits for completion of a specific task to be signaled. It is

4020

* interruptible.

4074

* interruptible.

4021

*/

4075

*/

4022

int __sched wait_for_completion_interruptible(struct completion *x)

4076

int __sched wait_for_completion_interruptible(struct completion *x)

4023

{

4077

{

4024

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);

4078

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);

4025

if (t == -ERESTARTSYS)

4079

if (t == -ERESTARTSYS)

4026

return t;

4080

return t;

4027

return 0;

4081

return 0;

4028

}

4082

}

4029

EXPORT_SYMBOL(wait_for_completion_interruptible);

4083

EXPORT_SYMBOL(wait_for_completion_interruptible);

4030

4084

4031

/**

4085

/**

4032

* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))

4086

* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))

4033

* @x: holds the state of this particular completion

4087

* @x: holds the state of this particular completion

4034

* @timeout: timeout value in jiffies

4088

* @timeout: timeout value in jiffies

4035

*

4089

*

4036

* This waits for either a completion of a specific task to be signaled or for a

4090

* This waits for either a completion of a specific task to be signaled or for a

4037

* specified timeout to expire. It is interruptible. The timeout is in jiffies.

4091

* specified timeout to expire. It is interruptible. The timeout is in jiffies.

4038

*/

4092

*/

4039

unsigned long __sched

4093

unsigned long __sched

4040

wait_for_completion_interruptible_timeout(struct completion *x,

4094

wait_for_completion_interruptible_timeout(struct completion *x,

4041

unsigned long timeout)

4095

unsigned long timeout)

4042

{

4096

{

4043

return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);

4097

return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);

4044

}

4098

}

4045

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

4099

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

4046

4100

4047

/**

4101

/**

4048

* wait_for_completion_killable: - waits for completion of a task (killable)

4102

* wait_for_completion_killable: - waits for completion of a task (killable)

4049

* @x: holds the state of this particular completion

4103

* @x: holds the state of this particular completion

4050

*

4104

*

4051

* This waits to be signaled for completion of a specific task. It can be

4105

* This waits to be signaled for completion of a specific task. It can be

4052

* interrupted by a kill signal.

4106

* interrupted by a kill signal.

4053

*/

4107

*/

4054

int __sched wait_for_completion_killable(struct completion *x)

4108

int __sched wait_for_completion_killable(struct completion *x)

4055

{

4109

{

4056

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);

4110

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);

4057

if (t == -ERESTARTSYS)

4111

if (t == -ERESTARTSYS)

4058

return t;

4112

return t;

4059

return 0;

4113

return 0;

4060

}

4114

}

4061

EXPORT_SYMBOL(wait_for_completion_killable);

4115

EXPORT_SYMBOL(wait_for_completion_killable);

4062

4116

4063

/**

4117

/**

4064

* try_wait_for_completion - try to decrement a completion without blocking

4118

* try_wait_for_completion - try to decrement a completion without blocking

4065

* @x: completion structure

4119

* @x: completion structure

4066

*

4120

*

4067

* Returns: 0 if a decrement cannot be done without blocking

4121

* Returns: 0 if a decrement cannot be done without blocking

4068

* 1 if a decrement succeeded.

4122

* 1 if a decrement succeeded.

4069

*

4123

*

4070

* If a completion is being used as a counting completion,

4124

* If a completion is being used as a counting completion,

4071

* attempt to decrement the counter without blocking. This

4125

* attempt to decrement the counter without blocking. This

4072

* enables us to avoid waiting if the resource the completion

4126

* enables us to avoid waiting if the resource the completion

4073

* is protecting is not available.

4127

* is protecting is not available.

4074

*/

4128

*/

4075

bool try_wait_for_completion(struct completion *x)

4129

bool try_wait_for_completion(struct completion *x)

4076

{

4130

{

4077

unsigned long flags;

4131

unsigned long flags;

4078

int ret = 1;

4132

int ret = 1;

4079

4133

4080

spin_lock_irqsave(&x->wait.lock, flags);

4134

spin_lock_irqsave(&x->wait.lock, flags);

4081

if (!x->done)

4135

if (!x->done)

4082

ret = 0;

4136

ret = 0;

4083

else

4137

else

4084

x->done--;

4138

x->done--;

4085

spin_unlock_irqrestore(&x->wait.lock, flags);

4139

spin_unlock_irqrestore(&x->wait.lock, flags);

4086

return ret;

4140

return ret;

4087

}

4141

}

4088

EXPORT_SYMBOL(try_wait_for_completion);

4142

EXPORT_SYMBOL(try_wait_for_completion);

4089

4143

4090

/**

4144

/**

4091

* completion_done - Test to see if a completion has any waiters

4145

* completion_done - Test to see if a completion has any waiters

4092

* @x: completion structure

4146

* @x: completion structure

4093

*

4147

*

4094

* Returns: 0 if there are waiters (wait_for_completion() in progress)

4148

* Returns: 0 if there are waiters (wait_for_completion() in progress)

4095

* 1 if there are no waiters.

4149

* 1 if there are no waiters.

4096

*

4150

*

4097

*/

4151

*/

4098

bool completion_done(struct completion *x)

4152

bool completion_done(struct completion *x)

4099

{

4153

{

4100

unsigned long flags;

4154

unsigned long flags;

4101

int ret = 1;

4155

int ret = 1;

4102

4156

4103

spin_lock_irqsave(&x->wait.lock, flags);

4157

spin_lock_irqsave(&x->wait.lock, flags);

4104

if (!x->done)

4158

if (!x->done)

4105

ret = 0;

4159

ret = 0;

4106

spin_unlock_irqrestore(&x->wait.lock, flags);

4160

spin_unlock_irqrestore(&x->wait.lock, flags);

4107

return ret;

4161

return ret;

4108

}

4162

}

4109

EXPORT_SYMBOL(completion_done);

4163

EXPORT_SYMBOL(completion_done);

4110

4164

4111

static long __sched

4165

static long __sched

4112

sleep_on_common(wait_queue_head_t *q, int state, long timeout)

4166

sleep_on_common(wait_queue_head_t *q, int state, long timeout)

4113

{

4167

{

4114

unsigned long flags;

4168

unsigned long flags;

4115

wait_queue_t wait;

4169

wait_queue_t wait;

4116

4170

4117

init_waitqueue_entry(&wait, current);

4171

init_waitqueue_entry(&wait, current);

4118

4172

4119

__set_current_state(state);

4173

__set_current_state(state);

4120

4174

4121

spin_lock_irqsave(&q->lock, flags);

4175

spin_lock_irqsave(&q->lock, flags);

4122

__add_wait_queue(q, &wait);

4176

__add_wait_queue(q, &wait);

4123

spin_unlock(&q->lock);

4177

spin_unlock(&q->lock);

4124

timeout = schedule_timeout(timeout);

4178

timeout = schedule_timeout(timeout);

4125

spin_lock_irq(&q->lock);

4179

spin_lock_irq(&q->lock);

4126

__remove_wait_queue(q, &wait);

4180

__remove_wait_queue(q, &wait);

4127

spin_unlock_irqrestore(&q->lock, flags);

4181

spin_unlock_irqrestore(&q->lock, flags);

4128

4182

4129

return timeout;

4183

return timeout;

4130

}

4184

}

4131

4185

4132

void __sched interruptible_sleep_on(wait_queue_head_t *q)

4186

void __sched interruptible_sleep_on(wait_queue_head_t *q)

4133

{

4187

{

4134

sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

4188

sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

4135

}

4189

}

4136

EXPORT_SYMBOL(interruptible_sleep_on);

4190

EXPORT_SYMBOL(interruptible_sleep_on);

4137

4191

4138

long __sched

4192

long __sched

4139

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

4193

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

4140

{

4194

{

4141

return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);

4195

return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);

4142

}

4196

}

4143

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

4197

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

4144

4198

4145

void __sched sleep_on(wait_queue_head_t *q)

4199

void __sched sleep_on(wait_queue_head_t *q)

4146

{

4200

{

4147

sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

4201

sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

4148

}

4202

}

4149

EXPORT_SYMBOL(sleep_on);

4203

EXPORT_SYMBOL(sleep_on);

4150

4204

4151

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

4205

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

4152

{

4206

{

4153

return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);

4207

return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);

4154

}

4208

}

4155

EXPORT_SYMBOL(sleep_on_timeout);

4209

EXPORT_SYMBOL(sleep_on_timeout);

4156

4210

4157

#ifdef CONFIG_RT_MUTEXES

4211

#ifdef CONFIG_RT_MUTEXES

4158

4212

4159

/*

4213

/*

4160

* rt_mutex_setprio - set the current priority of a task

4214

* rt_mutex_setprio - set the current priority of a task

4161

* @p: task

4215

* @p: task

4162

* @prio: prio value (kernel-internal form)

4216

* @prio: prio value (kernel-internal form)

4163

*

4217

*

4164

* This function changes the 'effective' priority of a task. It does

4218

* This function changes the 'effective' priority of a task. It does

4165

* not touch ->normal_prio like __setscheduler().

4219

* not touch ->normal_prio like __setscheduler().

4166

*

4220

*

4167

* Used by the rt_mutex code to implement priority inheritance logic.

4221

* Used by the rt_mutex code to implement priority inheritance logic.

4168

*/

4222

*/

4169

void rt_mutex_setprio(struct task_struct *p, int prio)

4223

void rt_mutex_setprio(struct task_struct *p, int prio)

4170

{

4224

{

4171

unsigned long flags;

4225

unsigned long flags;

4172

int oldprio, on_rq, running;

4226

int oldprio, on_rq, running;

4173

struct rq *rq;

4227

struct rq *rq;

4174

const struct sched_class *prev_class;

4228

const struct sched_class *prev_class;

4175

4229

4176

BUG_ON(prio < 0 || prio > MAX_PRIO);

4230

BUG_ON(prio < 0 || prio > MAX_PRIO);

4177

4231

4178

rq = task_rq_lock(p, &flags);

4232

rq = task_rq_lock(p, &flags);

4179

4233

4180

oldprio = p->prio;

4234

oldprio = p->prio;

4181

prev_class = p->sched_class;

4235

prev_class = p->sched_class;

4182

on_rq = p->se.on_rq;

4236

on_rq = p->se.on_rq;

4183

running = task_current(rq, p);

4237

running = task_current(rq, p);

4184

if (on_rq)

4238

if (on_rq)

4185

dequeue_task(rq, p, 0);

4239

dequeue_task(rq, p, 0);

4186

if (running)

4240

if (running)

4187

p->sched_class->put_prev_task(rq, p);

4241

p->sched_class->put_prev_task(rq, p);

4188

4242

4189

if (rt_prio(prio))

4243

if (rt_prio(prio))

4190

p->sched_class = &rt_sched_class;

4244

p->sched_class = &rt_sched_class;

4191

else

4245

else

4192

p->sched_class = &fair_sched_class;

4246

p->sched_class = &fair_sched_class;

4193

4247

4194

p->prio = prio;

4248

p->prio = prio;

4195

4249

4196

if (running)

4250

if (running)

4197

p->sched_class->set_curr_task(rq);

4251

p->sched_class->set_curr_task(rq);

4198

if (on_rq) {

4252

if (on_rq) {

4199

enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);

4253

enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);

4200

4254

4201

check_class_changed(rq, p, prev_class, oldprio, running);

4255

check_class_changed(rq, p, prev_class, oldprio, running);

4202

}

4256

}

4203

task_rq_unlock(rq, &flags);

4257

task_rq_unlock(rq, &flags);

4204

}

4258

}

4205

4259

4206

#endif

4260

#endif

4207

4261

4208

void set_user_nice(struct task_struct *p, long nice)

4262

void set_user_nice(struct task_struct *p, long nice)

4209

{

4263

{

4210

int old_prio, delta, on_rq;

4264

int old_prio, delta, on_rq;

4211

unsigned long flags;

4265

unsigned long flags;

4212

struct rq *rq;

4266

struct rq *rq;

4213

4267

4214

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

4268

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

4215

return;

4269

return;

4216

/*

4270

/*

4217

* We have to be careful, if called from sys_setpriority(),

4271

* We have to be careful, if called from sys_setpriority(),

4218

* the task might be in the middle of scheduling on another CPU.

4272

* the task might be in the middle of scheduling on another CPU.

4219

*/

4273

*/

4220

rq = task_rq_lock(p, &flags);

4274

rq = task_rq_lock(p, &flags);

4221

/*

4275

/*

4222

* The RT priorities are set via sched_setscheduler(), but we still

4276

* The RT priorities are set via sched_setscheduler(), but we still

4223

* allow the 'normal' nice value to be set - but as expected

4277

* allow the 'normal' nice value to be set - but as expected

4224

* it wont have any effect on scheduling until the task is

4278

* it wont have any effect on scheduling until the task is

4225

* SCHED_FIFO/SCHED_RR:

4279

* SCHED_FIFO/SCHED_RR:

4226

*/

4280

*/

4227

if (task_has_rt_policy(p)) {

4281

if (task_has_rt_policy(p)) {

4228

p->static_prio = NICE_TO_PRIO(nice);

4282

p->static_prio = NICE_TO_PRIO(nice);

4229

goto out_unlock;

4283

goto out_unlock;

4230

}

4284

}

4231

on_rq = p->se.on_rq;

4285

on_rq = p->se.on_rq;

4232

if (on_rq)

4286

if (on_rq)

4233

dequeue_task(rq, p, 0);

4287

dequeue_task(rq, p, 0);

4234

4288

4235

p->static_prio = NICE_TO_PRIO(nice);

4289

p->static_prio = NICE_TO_PRIO(nice);

4236

set_load_weight(p);

4290

set_load_weight(p);

4237

old_prio = p->prio;

4291

old_prio = p->prio;

4238

p->prio = effective_prio(p);

4292

p->prio = effective_prio(p);

4239

delta = p->prio - old_prio;

4293

delta = p->prio - old_prio;

4240

4294

4241

if (on_rq) {

4295

if (on_rq) {

4242

enqueue_task(rq, p, 0);

4296

enqueue_task(rq, p, 0);

4243

/*

4297

/*

4244

* If the task increased its priority or is running and

4298

* If the task increased its priority or is running and

4245

* lowered its priority, then reschedule its CPU:

4299

* lowered its priority, then reschedule its CPU:

4246

*/

4300

*/

4247

if (delta < 0 || (delta > 0 && task_running(rq, p)))

4301

if (delta < 0 || (delta > 0 && task_running(rq, p)))

4248

resched_task(rq->curr);

4302

resched_task(rq->curr);

4249

}

4303

}

4250

out_unlock:

4304

out_unlock:

4251

task_rq_unlock(rq, &flags);

4305

task_rq_unlock(rq, &flags);

4252

}

4306

}

4253

EXPORT_SYMBOL(set_user_nice);

4307

EXPORT_SYMBOL(set_user_nice);

4254

4308

4255

/*

4309

/*

4256

* can_nice - check if a task can reduce its nice value

4310

* can_nice - check if a task can reduce its nice value

4257

* @p: task

4311

* @p: task

4258

* @nice: nice value

4312

* @nice: nice value

4259

*/

4313

*/

4260

int can_nice(const struct task_struct *p, const int nice)

4314

int can_nice(const struct task_struct *p, const int nice)

4261

{

4315

{

4262

/* convert nice value [19,-20] to rlimit style value [1,40] */

4316

/* convert nice value [19,-20] to rlimit style value [1,40] */

4263

int nice_rlim = 20 - nice;

4317

int nice_rlim = 20 - nice;

4264

4318

4265

return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||

4319

return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||

4266

capable(CAP_SYS_NICE));

4320

capable(CAP_SYS_NICE));

4267

}

4321

}

4268

4322

4269

#ifdef __ARCH_WANT_SYS_NICE

4323

#ifdef __ARCH_WANT_SYS_NICE

4270

4324

4271

/*

4325

/*

4272

* sys_nice - change the priority of the current process.

4326

* sys_nice - change the priority of the current process.

4273

* @increment: priority increment

4327

* @increment: priority increment

4274

*

4328

*

4275

* sys_setpriority is a more generic, but much slower function that

4329

* sys_setpriority is a more generic, but much slower function that

4276

* does similar things.

4330

* does similar things.

4277

*/

4331

*/

4278

SYSCALL_DEFINE1(nice, int, increment)

4332

SYSCALL_DEFINE1(nice, int, increment)

4279

{

4333

{

4280

long nice, retval;

4334

long nice, retval;

4281

4335

4282

/*

4336

/*

4283

* Setpriority might change our priority at the same moment.

4337

* Setpriority might change our priority at the same moment.

4284

* We don't have to worry. Conceptually one call occurs first

4338

* We don't have to worry. Conceptually one call occurs first

4285

* and we have a single winner.

4339

* and we have a single winner.

4286

*/

4340

*/

4287

if (increment < -40)

4341

if (increment < -40)

4288

increment = -40;

4342

increment = -40;

4289

if (increment > 40)

4343

if (increment > 40)

4290

increment = 40;

4344

increment = 40;

4291

4345

4292

nice = TASK_NICE(current) + increment;

4346

nice = TASK_NICE(current) + increment;

4293

if (nice < -20)

4347

if (nice < -20)

4294

nice = -20;

4348

nice = -20;

4295

if (nice > 19)

4349

if (nice > 19)

4296

nice = 19;

4350

nice = 19;

4297

4351

4298

if (increment < 0 && !can_nice(current, nice))

4352

if (increment < 0 && !can_nice(current, nice))

4299

return -EPERM;

4353

return -EPERM;

4300

4354

4301

retval = security_task_setnice(current, nice);

4355

retval = security_task_setnice(current, nice);

4302

if (retval)

4356

if (retval)

4303

return retval;

4357

return retval;

4304

4358

4305

set_user_nice(current, nice);

4359

set_user_nice(current, nice);

4306

return 0;

4360

return 0;

4307

}

4361

}

4308

4362

4309

#endif

4363

#endif

4310

4364

4311

/**

4365

/**

4312

* task_prio - return the priority value of a given task.

4366

* task_prio - return the priority value of a given task.

4313

* @p: the task in question.

4367

* @p: the task in question.

4314

*

4368

*

4315

* This is the priority value as seen by users in /proc.

4369

* This is the priority value as seen by users in /proc.

4316

* RT tasks are offset by -200. Normal tasks are centered

4370

* RT tasks are offset by -200. Normal tasks are centered

4317

* around 0, value goes from -16 to +15.

4371

* around 0, value goes from -16 to +15.

4318

*/

4372

*/

4319

int task_prio(const struct task_struct *p)

4373

int task_prio(const struct task_struct *p)

4320

{

4374

{

4321

return p->prio - MAX_RT_PRIO;

4375

return p->prio - MAX_RT_PRIO;

4322

}

4376

}

4323

4377

4324

/**

4378

/**

4325

* task_nice - return the nice value of a given task.

4379

* task_nice - return the nice value of a given task.

4326

* @p: the task in question.

4380

* @p: the task in question.

4327

*/

4381

*/

4328

int task_nice(const struct task_struct *p)

4382

int task_nice(const struct task_struct *p)

4329

{

4383

{

4330

return TASK_NICE(p);

4384

return TASK_NICE(p);

4331

}

4385

}

4332

EXPORT_SYMBOL(task_nice);

4386

EXPORT_SYMBOL(task_nice);

4333

4387

4334

/**

4388

/**

4335

* idle_cpu - is a given cpu idle currently?

4389

* idle_cpu - is a given cpu idle currently?

4336

* @cpu: the processor in question.

4390

* @cpu: the processor in question.

4337

*/

4391

*/

4338

int idle_cpu(int cpu)

4392

int idle_cpu(int cpu)

4339

{

4393

{

4340

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

4394

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

4341

}

4395

}

4342

4396

4343

/**

4397

/**

4344

* idle_task - return the idle task for a given cpu.

4398

* idle_task - return the idle task for a given cpu.

4345

* @cpu: the processor in question.

4399

* @cpu: the processor in question.

4346

*/

4400

*/

4347

struct task_struct *idle_task(int cpu)

4401

struct task_struct *idle_task(int cpu)

4348

{

4402

{

4349

return cpu_rq(cpu)->idle;

4403

return cpu_rq(cpu)->idle;

4350

}

4404

}

4351

4405

4352

/**

4406

/**

4353

* find_process_by_pid - find a process with a matching PID value.

4407

* find_process_by_pid - find a process with a matching PID value.

4354

* @pid: the pid in question.

4408

* @pid: the pid in question.

4355

*/

4409

*/

4356

static struct task_struct *find_process_by_pid(pid_t pid)

4410

static struct task_struct *find_process_by_pid(pid_t pid)

4357

{

4411

{

4358

return pid ? find_task_by_vpid(pid) : current;

4412

return pid ? find_task_by_vpid(pid) : current;

4359

}

4413

}

4360

4414

4361

/* Actually do priority change: must hold rq lock. */

4415

/* Actually do priority change: must hold rq lock. */

4362

static void

4416

static void

4363

__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)

4417

__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)

4364

{

4418

{

4365

BUG_ON(p->se.on_rq);

4419

BUG_ON(p->se.on_rq);

4366

4420

4367

p->policy = policy;

4421

p->policy = policy;

4368

p->rt_priority = prio;

4422

p->rt_priority = prio;

4369

p->normal_prio = normal_prio(p);

4423

p->normal_prio = normal_prio(p);

4370

/* we are holding p->pi_lock already */

4424

/* we are holding p->pi_lock already */

4371

p->prio = rt_mutex_getprio(p);

4425

p->prio = rt_mutex_getprio(p);

4372

if (rt_prio(p->prio))

4426

if (rt_prio(p->prio))

4373

p->sched_class = &rt_sched_class;

4427

p->sched_class = &rt_sched_class;

4374

else

4428

else

4375

p->sched_class = &fair_sched_class;

4429

p->sched_class = &fair_sched_class;

4376

set_load_weight(p);

4430

set_load_weight(p);

4377

}

4431

}

4378

4432

4379

/*

4433

/*

4380

* check the target process has a UID that matches the current process's

4434

* check the target process has a UID that matches the current process's

4381

*/

4435

*/

4382

static bool check_same_owner(struct task_struct *p)

4436

static bool check_same_owner(struct task_struct *p)

4383

{

4437

{

4384

const struct cred *cred = current_cred(), *pcred;

4438

const struct cred *cred = current_cred(), *pcred;

4385

bool match;

4439

bool match;

4386

4440

4387

rcu_read_lock();

4441

rcu_read_lock();

4388

pcred = __task_cred(p);

4442

pcred = __task_cred(p);

4389

match = (cred->euid == pcred->euid ||

4443

match = (cred->euid == pcred->euid ||

4390

cred->euid == pcred->uid);

4444

cred->euid == pcred->uid);

4391

rcu_read_unlock();

4445

rcu_read_unlock();

4392

return match;

4446

return match;

4393

}

4447

}

4394

4448

4395

static int __sched_setscheduler(struct task_struct *p, int policy,

4449

static int __sched_setscheduler(struct task_struct *p, int policy,

4396

struct sched_param *param, bool user)

4450

struct sched_param *param, bool user)

4397

{

4451

{

4398

int retval, oldprio, oldpolicy = -1, on_rq, running;

4452

int retval, oldprio, oldpolicy = -1, on_rq, running;

4399

unsigned long flags;

4453

unsigned long flags;

4400

const struct sched_class *prev_class;

4454

const struct sched_class *prev_class;

4401

struct rq *rq;

4455

struct rq *rq;

4402

int reset_on_fork;

4456

int reset_on_fork;

4403

4457

4404

/* may grab non-irq protected spin_locks */

4458

/* may grab non-irq protected spin_locks */

4405

BUG_ON(in_interrupt());

4459

BUG_ON(in_interrupt());

4406

recheck:

4460

recheck:

4407

/* double check policy once rq lock held */

4461

/* double check policy once rq lock held */

4408

if (policy < 0) {

4462

if (policy < 0) {

4409

reset_on_fork = p->sched_reset_on_fork;

4463

reset_on_fork = p->sched_reset_on_fork;

4410

policy = oldpolicy = p->policy;

4464

policy = oldpolicy = p->policy;

4411

} else {

4465

} else {

4412

reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);

4466

reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);

4413

policy &= ~SCHED_RESET_ON_FORK;

4467

policy &= ~SCHED_RESET_ON_FORK;

4414

4468

4415

if (policy != SCHED_FIFO && policy != SCHED_RR &&

4469

if (policy != SCHED_FIFO && policy != SCHED_RR &&

4416

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

4470

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

4417

policy != SCHED_IDLE)

4471

policy != SCHED_IDLE)

4418

return -EINVAL;

4472

return -EINVAL;

4419

}

4473

}

4420

4474

4421

/*

4475

/*

4422

* Valid priorities for SCHED_FIFO and SCHED_RR are

4476

* Valid priorities for SCHED_FIFO and SCHED_RR are

4423

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

4477

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

4424

* SCHED_BATCH and SCHED_IDLE is 0.

4478

* SCHED_BATCH and SCHED_IDLE is 0.

4425

*/

4479

*/

4426

if (param->sched_priority < 0 ||

4480

if (param->sched_priority < 0 ||

4427

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

4481

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

4428

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

4482

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

4429

return -EINVAL;

4483

return -EINVAL;

4430

if (rt_policy(policy) != (param->sched_priority != 0))

4484

if (rt_policy(policy) != (param->sched_priority != 0))

4431

return -EINVAL;

4485

return -EINVAL;

4432

4486

4433

/*

4487

/*

4434

* Allow unprivileged RT tasks to decrease priority:

4488

* Allow unprivileged RT tasks to decrease priority:

4435

*/

4489

*/

4436

if (user && !capable(CAP_SYS_NICE)) {

4490

if (user && !capable(CAP_SYS_NICE)) {

4437

if (rt_policy(policy)) {

4491

if (rt_policy(policy)) {

4438

unsigned long rlim_rtprio;

4492

unsigned long rlim_rtprio;

4439

4493

4440

if (!lock_task_sighand(p, &flags))

4494

if (!lock_task_sighand(p, &flags))

4441

return -ESRCH;

4495

return -ESRCH;

4442

rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);

4496

rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);

4443

unlock_task_sighand(p, &flags);

4497

unlock_task_sighand(p, &flags);

4444

4498

4445

/* can't set/change the rt policy */

4499

/* can't set/change the rt policy */

4446

if (policy != p->policy && !rlim_rtprio)

4500

if (policy != p->policy && !rlim_rtprio)

4447

return -EPERM;

4501

return -EPERM;

4448

4502

4449

/* can't increase priority */

4503

/* can't increase priority */

4450

if (param->sched_priority > p->rt_priority &&

4504

if (param->sched_priority > p->rt_priority &&

4451

param->sched_priority > rlim_rtprio)

4505

param->sched_priority > rlim_rtprio)

4452

return -EPERM;

4506

return -EPERM;

4453

}

4507

}

4454

/*

4508

/*

4455

* Like positive nice levels, dont allow tasks to

4509

* Like positive nice levels, dont allow tasks to

4456

* move out of SCHED_IDLE either:

4510

* move out of SCHED_IDLE either:

4457

*/

4511

*/

4458

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)

4512

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)

4459

return -EPERM;

4513

return -EPERM;

4460

4514

4461

/* can't change other user's priorities */

4515

/* can't change other user's priorities */

4462

if (!check_same_owner(p))

4516

if (!check_same_owner(p))

4463

return -EPERM;

4517

return -EPERM;

4464

4518

4465

/* Normal users shall not reset the sched_reset_on_fork flag */

4519

/* Normal users shall not reset the sched_reset_on_fork flag */

4466

if (p->sched_reset_on_fork && !reset_on_fork)

4520

if (p->sched_reset_on_fork && !reset_on_fork)

4467

return -EPERM;

4521

return -EPERM;

4468

}

4522

}

4469

4523

4470

if (user) {

4524

if (user) {

4471

#ifdef CONFIG_RT_GROUP_SCHED

4525

#ifdef CONFIG_RT_GROUP_SCHED

4472

/*

4526

/*

4473

* Do not allow realtime tasks into groups that have no runtime

4527

* Do not allow realtime tasks into groups that have no runtime

4474

* assigned.

4528

* assigned.

4475

*/

4529

*/

4476

if (rt_bandwidth_enabled() && rt_policy(policy) &&

4530

if (rt_bandwidth_enabled() && rt_policy(policy) &&

4477

task_group(p)->rt_bandwidth.rt_runtime == 0)

4531

task_group(p)->rt_bandwidth.rt_runtime == 0)

4478

return -EPERM;

4532

return -EPERM;

4479

#endif

4533

#endif

4480

4534

4481

retval = security_task_setscheduler(p, policy, param);

4535

retval = security_task_setscheduler(p, policy, param);

4482

if (retval)

4536

if (retval)

4483

return retval;

4537

return retval;

4484

}

4538

}

4485

4539

4486

/*

4540

/*

4487

* make sure no PI-waiters arrive (or leave) while we are

4541

* make sure no PI-waiters arrive (or leave) while we are

4488

* changing the priority of the task:

4542

* changing the priority of the task:

4489

*/

4543

*/

4490

raw_spin_lock_irqsave(&p->pi_lock, flags);

4544

raw_spin_lock_irqsave(&p->pi_lock, flags);

4491

/*

4545

/*

4492

* To be able to change p->policy safely, the apropriate

4546

* To be able to change p->policy safely, the apropriate

4493

* runqueue lock must be held.

4547

* runqueue lock must be held.

4494

*/

4548

*/

4495

rq = __task_rq_lock(p);

4549

rq = __task_rq_lock(p);

4496

/* recheck policy now with rq lock held */

4550

/* recheck policy now with rq lock held */

4497

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

4551

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

4498

policy = oldpolicy = -1;

4552

policy = oldpolicy = -1;

4499

__task_rq_unlock(rq);

4553

__task_rq_unlock(rq);

4500

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

4554

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

4501

goto recheck;

4555

goto recheck;

4502

}

4556

}

4503

on_rq = p->se.on_rq;

4557

on_rq = p->se.on_rq;

4504

running = task_current(rq, p);

4558

running = task_current(rq, p);

4505

if (on_rq)

4559

if (on_rq)

4506

deactivate_task(rq, p, 0);

4560

deactivate_task(rq, p, 0);

4507

if (running)

4561

if (running)

4508

p->sched_class->put_prev_task(rq, p);

4562

p->sched_class->put_prev_task(rq, p);

4509

4563

4510

p->sched_reset_on_fork = reset_on_fork;

4564

p->sched_reset_on_fork = reset_on_fork;

4511

4565

4512

oldprio = p->prio;

4566

oldprio = p->prio;

4513

prev_class = p->sched_class;

4567

prev_class = p->sched_class;

4514

__setscheduler(rq, p, policy, param->sched_priority);

4568

__setscheduler(rq, p, policy, param->sched_priority);

4515

4569

4516

if (running)

4570

if (running)

4517

p->sched_class->set_curr_task(rq);

4571

p->sched_class->set_curr_task(rq);

4518

if (on_rq) {

4572

if (on_rq) {

4519

activate_task(rq, p, 0);

4573

activate_task(rq, p, 0);

4520

4574

4521

check_class_changed(rq, p, prev_class, oldprio, running);

4575

check_class_changed(rq, p, prev_class, oldprio, running);

4522

}

4576

}

4523

__task_rq_unlock(rq);

4577

__task_rq_unlock(rq);

4524

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

4578

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

4525

4579

4526

rt_mutex_adjust_pi(p);

4580

rt_mutex_adjust_pi(p);

4527

4581

4528

return 0;

4582

return 0;

4529

}

4583

}

4530

4584

4531

/**

4585

/**

4532

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

4586

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

4533

* @p: the task in question.

4587

* @p: the task in question.

4534

* @policy: new policy.

4588

* @policy: new policy.

4535

* @param: structure containing the new RT priority.

4589

* @param: structure containing the new RT priority.

4536

*

4590

*

4537

* NOTE that the task may be already dead.

4591

* NOTE that the task may be already dead.

4538

*/

4592

*/

4539

int sched_setscheduler(struct task_struct *p, int policy,

4593

int sched_setscheduler(struct task_struct *p, int policy,

4540

struct sched_param *param)

4594

struct sched_param *param)

4541

{

4595

{

4542

return __sched_setscheduler(p, policy, param, true);

4596

return __sched_setscheduler(p, policy, param, true);

4543

}

4597

}

4544

EXPORT_SYMBOL_GPL(sched_setscheduler);

4598

EXPORT_SYMBOL_GPL(sched_setscheduler);

4545

4599

4546

/**

4600

/**

4547

* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

4601

* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

4548

* @p: the task in question.

4602

* @p: the task in question.

4549

* @policy: new policy.

4603

* @policy: new policy.

4550

* @param: structure containing the new RT priority.

4604

* @param: structure containing the new RT priority.

4551

*

4605

*

4552

* Just like sched_setscheduler, only don't bother checking if the

4606

* Just like sched_setscheduler, only don't bother checking if the

4553

* current context has permission. For example, this is needed in

4607

* current context has permission. For example, this is needed in

4554

* stop_machine(): we create temporary high priority worker threads,

4608

* stop_machine(): we create temporary high priority worker threads,

4555

* but our caller might not have that capability.

4609

* but our caller might not have that capability.

4556

*/

4610

*/

4557

int sched_setscheduler_nocheck(struct task_struct *p, int policy,

4611

int sched_setscheduler_nocheck(struct task_struct *p, int policy,

4558

struct sched_param *param)

4612

struct sched_param *param)

4559

{

4613

{

4560

return __sched_setscheduler(p, policy, param, false);

4614

return __sched_setscheduler(p, policy, param, false);

4561

}

4615

}

4562

4616

4563

static int

4617

static int

4564

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

4618

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

4565

{

4619

{

4566

struct sched_param lparam;

4620

struct sched_param lparam;

4567

struct task_struct *p;

4621

struct task_struct *p;

4568

int retval;

4622

int retval;

4569

4623

4570

if (!param || pid < 0)

4624

if (!param || pid < 0)

4571

return -EINVAL;

4625

return -EINVAL;

4572

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

4626

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

4573

return -EFAULT;

4627

return -EFAULT;

4574

4628

4575

rcu_read_lock();

4629

rcu_read_lock();

4576

retval = -ESRCH;

4630

retval = -ESRCH;

4577

p = find_process_by_pid(pid);

4631

p = find_process_by_pid(pid);

4578

if (p != NULL)

4632

if (p != NULL)

4579

retval = sched_setscheduler(p, policy, &lparam);

4633

retval = sched_setscheduler(p, policy, &lparam);

4580

rcu_read_unlock();

4634

rcu_read_unlock();

4581

4635

4582

return retval;

4636

return retval;

4583

}

4637

}

4584

4638

4585

/**

4639

/**

4586

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

4640

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

4587

* @pid: the pid in question.

4641

* @pid: the pid in question.

4588

* @policy: new policy.

4642

* @policy: new policy.

4589

* @param: structure containing the new RT priority.

4643

* @param: structure containing the new RT priority.

4590

*/

4644

*/

4591

SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,

4645

SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,

4592

struct sched_param __user *, param)

4646

struct sched_param __user *, param)

4593

{

4647

{

4594

/* negative values for policy are not valid */

4648

/* negative values for policy are not valid */

4595

if (policy < 0)

4649

if (policy < 0)

4596

return -EINVAL;

4650

return -EINVAL;

4597

4651

4598

return do_sched_setscheduler(pid, policy, param);

4652

return do_sched_setscheduler(pid, policy, param);

4599

}

4653

}

4600

4654

4601

/**

4655

/**

4602

* sys_sched_setparam - set/change the RT priority of a thread

4656

* sys_sched_setparam - set/change the RT priority of a thread

4603

* @pid: the pid in question.

4657

* @pid: the pid in question.

4604

* @param: structure containing the new RT priority.

4658

* @param: structure containing the new RT priority.

4605

*/

4659

*/

4606

SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

4660

SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

4607

{

4661

{

4608

return do_sched_setscheduler(pid, -1, param);

4662

return do_sched_setscheduler(pid, -1, param);

4609

}

4663

}

4610

4664

4611

/**

4665

/**

4612

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

4666

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

4613

* @pid: the pid in question.

4667

* @pid: the pid in question.

4614

*/

4668

*/

4615

SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

4669

SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

4616

{

4670

{

4617

struct task_struct *p;

4671

struct task_struct *p;

4618

int retval;

4672

int retval;

4619

4673

4620

if (pid < 0)

4674

if (pid < 0)

4621

return -EINVAL;

4675

return -EINVAL;

4622

4676

4623

retval = -ESRCH;

4677

retval = -ESRCH;

4624

rcu_read_lock();

4678

rcu_read_lock();

4625

p = find_process_by_pid(pid);

4679

p = find_process_by_pid(pid);

4626

if (p) {

4680

if (p) {

4627

retval = security_task_getscheduler(p);

4681

retval = security_task_getscheduler(p);

4628

if (!retval)

4682

if (!retval)

4629

retval = p->policy

4683

retval = p->policy

4630

| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);

4684

| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);

4631

}

4685

}

4632

rcu_read_unlock();

4686

rcu_read_unlock();

4633

return retval;

4687

return retval;

4634

}

4688

}

4635

4689

4636

/**

4690

/**

4637

* sys_sched_getparam - get the RT priority of a thread

4691

* sys_sched_getparam - get the RT priority of a thread

4638

* @pid: the pid in question.

4692

* @pid: the pid in question.

4639

* @param: structure containing the RT priority.

4693

* @param: structure containing the RT priority.

4640

*/

4694

*/

4641

SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

4695

SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

4642

{

4696

{

4643

struct sched_param lp;

4697

struct sched_param lp;

4644

struct task_struct *p;

4698

struct task_struct *p;

4645

int retval;

4699

int retval;

4646

4700

4647

if (!param || pid < 0)

4701

if (!param || pid < 0)

4648

return -EINVAL;

4702

return -EINVAL;

4649

4703

4650

rcu_read_lock();

4704

rcu_read_lock();

4651

p = find_process_by_pid(pid);

4705

p = find_process_by_pid(pid);

4652

retval = -ESRCH;

4706

retval = -ESRCH;

4653

if (!p)

4707

if (!p)

4654

goto out_unlock;

4708

goto out_unlock;

4655

4709

4656

retval = security_task_getscheduler(p);

4710

retval = security_task_getscheduler(p);

4657

if (retval)

4711

if (retval)

4658

goto out_unlock;

4712

goto out_unlock;

4659

4713

4660

lp.sched_priority = p->rt_priority;

4714

lp.sched_priority = p->rt_priority;

4661

rcu_read_unlock();

4715

rcu_read_unlock();

4662

4716

4663

/*

4717

/*

4664

* This one might sleep, we cannot do it with a spinlock held ...

4718

* This one might sleep, we cannot do it with a spinlock held ...

4665

*/

4719

*/

4666

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

4720

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

4667

4721

4668

return retval;

4722

return retval;

4669

4723

4670

out_unlock:

4724

out_unlock:

4671

rcu_read_unlock();

4725

rcu_read_unlock();

4672

return retval;

4726

return retval;

4673

}

4727

}

4674

4728

4675

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

4729

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

4676

{

4730

{

4677

cpumask_var_t cpus_allowed, new_mask;

4731

cpumask_var_t cpus_allowed, new_mask;

4678

struct task_struct *p;

4732

struct task_struct *p;

4679

int retval;

4733

int retval;

4680

4734

4681

get_online_cpus();

4735

get_online_cpus();

4682

rcu_read_lock();

4736

rcu_read_lock();

4683

4737

4684

p = find_process_by_pid(pid);

4738

p = find_process_by_pid(pid);

4685

if (!p) {

4739

if (!p) {

4686

rcu_read_unlock();

4740

rcu_read_unlock();

4687

put_online_cpus();

4741

put_online_cpus();

4688

return -ESRCH;

4742

return -ESRCH;

4689

}

4743

}

4690

4744

4691

/* Prevent p going away */

4745

/* Prevent p going away */

4692

get_task_struct(p);

4746

get_task_struct(p);

4693

rcu_read_unlock();

4747

rcu_read_unlock();

4694

4748

4695

if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {

4749

if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {

4696

retval = -ENOMEM;

4750

retval = -ENOMEM;

4697

goto out_put_task;

4751

goto out_put_task;

4698

}

4752

}

4699

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

4753

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

4700

retval = -ENOMEM;

4754

retval = -ENOMEM;

4701

goto out_free_cpus_allowed;

4755

goto out_free_cpus_allowed;

4702

}

4756

}

4703

retval = -EPERM;

4757

retval = -EPERM;

4704

if (!check_same_owner(p) && !capable(CAP_SYS_NICE))

4758

if (!check_same_owner(p) && !capable(CAP_SYS_NICE))

4705

goto out_unlock;

4759

goto out_unlock;

4706

4760

4707

retval = security_task_setscheduler(p, 0, NULL);

4761

retval = security_task_setscheduler(p, 0, NULL);

4708

if (retval)

4762

if (retval)

4709

goto out_unlock;

4763

goto out_unlock;

4710

4764

4711

cpuset_cpus_allowed(p, cpus_allowed);

4765

cpuset_cpus_allowed(p, cpus_allowed);

4712

cpumask_and(new_mask, in_mask, cpus_allowed);

4766

cpumask_and(new_mask, in_mask, cpus_allowed);

4713

again:

4767

again:

4714

retval = set_cpus_allowed_ptr(p, new_mask);

4768

retval = set_cpus_allowed_ptr(p, new_mask);

4715

4769

4716

if (!retval) {

4770

if (!retval) {

4717

cpuset_cpus_allowed(p, cpus_allowed);

4771

cpuset_cpus_allowed(p, cpus_allowed);

4718

if (!cpumask_subset(new_mask, cpus_allowed)) {

4772

if (!cpumask_subset(new_mask, cpus_allowed)) {

4719

/*

4773

/*

4720

* We must have raced with a concurrent cpuset

4774

* We must have raced with a concurrent cpuset

4721

* update. Just reset the cpus_allowed to the

4775

* update. Just reset the cpus_allowed to the

4722

* cpuset's cpus_allowed

4776

* cpuset's cpus_allowed

4723

*/

4777

*/

4724

cpumask_copy(new_mask, cpus_allowed);

4778

cpumask_copy(new_mask, cpus_allowed);

4725

goto again;

4779

goto again;

4726

}

4780

}

4727

}

4781

}

4728

out_unlock:

4782

out_unlock:

4729

free_cpumask_var(new_mask);

4783

free_cpumask_var(new_mask);

4730

out_free_cpus_allowed:

4784

out_free_cpus_allowed:

4731

free_cpumask_var(cpus_allowed);

4785

free_cpumask_var(cpus_allowed);

4732

out_put_task:

4786

out_put_task:

4733

put_task_struct(p);

4787

put_task_struct(p);

4734

put_online_cpus();

4788

put_online_cpus();

4735

return retval;

4789

return retval;

4736

}

4790

}

4737

4791

4738

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

4792

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

4739

struct cpumask *new_mask)

4793

struct cpumask *new_mask)

4740

{

4794

{

4741

if (len < cpumask_size())

4795

if (len < cpumask_size())

4742

cpumask_clear(new_mask);

4796

cpumask_clear(new_mask);

4743

else if (len > cpumask_size())

4797

else if (len > cpumask_size())

4744

len = cpumask_size();

4798

len = cpumask_size();

4745

4799

4746

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

4800

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

4747

}

4801

}

4748

4802

4749

/**

4803

/**

4750

* sys_sched_setaffinity - set the cpu affinity of a process

4804

* sys_sched_setaffinity - set the cpu affinity of a process

4751

* @pid: pid of the process

4805

* @pid: pid of the process

4752

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

4806

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

4753

* @user_mask_ptr: user-space pointer to the new cpu mask

4807

* @user_mask_ptr: user-space pointer to the new cpu mask

4754

*/

4808

*/

4755

SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

4809

SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

4756

unsigned long __user *, user_mask_ptr)

4810

unsigned long __user *, user_mask_ptr)

4757

{

4811

{

4758

cpumask_var_t new_mask;

4812

cpumask_var_t new_mask;

4759

int retval;

4813

int retval;

4760

4814

4761

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

4815

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

4762

return -ENOMEM;

4816

return -ENOMEM;

4763

4817

4764

retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

4818

retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

4765

if (retval == 0)

4819

if (retval == 0)

4766

retval = sched_setaffinity(pid, new_mask);

4820

retval = sched_setaffinity(pid, new_mask);

4767

free_cpumask_var(new_mask);

4821

free_cpumask_var(new_mask);

4768

return retval;

4822

return retval;

4769

}

4823

}

4770

4824

4771

long sched_getaffinity(pid_t pid, struct cpumask *mask)

4825

long sched_getaffinity(pid_t pid, struct cpumask *mask)

4772

{

4826

{

4773

struct task_struct *p;

4827

struct task_struct *p;

4774

unsigned long flags;

4828

unsigned long flags;

4775

struct rq *rq;

4829

struct rq *rq;

4776

int retval;

4830

int retval;

4777

4831

4778

get_online_cpus();

4832

get_online_cpus();

4779

rcu_read_lock();

4833

rcu_read_lock();

4780

4834

4781

retval = -ESRCH;

4835

retval = -ESRCH;

4782

p = find_process_by_pid(pid);

4836

p = find_process_by_pid(pid);

4783

if (!p)

4837

if (!p)

4784

goto out_unlock;

4838

goto out_unlock;

4785

4839

4786

retval = security_task_getscheduler(p);

4840

retval = security_task_getscheduler(p);

4787

if (retval)

4841

if (retval)

4788

goto out_unlock;

4842

goto out_unlock;

4789

4843

4790

rq = task_rq_lock(p, &flags);

4844

rq = task_rq_lock(p, &flags);

4791

cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);

4845

cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);

4792

task_rq_unlock(rq, &flags);

4846

task_rq_unlock(rq, &flags);

4793

4847

4794

out_unlock:

4848

out_unlock:

4795

rcu_read_unlock();

4849

rcu_read_unlock();

4796

put_online_cpus();

4850

put_online_cpus();

4797

4851

4798

return retval;

4852

return retval;

4799

}

4853

}

4800

4854

4801

/**

4855

/**

4802

* sys_sched_getaffinity - get the cpu affinity of a process

4856

* sys_sched_getaffinity - get the cpu affinity of a process

4803

* @pid: pid of the process

4857

* @pid: pid of the process

4804

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

4858

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

4805

* @user_mask_ptr: user-space pointer to hold the current cpu mask

4859

* @user_mask_ptr: user-space pointer to hold the current cpu mask

4806

*/

4860

*/

4807

SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

4861

SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

4808

unsigned long __user *, user_mask_ptr)

4862

unsigned long __user *, user_mask_ptr)

4809

{

4863

{

4810

int ret;

4864

int ret;

4811

cpumask_var_t mask;

4865

cpumask_var_t mask;

4812

4866

4813

if ((len * BITS_PER_BYTE) < nr_cpu_ids)

4867

if ((len * BITS_PER_BYTE) < nr_cpu_ids)

4814

return -EINVAL;

4868

return -EINVAL;

4815

if (len & (sizeof(unsigned long)-1))

4869

if (len & (sizeof(unsigned long)-1))

4816

return -EINVAL;

4870

return -EINVAL;

4817

4871

4818

if (!alloc_cpumask_var(&mask, GFP_KERNEL))

4872

if (!alloc_cpumask_var(&mask, GFP_KERNEL))

4819

return -ENOMEM;

4873

return -ENOMEM;

4820

4874

4821

ret = sched_getaffinity(pid, mask);

4875

ret = sched_getaffinity(pid, mask);

4822

if (ret == 0) {

4876

if (ret == 0) {

4823

size_t retlen = min_t(size_t, len, cpumask_size());

4877

size_t retlen = min_t(size_t, len, cpumask_size());

4824

4878

4825

if (copy_to_user(user_mask_ptr, mask, retlen))

4879

if (copy_to_user(user_mask_ptr, mask, retlen))

4826

ret = -EFAULT;

4880

ret = -EFAULT;

4827

else

4881

else

4828

ret = retlen;

4882

ret = retlen;

4829

}

4883

}

4830

free_cpumask_var(mask);

4884

free_cpumask_var(mask);

4831

4885

4832

return ret;

4886

return ret;

4833

}

4887

}

4834

4888

4835

/**

4889

/**

4836

* sys_sched_yield - yield the current processor to other threads.

4890

* sys_sched_yield - yield the current processor to other threads.

4837

*

4891

*

4838

* This function yields the current CPU to other tasks. If there are no

4892

* This function yields the current CPU to other tasks. If there are no

4839

* other threads running on this CPU then this function will return.

4893

* other threads running on this CPU then this function will return.

4840

*/

4894

*/

4841

SYSCALL_DEFINE0(sched_yield)

4895

SYSCALL_DEFINE0(sched_yield)

4842

{

4896

{

4843

struct rq *rq = this_rq_lock();

4897

struct rq *rq = this_rq_lock();

4844

4898

4845

schedstat_inc(rq, yld_count);

4899

schedstat_inc(rq, yld_count);

4846

current->sched_class->yield_task(rq);

4900

current->sched_class->yield_task(rq);

4847

4901

4848

/*

4902

/*

4849

* Since we are going to call schedule() anyway, there's

4903

* Since we are going to call schedule() anyway, there's

4850

* no need to preempt or enable interrupts:

4904

* no need to preempt or enable interrupts:

4851

*/

4905

*/

4852

__release(rq->lock);

4906

__release(rq->lock);

4853

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

4907

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

4854

do_raw_spin_unlock(&rq->lock);

4908

do_raw_spin_unlock(&rq->lock);

4855

preempt_enable_no_resched();

4909

preempt_enable_no_resched();

4856

4910

4857

schedule();

4911

schedule();

4858

4912

4859

return 0;

4913

return 0;

4860

}

4914

}

4861

4915

4862

static inline int should_resched(void)

4916

static inline int should_resched(void)

4863

{

4917

{

4864

return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);

4918

return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);

4865

}

4919

}

4866

4920

4867

static void __cond_resched(void)

4921

static void __cond_resched(void)

4868

{

4922

{

4869

add_preempt_count(PREEMPT_ACTIVE);

4923

add_preempt_count(PREEMPT_ACTIVE);

4870

schedule();

4924

schedule();

4871

sub_preempt_count(PREEMPT_ACTIVE);

4925

sub_preempt_count(PREEMPT_ACTIVE);

4872

}

4926

}

4873

4927

4874

int __sched _cond_resched(void)

4928

int __sched _cond_resched(void)

4875

{

4929

{

4876

if (should_resched()) {

4930

if (should_resched()) {

4877

__cond_resched();

4931

__cond_resched();

4878

return 1;

4932

return 1;

4879

}

4933

}

4880

return 0;

4934

return 0;

4881

}

4935

}

4882

EXPORT_SYMBOL(_cond_resched);

4936

EXPORT_SYMBOL(_cond_resched);

4883

4937

4884

/*

4938

/*

4885

* __cond_resched_lock() - if a reschedule is pending, drop the given lock,

4939

* __cond_resched_lock() - if a reschedule is pending, drop the given lock,

4886

* call schedule, and on return reacquire the lock.

4940

* call schedule, and on return reacquire the lock.

4887

*

4941

*

4888

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

4942

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

4889

* operations here to prevent schedule() from being called twice (once via

4943

* operations here to prevent schedule() from being called twice (once via

4890

* spin_unlock(), once by hand).

4944

* spin_unlock(), once by hand).

4891

*/

4945

*/

4892

int __cond_resched_lock(spinlock_t *lock)

4946

int __cond_resched_lock(spinlock_t *lock)

4893

{

4947

{

4894

int resched = should_resched();

4948

int resched = should_resched();

4895

int ret = 0;

4949

int ret = 0;

4896

4950

4897

lockdep_assert_held(lock);

4951

lockdep_assert_held(lock);

4898

4952

4899

if (spin_needbreak(lock) || resched) {

4953

if (spin_needbreak(lock) || resched) {

4900

spin_unlock(lock);

4954

spin_unlock(lock);

4901

if (resched)

4955

if (resched)

4902

__cond_resched();

4956

__cond_resched();

4903

else

4957

else

4904

cpu_relax();

4958

cpu_relax();

4905

ret = 1;

4959

ret = 1;

4906

spin_lock(lock);

4960

spin_lock(lock);

4907

}

4961

}

4908

return ret;

4962

return ret;

4909

}

4963

}

4910

EXPORT_SYMBOL(__cond_resched_lock);

4964

EXPORT_SYMBOL(__cond_resched_lock);

4911

4965

4912

int __sched __cond_resched_softirq(void)

4966

int __sched __cond_resched_softirq(void)

4913

{

4967

{

4914

BUG_ON(!in_softirq());

4968

BUG_ON(!in_softirq());

4915

4969

4916

if (should_resched()) {

4970

if (should_resched()) {

4917

local_bh_enable();

4971

local_bh_enable();

4918

__cond_resched();

4972

__cond_resched();

4919

local_bh_disable();

4973

local_bh_disable();

4920

return 1;

4974

return 1;

4921

}

4975

}

4922

return 0;

4976

return 0;

4923

}

4977

}

4924

EXPORT_SYMBOL(__cond_resched_softirq);

4978

EXPORT_SYMBOL(__cond_resched_softirq);

4925

4979

4926

/**

4980

/**

4927

* yield - yield the current processor to other threads.

4981

* yield - yield the current processor to other threads.

4928

*

4982

*

4929

* This is a shortcut for kernel-space yielding - it marks the

4983

* This is a shortcut for kernel-space yielding - it marks the

4930

* thread runnable and calls sys_sched_yield().

4984

* thread runnable and calls sys_sched_yield().

4931

*/

4985

*/

4932

void __sched yield(void)

4986

void __sched yield(void)

4933

{

4987

{

4934

set_current_state(TASK_RUNNING);

4988

set_current_state(TASK_RUNNING);

4935

sys_sched_yield();

4989

sys_sched_yield();

4936

}

4990

}

4937

EXPORT_SYMBOL(yield);

4991

EXPORT_SYMBOL(yield);

4938

4992

4939

/*

4993

/*

4940

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

4994

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

4941

* that process accounting knows that this is a task in IO wait state.

4995

* that process accounting knows that this is a task in IO wait state.

4942

*/

4996

*/

4943

void __sched io_schedule(void)

4997

void __sched io_schedule(void)

4944

{

4998

{

4945

struct rq *rq = raw_rq();

4999

struct rq *rq = raw_rq();

4946

5000

4947

delayacct_blkio_start();

5001

delayacct_blkio_start();

4948

atomic_inc(&rq->nr_iowait);

5002

atomic_inc(&rq->nr_iowait);

4949

current->in_iowait = 1;

5003

current->in_iowait = 1;

4950

schedule();

5004

schedule();

4951

current->in_iowait = 0;

5005

current->in_iowait = 0;

4952

atomic_dec(&rq->nr_iowait);

5006

atomic_dec(&rq->nr_iowait);

4953

delayacct_blkio_end();

5007

delayacct_blkio_end();

4954

}

5008

}

4955

EXPORT_SYMBOL(io_schedule);

5009

EXPORT_SYMBOL(io_schedule);

4956

5010

4957

long __sched io_schedule_timeout(long timeout)

5011

long __sched io_schedule_timeout(long timeout)

4958

{

5012

{

4959

struct rq *rq = raw_rq();

5013

struct rq *rq = raw_rq();

4960

long ret;

5014

long ret;

4961

5015

4962

delayacct_blkio_start();

5016

delayacct_blkio_start();

4963

atomic_inc(&rq->nr_iowait);

5017

atomic_inc(&rq->nr_iowait);

4964

current->in_iowait = 1;

5018

current->in_iowait = 1;

4965

ret = schedule_timeout(timeout);

5019

ret = schedule_timeout(timeout);

4966

current->in_iowait = 0;

5020

current->in_iowait = 0;

4967

atomic_dec(&rq->nr_iowait);

5021

atomic_dec(&rq->nr_iowait);

4968

delayacct_blkio_end();

5022

delayacct_blkio_end();

4969

return ret;

5023

return ret;

4970

}

5024

}

4971

5025

4972

/**

5026

/**

4973

* sys_sched_get_priority_max - return maximum RT priority.

5027

* sys_sched_get_priority_max - return maximum RT priority.

4974

* @policy: scheduling class.

5028

* @policy: scheduling class.

4975

*

5029

*

4976

* this syscall returns the maximum rt_priority that can be used

5030

* this syscall returns the maximum rt_priority that can be used

4977

* by a given scheduling class.

5031

* by a given scheduling class.

4978

*/

5032

*/

4979

SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

5033

SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

4980

{

5034

{

4981

int ret = -EINVAL;

5035

int ret = -EINVAL;

4982

5036

4983

switch (policy) {

5037

switch (policy) {

4984

case SCHED_FIFO:

5038

case SCHED_FIFO:

4985

case SCHED_RR:

5039

case SCHED_RR:

4986

ret = MAX_USER_RT_PRIO-1;

5040

ret = MAX_USER_RT_PRIO-1;

4987

break;

5041

break;

4988

case SCHED_NORMAL:

5042

case SCHED_NORMAL:

4989

case SCHED_BATCH:

5043

case SCHED_BATCH:

4990

case SCHED_IDLE:

5044

case SCHED_IDLE:

4991

ret = 0;

5045

ret = 0;

4992

break;

5046

break;

4993

}

5047

}

4994

return ret;

5048

return ret;

4995

}

5049

}

4996

5050

4997

/**

5051

/**

4998

* sys_sched_get_priority_min - return minimum RT priority.

5052

* sys_sched_get_priority_min - return minimum RT priority.

4999

* @policy: scheduling class.

5053

* @policy: scheduling class.

5000

*

5054

*

5001

* this syscall returns the minimum rt_priority that can be used

5055

* this syscall returns the minimum rt_priority that can be used

5002

* by a given scheduling class.

5056

* by a given scheduling class.

5003

*/

5057

*/

5004

SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

5058

SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

5005

{

5059

{

5006

int ret = -EINVAL;

5060

int ret = -EINVAL;

5007

5061

5008

switch (policy) {

5062

switch (policy) {

5009

case SCHED_FIFO:

5063

case SCHED_FIFO:

5010

case SCHED_RR:

5064

case SCHED_RR:

5011

ret = 1;

5065

ret = 1;

5012

break;

5066

break;

5013

case SCHED_NORMAL:

5067

case SCHED_NORMAL:

5014

case SCHED_BATCH:

5068

case SCHED_BATCH:

5015

case SCHED_IDLE:

5069

case SCHED_IDLE:

5016

ret = 0;

5070

ret = 0;

5017

}

5071

}

5018

return ret;

5072

return ret;

5019

}

5073

}

5020

5074

5021

/**

5075

/**

5022

* sys_sched_rr_get_interval - return the default timeslice of a process.

5076

* sys_sched_rr_get_interval - return the default timeslice of a process.

5023

* @pid: pid of the process.

5077

* @pid: pid of the process.

5024

* @interval: userspace pointer to the timeslice value.

5078

* @interval: userspace pointer to the timeslice value.

5025

*

5079

*

5026

* this syscall writes the default timeslice value of a given process

5080

* this syscall writes the default timeslice value of a given process

5027

* into the user-space timespec buffer. A value of '0' means infinity.

5081

* into the user-space timespec buffer. A value of '0' means infinity.

5028

*/

5082

*/

5029

SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

5083

SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

5030

struct timespec __user *, interval)

5084

struct timespec __user *, interval)

5031

{

5085

{

5032

struct task_struct *p;

5086

struct task_struct *p;

5033

unsigned int time_slice;

5087

unsigned int time_slice;

5034

unsigned long flags;

5088

unsigned long flags;

5035

struct rq *rq;

5089

struct rq *rq;

5036

int retval;

5090

int retval;

5037

struct timespec t;

5091

struct timespec t;

5038

5092

5039

if (pid < 0)

5093

if (pid < 0)

5040

return -EINVAL;

5094

return -EINVAL;

5041

5095

5042

retval = -ESRCH;

5096

retval = -ESRCH;

5043

rcu_read_lock();

5097

rcu_read_lock();

5044

p = find_process_by_pid(pid);

5098

p = find_process_by_pid(pid);

5045

if (!p)

5099

if (!p)

5046

goto out_unlock;

5100

goto out_unlock;

5047

5101

5048

retval = security_task_getscheduler(p);

5102

retval = security_task_getscheduler(p);

5049

if (retval)

5103

if (retval)

5050

goto out_unlock;

5104

goto out_unlock;

5051

5105

5052

rq = task_rq_lock(p, &flags);

5106

rq = task_rq_lock(p, &flags);

5053

time_slice = p->sched_class->get_rr_interval(rq, p);

5107

time_slice = p->sched_class->get_rr_interval(rq, p);

5054

task_rq_unlock(rq, &flags);

5108

task_rq_unlock(rq, &flags);

5055

5109

5056

rcu_read_unlock();

5110

rcu_read_unlock();

5057

jiffies_to_timespec(time_slice, &t);

5111

jiffies_to_timespec(time_slice, &t);

5058

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

5112

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

5059

return retval;

5113

return retval;

5060

5114

5061

out_unlock:

5115

out_unlock:

5062

rcu_read_unlock();

5116

rcu_read_unlock();

5063

return retval;

5117

return retval;

5064

}

5118

}

5065

5119

5066

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

5120

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

5067

5121

5068

void sched_show_task(struct task_struct *p)

5122

void sched_show_task(struct task_struct *p)

5069

{

5123

{

5070

unsigned long free = 0;

5124

unsigned long free = 0;

5071

unsigned state;

5125

unsigned state;

5072

5126

5073

state = p->state ? __ffs(p->state) + 1 : 0;

5127

state = p->state ? __ffs(p->state) + 1 : 0;

5074

printk(KERN_INFO "%-13.13s %c", p->comm,

5128

printk(KERN_INFO "%-13.13s %c", p->comm,

5075

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

5129

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

5076

#if BITS_PER_LONG == 32

5130

#if BITS_PER_LONG == 32

5077

if (state == TASK_RUNNING)

5131

if (state == TASK_RUNNING)

5078

printk(KERN_CONT " running ");

5132

printk(KERN_CONT " running ");

5079

else

5133

else

5080

printk(KERN_CONT " %08lx ", thread_saved_pc(p));

5134

printk(KERN_CONT " %08lx ", thread_saved_pc(p));

5081

#else

5135

#else

5082

if (state == TASK_RUNNING)

5136

if (state == TASK_RUNNING)

5083

printk(KERN_CONT " running task ");

5137

printk(KERN_CONT " running task ");

5084

else

5138

else

5085

printk(KERN_CONT " %016lx ", thread_saved_pc(p));

5139

printk(KERN_CONT " %016lx ", thread_saved_pc(p));

5086

#endif

5140

#endif

5087

#ifdef CONFIG_DEBUG_STACK_USAGE

5141

#ifdef CONFIG_DEBUG_STACK_USAGE

5088

free = stack_not_used(p);

5142

free = stack_not_used(p);

5089

#endif

5143

#endif

5090

printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,

5144

printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,

5091

task_pid_nr(p), task_pid_nr(p->real_parent),

5145

task_pid_nr(p), task_pid_nr(p->real_parent),

5092

(unsigned long)task_thread_info(p)->flags);

5146

(unsigned long)task_thread_info(p)->flags);

5093

5147

5094

show_stack(p, NULL);

5148

show_stack(p, NULL);

5095

}

5149

}

5096

5150

5097

void show_state_filter(unsigned long state_filter)

5151

void show_state_filter(unsigned long state_filter)

5098

{

5152

{

5099

struct task_struct *g, *p;

5153

struct task_struct *g, *p;

5100

5154

5101

#if BITS_PER_LONG == 32

5155

#if BITS_PER_LONG == 32

5102

printk(KERN_INFO

5156

printk(KERN_INFO

5103

" task PC stack pid father\n");

5157

" task PC stack pid father\n");

5104

#else

5158

#else

5105

printk(KERN_INFO

5159

printk(KERN_INFO

5106

" task PC stack pid father\n");

5160

" task PC stack pid father\n");

5107

#endif

5161

#endif

5108

read_lock(&tasklist_lock);

5162

read_lock(&tasklist_lock);

5109

do_each_thread(g, p) {

5163

do_each_thread(g, p) {

5110

/*

5164

/*

5111

* reset the NMI-timeout, listing all files on a slow

5165

* reset the NMI-timeout, listing all files on a slow

5112

* console might take alot of time:

5166

* console might take alot of time:

5113

*/

5167

*/

5114

touch_nmi_watchdog();

5168

touch_nmi_watchdog();

5115

if (!state_filter || (p->state & state_filter))

5169

if (!state_filter || (p->state & state_filter))

5116

sched_show_task(p);

5170

sched_show_task(p);

5117

} while_each_thread(g, p);

5171

} while_each_thread(g, p);

5118

5172

5119

touch_all_softlockup_watchdogs();

5173

touch_all_softlockup_watchdogs();

5120

5174

5121

#ifdef CONFIG_SCHED_DEBUG

5175

#ifdef CONFIG_SCHED_DEBUG

5122

sysrq_sched_debug_show();

5176

sysrq_sched_debug_show();

5123

#endif

5177

#endif

5124

read_unlock(&tasklist_lock);

5178

read_unlock(&tasklist_lock);

5125

/*

5179

/*

5126

* Only show locks if all tasks are dumped:

5180

* Only show locks if all tasks are dumped:

5127

*/

5181

*/

5128

if (!state_filter)

5182

if (!state_filter)

5129

debug_show_all_locks();

5183

debug_show_all_locks();

5130

}

5184

}

5131

5185

5132

void __cpuinit init_idle_bootup_task(struct task_struct *idle)

5186

void __cpuinit init_idle_bootup_task(struct task_struct *idle)

5133

{

5187

{

5134

idle->sched_class = &idle_sched_class;

5188

idle->sched_class = &idle_sched_class;

5135

}

5189

}

5136

5190

5137

/**

5191

/**

5138

* init_idle - set up an idle thread for a given CPU

5192

* init_idle - set up an idle thread for a given CPU

5139

* @idle: task in question

5193

* @idle: task in question

5140

* @cpu: cpu the idle task belongs to

5194

* @cpu: cpu the idle task belongs to

5141

*

5195

*

5142

* NOTE: this function does not set the idle thread's NEED_RESCHED

5196

* NOTE: this function does not set the idle thread's NEED_RESCHED

5143

* flag, to make booting more robust.

5197

* flag, to make booting more robust.

5144

*/

5198

*/

5145

void __cpuinit init_idle(struct task_struct *idle, int cpu)

5199

void __cpuinit init_idle(struct task_struct *idle, int cpu)

5146

{

5200

{

5147

struct rq *rq = cpu_rq(cpu);

5201

struct rq *rq = cpu_rq(cpu);

5148

unsigned long flags;

5202

unsigned long flags;

5149

5203

5150

raw_spin_lock_irqsave(&rq->lock, flags);

5204

raw_spin_lock_irqsave(&rq->lock, flags);

5151

5205

5152

__sched_fork(idle);

5206

__sched_fork(idle);

5153

idle->state = TASK_RUNNING;

5207

idle->state = TASK_RUNNING;

5154

idle->se.exec_start = sched_clock();

5208

idle->se.exec_start = sched_clock();

5155

5209

5156

cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));

5210

cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));

5157

__set_task_cpu(idle, cpu);

5211

__set_task_cpu(idle, cpu);

5158

5212

5159

rq->curr = rq->idle = idle;

5213

rq->curr = rq->idle = idle;

5160

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

5214

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

5161

idle->oncpu = 1;

5215

idle->oncpu = 1;

5162

#endif

5216

#endif

5163

raw_spin_unlock_irqrestore(&rq->lock, flags);

5217

raw_spin_unlock_irqrestore(&rq->lock, flags);

5164

5218

5165

/* Set the preempt count _outside_ the spinlocks! */

5219

/* Set the preempt count _outside_ the spinlocks! */

5166

#if defined(CONFIG_PREEMPT)

5220

#if defined(CONFIG_PREEMPT)

5167

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

5221

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

5168

#else

5222

#else

5169

task_thread_info(idle)->preempt_count = 0;

5223

task_thread_info(idle)->preempt_count = 0;

5170

#endif

5224

#endif

5171

/*

5225

/*

5172

* The idle tasks have their own, simple scheduling class:

5226

* The idle tasks have their own, simple scheduling class:

5173

*/

5227

*/

5174

idle->sched_class = &idle_sched_class;

5228

idle->sched_class = &idle_sched_class;

5175

ftrace_graph_init_task(idle);

5229

ftrace_graph_init_task(idle);

5176

}

5230

}

5177

5231

5178

/*

5232

/*

5179

* In a system that switches off the HZ timer nohz_cpu_mask

5233

* In a system that switches off the HZ timer nohz_cpu_mask

5180

* indicates which cpus entered this state. This is used

5234

* indicates which cpus entered this state. This is used

5181

* in the rcu update to wait only for active cpus. For system

5235

* in the rcu update to wait only for active cpus. For system

5182

* which do not switch off the HZ timer nohz_cpu_mask should

5236

* which do not switch off the HZ timer nohz_cpu_mask should

5183

* always be CPU_BITS_NONE.

5237

* always be CPU_BITS_NONE.

5184

*/

5238

*/

5185

cpumask_var_t nohz_cpu_mask;

5239

cpumask_var_t nohz_cpu_mask;

5186

5240

5187

/*

5241

/*

5188

* Increase the granularity value when there are more CPUs,

5242

* Increase the granularity value when there are more CPUs,

5189

* because with more CPUs the 'effective latency' as visible

5243

* because with more CPUs the 'effective latency' as visible

5190

* to users decreases. But the relationship is not linear,

5244

* to users decreases. But the relationship is not linear,

5191

* so pick a second-best guess by going with the log2 of the

5245

* so pick a second-best guess by going with the log2 of the

5192

* number of CPUs.

5246

* number of CPUs.

5193

*

5247

*

5194

* This idea comes from the SD scheduler of Con Kolivas:

5248

* This idea comes from the SD scheduler of Con Kolivas:

5195

*/

5249

*/

5196

static int get_update_sysctl_factor(void)

5250

static int get_update_sysctl_factor(void)

5197

{

5251

{

5198

unsigned int cpus = min_t(int, num_online_cpus(), 8);

5252

unsigned int cpus = min_t(int, num_online_cpus(), 8);

5199

unsigned int factor;

5253

unsigned int factor;

5200

5254

5201

switch (sysctl_sched_tunable_scaling) {

5255

switch (sysctl_sched_tunable_scaling) {

5202

case SCHED_TUNABLESCALING_NONE:

5256

case SCHED_TUNABLESCALING_NONE:

5203

factor = 1;

5257

factor = 1;

5204

break;

5258

break;

5205

case SCHED_TUNABLESCALING_LINEAR:

5259

case SCHED_TUNABLESCALING_LINEAR:

5206

factor = cpus;

5260

factor = cpus;

5207

break;

5261

break;

5208

case SCHED_TUNABLESCALING_LOG:

5262

case SCHED_TUNABLESCALING_LOG:

5209

default:

5263

default:

5210

factor = 1 + ilog2(cpus);

5264

factor = 1 + ilog2(cpus);

5211

break;

5265

break;

5212

}

5266

}

5213

5267

5214

return factor;

5268

return factor;

5215

}

5269

}

5216

5270

5217

static void update_sysctl(void)

5271

static void update_sysctl(void)

5218

{

5272

{

5219

unsigned int factor = get_update_sysctl_factor();

5273

unsigned int factor = get_update_sysctl_factor();

5220

5274

5221

#define SET_SYSCTL(name) \

5275

#define SET_SYSCTL(name) \

5222

(sysctl_##name = (factor) * normalized_sysctl_##name)

5276

(sysctl_##name = (factor) * normalized_sysctl_##name)

5223

SET_SYSCTL(sched_min_granularity);

5277

SET_SYSCTL(sched_min_granularity);

5224

SET_SYSCTL(sched_latency);

5278

SET_SYSCTL(sched_latency);

5225

SET_SYSCTL(sched_wakeup_granularity);

5279

SET_SYSCTL(sched_wakeup_granularity);

5226

SET_SYSCTL(sched_shares_ratelimit);

5280

SET_SYSCTL(sched_shares_ratelimit);

5227

#undef SET_SYSCTL

5281

#undef SET_SYSCTL

5228

}

5282

}

5229

5283

5230

static inline void sched_init_granularity(void)

5284

static inline void sched_init_granularity(void)

5231

{

5285

{

5232

update_sysctl();

5286

update_sysctl();

5233

}

5287

}

5234

5288

5235

#ifdef CONFIG_SMP

5289

#ifdef CONFIG_SMP

5236

/*

5290

/*

5237

* This is how migration works:

5291

* This is how migration works:

5238

*

5292

*

5239

* 1) we queue a struct migration_req structure in the source CPU's

5293

* 1) we queue a struct migration_req structure in the source CPU's

5240

* runqueue and wake up that CPU's migration thread.

5294

* runqueue and wake up that CPU's migration thread.

5241

* 2) we down() the locked semaphore => thread blocks.

5295

* 2) we down() the locked semaphore => thread blocks.

5242

* 3) migration thread wakes up (implicitly it forces the migrated

5296

* 3) migration thread wakes up (implicitly it forces the migrated

5243

* thread off the CPU)

5297

* thread off the CPU)

5244

* 4) it gets the migration request and checks whether the migrated

5298

* 4) it gets the migration request and checks whether the migrated

5245

* task is still in the wrong runqueue.

5299

* task is still in the wrong runqueue.

5246

* 5) if it's in the wrong runqueue then the migration thread removes

5300

* 5) if it's in the wrong runqueue then the migration thread removes

5247

* it and puts it into the right queue.

5301

* it and puts it into the right queue.

5248

* 6) migration thread up()s the semaphore.

5302

* 6) migration thread up()s the semaphore.

5249

* 7) we wake up and the migration is done.

5303

* 7) we wake up and the migration is done.

5250

*/

5304

*/

5251

5305

5252

/*

5306

/*

5253

* Change a given task's CPU affinity. Migrate the thread to a

5307

* Change a given task's CPU affinity. Migrate the thread to a

5254

* proper CPU and schedule it away if the CPU it's executing on

5308

* proper CPU and schedule it away if the CPU it's executing on

5255

* is removed from the allowed bitmask.

5309

* is removed from the allowed bitmask.

5256

*

5310

*

5257

* NOTE: the caller must have a valid reference to the task, the

5311

* NOTE: the caller must have a valid reference to the task, the

5258

* task must not exit() & deallocate itself prematurely. The

5312

* task must not exit() & deallocate itself prematurely. The

5259

* call is not atomic; no spinlocks may be held.

5313

* call is not atomic; no spinlocks may be held.

5260

*/

5314

*/

5261

int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

5315

int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

5262

{

5316

{

5263

struct migration_req req;

5317

struct migration_req req;

5264

unsigned long flags;

5318

unsigned long flags;

5265

struct rq *rq;

5319

struct rq *rq;

5266

int ret = 0;

5320

int ret = 0;

5267

5321

5268

/*

5322

/*

5269

* Serialize against TASK_WAKING so that ttwu() and wunt() can

5323

* Serialize against TASK_WAKING so that ttwu() and wunt() can

5270

* drop the rq->lock and still rely on ->cpus_allowed.

5324

* drop the rq->lock and still rely on ->cpus_allowed.

5271

*/

5325

*/

5272

again:

5326

again:

5273

while (task_is_waking(p))

5327

while (task_is_waking(p))

5274

cpu_relax();

5328

cpu_relax();

5275

rq = task_rq_lock(p, &flags);

5329

rq = task_rq_lock(p, &flags);

5276

if (task_is_waking(p)) {

5330

if (task_is_waking(p)) {

5277

task_rq_unlock(rq, &flags);

5331

task_rq_unlock(rq, &flags);

5278

goto again;

5332

goto again;

5279

}

5333

}

5280

5334

5281

if (!cpumask_intersects(new_mask, cpu_active_mask)) {

5335

if (!cpumask_intersects(new_mask, cpu_active_mask)) {

5282

ret = -EINVAL;

5336

ret = -EINVAL;

5283

goto out;

5337

goto out;

5284

}

5338

}

5285

5339

5286

if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&

5340

if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&

5287

!cpumask_equal(&p->cpus_allowed, new_mask))) {

5341

!cpumask_equal(&p->cpus_allowed, new_mask))) {

5288

ret = -EINVAL;

5342

ret = -EINVAL;

5289

goto out;

5343

goto out;

5290

}

5344

}

5291

5345

5292

if (p->sched_class->set_cpus_allowed)

5346

if (p->sched_class->set_cpus_allowed)

5293

p->sched_class->set_cpus_allowed(p, new_mask);

5347

p->sched_class->set_cpus_allowed(p, new_mask);

5294

else {

5348

else {

5295

cpumask_copy(&p->cpus_allowed, new_mask);

5349

cpumask_copy(&p->cpus_allowed, new_mask);

5296

p->rt.nr_cpus_allowed = cpumask_weight(new_mask);

5350

p->rt.nr_cpus_allowed = cpumask_weight(new_mask);

5297

}

5351

}

5298

5352

5299

/* Can the task run on the task's current CPU? If so, we're done */

5353

/* Can the task run on the task's current CPU? If so, we're done */

5300

if (cpumask_test_cpu(task_cpu(p), new_mask))

5354

if (cpumask_test_cpu(task_cpu(p), new_mask))

5301

goto out;

5355

goto out;

5302

5356

5303

if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {

5357

if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {

5304

/* Need help from migration thread: drop lock and wait. */

5358

/* Need help from migration thread: drop lock and wait. */

5305

struct task_struct *mt = rq->migration_thread;

5359

struct task_struct *mt = rq->migration_thread;

5306

5360

5307

get_task_struct(mt);

5361

get_task_struct(mt);

5308

task_rq_unlock(rq, &flags);

5362

task_rq_unlock(rq, &flags);

5309

wake_up_process(mt);

5363

wake_up_process(mt);

5310

put_task_struct(mt);

5364

put_task_struct(mt);

5311

wait_for_completion(&req.done);

5365

wait_for_completion(&req.done);

5312

tlb_migrate_finish(p->mm);

5366

tlb_migrate_finish(p->mm);

5313

return 0;

5367

return 0;

5314

}

5368

}

5315

out:

5369

out:

5316

task_rq_unlock(rq, &flags);

5370

task_rq_unlock(rq, &flags);

5317

5371

5318

return ret;

5372

return ret;

5319

}

5373

}

5320

EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

5374

EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

5321

5375

5322

/*

5376

/*

5323

* Move (not current) task off this cpu, onto dest cpu. We're doing

5377

* Move (not current) task off this cpu, onto dest cpu. We're doing

5324

* this because either it can't run here any more (set_cpus_allowed()

5378

* this because either it can't run here any more (set_cpus_allowed()

5325

* away from this CPU, or CPU going down), or because we're

5379

* away from this CPU, or CPU going down), or because we're

5326

* attempting to rebalance this task on exec (sched_exec).

5380

* attempting to rebalance this task on exec (sched_exec).

5327

*

5381

*

5328

* So we race with normal scheduler movements, but that's OK, as long

5382

* So we race with normal scheduler movements, but that's OK, as long

5329

* as the task is no longer on this CPU.

5383

* as the task is no longer on this CPU.

5330

*

5384

*

5331

* Returns non-zero if task was successfully migrated.

5385

* Returns non-zero if task was successfully migrated.

5332

*/

5386

*/

5333

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

5387

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

5334

{

5388

{

5335

struct rq *rq_dest, *rq_src;

5389

struct rq *rq_dest, *rq_src;

5336

int ret = 0;

5390

int ret = 0;

5337

5391

5338

if (unlikely(!cpu_active(dest_cpu)))

5392

if (unlikely(!cpu_active(dest_cpu)))

5339

return ret;

5393

return ret;

5340

5394

5341

rq_src = cpu_rq(src_cpu);

5395

rq_src = cpu_rq(src_cpu);

5342

rq_dest = cpu_rq(dest_cpu);

5396

rq_dest = cpu_rq(dest_cpu);

5343

5397

5344

double_rq_lock(rq_src, rq_dest);

5398

double_rq_lock(rq_src, rq_dest);

5345

/* Already moved. */

5399

/* Already moved. */

5346

if (task_cpu(p) != src_cpu)

5400

if (task_cpu(p) != src_cpu)

5347

goto done;

5401

goto done;

5348

/* Affinity changed (again). */

5402

/* Affinity changed (again). */

5349

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

5403

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

5350

goto fail;

5404

goto fail;

5351

5405

5352

/*

5406

/*

5353

* If we're not on a rq, the next wake-up will ensure we're

5407

* If we're not on a rq, the next wake-up will ensure we're

5354

* placed properly.

5408

* placed properly.

5355

*/

5409

*/

5356

if (p->se.on_rq) {

5410

if (p->se.on_rq) {

5357

deactivate_task(rq_src, p, 0);

5411

deactivate_task(rq_src, p, 0);

5358

set_task_cpu(p, dest_cpu);

5412

set_task_cpu(p, dest_cpu);

5359

activate_task(rq_dest, p, 0);

5413

activate_task(rq_dest, p, 0);

5360

check_preempt_curr(rq_dest, p, 0);

5414

check_preempt_curr(rq_dest, p, 0);

5361

}

5415

}

5362

done:

5416

done:

5363

ret = 1;

5417

ret = 1;

5364

fail:

5418

fail:

5365

double_rq_unlock(rq_src, rq_dest);

5419

double_rq_unlock(rq_src, rq_dest);

5366

return ret;

5420

return ret;

5367

}

5421

}

5368

5422

5369

#define RCU_MIGRATION_IDLE 0

5423

#define RCU_MIGRATION_IDLE 0

5370

#define RCU_MIGRATION_NEED_QS 1

5424

#define RCU_MIGRATION_NEED_QS 1

5371

#define RCU_MIGRATION_GOT_QS 2

5425

#define RCU_MIGRATION_GOT_QS 2

5372

#define RCU_MIGRATION_MUST_SYNC 3

5426

#define RCU_MIGRATION_MUST_SYNC 3

5373

5427

5374

/*

5428

/*

5375

* migration_thread - this is a highprio system thread that performs

5429

* migration_thread - this is a highprio system thread that performs

5376

* thread migration by bumping thread off CPU then 'pushing' onto

5430

* thread migration by bumping thread off CPU then 'pushing' onto

5377

* another runqueue.

5431

* another runqueue.

5378

*/

5432

*/

5379

static int migration_thread(void *data)

5433

static int migration_thread(void *data)

5380

{

5434

{

5381

int badcpu;

5435

int badcpu;

5382

int cpu = (long)data;

5436

int cpu = (long)data;

5383

struct rq *rq;

5437

struct rq *rq;

5384

5438

5385

rq = cpu_rq(cpu);

5439

rq = cpu_rq(cpu);

5386

BUG_ON(rq->migration_thread != current);

5440

BUG_ON(rq->migration_thread != current);

5387

5441

5388

set_current_state(TASK_INTERRUPTIBLE);

5442

set_current_state(TASK_INTERRUPTIBLE);

5389

while (!kthread_should_stop()) {

5443

while (!kthread_should_stop()) {

5390

struct migration_req *req;

5444

struct migration_req *req;

5391

struct list_head *head;

5445

struct list_head *head;

5392

5446

5393

raw_spin_lock_irq(&rq->lock);

5447

raw_spin_lock_irq(&rq->lock);

5394

5448

5395

if (cpu_is_offline(cpu)) {

5449

if (cpu_is_offline(cpu)) {

5396

raw_spin_unlock_irq(&rq->lock);

5450

raw_spin_unlock_irq(&rq->lock);

5397

break;

5451

break;

5398

}

5452

}

5399

5453

5400

if (rq->active_balance) {

5454

if (rq->active_balance) {

5401

active_load_balance(rq, cpu);

5455

active_load_balance(rq, cpu);

5402

rq->active_balance = 0;

5456

rq->active_balance = 0;

5403

}

5457

}

5404

5458

5405

head = &rq->migration_queue;

5459

head = &rq->migration_queue;

5406

5460

5407

if (list_empty(head)) {

5461

if (list_empty(head)) {

5408

raw_spin_unlock_irq(&rq->lock);

5462

raw_spin_unlock_irq(&rq->lock);

5409

schedule();

5463

schedule();

5410

set_current_state(TASK_INTERRUPTIBLE);

5464

set_current_state(TASK_INTERRUPTIBLE);

5411

continue;

5465

continue;

5412

}

5466

}

5413

req = list_entry(head->next, struct migration_req, list);

5467

req = list_entry(head->next, struct migration_req, list);

5414

list_del_init(head->next);

5468

list_del_init(head->next);

5415

5469

5416

if (req->task != NULL) {

5470

if (req->task != NULL) {

5417

raw_spin_unlock(&rq->lock);

5471

raw_spin_unlock(&rq->lock);

5418

__migrate_task(req->task, cpu, req->dest_cpu);

5472

__migrate_task(req->task, cpu, req->dest_cpu);

5419

} else if (likely(cpu == (badcpu = smp_processor_id()))) {

5473

} else if (likely(cpu == (badcpu = smp_processor_id()))) {

5420

req->dest_cpu = RCU_MIGRATION_GOT_QS;

5474

req->dest_cpu = RCU_MIGRATION_GOT_QS;

5421

raw_spin_unlock(&rq->lock);

5475

raw_spin_unlock(&rq->lock);

5422

} else {

5476

} else {

5423

req->dest_cpu = RCU_MIGRATION_MUST_SYNC;

5477

req->dest_cpu = RCU_MIGRATION_MUST_SYNC;

5424

raw_spin_unlock(&rq->lock);

5478

raw_spin_unlock(&rq->lock);

5425

WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);

5479

WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);

5426

}

5480

}

5427

local_irq_enable();

5481

local_irq_enable();

5428

5482

5429

complete(&req->done);

5483

complete(&req->done);

5430

}

5484

}

5431

__set_current_state(TASK_RUNNING);

5485

__set_current_state(TASK_RUNNING);

5432

5486

5433

return 0;

5487

return 0;

5434

}

5488

}

5435

5489

5436

#ifdef CONFIG_HOTPLUG_CPU

5490

#ifdef CONFIG_HOTPLUG_CPU

5437

/*

5491

/*

5438

* Figure out where task on dead CPU should go, use force if necessary.

5492

* Figure out where task on dead CPU should go, use force if necessary.

5439

*/

5493

*/

5440

void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)

5494

void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)

5441

{

5495

{

5442

struct rq *rq = cpu_rq(dead_cpu);

5496

struct rq *rq = cpu_rq(dead_cpu);

5443

int needs_cpu, uninitialized_var(dest_cpu);

5497

int needs_cpu, uninitialized_var(dest_cpu);

5444

unsigned long flags;

5498

unsigned long flags;

5445

5499

5446

local_irq_save(flags);

5500

local_irq_save(flags);

5447

5501

5448

raw_spin_lock(&rq->lock);

5502

raw_spin_lock(&rq->lock);

5449

needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);

5503

needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);

5450

if (needs_cpu)

5504

if (needs_cpu)

5451

dest_cpu = select_fallback_rq(dead_cpu, p);

5505

dest_cpu = select_fallback_rq(dead_cpu, p);

5452

raw_spin_unlock(&rq->lock);

5506

raw_spin_unlock(&rq->lock);

5453

/*

5507

/*

5454

* It can only fail if we race with set_cpus_allowed(),

5508

* It can only fail if we race with set_cpus_allowed(),

5455

* in the racer should migrate the task anyway.

5509

* in the racer should migrate the task anyway.

5456

*/

5510

*/

5457

if (needs_cpu)

5511

if (needs_cpu)

5458

__migrate_task(p, dead_cpu, dest_cpu);

5512

__migrate_task(p, dead_cpu, dest_cpu);

5459

local_irq_restore(flags);

5513

local_irq_restore(flags);

5460

}

5514

}

5461

5515

5462

/*

5516

/*

5463

* While a dead CPU has no uninterruptible tasks queued at this point,

5517

* While a dead CPU has no uninterruptible tasks queued at this point,

5464

* it might still have a nonzero ->nr_uninterruptible counter, because

5518

* it might still have a nonzero ->nr_uninterruptible counter, because

5465

* for performance reasons the counter is not stricly tracking tasks to

5519

* for performance reasons the counter is not stricly tracking tasks to

5466

* their home CPUs. So we just add the counter to another CPU's counter,

5520

* their home CPUs. So we just add the counter to another CPU's counter,

5467

* to keep the global sum constant after CPU-down:

5521

* to keep the global sum constant after CPU-down:

5468

*/

5522

*/

5469

static void migrate_nr_uninterruptible(struct rq *rq_src)

5523

static void migrate_nr_uninterruptible(struct rq *rq_src)

5470

{

5524

{

5471

struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));

5525

struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));

5472

unsigned long flags;

5526

unsigned long flags;

5473

5527

5474

local_irq_save(flags);

5528

local_irq_save(flags);

5475

double_rq_lock(rq_src, rq_dest);

5529

double_rq_lock(rq_src, rq_dest);

5476

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

5530

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

5477

rq_src->nr_uninterruptible = 0;

5531

rq_src->nr_uninterruptible = 0;

5478

double_rq_unlock(rq_src, rq_dest);

5532

double_rq_unlock(rq_src, rq_dest);

5479

local_irq_restore(flags);

5533

local_irq_restore(flags);

5480

}

5534

}

5481

5535

5482

/* Run through task list and migrate tasks from the dead cpu. */

5536

/* Run through task list and migrate tasks from the dead cpu. */

5483

static void migrate_live_tasks(int src_cpu)

5537

static void migrate_live_tasks(int src_cpu)

5484

{

5538

{

5485

struct task_struct *p, *t;

5539

struct task_struct *p, *t;

5486

5540

5487

read_lock(&tasklist_lock);

5541

read_lock(&tasklist_lock);

5488

5542

5489

do_each_thread(t, p) {

5543

do_each_thread(t, p) {

5490

if (p == current)

5544

if (p == current)

5491

continue;

5545

continue;

5492

5546

5493

if (task_cpu(p) == src_cpu)

5547

if (task_cpu(p) == src_cpu)

5494

move_task_off_dead_cpu(src_cpu, p);

5548

move_task_off_dead_cpu(src_cpu, p);

5495

} while_each_thread(t, p);

5549

} while_each_thread(t, p);

5496

5550

5497

read_unlock(&tasklist_lock);

5551

read_unlock(&tasklist_lock);

5498

}

5552

}

5499

5553

5500

/*

5554

/*

5501

* Schedules idle task to be the next runnable task on current CPU.

5555

* Schedules idle task to be the next runnable task on current CPU.

5502

* It does so by boosting its priority to highest possible.

5556

* It does so by boosting its priority to highest possible.

5503

* Used by CPU offline code.

5557

* Used by CPU offline code.

5504

*/

5558

*/

5505

void sched_idle_next(void)

5559

void sched_idle_next(void)

5506

{

5560

{

5507

int this_cpu = smp_processor_id();

5561

int this_cpu = smp_processor_id();

5508

struct rq *rq = cpu_rq(this_cpu);

5562

struct rq *rq = cpu_rq(this_cpu);

5509

struct task_struct *p = rq->idle;

5563

struct task_struct *p = rq->idle;

5510

unsigned long flags;

5564

unsigned long flags;

5511

5565

5512

/* cpu has to be offline */

5566

/* cpu has to be offline */

5513

BUG_ON(cpu_online(this_cpu));

5567

BUG_ON(cpu_online(this_cpu));

5514

5568

5515

/*

5569

/*

5516

* Strictly not necessary since rest of the CPUs are stopped by now

5570

* Strictly not necessary since rest of the CPUs are stopped by now

5517

* and interrupts disabled on the current cpu.

5571

* and interrupts disabled on the current cpu.

5518

*/

5572

*/

5519

raw_spin_lock_irqsave(&rq->lock, flags);

5573

raw_spin_lock_irqsave(&rq->lock, flags);

5520

5574

5521

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

5575

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

5522

5576

5523

activate_task(rq, p, 0);

5577

activate_task(rq, p, 0);

5524

5578

5525

raw_spin_unlock_irqrestore(&rq->lock, flags);

5579

raw_spin_unlock_irqrestore(&rq->lock, flags);

5526

}

5580

}

5527

5581

5528

/*

5582

/*

5529

* Ensures that the idle task is using init_mm right before its cpu goes

5583

* Ensures that the idle task is using init_mm right before its cpu goes

5530

* offline.

5584

* offline.

5531

*/

5585

*/

5532

void idle_task_exit(void)

5586

void idle_task_exit(void)

5533

{

5587

{

5534

struct mm_struct *mm = current->active_mm;

5588

struct mm_struct *mm = current->active_mm;

5535

5589

5536

BUG_ON(cpu_online(smp_processor_id()));

5590

BUG_ON(cpu_online(smp_processor_id()));

5537

5591

5538

if (mm != &init_mm)

5592

if (mm != &init_mm)

5539

switch_mm(mm, &init_mm, current);

5593

switch_mm(mm, &init_mm, current);

5540

mmdrop(mm);

5594

mmdrop(mm);

5541

}

5595

}

5542

5596

5543

/* called under rq->lock with disabled interrupts */

5597

/* called under rq->lock with disabled interrupts */

5544

static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)

5598

static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)

5545

{

5599

{

5546

struct rq *rq = cpu_rq(dead_cpu);

5600

struct rq *rq = cpu_rq(dead_cpu);

5547

5601

5548

/* Must be exiting, otherwise would be on tasklist. */

5602

/* Must be exiting, otherwise would be on tasklist. */

5549

BUG_ON(!p->exit_state);

5603

BUG_ON(!p->exit_state);

5550

5604

5551

/* Cannot have done final schedule yet: would have vanished. */

5605

/* Cannot have done final schedule yet: would have vanished. */

5552

BUG_ON(p->state == TASK_DEAD);

5606

BUG_ON(p->state == TASK_DEAD);

5553

5607

5554

get_task_struct(p);

5608

get_task_struct(p);

5555

5609

5556

/*

5610

/*

5557

* Drop lock around migration; if someone else moves it,

5611

* Drop lock around migration; if someone else moves it,

5558

* that's OK. No task can be added to this CPU, so iteration is

5612

* that's OK. No task can be added to this CPU, so iteration is

5559

* fine.

5613

* fine.

5560

*/

5614

*/

5561

raw_spin_unlock_irq(&rq->lock);

5615

raw_spin_unlock_irq(&rq->lock);

5562

move_task_off_dead_cpu(dead_cpu, p);

5616

move_task_off_dead_cpu(dead_cpu, p);

5563

raw_spin_lock_irq(&rq->lock);

5617

raw_spin_lock_irq(&rq->lock);

5564

5618

5565

put_task_struct(p);

5619

put_task_struct(p);

5566

}

5620

}

5567

5621

5568

/* release_task() removes task from tasklist, so we won't find dead tasks. */

5622

/* release_task() removes task from tasklist, so we won't find dead tasks. */

5569

static void migrate_dead_tasks(unsigned int dead_cpu)

5623

static void migrate_dead_tasks(unsigned int dead_cpu)

5570

{

5624

{

5571

struct rq *rq = cpu_rq(dead_cpu);

5625

struct rq *rq = cpu_rq(dead_cpu);

5572

struct task_struct *next;

5626

struct task_struct *next;

5573

5627

5574

for ( ; ; ) {

5628

for ( ; ; ) {

5575

if (!rq->nr_running)

5629

if (!rq->nr_running)

5576

break;

5630

break;

5577

next = pick_next_task(rq);

5631

next = pick_next_task(rq);

5578

if (!next)

5632

if (!next)

5579

break;

5633

break;

5580

next->sched_class->put_prev_task(rq, next);

5634

next->sched_class->put_prev_task(rq, next);

5581

migrate_dead(dead_cpu, next);

5635

migrate_dead(dead_cpu, next);

5582

5636

5583

}

5637

}

5584

}

5638

}

5585

5639

5586

/*

5640

/*

5587

* remove the tasks which were accounted by rq from calc_load_tasks.

5641

* remove the tasks which were accounted by rq from calc_load_tasks.

5588

*/

5642

*/

5589

static void calc_global_load_remove(struct rq *rq)

5643

static void calc_global_load_remove(struct rq *rq)

5590

{

5644

{

5591

atomic_long_sub(rq->calc_load_active, &calc_load_tasks);

5645

atomic_long_sub(rq->calc_load_active, &calc_load_tasks);

5592

rq->calc_load_active = 0;

5646

rq->calc_load_active = 0;

5593

}

5647

}

5594

#endif /* CONFIG_HOTPLUG_CPU */

5648

#endif /* CONFIG_HOTPLUG_CPU */

5595

5649

5596

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

5650

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

5597

5651

5598

static struct ctl_table sd_ctl_dir[] = {

5652

static struct ctl_table sd_ctl_dir[] = {

5599

{

5653

{

5600

.procname = "sched_domain",

5654

.procname = "sched_domain",

5601

.mode = 0555,

5655

.mode = 0555,

5602

},

5656

},

5603

{}

5657

{}

5604

};

5658

};

5605

5659

5606

static struct ctl_table sd_ctl_root[] = {

5660

static struct ctl_table sd_ctl_root[] = {

5607

{

5661

{

5608

.procname = "kernel",

5662

.procname = "kernel",

5609

.mode = 0555,

5663

.mode = 0555,

5610

.child = sd_ctl_dir,

5664

.child = sd_ctl_dir,

5611

},

5665

},

5612

{}

5666

{}

5613

};

5667

};

5614

5668

5615

static struct ctl_table *sd_alloc_ctl_entry(int n)

5669

static struct ctl_table *sd_alloc_ctl_entry(int n)

5616

{

5670

{

5617

struct ctl_table *entry =

5671

struct ctl_table *entry =

5618

kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);

5672

kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);

5619

5673

5620

return entry;

5674

return entry;

5621

}

5675

}

5622

5676

5623

static void sd_free_ctl_entry(struct ctl_table **tablep)

5677

static void sd_free_ctl_entry(struct ctl_table **tablep)

5624

{

5678

{

5625

struct ctl_table *entry;

5679

struct ctl_table *entry;

5626

5680

5627

/*

5681

/*

5628

* In the intermediate directories, both the child directory and

5682

* In the intermediate directories, both the child directory and

5629

* procname are dynamically allocated and could fail but the mode

5683

* procname are dynamically allocated and could fail but the mode

5630

* will always be set. In the lowest directory the names are

5684

* will always be set. In the lowest directory the names are

5631

* static strings and all have proc handlers.

5685

* static strings and all have proc handlers.

5632

*/

5686

*/

5633

for (entry = *tablep; entry->mode; entry++) {

5687

for (entry = *tablep; entry->mode; entry++) {

5634

if (entry->child)

5688

if (entry->child)

5635

sd_free_ctl_entry(&entry->child);

5689

sd_free_ctl_entry(&entry->child);

5636

if (entry->proc_handler == NULL)

5690

if (entry->proc_handler == NULL)

5637

kfree(entry->procname);

5691

kfree(entry->procname);

5638

}

5692

}

5639

5693

5640

kfree(*tablep);

5694

kfree(*tablep);

5641

*tablep = NULL;

5695

*tablep = NULL;

5642

}

5696

}

5643

5697

5644

static void

5698

static void

5645

set_table_entry(struct ctl_table *entry,

5699

set_table_entry(struct ctl_table *entry,

5646

const char *procname, void *data, int maxlen,

5700

const char *procname, void *data, int maxlen,

5647

mode_t mode, proc_handler *proc_handler)

5701

mode_t mode, proc_handler *proc_handler)

5648

{

5702

{

5649

entry->procname = procname;

5703

entry->procname = procname;

5650

entry->data = data;

5704

entry->data = data;

5651

entry->maxlen = maxlen;

5705

entry->maxlen = maxlen;

5652

entry->mode = mode;

5706

entry->mode = mode;

5653

entry->proc_handler = proc_handler;

5707

entry->proc_handler = proc_handler;

5654

}

5708

}

5655

5709

5656

static struct ctl_table *

5710

static struct ctl_table *

5657

sd_alloc_ctl_domain_table(struct sched_domain *sd)

5711

sd_alloc_ctl_domain_table(struct sched_domain *sd)

5658

{

5712

{

5659

struct ctl_table *table = sd_alloc_ctl_entry(13);

5713

struct ctl_table *table = sd_alloc_ctl_entry(13);

5660

5714

5661

if (table == NULL)

5715

if (table == NULL)

5662

return NULL;

5716

return NULL;

5663

5717

5664

set_table_entry(&table[0], "min_interval", &sd->min_interval,

5718

set_table_entry(&table[0], "min_interval", &sd->min_interval,

5665

sizeof(long), 0644, proc_doulongvec_minmax);

5719

sizeof(long), 0644, proc_doulongvec_minmax);

5666

set_table_entry(&table[1], "max_interval", &sd->max_interval,

5720

set_table_entry(&table[1], "max_interval", &sd->max_interval,

5667

sizeof(long), 0644, proc_doulongvec_minmax);

5721

sizeof(long), 0644, proc_doulongvec_minmax);

5668

set_table_entry(&table[2], "busy_idx", &sd->busy_idx,

5722

set_table_entry(&table[2], "busy_idx", &sd->busy_idx,

5669

sizeof(int), 0644, proc_dointvec_minmax);

5723

sizeof(int), 0644, proc_dointvec_minmax);

5670

set_table_entry(&table[3], "idle_idx", &sd->idle_idx,

5724

set_table_entry(&table[3], "idle_idx", &sd->idle_idx,

5671

sizeof(int), 0644, proc_dointvec_minmax);

5725

sizeof(int), 0644, proc_dointvec_minmax);

5672

set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,

5726

set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,

5673

sizeof(int), 0644, proc_dointvec_minmax);

5727

sizeof(int), 0644, proc_dointvec_minmax);

5674

set_table_entry(&table[5], "wake_idx", &sd->wake_idx,

5728

set_table_entry(&table[5], "wake_idx", &sd->wake_idx,

5675

sizeof(int), 0644, proc_dointvec_minmax);

5729

sizeof(int), 0644, proc_dointvec_minmax);

5676

set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,

5730

set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,

5677

sizeof(int), 0644, proc_dointvec_minmax);

5731

sizeof(int), 0644, proc_dointvec_minmax);

5678

set_table_entry(&table[7], "busy_factor", &sd->busy_factor,

5732

set_table_entry(&table[7], "busy_factor", &sd->busy_factor,

5679

sizeof(int), 0644, proc_dointvec_minmax);

5733

sizeof(int), 0644, proc_dointvec_minmax);

5680

set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,

5734

set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,

5681

sizeof(int), 0644, proc_dointvec_minmax);

5735

sizeof(int), 0644, proc_dointvec_minmax);

5682

set_table_entry(&table[9], "cache_nice_tries",

5736

set_table_entry(&table[9], "cache_nice_tries",

5683

&sd->cache_nice_tries,

5737

&sd->cache_nice_tries,

5684

sizeof(int), 0644, proc_dointvec_minmax);

5738

sizeof(int), 0644, proc_dointvec_minmax);

5685

set_table_entry(&table[10], "flags", &sd->flags,

5739

set_table_entry(&table[10], "flags", &sd->flags,

5686

sizeof(int), 0644, proc_dointvec_minmax);

5740

sizeof(int), 0644, proc_dointvec_minmax);

5687

set_table_entry(&table[11], "name", sd->name,

5741

set_table_entry(&table[11], "name", sd->name,

5688

CORENAME_MAX_SIZE, 0444, proc_dostring);

5742

CORENAME_MAX_SIZE, 0444, proc_dostring);

5689

/* &table[12] is terminator */

5743

/* &table[12] is terminator */

5690

5744

5691

return table;

5745

return table;

5692

}

5746

}

5693

5747

5694

static ctl_table *sd_alloc_ctl_cpu_table(int cpu)

5748

static ctl_table *sd_alloc_ctl_cpu_table(int cpu)

5695

{

5749

{

5696

struct ctl_table *entry, *table;

5750

struct ctl_table *entry, *table;

5697

struct sched_domain *sd;

5751

struct sched_domain *sd;

5698

int domain_num = 0, i;

5752

int domain_num = 0, i;

5699

char buf[32];

5753

char buf[32];

5700

5754

5701

for_each_domain(cpu, sd)

5755

for_each_domain(cpu, sd)

5702

domain_num++;

5756

domain_num++;

5703

entry = table = sd_alloc_ctl_entry(domain_num + 1);

5757

entry = table = sd_alloc_ctl_entry(domain_num + 1);

5704

if (table == NULL)

5758

if (table == NULL)

5705

return NULL;

5759

return NULL;

5706

5760

5707

i = 0;

5761

i = 0;

5708

for_each_domain(cpu, sd) {

5762

for_each_domain(cpu, sd) {

5709

snprintf(buf, 32, "domain%d", i);

5763

snprintf(buf, 32, "domain%d", i);

5710

entry->procname = kstrdup(buf, GFP_KERNEL);

5764

entry->procname = kstrdup(buf, GFP_KERNEL);

5711

entry->mode = 0555;

5765

entry->mode = 0555;

5712

entry->child = sd_alloc_ctl_domain_table(sd);

5766

entry->child = sd_alloc_ctl_domain_table(sd);

5713

entry++;

5767

entry++;

5714

i++;

5768

i++;

5715

}

5769

}

5716

return table;

5770

return table;

5717

}

5771

}

5718

5772

5719

static struct ctl_table_header *sd_sysctl_header;

5773

static struct ctl_table_header *sd_sysctl_header;

5720

static void register_sched_domain_sysctl(void)

5774

static void register_sched_domain_sysctl(void)

5721

{

5775

{

5722

int i, cpu_num = num_possible_cpus();

5776

int i, cpu_num = num_possible_cpus();

5723

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

5777

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

5724

char buf[32];

5778

char buf[32];

5725

5779

5726

WARN_ON(sd_ctl_dir[0].child);

5780

WARN_ON(sd_ctl_dir[0].child);

5727

sd_ctl_dir[0].child = entry;

5781

sd_ctl_dir[0].child = entry;

5728

5782

5729

if (entry == NULL)

5783

if (entry == NULL)

5730

return;

5784

return;

5731

5785

5732

for_each_possible_cpu(i) {

5786

for_each_possible_cpu(i) {

5733

snprintf(buf, 32, "cpu%d", i);

5787

snprintf(buf, 32, "cpu%d", i);

5734

entry->procname = kstrdup(buf, GFP_KERNEL);

5788

entry->procname = kstrdup(buf, GFP_KERNEL);

5735

entry->mode = 0555;

5789

entry->mode = 0555;

5736

entry->child = sd_alloc_ctl_cpu_table(i);

5790

entry->child = sd_alloc_ctl_cpu_table(i);

5737

entry++;

5791

entry++;

5738

}

5792

}

5739

5793

5740

WARN_ON(sd_sysctl_header);

5794

WARN_ON(sd_sysctl_header);

5741

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

5795

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

5742

}

5796

}

5743

5797

5744

/* may be called multiple times per register */

5798

/* may be called multiple times per register */

5745

static void unregister_sched_domain_sysctl(void)

5799

static void unregister_sched_domain_sysctl(void)

5746

{

5800

{

5747

if (sd_sysctl_header)

5801

if (sd_sysctl_header)

5748

unregister_sysctl_table(sd_sysctl_header);

5802

unregister_sysctl_table(sd_sysctl_header);

5749

sd_sysctl_header = NULL;

5803

sd_sysctl_header = NULL;

5750

if (sd_ctl_dir[0].child)

5804

if (sd_ctl_dir[0].child)

5751

sd_free_ctl_entry(&sd_ctl_dir[0].child);

5805

sd_free_ctl_entry(&sd_ctl_dir[0].child);

5752

}

5806

}

5753

#else

5807

#else

5754

static void register_sched_domain_sysctl(void)

5808

static void register_sched_domain_sysctl(void)

5755

{

5809

{

5756

}

5810

}

5757

static void unregister_sched_domain_sysctl(void)

5811

static void unregister_sched_domain_sysctl(void)

5758

{

5812

{

5759

}

5813

}

5760

#endif

5814

#endif

5761

5815

5762

static void set_rq_online(struct rq *rq)

5816

static void set_rq_online(struct rq *rq)

5763

{

5817

{

5764

if (!rq->online) {

5818

if (!rq->online) {

5765

const struct sched_class *class;

5819

const struct sched_class *class;

5766

5820

5767

cpumask_set_cpu(rq->cpu, rq->rd->online);

5821

cpumask_set_cpu(rq->cpu, rq->rd->online);

5768

rq->online = 1;

5822

rq->online = 1;

5769

5823

5770

for_each_class(class) {

5824

for_each_class(class) {

5771

if (class->rq_online)

5825

if (class->rq_online)

5772

class->rq_online(rq);

5826

class->rq_online(rq);

5773

}

5827

}

5774

}

5828

}

5775

}

5829

}

5776

5830

5777

static void set_rq_offline(struct rq *rq)

5831

static void set_rq_offline(struct rq *rq)

5778

{

5832

{

5779

if (rq->online) {

5833

if (rq->online) {

5780

const struct sched_class *class;

5834

const struct sched_class *class;

5781

5835

5782

for_each_class(class) {

5836

for_each_class(class) {

5783

if (class->rq_offline)

5837

if (class->rq_offline)

5784

class->rq_offline(rq);

5838

class->rq_offline(rq);

5785

}

5839

}

5786

5840

5787

cpumask_clear_cpu(rq->cpu, rq->rd->online);

5841

cpumask_clear_cpu(rq->cpu, rq->rd->online);

5788

rq->online = 0;

5842

rq->online = 0;

5789

}

5843

}

5790

}

5844

}

5791

5845

5792

/*

5846

/*

5793

* migration_call - callback that gets triggered when a CPU is added.

5847

* migration_call - callback that gets triggered when a CPU is added.

5794

* Here we can start up the necessary migration thread for the new CPU.

5848

* Here we can start up the necessary migration thread for the new CPU.

5795

*/

5849

*/

5796

static int __cpuinit

5850

static int __cpuinit

5797

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

5851

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

5798

{

5852

{

5799

struct task_struct *p;

5853

struct task_struct *p;

5800

int cpu = (long)hcpu;

5854

int cpu = (long)hcpu;

5801

unsigned long flags;

5855

unsigned long flags;

5802

struct rq *rq;

5856

struct rq *rq;

5803

5857

5804

switch (action) {

5858

switch (action) {

5805

5859

5806

case CPU_UP_PREPARE:

5860

case CPU_UP_PREPARE:

5807

case CPU_UP_PREPARE_FROZEN:

5861

case CPU_UP_PREPARE_FROZEN:

5808

p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);

5862

p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);

5809

if (IS_ERR(p))

5863

if (IS_ERR(p))

5810

return NOTIFY_BAD;

5864

return NOTIFY_BAD;

5811

kthread_bind(p, cpu);

5865

kthread_bind(p, cpu);

5812

/* Must be high prio: stop_machine expects to yield to it. */

5866

/* Must be high prio: stop_machine expects to yield to it. */

5813

rq = task_rq_lock(p, &flags);

5867

rq = task_rq_lock(p, &flags);

5814

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

5868

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

5815

task_rq_unlock(rq, &flags);

5869

task_rq_unlock(rq, &flags);

5816

get_task_struct(p);

5870

get_task_struct(p);

5817

cpu_rq(cpu)->migration_thread = p;

5871

cpu_rq(cpu)->migration_thread = p;

5818

rq->calc_load_update = calc_load_update;

5872

rq->calc_load_update = calc_load_update;

5819

break;

5873

break;

5820

5874

5821

case CPU_ONLINE:

5875

case CPU_ONLINE:

5822

case CPU_ONLINE_FROZEN:

5876

case CPU_ONLINE_FROZEN:

5823

/* Strictly unnecessary, as first user will wake it. */

5877

/* Strictly unnecessary, as first user will wake it. */

5824

wake_up_process(cpu_rq(cpu)->migration_thread);

5878

wake_up_process(cpu_rq(cpu)->migration_thread);

5825

5879

5826

/* Update our root-domain */

5880

/* Update our root-domain */

5827

rq = cpu_rq(cpu);

5881

rq = cpu_rq(cpu);

5828

raw_spin_lock_irqsave(&rq->lock, flags);

5882

raw_spin_lock_irqsave(&rq->lock, flags);

5829

if (rq->rd) {

5883

if (rq->rd) {

5830

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

5884

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

5831

5885

5832

set_rq_online(rq);

5886

set_rq_online(rq);

5833

}

5887

}

5834

raw_spin_unlock_irqrestore(&rq->lock, flags);

5888

raw_spin_unlock_irqrestore(&rq->lock, flags);

5835

break;

5889

break;

5836

5890

5837

#ifdef CONFIG_HOTPLUG_CPU

5891

#ifdef CONFIG_HOTPLUG_CPU

5838

case CPU_UP_CANCELED:

5892

case CPU_UP_CANCELED:

5839

case CPU_UP_CANCELED_FROZEN:

5893

case CPU_UP_CANCELED_FROZEN:

5840

if (!cpu_rq(cpu)->migration_thread)

5894

if (!cpu_rq(cpu)->migration_thread)

5841

break;

5895

break;

5842

/* Unbind it from offline cpu so it can run. Fall thru. */

5896

/* Unbind it from offline cpu so it can run. Fall thru. */

5843

kthread_bind(cpu_rq(cpu)->migration_thread,

5897

kthread_bind(cpu_rq(cpu)->migration_thread,

5844

cpumask_any(cpu_online_mask));

5898

cpumask_any(cpu_online_mask));

5845

kthread_stop(cpu_rq(cpu)->migration_thread);

5899

kthread_stop(cpu_rq(cpu)->migration_thread);

5846

put_task_struct(cpu_rq(cpu)->migration_thread);

5900

put_task_struct(cpu_rq(cpu)->migration_thread);

5847

cpu_rq(cpu)->migration_thread = NULL;

5901

cpu_rq(cpu)->migration_thread = NULL;

5848

break;

5902

break;

5849

5903

5850

case CPU_DEAD:

5904

case CPU_DEAD:

5851

case CPU_DEAD_FROZEN:

5905

case CPU_DEAD_FROZEN:

5852

migrate_live_tasks(cpu);

5906

migrate_live_tasks(cpu);

5853

rq = cpu_rq(cpu);

5907

rq = cpu_rq(cpu);

5854

kthread_stop(rq->migration_thread);

5908

kthread_stop(rq->migration_thread);

5855

put_task_struct(rq->migration_thread);

5909

put_task_struct(rq->migration_thread);

5856

rq->migration_thread = NULL;

5910

rq->migration_thread = NULL;

5857

/* Idle task back to normal (off runqueue, low prio) */

5911

/* Idle task back to normal (off runqueue, low prio) */

5858

raw_spin_lock_irq(&rq->lock);

5912

raw_spin_lock_irq(&rq->lock);

5859

deactivate_task(rq, rq->idle, 0);

5913

deactivate_task(rq, rq->idle, 0);

5860

__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);

5914

__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);

5861

rq->idle->sched_class = &idle_sched_class;

5915

rq->idle->sched_class = &idle_sched_class;

5862

migrate_dead_tasks(cpu);

5916

migrate_dead_tasks(cpu);

5863

raw_spin_unlock_irq(&rq->lock);

5917

raw_spin_unlock_irq(&rq->lock);

5864

migrate_nr_uninterruptible(rq);

5918

migrate_nr_uninterruptible(rq);

5865

BUG_ON(rq->nr_running != 0);

5919

BUG_ON(rq->nr_running != 0);

5866

calc_global_load_remove(rq);

5920

calc_global_load_remove(rq);

5867

/*

5921

/*

5868

* No need to migrate the tasks: it was best-effort if

5922

* No need to migrate the tasks: it was best-effort if

5869

* they didn't take sched_hotcpu_mutex. Just wake up

5923

* they didn't take sched_hotcpu_mutex. Just wake up

5870

* the requestors.

5924

* the requestors.

5871

*/

5925

*/

5872

raw_spin_lock_irq(&rq->lock);

5926

raw_spin_lock_irq(&rq->lock);

5873

while (!list_empty(&rq->migration_queue)) {

5927

while (!list_empty(&rq->migration_queue)) {

5874

struct migration_req *req;

5928

struct migration_req *req;

5875

5929

5876

req = list_entry(rq->migration_queue.next,

5930

req = list_entry(rq->migration_queue.next,

5877

struct migration_req, list);

5931

struct migration_req, list);

5878

list_del_init(&req->list);

5932

list_del_init(&req->list);

5879

raw_spin_unlock_irq(&rq->lock);

5933

raw_spin_unlock_irq(&rq->lock);

5880

complete(&req->done);

5934

complete(&req->done);

5881

raw_spin_lock_irq(&rq->lock);

5935

raw_spin_lock_irq(&rq->lock);

5882

}

5936

}

5883

raw_spin_unlock_irq(&rq->lock);

5937

raw_spin_unlock_irq(&rq->lock);

5884

break;

5938

break;

5885

5939

5886

case CPU_DYING:

5940

case CPU_DYING:

5887

case CPU_DYING_FROZEN:

5941

case CPU_DYING_FROZEN:

5888

/* Update our root-domain */

5942

/* Update our root-domain */

5889

rq = cpu_rq(cpu);

5943

rq = cpu_rq(cpu);

5890

raw_spin_lock_irqsave(&rq->lock, flags);

5944

raw_spin_lock_irqsave(&rq->lock, flags);

5891

if (rq->rd) {

5945

if (rq->rd) {

5892

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

5946

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

5893

set_rq_offline(rq);

5947

set_rq_offline(rq);

5894

}

5948

}

5895

raw_spin_unlock_irqrestore(&rq->lock, flags);

5949

raw_spin_unlock_irqrestore(&rq->lock, flags);

5896

break;

5950

break;

5897

#endif

5951

#endif

5898

}

5952

}

5899

return NOTIFY_OK;

5953

return NOTIFY_OK;

5900

}

5954

}

5901

5955

5902

/*

5956

/*

5903

* Register at high priority so that task migration (migrate_all_tasks)

5957

* Register at high priority so that task migration (migrate_all_tasks)

5904

* happens before everything else. This has to be lower priority than

5958

* happens before everything else. This has to be lower priority than

5905

* the notifier in the perf_event subsystem, though.

5959

* the notifier in the perf_event subsystem, though.

5906

*/

5960

*/

5907

static struct notifier_block __cpuinitdata migration_notifier = {

5961

static struct notifier_block __cpuinitdata migration_notifier = {

5908

.notifier_call = migration_call,

5962

.notifier_call = migration_call,

5909

.priority = 10

5963

.priority = 10

5910

};

5964

};

5911

5965

5912

static int __init migration_init(void)

5966

static int __init migration_init(void)

5913

{

5967

{

5914

void *cpu = (void *)(long)smp_processor_id();

5968

void *cpu = (void *)(long)smp_processor_id();

5915

int err;

5969

int err;

5916

5970

5917

/* Start one for the boot CPU: */

5971

/* Start one for the boot CPU: */

5918

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

5972

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

5919

BUG_ON(err == NOTIFY_BAD);

5973

BUG_ON(err == NOTIFY_BAD);

5920

migration_call(&migration_notifier, CPU_ONLINE, cpu);

5974

migration_call(&migration_notifier, CPU_ONLINE, cpu);

5921

register_cpu_notifier(&migration_notifier);

5975

register_cpu_notifier(&migration_notifier);

5922

5976

5923

return 0;

5977

return 0;

5924

}

5978

}

5925

early_initcall(migration_init);

5979

early_initcall(migration_init);

5926

#endif

5980

#endif

5927

5981

5928

#ifdef CONFIG_SMP

5982

#ifdef CONFIG_SMP

5929

5983

5930

#ifdef CONFIG_SCHED_DEBUG

5984

#ifdef CONFIG_SCHED_DEBUG

5931

5985

5932

static __read_mostly int sched_domain_debug_enabled;

5986

static __read_mostly int sched_domain_debug_enabled;

5933

5987

5934

static int __init sched_domain_debug_setup(char *str)

5988

static int __init sched_domain_debug_setup(char *str)

5935

{

5989

{

5936

sched_domain_debug_enabled = 1;

5990

sched_domain_debug_enabled = 1;

5937

5991

5938

return 0;

5992

return 0;

5939

}

5993

}

5940

early_param("sched_debug", sched_domain_debug_setup);

5994

early_param("sched_debug", sched_domain_debug_setup);

5941

5995

5942

static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

5996

static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

5943

struct cpumask *groupmask)

5997

struct cpumask *groupmask)

5944

{

5998

{

5945

struct sched_group *group = sd->groups;

5999

struct sched_group *group = sd->groups;

5946

char str[256];

6000

char str[256];

5947

6001

5948

cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));

6002

cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));

5949

cpumask_clear(groupmask);

6003

cpumask_clear(groupmask);

5950

6004

5951

printk(KERN_DEBUG "%*s domain %d: ", level, "", level);

6005

printk(KERN_DEBUG "%*s domain %d: ", level, "", level);

5952

6006

5953

if (!(sd->flags & SD_LOAD_BALANCE)) {

6007

if (!(sd->flags & SD_LOAD_BALANCE)) {

5954

printk("does not load-balance\n");

6008

printk("does not load-balance\n");

5955

if (sd->parent)

6009

if (sd->parent)

5956

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"

6010

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"

5957

" has parent");

6011

" has parent");

5958

return -1;

6012

return -1;

5959

}

6013

}

5960

6014

5961

printk(KERN_CONT "span %s level %s\n", str, sd->name);

6015

printk(KERN_CONT "span %s level %s\n", str, sd->name);

5962

6016

5963

if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {

6017

if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {

5964

printk(KERN_ERR "ERROR: domain->span does not contain "

6018

printk(KERN_ERR "ERROR: domain->span does not contain "

5965

"CPU%d\n", cpu);

6019

"CPU%d\n", cpu);

5966

}

6020

}

5967

if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {

6021

if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {

5968

printk(KERN_ERR "ERROR: domain->groups does not contain"

6022

printk(KERN_ERR "ERROR: domain->groups does not contain"

5969

" CPU%d\n", cpu);

6023

" CPU%d\n", cpu);

5970

}

6024

}

5971

6025

5972

printk(KERN_DEBUG "%*s groups:", level + 1, "");

6026

printk(KERN_DEBUG "%*s groups:", level + 1, "");

5973

do {

6027

do {

5974

if (!group) {

6028

if (!group) {

5975

printk("\n");

6029

printk("\n");

5976

printk(KERN_ERR "ERROR: group is NULL\n");

6030

printk(KERN_ERR "ERROR: group is NULL\n");

5977

break;

6031

break;

5978

}

6032

}

5979

6033

5980

if (!group->cpu_power) {

6034

if (!group->cpu_power) {

5981

printk(KERN_CONT "\n");

6035

printk(KERN_CONT "\n");

5982

printk(KERN_ERR "ERROR: domain->cpu_power not "

6036

printk(KERN_ERR "ERROR: domain->cpu_power not "

5983

"set\n");

6037

"set\n");

5984

break;

6038

break;

5985

}

6039

}

5986

6040

5987

if (!cpumask_weight(sched_group_cpus(group))) {

6041

if (!cpumask_weight(sched_group_cpus(group))) {

5988

printk(KERN_CONT "\n");

6042

printk(KERN_CONT "\n");

5989

printk(KERN_ERR "ERROR: empty group\n");

6043

printk(KERN_ERR "ERROR: empty group\n");

5990

break;

6044

break;

5991

}

6045

}

5992

6046

5993

if (cpumask_intersects(groupmask, sched_group_cpus(group))) {

6047

if (cpumask_intersects(groupmask, sched_group_cpus(group))) {

5994

printk(KERN_CONT "\n");

6048

printk(KERN_CONT "\n");

5995

printk(KERN_ERR "ERROR: repeated CPUs\n");

6049

printk(KERN_ERR "ERROR: repeated CPUs\n");

5996

break;

6050

break;

5997

}

6051

}

5998

6052

5999

cpumask_or(groupmask, groupmask, sched_group_cpus(group));

6053

cpumask_or(groupmask, groupmask, sched_group_cpus(group));

6000

6054

6001

cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));

6055

cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));

6002

6056

6003

printk(KERN_CONT " %s", str);

6057

printk(KERN_CONT " %s", str);

6004

if (group->cpu_power != SCHED_LOAD_SCALE) {

6058

if (group->cpu_power != SCHED_LOAD_SCALE) {

6005

printk(KERN_CONT " (cpu_power = %d)",

6059

printk(KERN_CONT " (cpu_power = %d)",

6006

group->cpu_power);

6060

group->cpu_power);

6007

}

6061

}

6008

6062

6009

group = group->next;

6063

group = group->next;

6010

} while (group != sd->groups);

6064

} while (group != sd->groups);

6011

printk(KERN_CONT "\n");

6065

printk(KERN_CONT "\n");

6012

6066

6013

if (!cpumask_equal(sched_domain_span(sd), groupmask))

6067

if (!cpumask_equal(sched_domain_span(sd), groupmask))

6014

printk(KERN_ERR "ERROR: groups don't span domain->span\n");

6068

printk(KERN_ERR "ERROR: groups don't span domain->span\n");

6015

6069

6016

if (sd->parent &&

6070

if (sd->parent &&

6017

!cpumask_subset(groupmask, sched_domain_span(sd->parent)))

6071

!cpumask_subset(groupmask, sched_domain_span(sd->parent)))

6018

printk(KERN_ERR "ERROR: parent span is not a superset "

6072

printk(KERN_ERR "ERROR: parent span is not a superset "

6019

"of domain->span\n");

6073

"of domain->span\n");

6020

return 0;

6074

return 0;

6021

}

6075

}

6022

6076

6023

static void sched_domain_debug(struct sched_domain *sd, int cpu)

6077

static void sched_domain_debug(struct sched_domain *sd, int cpu)

6024

{

6078

{

6025

cpumask_var_t groupmask;

6079

cpumask_var_t groupmask;

6026

int level = 0;

6080

int level = 0;

6027

6081

6028

if (!sched_domain_debug_enabled)

6082

if (!sched_domain_debug_enabled)

6029

return;

6083

return;

6030

6084

6031

if (!sd) {

6085

if (!sd) {

6032

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

6086

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

6033

return;

6087

return;

6034

}

6088

}

6035

6089

6036

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

6090

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

6037

6091

6038

if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {

6092

if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {

6039

printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");

6093

printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");

6040

return;

6094

return;

6041

}

6095

}

6042

6096

6043

for (;;) {

6097

for (;;) {

6044

if (sched_domain_debug_one(sd, cpu, level, groupmask))

6098

if (sched_domain_debug_one(sd, cpu, level, groupmask))

6045

break;

6099

break;

6046

level++;

6100

level++;

6047

sd = sd->parent;

6101

sd = sd->parent;

6048

if (!sd)

6102

if (!sd)

6049

break;

6103

break;

6050

}

6104

}

6051

free_cpumask_var(groupmask);

6105

free_cpumask_var(groupmask);

6052

}

6106

}

6053

#else /* !CONFIG_SCHED_DEBUG */

6107

#else /* !CONFIG_SCHED_DEBUG */

6054

# define sched_domain_debug(sd, cpu) do { } while (0)

6108

# define sched_domain_debug(sd, cpu) do { } while (0)

6055

#endif /* CONFIG_SCHED_DEBUG */

6109

#endif /* CONFIG_SCHED_DEBUG */

6056

6110

6057

static int sd_degenerate(struct sched_domain *sd)

6111

static int sd_degenerate(struct sched_domain *sd)

6058

{

6112

{

6059

if (cpumask_weight(sched_domain_span(sd)) == 1)

6113

if (cpumask_weight(sched_domain_span(sd)) == 1)

6060

return 1;

6114

return 1;

6061

6115

6062

/* Following flags need at least 2 groups */

6116

/* Following flags need at least 2 groups */

6063

if (sd->flags & (SD_LOAD_BALANCE |

6117

if (sd->flags & (SD_LOAD_BALANCE |

6064

SD_BALANCE_NEWIDLE |

6118

SD_BALANCE_NEWIDLE |

6065

SD_BALANCE_FORK |

6119

SD_BALANCE_FORK |

6066

SD_BALANCE_EXEC |

6120

SD_BALANCE_EXEC |

6067

SD_SHARE_CPUPOWER |

6121

SD_SHARE_CPUPOWER |

6068

SD_SHARE_PKG_RESOURCES)) {

6122

SD_SHARE_PKG_RESOURCES)) {

6069

if (sd->groups != sd->groups->next)

6123

if (sd->groups != sd->groups->next)

6070

return 0;

6124

return 0;

6071

}

6125

}

6072

6126

6073

/* Following flags don't use groups */

6127

/* Following flags don't use groups */

6074

if (sd->flags & (SD_WAKE_AFFINE))

6128

if (sd->flags & (SD_WAKE_AFFINE))

6075

return 0;

6129

return 0;

6076

6130

6077

return 1;

6131

return 1;

6078

}

6132

}

6079

6133

6080

static int

6134

static int

6081

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

6135

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

6082

{

6136

{

6083

unsigned long cflags = sd->flags, pflags = parent->flags;

6137

unsigned long cflags = sd->flags, pflags = parent->flags;

6084

6138

6085

if (sd_degenerate(parent))

6139

if (sd_degenerate(parent))

6086

return 1;

6140

return 1;

6087

6141

6088

if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))

6142

if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))

6089

return 0;

6143

return 0;

6090

6144

6091

/* Flags needing groups don't count if only 1 group in parent */

6145

/* Flags needing groups don't count if only 1 group in parent */

6092

if (parent->groups == parent->groups->next) {

6146

if (parent->groups == parent->groups->next) {

6093

pflags &= ~(SD_LOAD_BALANCE |

6147

pflags &= ~(SD_LOAD_BALANCE |

6094

SD_BALANCE_NEWIDLE |

6148

SD_BALANCE_NEWIDLE |

6095

SD_BALANCE_FORK |

6149

SD_BALANCE_FORK |

6096

SD_BALANCE_EXEC |

6150

SD_BALANCE_EXEC |

6097

SD_SHARE_CPUPOWER |

6151

SD_SHARE_CPUPOWER |

6098

SD_SHARE_PKG_RESOURCES);

6152

SD_SHARE_PKG_RESOURCES);

6099

if (nr_node_ids == 1)

6153

if (nr_node_ids == 1)

6100

pflags &= ~SD_SERIALIZE;

6154

pflags &= ~SD_SERIALIZE;

6101

}

6155

}

6102

if (~cflags & pflags)

6156

if (~cflags & pflags)

6103

return 0;

6157

return 0;

6104

6158

6105

return 1;

6159

return 1;

6106

}

6160

}

6107

6161

6108

static void free_rootdomain(struct root_domain *rd)

6162

static void free_rootdomain(struct root_domain *rd)

6109

{

6163

{

6110

synchronize_sched();

6164

synchronize_sched();

6111

6165

6112

cpupri_cleanup(&rd->cpupri);

6166

cpupri_cleanup(&rd->cpupri);

6113

6167

6114

free_cpumask_var(rd->rto_mask);

6168

free_cpumask_var(rd->rto_mask);

6115

free_cpumask_var(rd->online);

6169

free_cpumask_var(rd->online);

6116

free_cpumask_var(rd->span);

6170

free_cpumask_var(rd->span);

6117

kfree(rd);

6171

kfree(rd);

6118

}

6172

}

6119

6173

6120

static void rq_attach_root(struct rq *rq, struct root_domain *rd)

6174

static void rq_attach_root(struct rq *rq, struct root_domain *rd)

6121

{

6175

{

6122

struct root_domain *old_rd = NULL;

6176

struct root_domain *old_rd = NULL;

6123

unsigned long flags;

6177

unsigned long flags;

6124

6178

6125

raw_spin_lock_irqsave(&rq->lock, flags);

6179

raw_spin_lock_irqsave(&rq->lock, flags);

6126

6180

6127

if (rq->rd) {

6181

if (rq->rd) {

6128

old_rd = rq->rd;

6182

old_rd = rq->rd;

6129

6183

6130

if (cpumask_test_cpu(rq->cpu, old_rd->online))

6184

if (cpumask_test_cpu(rq->cpu, old_rd->online))

6131

set_rq_offline(rq);

6185

set_rq_offline(rq);

6132

6186

6133

cpumask_clear_cpu(rq->cpu, old_rd->span);

6187

cpumask_clear_cpu(rq->cpu, old_rd->span);

6134

6188

6135

/*

6189

/*

6136

* If we dont want to free the old_rt yet then

6190

* If we dont want to free the old_rt yet then

6137

* set old_rd to NULL to skip the freeing later

6191

* set old_rd to NULL to skip the freeing later

6138

* in this function:

6192

* in this function:

6139

*/

6193

*/

6140

if (!atomic_dec_and_test(&old_rd->refcount))

6194

if (!atomic_dec_and_test(&old_rd->refcount))

6141

old_rd = NULL;

6195

old_rd = NULL;

6142

}

6196

}

6143

6197

6144

atomic_inc(&rd->refcount);

6198

atomic_inc(&rd->refcount);

6145

rq->rd = rd;

6199

rq->rd = rd;

6146

6200

6147

cpumask_set_cpu(rq->cpu, rd->span);

6201

cpumask_set_cpu(rq->cpu, rd->span);

6148

if (cpumask_test_cpu(rq->cpu, cpu_active_mask))

6202

if (cpumask_test_cpu(rq->cpu, cpu_active_mask))

6149

set_rq_online(rq);

6203

set_rq_online(rq);

6150

6204

6151

raw_spin_unlock_irqrestore(&rq->lock, flags);

6205

raw_spin_unlock_irqrestore(&rq->lock, flags);

6152

6206

6153

if (old_rd)

6207

if (old_rd)

6154

free_rootdomain(old_rd);

6208

free_rootdomain(old_rd);

6155

}

6209

}

6156

6210

6157

static int init_rootdomain(struct root_domain *rd, bool bootmem)

6211

static int init_rootdomain(struct root_domain *rd, bool bootmem)

6158

{

6212

{

6159

gfp_t gfp = GFP_KERNEL;

6213

gfp_t gfp = GFP_KERNEL;

6160

6214

6161

memset(rd, 0, sizeof(*rd));

6215

memset(rd, 0, sizeof(*rd));

6162

6216

6163

if (bootmem)

6217

if (bootmem)

6164

gfp = GFP_NOWAIT;

6218

gfp = GFP_NOWAIT;

6165

6219

6166

if (!alloc_cpumask_var(&rd->span, gfp))

6220

if (!alloc_cpumask_var(&rd->span, gfp))

6167

goto out;

6221

goto out;

6168

if (!alloc_cpumask_var(&rd->online, gfp))

6222

if (!alloc_cpumask_var(&rd->online, gfp))

6169

goto free_span;

6223

goto free_span;

6170

if (!alloc_cpumask_var(&rd->rto_mask, gfp))

6224

if (!alloc_cpumask_var(&rd->rto_mask, gfp))

6171

goto free_online;

6225

goto free_online;

6172

6226

6173

if (cpupri_init(&rd->cpupri, bootmem) != 0)

6227

if (cpupri_init(&rd->cpupri, bootmem) != 0)

6174

goto free_rto_mask;

6228

goto free_rto_mask;

6175

return 0;

6229

return 0;

6176

6230

6177

free_rto_mask:

6231

free_rto_mask:

6178

free_cpumask_var(rd->rto_mask);

6232

free_cpumask_var(rd->rto_mask);

6179

free_online:

6233

free_online:

6180

free_cpumask_var(rd->online);

6234

free_cpumask_var(rd->online);

6181

free_span:

6235

free_span:

6182

free_cpumask_var(rd->span);

6236

free_cpumask_var(rd->span);

6183

out:

6237

out:

6184

return -ENOMEM;

6238

return -ENOMEM;

6185

}

6239

}

6186

6240

6187

static void init_defrootdomain(void)

6241

static void init_defrootdomain(void)

6188

{

6242

{

6189

init_rootdomain(&def_root_domain, true);

6243

init_rootdomain(&def_root_domain, true);

6190

6244

6191

atomic_set(&def_root_domain.refcount, 1);

6245

atomic_set(&def_root_domain.refcount, 1);

6192

}

6246

}

6193

6247

6194

static struct root_domain *alloc_rootdomain(void)

6248

static struct root_domain *alloc_rootdomain(void)

6195

{

6249

{

6196

struct root_domain *rd;

6250

struct root_domain *rd;

6197

6251

6198

rd = kmalloc(sizeof(*rd), GFP_KERNEL);

6252

rd = kmalloc(sizeof(*rd), GFP_KERNEL);

6199

if (!rd)

6253

if (!rd)

6200

return NULL;

6254

return NULL;

6201

6255

6202

if (init_rootdomain(rd, false) != 0) {

6256

if (init_rootdomain(rd, false) != 0) {

6203

kfree(rd);

6257

kfree(rd);

6204

return NULL;

6258

return NULL;

6205

}

6259

}

6206

6260

6207

return rd;

6261

return rd;

6208

}

6262

}

6209

6263

6210

/*

6264

/*

6211

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

6265

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

6212

* hold the hotplug lock.

6266

* hold the hotplug lock.

6213

*/

6267

*/

6214

static void

6268

static void

6215

cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

6269

cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

6216

{

6270

{

6217

struct rq *rq = cpu_rq(cpu);

6271

struct rq *rq = cpu_rq(cpu);

6218

struct sched_domain *tmp;

6272

struct sched_domain *tmp;

6219

6273

6220

/* Remove the sched domains which do not contribute to scheduling. */

6274

/* Remove the sched domains which do not contribute to scheduling. */

6221

for (tmp = sd; tmp; ) {

6275

for (tmp = sd; tmp; ) {

6222

struct sched_domain *parent = tmp->parent;

6276

struct sched_domain *parent = tmp->parent;

6223

if (!parent)

6277

if (!parent)

6224

break;

6278

break;

6225

6279

6226

if (sd_parent_degenerate(tmp, parent)) {

6280

if (sd_parent_degenerate(tmp, parent)) {

6227

tmp->parent = parent->parent;

6281

tmp->parent = parent->parent;

6228

if (parent->parent)

6282

if (parent->parent)

6229

parent->parent->child = tmp;

6283

parent->parent->child = tmp;

6230

} else

6284

} else

6231

tmp = tmp->parent;

6285

tmp = tmp->parent;

6232

}

6286

}

6233

6287

6234

if (sd && sd_degenerate(sd)) {

6288

if (sd && sd_degenerate(sd)) {

6235

sd = sd->parent;

6289

sd = sd->parent;

6236

if (sd)

6290

if (sd)

6237

sd->child = NULL;

6291

sd->child = NULL;

6238

}

6292

}

6239

6293

6240

sched_domain_debug(sd, cpu);

6294

sched_domain_debug(sd, cpu);

6241

6295

6242

rq_attach_root(rq, rd);

6296

rq_attach_root(rq, rd);

6243

rcu_assign_pointer(rq->sd, sd);

6297

rcu_assign_pointer(rq->sd, sd);

6244

}

6298

}

6245

6299

6246

/* cpus with isolated domains */

6300

/* cpus with isolated domains */

6247

static cpumask_var_t cpu_isolated_map;

6301

static cpumask_var_t cpu_isolated_map;

6248

6302

6249

/* Setup the mask of cpus configured for isolated domains */

6303

/* Setup the mask of cpus configured for isolated domains */

6250

static int __init isolated_cpu_setup(char *str)

6304

static int __init isolated_cpu_setup(char *str)

6251

{

6305

{

6252

alloc_bootmem_cpumask_var(&cpu_isolated_map);

6306

alloc_bootmem_cpumask_var(&cpu_isolated_map);

6253

cpulist_parse(str, cpu_isolated_map);

6307

cpulist_parse(str, cpu_isolated_map);

6254

return 1;

6308

return 1;

6255

}

6309

}

6256

6310

6257

__setup("isolcpus=", isolated_cpu_setup);

6311

__setup("isolcpus=", isolated_cpu_setup);

6258

6312

6259

/*

6313

/*

6260

* init_sched_build_groups takes the cpumask we wish to span, and a pointer

6314

* init_sched_build_groups takes the cpumask we wish to span, and a pointer

6261

* to a function which identifies what group(along with sched group) a CPU

6315

* to a function which identifies what group(along with sched group) a CPU

6262

* belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids

6316

* belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids

6263

* (due to the fact that we keep track of groups covered with a struct cpumask).

6317

* (due to the fact that we keep track of groups covered with a struct cpumask).

6264

*

6318

*

6265

* init_sched_build_groups will build a circular linked list of the groups

6319

* init_sched_build_groups will build a circular linked list of the groups

6266

* covered by the given span, and will set each group's ->cpumask correctly,

6320

* covered by the given span, and will set each group's ->cpumask correctly,

6267

* and ->cpu_power to 0.

6321

* and ->cpu_power to 0.

6268

*/

6322

*/

6269

static void

6323

static void

6270

init_sched_build_groups(const struct cpumask *span,

6324

init_sched_build_groups(const struct cpumask *span,

6271

const struct cpumask *cpu_map,

6325

const struct cpumask *cpu_map,

6272

int (*group_fn)(int cpu, const struct cpumask *cpu_map,

6326

int (*group_fn)(int cpu, const struct cpumask *cpu_map,

6273

struct sched_group **sg,

6327

struct sched_group **sg,

6274

struct cpumask *tmpmask),

6328

struct cpumask *tmpmask),

6275

struct cpumask *covered, struct cpumask *tmpmask)

6329

struct cpumask *covered, struct cpumask *tmpmask)

6276

{

6330

{

6277

struct sched_group *first = NULL, *last = NULL;

6331

struct sched_group *first = NULL, *last = NULL;

6278

int i;

6332

int i;

6279

6333

6280

cpumask_clear(covered);

6334

cpumask_clear(covered);

6281

6335

6282

for_each_cpu(i, span) {

6336

for_each_cpu(i, span) {

6283

struct sched_group *sg;

6337

struct sched_group *sg;

6284

int group = group_fn(i, cpu_map, &sg, tmpmask);

6338

int group = group_fn(i, cpu_map, &sg, tmpmask);

6285

int j;

6339

int j;

6286

6340

6287

if (cpumask_test_cpu(i, covered))

6341

if (cpumask_test_cpu(i, covered))

6288

continue;

6342

continue;

6289

6343

6290

cpumask_clear(sched_group_cpus(sg));

6344

cpumask_clear(sched_group_cpus(sg));

6291

sg->cpu_power = 0;

6345

sg->cpu_power = 0;

6292

6346

6293

for_each_cpu(j, span) {

6347

for_each_cpu(j, span) {

6294

if (group_fn(j, cpu_map, NULL, tmpmask) != group)

6348

if (group_fn(j, cpu_map, NULL, tmpmask) != group)

6295

continue;

6349

continue;

6296

6350

6297

cpumask_set_cpu(j, covered);

6351

cpumask_set_cpu(j, covered);

6298

cpumask_set_cpu(j, sched_group_cpus(sg));

6352

cpumask_set_cpu(j, sched_group_cpus(sg));

6299

}

6353

}

6300

if (!first)

6354

if (!first)

6301

first = sg;

6355

first = sg;

6302

if (last)

6356

if (last)

6303

last->next = sg;

6357

last->next = sg;

6304

last = sg;

6358

last = sg;

6305

}

6359

}

6306

last->next = first;

6360

last->next = first;

6307

}

6361

}

6308

6362

6309

#define SD_NODES_PER_DOMAIN 16

6363

#define SD_NODES_PER_DOMAIN 16

6310

6364

6311

#ifdef CONFIG_NUMA

6365

#ifdef CONFIG_NUMA

6312

6366

6313

/**

6367

/**

6314

* find_next_best_node - find the next node to include in a sched_domain

6368

* find_next_best_node - find the next node to include in a sched_domain

6315

* @node: node whose sched_domain we're building

6369

* @node: node whose sched_domain we're building

6316

* @used_nodes: nodes already in the sched_domain

6370

* @used_nodes: nodes already in the sched_domain

6317

*

6371

*

6318

* Find the next node to include in a given scheduling domain. Simply

6372

* Find the next node to include in a given scheduling domain. Simply

6319

* finds the closest node not already in the @used_nodes map.

6373

* finds the closest node not already in the @used_nodes map.

6320

*

6374

*

6321

* Should use nodemask_t.

6375

* Should use nodemask_t.

6322

*/

6376

*/

6323

static int find_next_best_node(int node, nodemask_t *used_nodes)

6377

static int find_next_best_node(int node, nodemask_t *used_nodes)

6324

{

6378

{

6325

int i, n, val, min_val, best_node = 0;

6379

int i, n, val, min_val, best_node = 0;

6326

6380

6327

min_val = INT_MAX;

6381

min_val = INT_MAX;

6328

6382

6329

for (i = 0; i < nr_node_ids; i++) {

6383

for (i = 0; i < nr_node_ids; i++) {

6330

/* Start at @node */

6384

/* Start at @node */

6331

n = (node + i) % nr_node_ids;

6385

n = (node + i) % nr_node_ids;

6332

6386

6333

if (!nr_cpus_node(n))

6387

if (!nr_cpus_node(n))

6334

continue;

6388

continue;

6335

6389

6336

/* Skip already used nodes */

6390

/* Skip already used nodes */

6337

if (node_isset(n, *used_nodes))

6391

if (node_isset(n, *used_nodes))

6338

continue;

6392

continue;

6339

6393

6340

/* Simple min distance search */

6394

/* Simple min distance search */

6341

val = node_distance(node, n);

6395

val = node_distance(node, n);

6342

6396

6343

if (val < min_val) {

6397

if (val < min_val) {

6344

min_val = val;

6398

min_val = val;

6345

best_node = n;

6399

best_node = n;

6346

}

6400

}

6347

}

6401

}

6348

6402

6349

node_set(best_node, *used_nodes);

6403

node_set(best_node, *used_nodes);

6350

return best_node;

6404

return best_node;

6351

}

6405

}

6352

6406

6353

/**

6407

/**

6354

* sched_domain_node_span - get a cpumask for a node's sched_domain

6408

* sched_domain_node_span - get a cpumask for a node's sched_domain

6355

* @node: node whose cpumask we're constructing

6409

* @node: node whose cpumask we're constructing

6356

* @span: resulting cpumask

6410

* @span: resulting cpumask

6357

*

6411

*

6358

* Given a node, construct a good cpumask for its sched_domain to span. It

6412

* Given a node, construct a good cpumask for its sched_domain to span. It

6359

* should be one that prevents unnecessary balancing, but also spreads tasks

6413

* should be one that prevents unnecessary balancing, but also spreads tasks

6360

* out optimally.

6414

* out optimally.

6361

*/

6415

*/

6362

static void sched_domain_node_span(int node, struct cpumask *span)

6416

static void sched_domain_node_span(int node, struct cpumask *span)

6363

{

6417

{

6364

nodemask_t used_nodes;

6418

nodemask_t used_nodes;

6365

int i;

6419

int i;

6366

6420

6367

cpumask_clear(span);

6421

cpumask_clear(span);

6368

nodes_clear(used_nodes);

6422

nodes_clear(used_nodes);

6369

6423

6370

cpumask_or(span, span, cpumask_of_node(node));

6424

cpumask_or(span, span, cpumask_of_node(node));

6371

node_set(node, used_nodes);

6425

node_set(node, used_nodes);

6372

6426

6373

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

6427

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

6374

int next_node = find_next_best_node(node, &used_nodes);

6428

int next_node = find_next_best_node(node, &used_nodes);

6375

6429

6376

cpumask_or(span, span, cpumask_of_node(next_node));

6430

cpumask_or(span, span, cpumask_of_node(next_node));

6377

}

6431

}

6378

}

6432

}

6379

#endif /* CONFIG_NUMA */

6433

#endif /* CONFIG_NUMA */

6380

6434

6381

int sched_smt_power_savings = 0, sched_mc_power_savings = 0;

6435

int sched_smt_power_savings = 0, sched_mc_power_savings = 0;

6382

6436

6383

/*

6437

/*

6384

* The cpus mask in sched_group and sched_domain hangs off the end.

6438

* The cpus mask in sched_group and sched_domain hangs off the end.

6385

*

6439

*

6386

* ( See the the comments in include/linux/sched.h:struct sched_group

6440

* ( See the the comments in include/linux/sched.h:struct sched_group

6387

* and struct sched_domain. )

6441

* and struct sched_domain. )

6388

*/

6442

*/

6389

struct static_sched_group {

6443

struct static_sched_group {

6390

struct sched_group sg;

6444

struct sched_group sg;

6391

DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);

6445

DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);

6392

};

6446

};

6393

6447

6394

struct static_sched_domain {

6448

struct static_sched_domain {

6395

struct sched_domain sd;

6449

struct sched_domain sd;

6396

DECLARE_BITMAP(span, CONFIG_NR_CPUS);

6450

DECLARE_BITMAP(span, CONFIG_NR_CPUS);

6397

};

6451

};

6398

6452

6399

struct s_data {

6453

struct s_data {

6400

#ifdef CONFIG_NUMA

6454

#ifdef CONFIG_NUMA

6401

int sd_allnodes;

6455

int sd_allnodes;

6402

cpumask_var_t domainspan;

6456

cpumask_var_t domainspan;

6403

cpumask_var_t covered;

6457

cpumask_var_t covered;

6404

cpumask_var_t notcovered;

6458

cpumask_var_t notcovered;

6405

#endif

6459

#endif

6406

cpumask_var_t nodemask;

6460

cpumask_var_t nodemask;

6407

cpumask_var_t this_sibling_map;

6461

cpumask_var_t this_sibling_map;

6408

cpumask_var_t this_core_map;

6462

cpumask_var_t this_core_map;

6409

cpumask_var_t send_covered;

6463

cpumask_var_t send_covered;

6410

cpumask_var_t tmpmask;

6464

cpumask_var_t tmpmask;

6411

struct sched_group **sched_group_nodes;

6465

struct sched_group **sched_group_nodes;

6412

struct root_domain *rd;

6466

struct root_domain *rd;

6413

};

6467

};

6414

6468

6415

enum s_alloc {

6469

enum s_alloc {

6416

sa_sched_groups = 0,

6470

sa_sched_groups = 0,

6417

sa_rootdomain,

6471

sa_rootdomain,

6418

sa_tmpmask,

6472

sa_tmpmask,

6419

sa_send_covered,

6473

sa_send_covered,

6420

sa_this_core_map,

6474

sa_this_core_map,

6421

sa_this_sibling_map,

6475

sa_this_sibling_map,

6422

sa_nodemask,

6476

sa_nodemask,

6423

sa_sched_group_nodes,

6477

sa_sched_group_nodes,

6424

#ifdef CONFIG_NUMA

6478

#ifdef CONFIG_NUMA

6425

sa_notcovered,

6479

sa_notcovered,

6426

sa_covered,

6480

sa_covered,

6427

sa_domainspan,

6481

sa_domainspan,

6428

#endif

6482

#endif

6429

sa_none,

6483

sa_none,

6430

};

6484

};

6431

6485

6432

/*

6486

/*

6433

* SMT sched-domains:

6487

* SMT sched-domains:

6434

*/

6488

*/

6435

#ifdef CONFIG_SCHED_SMT

6489

#ifdef CONFIG_SCHED_SMT

6436

static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);

6490

static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);

6437

static DEFINE_PER_CPU(struct static_sched_group, sched_groups);

6491

static DEFINE_PER_CPU(struct static_sched_group, sched_groups);

6438

6492

6439

static int

6493

static int

6440

cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,

6494

cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,

6441

struct sched_group **sg, struct cpumask *unused)

6495

struct sched_group **sg, struct cpumask *unused)

6442

{

6496

{

6443

if (sg)

6497

if (sg)

6444

*sg = &per_cpu(sched_groups, cpu).sg;

6498

*sg = &per_cpu(sched_groups, cpu).sg;

6445

return cpu;

6499

return cpu;

6446

}

6500

}

6447

#endif /* CONFIG_SCHED_SMT */

6501

#endif /* CONFIG_SCHED_SMT */

6448

6502

6449

/*

6503

/*

6450

* multi-core sched-domains:

6504

* multi-core sched-domains:

6451

*/

6505

*/

6452

#ifdef CONFIG_SCHED_MC

6506

#ifdef CONFIG_SCHED_MC

6453

static DEFINE_PER_CPU(struct static_sched_domain, core_domains);

6507

static DEFINE_PER_CPU(struct static_sched_domain, core_domains);

6454

static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);

6508

static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);

6455

#endif /* CONFIG_SCHED_MC */

6509

#endif /* CONFIG_SCHED_MC */

6456

6510

6457

#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)

6511

#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)

6458

static int

6512

static int

6459

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

6513

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

6460

struct sched_group **sg, struct cpumask *mask)

6514

struct sched_group **sg, struct cpumask *mask)

6461

{

6515

{

6462

int group;

6516

int group;

6463

6517

6464

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

6518

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

6465

group = cpumask_first(mask);

6519

group = cpumask_first(mask);

6466

if (sg)

6520

if (sg)

6467

*sg = &per_cpu(sched_group_core, group).sg;

6521

*sg = &per_cpu(sched_group_core, group).sg;

6468

return group;

6522

return group;

6469

}

6523

}

6470

#elif defined(CONFIG_SCHED_MC)

6524

#elif defined(CONFIG_SCHED_MC)

6471

static int

6525

static int

6472

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

6526

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

6473

struct sched_group **sg, struct cpumask *unused)

6527

struct sched_group **sg, struct cpumask *unused)

6474

{

6528

{

6475

if (sg)

6529

if (sg)

6476

*sg = &per_cpu(sched_group_core, cpu).sg;

6530

*sg = &per_cpu(sched_group_core, cpu).sg;

6477

return cpu;

6531

return cpu;

6478

}

6532

}

6479

#endif

6533

#endif

6480

6534

6481

static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);

6535

static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);

6482

static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);

6536

static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);

6483

6537

6484

static int

6538

static int

6485

cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,

6539

cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,

6486

struct sched_group **sg, struct cpumask *mask)

6540

struct sched_group **sg, struct cpumask *mask)

6487

{

6541

{

6488

int group;

6542

int group;

6489

#ifdef CONFIG_SCHED_MC

6543

#ifdef CONFIG_SCHED_MC

6490

cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);

6544

cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);

6491

group = cpumask_first(mask);

6545

group = cpumask_first(mask);

6492

#elif defined(CONFIG_SCHED_SMT)

6546

#elif defined(CONFIG_SCHED_SMT)

6493

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

6547

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

6494

group = cpumask_first(mask);

6548

group = cpumask_first(mask);

6495

#else

6549

#else

6496

group = cpu;

6550

group = cpu;

6497

#endif

6551

#endif

6498

if (sg)

6552

if (sg)

6499

*sg = &per_cpu(sched_group_phys, group).sg;

6553

*sg = &per_cpu(sched_group_phys, group).sg;

6500

return group;

6554

return group;

6501

}

6555

}

6502

6556

6503

#ifdef CONFIG_NUMA

6557

#ifdef CONFIG_NUMA

6504

/*

6558

/*

6505

* The init_sched_build_groups can't handle what we want to do with node

6559

* The init_sched_build_groups can't handle what we want to do with node

6506

* groups, so roll our own. Now each node has its own list of groups which

6560

* groups, so roll our own. Now each node has its own list of groups which

6507

* gets dynamically allocated.

6561

* gets dynamically allocated.

6508

*/

6562

*/

6509

static DEFINE_PER_CPU(struct static_sched_domain, node_domains);

6563

static DEFINE_PER_CPU(struct static_sched_domain, node_domains);

6510

static struct sched_group ***sched_group_nodes_bycpu;

6564

static struct sched_group ***sched_group_nodes_bycpu;

6511

6565

6512

static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);

6566

static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);

6513

static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);

6567

static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);

6514

6568

6515

static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,

6569

static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,

6516

struct sched_group **sg,

6570

struct sched_group **sg,

6517

struct cpumask *nodemask)

6571

struct cpumask *nodemask)

6518

{

6572

{

6519

int group;

6573

int group;

6520

6574

6521

cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);

6575

cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);

6522

group = cpumask_first(nodemask);

6576

group = cpumask_first(nodemask);

6523

6577

6524

if (sg)

6578

if (sg)

6525

*sg = &per_cpu(sched_group_allnodes, group).sg;

6579

*sg = &per_cpu(sched_group_allnodes, group).sg;

6526

return group;

6580

return group;

6527

}

6581

}

6528

6582

6529

static void init_numa_sched_groups_power(struct sched_group *group_head)

6583

static void init_numa_sched_groups_power(struct sched_group *group_head)

6530

{

6584

{

6531

struct sched_group *sg = group_head;

6585

struct sched_group *sg = group_head;

6532

int j;

6586

int j;

6533

6587

6534

if (!sg)

6588

if (!sg)

6535

return;

6589

return;

6536

do {

6590

do {

6537

for_each_cpu(j, sched_group_cpus(sg)) {

6591

for_each_cpu(j, sched_group_cpus(sg)) {

6538

struct sched_domain *sd;

6592

struct sched_domain *sd;

6539

6593

6540

sd = &per_cpu(phys_domains, j).sd;

6594

sd = &per_cpu(phys_domains, j).sd;

6541

if (j != group_first_cpu(sd->groups)) {

6595

if (j != group_first_cpu(sd->groups)) {

6542

/*

6596

/*

6543

* Only add "power" once for each

6597

* Only add "power" once for each

6544

* physical package.

6598

* physical package.

6545

*/

6599

*/

6546

continue;

6600

continue;

6547

}

6601

}

6548

6602

6549

sg->cpu_power += sd->groups->cpu_power;

6603

sg->cpu_power += sd->groups->cpu_power;

6550

}

6604

}

6551

sg = sg->next;

6605

sg = sg->next;

6552

} while (sg != group_head);

6606

} while (sg != group_head);

6553

}

6607

}

6554

6608

6555

static int build_numa_sched_groups(struct s_data *d,

6609

static int build_numa_sched_groups(struct s_data *d,

6556

const struct cpumask *cpu_map, int num)

6610

const struct cpumask *cpu_map, int num)

6557

{

6611

{

6558

struct sched_domain *sd;

6612

struct sched_domain *sd;

6559

struct sched_group *sg, *prev;

6613

struct sched_group *sg, *prev;

6560

int n, j;

6614

int n, j;

6561

6615

6562

cpumask_clear(d->covered);

6616

cpumask_clear(d->covered);

6563

cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);

6617

cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);

6564

if (cpumask_empty(d->nodemask)) {

6618

if (cpumask_empty(d->nodemask)) {

6565

d->sched_group_nodes[num] = NULL;

6619

d->sched_group_nodes[num] = NULL;

6566

goto out;

6620

goto out;

6567

}

6621

}

6568

6622

6569

sched_domain_node_span(num, d->domainspan);

6623

sched_domain_node_span(num, d->domainspan);

6570

cpumask_and(d->domainspan, d->domainspan, cpu_map);

6624

cpumask_and(d->domainspan, d->domainspan, cpu_map);

6571

6625

6572

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

6626

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

6573

GFP_KERNEL, num);

6627

GFP_KERNEL, num);

6574

if (!sg) {

6628

if (!sg) {

6575

printk(KERN_WARNING "Can not alloc domain group for node %d\n",

6629

printk(KERN_WARNING "Can not alloc domain group for node %d\n",

6576

num);

6630

num);

6577

return -ENOMEM;

6631

return -ENOMEM;

6578

}

6632

}

6579

d->sched_group_nodes[num] = sg;

6633

d->sched_group_nodes[num] = sg;

6580

6634

6581

for_each_cpu(j, d->nodemask) {

6635

for_each_cpu(j, d->nodemask) {

6582

sd = &per_cpu(node_domains, j).sd;

6636

sd = &per_cpu(node_domains, j).sd;

6583

sd->groups = sg;

6637

sd->groups = sg;

6584

}

6638

}

6585

6639

6586

sg->cpu_power = 0;

6640

sg->cpu_power = 0;

6587

cpumask_copy(sched_group_cpus(sg), d->nodemask);

6641

cpumask_copy(sched_group_cpus(sg), d->nodemask);

6588

sg->next = sg;

6642

sg->next = sg;

6589

cpumask_or(d->covered, d->covered, d->nodemask);

6643

cpumask_or(d->covered, d->covered, d->nodemask);

6590

6644

6591

prev = sg;

6645

prev = sg;

6592

for (j = 0; j < nr_node_ids; j++) {

6646

for (j = 0; j < nr_node_ids; j++) {

6593

n = (num + j) % nr_node_ids;

6647

n = (num + j) % nr_node_ids;

6594

cpumask_complement(d->notcovered, d->covered);

6648

cpumask_complement(d->notcovered, d->covered);

6595

cpumask_and(d->tmpmask, d->notcovered, cpu_map);

6649

cpumask_and(d->tmpmask, d->notcovered, cpu_map);

6596

cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);

6650

cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);

6597

if (cpumask_empty(d->tmpmask))

6651

if (cpumask_empty(d->tmpmask))

6598

break;

6652

break;

6599

cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));

6653

cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));

6600

if (cpumask_empty(d->tmpmask))

6654

if (cpumask_empty(d->tmpmask))

6601

continue;

6655

continue;

6602

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

6656

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

6603

GFP_KERNEL, num);

6657

GFP_KERNEL, num);

6604

if (!sg) {

6658

if (!sg) {

6605

printk(KERN_WARNING

6659

printk(KERN_WARNING

6606

"Can not alloc domain group for node %d\n", j);

6660

"Can not alloc domain group for node %d\n", j);

6607

return -ENOMEM;

6661

return -ENOMEM;

6608

}

6662

}

6609

sg->cpu_power = 0;

6663

sg->cpu_power = 0;

6610

cpumask_copy(sched_group_cpus(sg), d->tmpmask);

6664

cpumask_copy(sched_group_cpus(sg), d->tmpmask);

6611

sg->next = prev->next;

6665

sg->next = prev->next;

6612

cpumask_or(d->covered, d->covered, d->tmpmask);

6666

cpumask_or(d->covered, d->covered, d->tmpmask);

6613

prev->next = sg;

6667

prev->next = sg;

6614

prev = sg;

6668

prev = sg;

6615

}

6669

}

6616

out:

6670

out:

6617

return 0;

6671

return 0;

6618

}

6672

}

6619

#endif /* CONFIG_NUMA */

6673

#endif /* CONFIG_NUMA */

6620

6674

6621

#ifdef CONFIG_NUMA

6675

#ifdef CONFIG_NUMA

6622

/* Free memory allocated for various sched_group structures */

6676

/* Free memory allocated for various sched_group structures */

6623

static void free_sched_groups(const struct cpumask *cpu_map,

6677

static void free_sched_groups(const struct cpumask *cpu_map,

6624

struct cpumask *nodemask)

6678

struct cpumask *nodemask)

6625

{

6679

{

6626

int cpu, i;

6680

int cpu, i;

6627

6681

6628

for_each_cpu(cpu, cpu_map) {

6682

for_each_cpu(cpu, cpu_map) {

6629

struct sched_group **sched_group_nodes

6683

struct sched_group **sched_group_nodes

6630

= sched_group_nodes_bycpu[cpu];

6684

= sched_group_nodes_bycpu[cpu];

6631

6685

6632

if (!sched_group_nodes)

6686

if (!sched_group_nodes)

6633

continue;

6687

continue;

6634

6688

6635

for (i = 0; i < nr_node_ids; i++) {

6689

for (i = 0; i < nr_node_ids; i++) {

6636

struct sched_group *oldsg, *sg = sched_group_nodes[i];

6690

struct sched_group *oldsg, *sg = sched_group_nodes[i];

6637

6691

6638

cpumask_and(nodemask, cpumask_of_node(i), cpu_map);

6692

cpumask_and(nodemask, cpumask_of_node(i), cpu_map);

6639

if (cpumask_empty(nodemask))

6693

if (cpumask_empty(nodemask))

6640

continue;

6694

continue;

6641

6695

6642

if (sg == NULL)

6696

if (sg == NULL)

6643

continue;

6697

continue;

6644

sg = sg->next;

6698

sg = sg->next;

6645

next_sg:

6699

next_sg:

6646

oldsg = sg;

6700

oldsg = sg;

6647

sg = sg->next;

6701

sg = sg->next;

6648

kfree(oldsg);

6702

kfree(oldsg);

6649

if (oldsg != sched_group_nodes[i])

6703

if (oldsg != sched_group_nodes[i])

6650

goto next_sg;

6704

goto next_sg;

6651

}

6705

}

6652

kfree(sched_group_nodes);

6706

kfree(sched_group_nodes);

6653

sched_group_nodes_bycpu[cpu] = NULL;

6707

sched_group_nodes_bycpu[cpu] = NULL;

6654

}

6708

}

6655

}

6709

}

6656

#else /* !CONFIG_NUMA */

6710

#else /* !CONFIG_NUMA */

6657

static void free_sched_groups(const struct cpumask *cpu_map,

6711

static void free_sched_groups(const struct cpumask *cpu_map,

6658

struct cpumask *nodemask)

6712

struct cpumask *nodemask)

6659

{

6713

{

6660

}

6714

}

6661

#endif /* CONFIG_NUMA */

6715

#endif /* CONFIG_NUMA */

6662

6716

6663

/*

6717

/*

6664

* Initialize sched groups cpu_power.

6718

* Initialize sched groups cpu_power.

6665

*

6719

*

6666

* cpu_power indicates the capacity of sched group, which is used while

6720

* cpu_power indicates the capacity of sched group, which is used while

6667

* distributing the load between different sched groups in a sched domain.

6721

* distributing the load between different sched groups in a sched domain.

6668

* Typically cpu_power for all the groups in a sched domain will be same unless

6722

* Typically cpu_power for all the groups in a sched domain will be same unless

6669

* there are asymmetries in the topology. If there are asymmetries, group

6723

* there are asymmetries in the topology. If there are asymmetries, group

6670

* having more cpu_power will pickup more load compared to the group having

6724

* having more cpu_power will pickup more load compared to the group having

6671

* less cpu_power.

6725

* less cpu_power.

6672

*/

6726

*/

6673

static void init_sched_groups_power(int cpu, struct sched_domain *sd)

6727

static void init_sched_groups_power(int cpu, struct sched_domain *sd)

6674

{

6728

{

6675

struct sched_domain *child;

6729

struct sched_domain *child;

6676

struct sched_group *group;

6730

struct sched_group *group;

6677

long power;

6731

long power;

6678

int weight;

6732

int weight;

6679

6733

6680

WARN_ON(!sd || !sd->groups);

6734

WARN_ON(!sd || !sd->groups);

6681

6735

6682

if (cpu != group_first_cpu(sd->groups))

6736

if (cpu != group_first_cpu(sd->groups))

6683

return;

6737

return;

6684

6738

6685

child = sd->child;

6739

child = sd->child;

6686

6740

6687

sd->groups->cpu_power = 0;

6741

sd->groups->cpu_power = 0;

6688

6742

6689

if (!child) {

6743

if (!child) {

6690

power = SCHED_LOAD_SCALE;

6744

power = SCHED_LOAD_SCALE;

6691

weight = cpumask_weight(sched_domain_span(sd));

6745

weight = cpumask_weight(sched_domain_span(sd));

6692

/*

6746

/*

6693

* SMT siblings share the power of a single core.

6747

* SMT siblings share the power of a single core.

6694

* Usually multiple threads get a better yield out of

6748

* Usually multiple threads get a better yield out of

6695

* that one core than a single thread would have,

6749

* that one core than a single thread would have,

6696

* reflect that in sd->smt_gain.

6750

* reflect that in sd->smt_gain.

6697

*/

6751

*/

6698

if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {

6752

if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {

6699

power *= sd->smt_gain;

6753

power *= sd->smt_gain;

6700

power /= weight;

6754

power /= weight;

6701

power >>= SCHED_LOAD_SHIFT;

6755

power >>= SCHED_LOAD_SHIFT;

6702

}

6756

}

6703

sd->groups->cpu_power += power;

6757

sd->groups->cpu_power += power;

6704

return;

6758

return;

6705

}

6759

}

6706

6760

6707

/*

6761

/*

6708

* Add cpu_power of each child group to this groups cpu_power.

6762

* Add cpu_power of each child group to this groups cpu_power.

6709

*/

6763

*/

6710

group = child->groups;

6764

group = child->groups;

6711

do {

6765

do {

6712

sd->groups->cpu_power += group->cpu_power;

6766

sd->groups->cpu_power += group->cpu_power;

6713

group = group->next;

6767

group = group->next;

6714

} while (group != child->groups);

6768

} while (group != child->groups);

6715

}

6769

}

6716

6770

6717

/*

6771

/*

6718

* Initializers for schedule domains

6772

* Initializers for schedule domains

6719

* Non-inlined to reduce accumulated stack pressure in build_sched_domains()

6773

* Non-inlined to reduce accumulated stack pressure in build_sched_domains()

6720

*/

6774

*/

6721

6775

6722

#ifdef CONFIG_SCHED_DEBUG

6776

#ifdef CONFIG_SCHED_DEBUG

6723

# define SD_INIT_NAME(sd, type) sd->name = #type

6777

# define SD_INIT_NAME(sd, type) sd->name = #type

6724

#else

6778

#else

6725

# define SD_INIT_NAME(sd, type) do { } while (0)

6779

# define SD_INIT_NAME(sd, type) do { } while (0)

6726

#endif

6780

#endif

6727

6781

6728

#define SD_INIT(sd, type) sd_init_##type(sd)

6782

#define SD_INIT(sd, type) sd_init_##type(sd)

6729

6783

6730

#define SD_INIT_FUNC(type) \

6784

#define SD_INIT_FUNC(type) \

6731

static noinline void sd_init_##type(struct sched_domain *sd) \

6785

static noinline void sd_init_##type(struct sched_domain *sd) \

6732

{ \

6786

{ \

6733

memset(sd, 0, sizeof(*sd)); \

6787

memset(sd, 0, sizeof(*sd)); \

6734

*sd = SD_##type##_INIT; \

6788

*sd = SD_##type##_INIT; \

6735

sd->level = SD_LV_##type; \

6789

sd->level = SD_LV_##type; \

6736

SD_INIT_NAME(sd, type); \

6790

SD_INIT_NAME(sd, type); \

6737

}

6791

}

6738

6792

6739

SD_INIT_FUNC(CPU)

6793

SD_INIT_FUNC(CPU)

6740

#ifdef CONFIG_NUMA

6794

#ifdef CONFIG_NUMA

6741

SD_INIT_FUNC(ALLNODES)

6795

SD_INIT_FUNC(ALLNODES)

6742

SD_INIT_FUNC(NODE)

6796

SD_INIT_FUNC(NODE)

6743

#endif

6797

#endif

6744

#ifdef CONFIG_SCHED_SMT

6798

#ifdef CONFIG_SCHED_SMT

6745

SD_INIT_FUNC(SIBLING)

6799

SD_INIT_FUNC(SIBLING)

6746

#endif

6800

#endif

6747

#ifdef CONFIG_SCHED_MC

6801

#ifdef CONFIG_SCHED_MC

6748

SD_INIT_FUNC(MC)

6802

SD_INIT_FUNC(MC)

6749

#endif

6803

#endif

6750

6804

6751

static int default_relax_domain_level = -1;

6805

static int default_relax_domain_level = -1;

6752

6806

6753

static int __init setup_relax_domain_level(char *str)

6807

static int __init setup_relax_domain_level(char *str)

6754

{

6808

{

6755

unsigned long val;

6809

unsigned long val;

6756

6810

6757

val = simple_strtoul(str, NULL, 0);

6811

val = simple_strtoul(str, NULL, 0);

6758

if (val < SD_LV_MAX)

6812

if (val < SD_LV_MAX)

6759

default_relax_domain_level = val;

6813

default_relax_domain_level = val;

6760

6814

6761

return 1;

6815

return 1;

6762

}

6816

}

6763

__setup("relax_domain_level=", setup_relax_domain_level);

6817

__setup("relax_domain_level=", setup_relax_domain_level);

6764

6818

6765

static void set_domain_attribute(struct sched_domain *sd,

6819

static void set_domain_attribute(struct sched_domain *sd,

6766

struct sched_domain_attr *attr)

6820

struct sched_domain_attr *attr)

6767

{

6821

{

6768

int request;

6822

int request;

6769

6823

6770

if (!attr || attr->relax_domain_level < 0) {

6824

if (!attr || attr->relax_domain_level < 0) {

6771

if (default_relax_domain_level < 0)

6825

if (default_relax_domain_level < 0)

6772

return;

6826

return;

6773

else

6827

else

6774

request = default_relax_domain_level;

6828

request = default_relax_domain_level;

6775

} else

6829

} else

6776

request = attr->relax_domain_level;

6830

request = attr->relax_domain_level;

6777

if (request < sd->level) {

6831

if (request < sd->level) {

6778

/* turn off idle balance on this domain */

6832

/* turn off idle balance on this domain */

6779

sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

6833

sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

6780

} else {

6834

} else {

6781

/* turn on idle balance on this domain */

6835

/* turn on idle balance on this domain */

6782

sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

6836

sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

6783

}

6837

}

6784

}

6838

}

6785

6839

6786

static void __free_domain_allocs(struct s_data *d, enum s_alloc what,

6840

static void __free_domain_allocs(struct s_data *d, enum s_alloc what,

6787

const struct cpumask *cpu_map)

6841

const struct cpumask *cpu_map)

6788

{

6842

{

6789

switch (what) {

6843

switch (what) {

6790

case sa_sched_groups:

6844

case sa_sched_groups:

6791

free_sched_groups(cpu_map, d->tmpmask); /* fall through */

6845

free_sched_groups(cpu_map, d->tmpmask); /* fall through */

6792

d->sched_group_nodes = NULL;

6846

d->sched_group_nodes = NULL;

6793

case sa_rootdomain:

6847

case sa_rootdomain:

6794

free_rootdomain(d->rd); /* fall through */

6848

free_rootdomain(d->rd); /* fall through */

6795

case sa_tmpmask:

6849

case sa_tmpmask:

6796

free_cpumask_var(d->tmpmask); /* fall through */

6850

free_cpumask_var(d->tmpmask); /* fall through */

6797

case sa_send_covered:

6851

case sa_send_covered:

6798

free_cpumask_var(d->send_covered); /* fall through */

6852

free_cpumask_var(d->send_covered); /* fall through */

6799

case sa_this_core_map:

6853

case sa_this_core_map:

6800

free_cpumask_var(d->this_core_map); /* fall through */

6854

free_cpumask_var(d->this_core_map); /* fall through */

6801

case sa_this_sibling_map:

6855

case sa_this_sibling_map:

6802

free_cpumask_var(d->this_sibling_map); /* fall through */

6856

free_cpumask_var(d->this_sibling_map); /* fall through */

6803

case sa_nodemask:

6857

case sa_nodemask:

6804

free_cpumask_var(d->nodemask); /* fall through */

6858

free_cpumask_var(d->nodemask); /* fall through */

6805

case sa_sched_group_nodes:

6859

case sa_sched_group_nodes:

6806

#ifdef CONFIG_NUMA

6860

#ifdef CONFIG_NUMA

6807

kfree(d->sched_group_nodes); /* fall through */

6861

kfree(d->sched_group_nodes); /* fall through */

6808

case sa_notcovered:

6862

case sa_notcovered:

6809

free_cpumask_var(d->notcovered); /* fall through */

6863

free_cpumask_var(d->notcovered); /* fall through */

6810

case sa_covered:

6864

case sa_covered:

6811

free_cpumask_var(d->covered); /* fall through */

6865

free_cpumask_var(d->covered); /* fall through */

6812

case sa_domainspan:

6866

case sa_domainspan:

6813

free_cpumask_var(d->domainspan); /* fall through */

6867

free_cpumask_var(d->domainspan); /* fall through */

6814

#endif

6868

#endif

6815

case sa_none:

6869

case sa_none:

6816

break;

6870

break;

6817

}

6871

}

6818

}

6872

}

6819

6873

6820

static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,

6874

static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,

6821

const struct cpumask *cpu_map)

6875

const struct cpumask *cpu_map)

6822

{

6876

{

6823

#ifdef CONFIG_NUMA

6877

#ifdef CONFIG_NUMA

6824

if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))

6878

if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))

6825

return sa_none;

6879

return sa_none;

6826

if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))

6880

if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))

6827

return sa_domainspan;

6881

return sa_domainspan;

6828

if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))

6882

if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))

6829

return sa_covered;

6883

return sa_covered;

6830

/* Allocate the per-node list of sched groups */

6884

/* Allocate the per-node list of sched groups */

6831

d->sched_group_nodes = kcalloc(nr_node_ids,

6885

d->sched_group_nodes = kcalloc(nr_node_ids,

6832

sizeof(struct sched_group *), GFP_KERNEL);

6886

sizeof(struct sched_group *), GFP_KERNEL);

6833

if (!d->sched_group_nodes) {

6887

if (!d->sched_group_nodes) {

6834

printk(KERN_WARNING "Can not alloc sched group node list\n");

6888

printk(KERN_WARNING "Can not alloc sched group node list\n");

6835

return sa_notcovered;

6889

return sa_notcovered;

6836

}

6890

}

6837

sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;

6891

sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;

6838

#endif

6892

#endif

6839

if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))

6893

if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))

6840

return sa_sched_group_nodes;

6894

return sa_sched_group_nodes;

6841

if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))

6895

if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))

6842

return sa_nodemask;

6896

return sa_nodemask;

6843

if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))

6897

if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))

6844

return sa_this_sibling_map;

6898

return sa_this_sibling_map;

6845

if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))

6899

if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))

6846

return sa_this_core_map;

6900

return sa_this_core_map;

6847

if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))

6901

if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))

6848

return sa_send_covered;

6902

return sa_send_covered;

6849

d->rd = alloc_rootdomain();

6903

d->rd = alloc_rootdomain();

6850

if (!d->rd) {

6904

if (!d->rd) {

6851

printk(KERN_WARNING "Cannot alloc root domain\n");

6905

printk(KERN_WARNING "Cannot alloc root domain\n");

6852

return sa_tmpmask;

6906

return sa_tmpmask;

6853

}

6907

}

6854

return sa_rootdomain;

6908

return sa_rootdomain;

6855

}

6909

}

6856

6910

6857

static struct sched_domain *__build_numa_sched_domains(struct s_data *d,

6911

static struct sched_domain *__build_numa_sched_domains(struct s_data *d,

6858

const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)

6912

const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)

6859

{

6913

{

6860

struct sched_domain *sd = NULL;

6914

struct sched_domain *sd = NULL;

6861

#ifdef CONFIG_NUMA

6915

#ifdef CONFIG_NUMA

6862

struct sched_domain *parent;

6916

struct sched_domain *parent;

6863

6917

6864

d->sd_allnodes = 0;

6918

d->sd_allnodes = 0;

6865

if (cpumask_weight(cpu_map) >

6919

if (cpumask_weight(cpu_map) >

6866

SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {

6920

SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {

6867

sd = &per_cpu(allnodes_domains, i).sd;

6921

sd = &per_cpu(allnodes_domains, i).sd;

6868

SD_INIT(sd, ALLNODES);

6922

SD_INIT(sd, ALLNODES);

6869

set_domain_attribute(sd, attr);

6923

set_domain_attribute(sd, attr);

6870

cpumask_copy(sched_domain_span(sd), cpu_map);

6924

cpumask_copy(sched_domain_span(sd), cpu_map);

6871

cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);

6925

cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);

6872

d->sd_allnodes = 1;

6926

d->sd_allnodes = 1;

6873

}

6927

}

6874

parent = sd;

6928

parent = sd;

6875

6929

6876

sd = &per_cpu(node_domains, i).sd;

6930

sd = &per_cpu(node_domains, i).sd;

6877

SD_INIT(sd, NODE);

6931

SD_INIT(sd, NODE);

6878

set_domain_attribute(sd, attr);

6932

set_domain_attribute(sd, attr);

6879

sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));

6933

sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));

6880

sd->parent = parent;

6934

sd->parent = parent;

6881

if (parent)

6935

if (parent)

6882

parent->child = sd;

6936

parent->child = sd;

6883

cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);

6937

cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);

6884

#endif

6938

#endif

6885

return sd;

6939

return sd;

6886

}

6940

}

6887

6941

6888

static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,

6942

static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,

6889

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

6943

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

6890

struct sched_domain *parent, int i)

6944

struct sched_domain *parent, int i)

6891

{

6945

{

6892

struct sched_domain *sd;

6946

struct sched_domain *sd;

6893

sd = &per_cpu(phys_domains, i).sd;

6947

sd = &per_cpu(phys_domains, i).sd;

6894

SD_INIT(sd, CPU);

6948

SD_INIT(sd, CPU);

6895

set_domain_attribute(sd, attr);

6949

set_domain_attribute(sd, attr);

6896

cpumask_copy(sched_domain_span(sd), d->nodemask);

6950

cpumask_copy(sched_domain_span(sd), d->nodemask);

6897

sd->parent = parent;

6951

sd->parent = parent;

6898

if (parent)

6952

if (parent)

6899

parent->child = sd;

6953

parent->child = sd;

6900

cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);

6954

cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);

6901

return sd;

6955

return sd;

6902

}

6956

}

6903

6957

6904

static struct sched_domain *__build_mc_sched_domain(struct s_data *d,

6958

static struct sched_domain *__build_mc_sched_domain(struct s_data *d,

6905

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

6959

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

6906

struct sched_domain *parent, int i)

6960

struct sched_domain *parent, int i)

6907

{

6961

{

6908

struct sched_domain *sd = parent;

6962

struct sched_domain *sd = parent;

6909

#ifdef CONFIG_SCHED_MC

6963

#ifdef CONFIG_SCHED_MC

6910

sd = &per_cpu(core_domains, i).sd;

6964

sd = &per_cpu(core_domains, i).sd;

6911

SD_INIT(sd, MC);

6965

SD_INIT(sd, MC);

6912

set_domain_attribute(sd, attr);

6966

set_domain_attribute(sd, attr);

6913

cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));

6967

cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));

6914

sd->parent = parent;

6968

sd->parent = parent;

6915

parent->child = sd;

6969

parent->child = sd;

6916

cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);

6970

cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);

6917

#endif

6971

#endif

6918

return sd;

6972

return sd;

6919

}

6973

}

6920

6974

6921

static struct sched_domain *__build_smt_sched_domain(struct s_data *d,

6975

static struct sched_domain *__build_smt_sched_domain(struct s_data *d,

6922

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

6976

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

6923

struct sched_domain *parent, int i)

6977

struct sched_domain *parent, int i)

6924

{

6978

{

6925

struct sched_domain *sd = parent;

6979

struct sched_domain *sd = parent;

6926

#ifdef CONFIG_SCHED_SMT

6980

#ifdef CONFIG_SCHED_SMT

6927

sd = &per_cpu(cpu_domains, i).sd;

6981

sd = &per_cpu(cpu_domains, i).sd;

6928

SD_INIT(sd, SIBLING);

6982

SD_INIT(sd, SIBLING);

6929

set_domain_attribute(sd, attr);

6983

set_domain_attribute(sd, attr);

6930

cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));

6984

cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));

6931

sd->parent = parent;

6985

sd->parent = parent;

6932

parent->child = sd;

6986

parent->child = sd;

6933

cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);

6987

cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);

6934

#endif

6988

#endif

6935

return sd;

6989

return sd;

6936

}

6990

}

6937

6991

6938

static void build_sched_groups(struct s_data *d, enum sched_domain_level l,

6992

static void build_sched_groups(struct s_data *d, enum sched_domain_level l,

6939

const struct cpumask *cpu_map, int cpu)

6993

const struct cpumask *cpu_map, int cpu)

6940

{

6994

{

6941

switch (l) {

6995

switch (l) {

6942

#ifdef CONFIG_SCHED_SMT

6996

#ifdef CONFIG_SCHED_SMT

6943

case SD_LV_SIBLING: /* set up CPU (sibling) groups */

6997

case SD_LV_SIBLING: /* set up CPU (sibling) groups */

6944

cpumask_and(d->this_sibling_map, cpu_map,

6998

cpumask_and(d->this_sibling_map, cpu_map,

6945

topology_thread_cpumask(cpu));

6999

topology_thread_cpumask(cpu));

6946

if (cpu == cpumask_first(d->this_sibling_map))

7000

if (cpu == cpumask_first(d->this_sibling_map))

6947

init_sched_build_groups(d->this_sibling_map, cpu_map,

7001

init_sched_build_groups(d->this_sibling_map, cpu_map,

6948

&cpu_to_cpu_group,

7002

&cpu_to_cpu_group,

6949

d->send_covered, d->tmpmask);

7003

d->send_covered, d->tmpmask);

6950

break;

7004

break;

6951

#endif

7005

#endif

6952

#ifdef CONFIG_SCHED_MC

7006

#ifdef CONFIG_SCHED_MC

6953

case SD_LV_MC: /* set up multi-core groups */

7007

case SD_LV_MC: /* set up multi-core groups */

6954

cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));

7008

cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));

6955

if (cpu == cpumask_first(d->this_core_map))

7009

if (cpu == cpumask_first(d->this_core_map))

6956

init_sched_build_groups(d->this_core_map, cpu_map,

7010

init_sched_build_groups(d->this_core_map, cpu_map,

6957

&cpu_to_core_group,

7011

&cpu_to_core_group,

6958

d->send_covered, d->tmpmask);

7012

d->send_covered, d->tmpmask);

6959

break;

7013

break;

6960

#endif

7014

#endif

6961

case SD_LV_CPU: /* set up physical groups */

7015

case SD_LV_CPU: /* set up physical groups */

6962

cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);

7016

cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);

6963

if (!cpumask_empty(d->nodemask))

7017

if (!cpumask_empty(d->nodemask))

6964

init_sched_build_groups(d->nodemask, cpu_map,

7018

init_sched_build_groups(d->nodemask, cpu_map,

6965

&cpu_to_phys_group,

7019

&cpu_to_phys_group,

6966

d->send_covered, d->tmpmask);

7020

d->send_covered, d->tmpmask);

6967

break;

7021

break;

6968

#ifdef CONFIG_NUMA

7022

#ifdef CONFIG_NUMA

6969

case SD_LV_ALLNODES:

7023

case SD_LV_ALLNODES:

6970

init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,

7024

init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,

6971

d->send_covered, d->tmpmask);

7025

d->send_covered, d->tmpmask);

6972

break;

7026

break;

6973

#endif

7027

#endif

6974

default:

7028

default:

6975

break;

7029

break;

6976

}

7030

}

6977

}

7031

}

6978

7032

6979

/*

7033

/*

6980

* Build sched domains for a given set of cpus and attach the sched domains

7034

* Build sched domains for a given set of cpus and attach the sched domains

6981

* to the individual cpus

7035

* to the individual cpus

6982

*/

7036

*/

6983

static int __build_sched_domains(const struct cpumask *cpu_map,

7037

static int __build_sched_domains(const struct cpumask *cpu_map,

6984

struct sched_domain_attr *attr)

7038

struct sched_domain_attr *attr)

6985

{

7039

{

6986

enum s_alloc alloc_state = sa_none;

7040

enum s_alloc alloc_state = sa_none;

6987

struct s_data d;

7041

struct s_data d;

6988

struct sched_domain *sd;

7042

struct sched_domain *sd;

6989

int i;

7043

int i;

6990

#ifdef CONFIG_NUMA

7044

#ifdef CONFIG_NUMA

6991

d.sd_allnodes = 0;

7045

d.sd_allnodes = 0;

6992

#endif

7046

#endif

6993

7047

6994

alloc_state = __visit_domain_allocation_hell(&d, cpu_map);

7048

alloc_state = __visit_domain_allocation_hell(&d, cpu_map);

6995

if (alloc_state != sa_rootdomain)

7049

if (alloc_state != sa_rootdomain)

6996

goto error;

7050

goto error;

6997

alloc_state = sa_sched_groups;

7051

alloc_state = sa_sched_groups;

6998

7052

6999

/*

7053

/*

7000

* Set up domains for cpus specified by the cpu_map.

7054

* Set up domains for cpus specified by the cpu_map.

7001

*/

7055

*/

7002

for_each_cpu(i, cpu_map) {

7056

for_each_cpu(i, cpu_map) {

7003

cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),

7057

cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),

7004

cpu_map);

7058

cpu_map);

7005

7059

7006

sd = __build_numa_sched_domains(&d, cpu_map, attr, i);

7060

sd = __build_numa_sched_domains(&d, cpu_map, attr, i);

7007

sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);

7061

sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);

7008

sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);

7062

sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);

7009

sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);

7063

sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);

7010

}

7064

}

7011

7065

7012

for_each_cpu(i, cpu_map) {

7066

for_each_cpu(i, cpu_map) {

7013

build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);

7067

build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);

7014

build_sched_groups(&d, SD_LV_MC, cpu_map, i);

7068

build_sched_groups(&d, SD_LV_MC, cpu_map, i);

7015

}

7069

}

7016

7070

7017

/* Set up physical groups */

7071

/* Set up physical groups */

7018

for (i = 0; i < nr_node_ids; i++)

7072

for (i = 0; i < nr_node_ids; i++)

7019

build_sched_groups(&d, SD_LV_CPU, cpu_map, i);

7073

build_sched_groups(&d, SD_LV_CPU, cpu_map, i);

7020

7074

7021

#ifdef CONFIG_NUMA

7075

#ifdef CONFIG_NUMA

7022

/* Set up node groups */

7076

/* Set up node groups */

7023

if (d.sd_allnodes)

7077

if (d.sd_allnodes)

7024

build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);

7078

build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);

7025

7079

7026

for (i = 0; i < nr_node_ids; i++)

7080

for (i = 0; i < nr_node_ids; i++)

7027

if (build_numa_sched_groups(&d, cpu_map, i))

7081

if (build_numa_sched_groups(&d, cpu_map, i))

7028

goto error;

7082

goto error;

7029

#endif

7083

#endif

7030

7084

7031

/* Calculate CPU power for physical packages and nodes */

7085

/* Calculate CPU power for physical packages and nodes */

7032

#ifdef CONFIG_SCHED_SMT

7086

#ifdef CONFIG_SCHED_SMT

7033

for_each_cpu(i, cpu_map) {

7087

for_each_cpu(i, cpu_map) {

7034

sd = &per_cpu(cpu_domains, i).sd;

7088

sd = &per_cpu(cpu_domains, i).sd;

7035

init_sched_groups_power(i, sd);

7089

init_sched_groups_power(i, sd);

7036

}

7090

}

7037

#endif

7091

#endif

7038

#ifdef CONFIG_SCHED_MC

7092

#ifdef CONFIG_SCHED_MC

7039

for_each_cpu(i, cpu_map) {

7093

for_each_cpu(i, cpu_map) {

7040

sd = &per_cpu(core_domains, i).sd;

7094

sd = &per_cpu(core_domains, i).sd;

7041

init_sched_groups_power(i, sd);

7095

init_sched_groups_power(i, sd);

7042

}

7096

}

7043

#endif

7097

#endif

7044

7098

7045

for_each_cpu(i, cpu_map) {

7099

for_each_cpu(i, cpu_map) {

7046

sd = &per_cpu(phys_domains, i).sd;

7100

sd = &per_cpu(phys_domains, i).sd;

7047

init_sched_groups_power(i, sd);

7101

init_sched_groups_power(i, sd);

7048

}

7102

}

7049

7103

7050

#ifdef CONFIG_NUMA

7104

#ifdef CONFIG_NUMA

7051

for (i = 0; i < nr_node_ids; i++)

7105

for (i = 0; i < nr_node_ids; i++)

7052

init_numa_sched_groups_power(d.sched_group_nodes[i]);

7106

init_numa_sched_groups_power(d.sched_group_nodes[i]);

7053

7107

7054

if (d.sd_allnodes) {

7108

if (d.sd_allnodes) {

7055

struct sched_group *sg;

7109

struct sched_group *sg;

7056

7110

7057

cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,

7111

cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,

7058

d.tmpmask);

7112

d.tmpmask);

7059

init_numa_sched_groups_power(sg);

7113

init_numa_sched_groups_power(sg);

7060

}

7114

}

7061

#endif

7115

#endif

7062

7116

7063

/* Attach the domains */

7117

/* Attach the domains */

7064

for_each_cpu(i, cpu_map) {

7118

for_each_cpu(i, cpu_map) {

7065

#ifdef CONFIG_SCHED_SMT

7119

#ifdef CONFIG_SCHED_SMT

7066

sd = &per_cpu(cpu_domains, i).sd;

7120

sd = &per_cpu(cpu_domains, i).sd;

7067

#elif defined(CONFIG_SCHED_MC)

7121

#elif defined(CONFIG_SCHED_MC)

7068

sd = &per_cpu(core_domains, i).sd;

7122

sd = &per_cpu(core_domains, i).sd;

7069

#else

7123

#else

7070

sd = &per_cpu(phys_domains, i).sd;

7124

sd = &per_cpu(phys_domains, i).sd;

7071

#endif

7125

#endif

7072

cpu_attach_domain(sd, d.rd, i);

7126

cpu_attach_domain(sd, d.rd, i);

7073

}

7127

}

7074

7128

7075

d.sched_group_nodes = NULL; /* don't free this we still need it */

7129

d.sched_group_nodes = NULL; /* don't free this we still need it */

7076

__free_domain_allocs(&d, sa_tmpmask, cpu_map);

7130

__free_domain_allocs(&d, sa_tmpmask, cpu_map);

7077

return 0;

7131

return 0;

7078

7132

7079

error:

7133

error:

7080

__free_domain_allocs(&d, alloc_state, cpu_map);

7134

__free_domain_allocs(&d, alloc_state, cpu_map);

7081

return -ENOMEM;

7135

return -ENOMEM;

7082

}

7136

}

7083

7137

7084

static int build_sched_domains(const struct cpumask *cpu_map)

7138

static int build_sched_domains(const struct cpumask *cpu_map)

7085

{

7139

{

7086

return __build_sched_domains(cpu_map, NULL);

7140

return __build_sched_domains(cpu_map, NULL);

7087

}

7141

}

7088

7142

7089

static cpumask_var_t *doms_cur; /* current sched domains */

7143

static cpumask_var_t *doms_cur; /* current sched domains */

7090

static int ndoms_cur; /* number of sched domains in 'doms_cur' */

7144

static int ndoms_cur; /* number of sched domains in 'doms_cur' */

7091

static struct sched_domain_attr *dattr_cur;

7145

static struct sched_domain_attr *dattr_cur;

7092

/* attribues of custom domains in 'doms_cur' */

7146

/* attribues of custom domains in 'doms_cur' */

7093

7147

7094

/*

7148

/*

7095

* Special case: If a kmalloc of a doms_cur partition (array of

7149

* Special case: If a kmalloc of a doms_cur partition (array of

7096

* cpumask) fails, then fallback to a single sched domain,

7150

* cpumask) fails, then fallback to a single sched domain,

7097

* as determined by the single cpumask fallback_doms.

7151

* as determined by the single cpumask fallback_doms.

7098

*/

7152

*/

7099

static cpumask_var_t fallback_doms;

7153

static cpumask_var_t fallback_doms;

7100

7154

7101

/*

7155

/*

7102

* arch_update_cpu_topology lets virtualized architectures update the

7156

* arch_update_cpu_topology lets virtualized architectures update the

7103

* cpu core maps. It is supposed to return 1 if the topology changed

7157

* cpu core maps. It is supposed to return 1 if the topology changed

7104

* or 0 if it stayed the same.

7158

* or 0 if it stayed the same.

7105

*/

7159

*/

7106

int __attribute__((weak)) arch_update_cpu_topology(void)

7160

int __attribute__((weak)) arch_update_cpu_topology(void)

7107

{

7161

{

7108

return 0;

7162

return 0;

7109

}

7163

}

7110

7164

7111

cpumask_var_t *alloc_sched_domains(unsigned int ndoms)

7165

cpumask_var_t *alloc_sched_domains(unsigned int ndoms)

7112

{

7166

{

7113

int i;

7167

int i;

7114

cpumask_var_t *doms;

7168

cpumask_var_t *doms;

7115

7169

7116

doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);

7170

doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);

7117

if (!doms)

7171

if (!doms)

7118

return NULL;

7172

return NULL;

7119

for (i = 0; i < ndoms; i++) {

7173

for (i = 0; i < ndoms; i++) {

7120

if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {

7174

if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {

7121

free_sched_domains(doms, i);

7175

free_sched_domains(doms, i);

7122

return NULL;

7176

return NULL;

7123

}

7177

}

7124

}

7178

}

7125

return doms;

7179

return doms;

7126

}

7180

}

7127

7181

7128

void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)

7182

void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)

7129

{

7183

{

7130

unsigned int i;

7184

unsigned int i;

7131

for (i = 0; i < ndoms; i++)

7185

for (i = 0; i < ndoms; i++)

7132

free_cpumask_var(doms[i]);

7186

free_cpumask_var(doms[i]);

7133

kfree(doms);

7187

kfree(doms);

7134

}

7188

}

7135

7189

7136

/*

7190

/*

7137

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

7191

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

7138

* For now this just excludes isolated cpus, but could be used to

7192

* For now this just excludes isolated cpus, but could be used to

7139

* exclude other special cases in the future.

7193

* exclude other special cases in the future.

7140

*/

7194

*/

7141

static int arch_init_sched_domains(const struct cpumask *cpu_map)

7195

static int arch_init_sched_domains(const struct cpumask *cpu_map)

7142

{

7196

{

7143

int err;

7197

int err;

7144

7198

7145

arch_update_cpu_topology();

7199

arch_update_cpu_topology();

7146

ndoms_cur = 1;

7200

ndoms_cur = 1;

7147

doms_cur = alloc_sched_domains(ndoms_cur);

7201

doms_cur = alloc_sched_domains(ndoms_cur);

7148

if (!doms_cur)

7202

if (!doms_cur)

7149

doms_cur = &fallback_doms;

7203

doms_cur = &fallback_doms;

7150

cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);

7204

cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);

7151

dattr_cur = NULL;

7205

dattr_cur = NULL;

7152

err = build_sched_domains(doms_cur[0]);

7206

err = build_sched_domains(doms_cur[0]);

7153

register_sched_domain_sysctl();

7207

register_sched_domain_sysctl();

7154

7208

7155

return err;

7209

return err;

7156

}

7210

}

7157

7211

7158

static void arch_destroy_sched_domains(const struct cpumask *cpu_map,

7212

static void arch_destroy_sched_domains(const struct cpumask *cpu_map,

7159

struct cpumask *tmpmask)

7213

struct cpumask *tmpmask)

7160

{

7214

{

7161

free_sched_groups(cpu_map, tmpmask);

7215

free_sched_groups(cpu_map, tmpmask);

7162

}

7216

}

7163

7217

7164

/*

7218

/*

7165

* Detach sched domains from a group of cpus specified in cpu_map

7219

* Detach sched domains from a group of cpus specified in cpu_map

7166

* These cpus will now be attached to the NULL domain

7220

* These cpus will now be attached to the NULL domain

7167

*/

7221

*/

7168

static void detach_destroy_domains(const struct cpumask *cpu_map)

7222

static void detach_destroy_domains(const struct cpumask *cpu_map)

7169

{

7223

{

7170

/* Save because hotplug lock held. */

7224

/* Save because hotplug lock held. */

7171

static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);

7225

static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);

7172

int i;

7226

int i;

7173

7227

7174

for_each_cpu(i, cpu_map)

7228

for_each_cpu(i, cpu_map)

7175

cpu_attach_domain(NULL, &def_root_domain, i);

7229

cpu_attach_domain(NULL, &def_root_domain, i);

7176

synchronize_sched();

7230

synchronize_sched();

7177

arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));

7231

arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));

7178

}

7232

}

7179

7233

7180

/* handle null as "default" */

7234

/* handle null as "default" */

7181

static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,

7235

static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,

7182

struct sched_domain_attr *new, int idx_new)

7236

struct sched_domain_attr *new, int idx_new)

7183

{

7237

{

7184

struct sched_domain_attr tmp;

7238

struct sched_domain_attr tmp;

7185

7239

7186

/* fast path */

7240

/* fast path */

7187

if (!new && !cur)

7241

if (!new && !cur)

7188

return 1;

7242

return 1;

7189

7243

7190

tmp = SD_ATTR_INIT;

7244

tmp = SD_ATTR_INIT;

7191

return !memcmp(cur ? (cur + idx_cur) : &tmp,

7245

return !memcmp(cur ? (cur + idx_cur) : &tmp,

7192

new ? (new + idx_new) : &tmp,

7246

new ? (new + idx_new) : &tmp,

7193

sizeof(struct sched_domain_attr));

7247

sizeof(struct sched_domain_attr));

7194

}

7248

}

7195

7249

7196

/*

7250

/*

7197

* Partition sched domains as specified by the 'ndoms_new'

7251

* Partition sched domains as specified by the 'ndoms_new'

7198

* cpumasks in the array doms_new[] of cpumasks. This compares

7252

* cpumasks in the array doms_new[] of cpumasks. This compares

7199

* doms_new[] to the current sched domain partitioning, doms_cur[].

7253

* doms_new[] to the current sched domain partitioning, doms_cur[].

7200

* It destroys each deleted domain and builds each new domain.

7254

* It destroys each deleted domain and builds each new domain.

7201

*

7255

*

7202

* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.

7256

* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.

7203

* The masks don't intersect (don't overlap.) We should setup one

7257

* The masks don't intersect (don't overlap.) We should setup one

7204

* sched domain for each mask. CPUs not in any of the cpumasks will

7258

* sched domain for each mask. CPUs not in any of the cpumasks will

7205

* not be load balanced. If the same cpumask appears both in the

7259

* not be load balanced. If the same cpumask appears both in the

7206

* current 'doms_cur' domains and in the new 'doms_new', we can leave

7260

* current 'doms_cur' domains and in the new 'doms_new', we can leave

7207

* it as it is.

7261

* it as it is.

7208

*

7262

*

7209

* The passed in 'doms_new' should be allocated using

7263

* The passed in 'doms_new' should be allocated using

7210

* alloc_sched_domains. This routine takes ownership of it and will

7264

* alloc_sched_domains. This routine takes ownership of it and will

7211

* free_sched_domains it when done with it. If the caller failed the

7265

* free_sched_domains it when done with it. If the caller failed the

7212

* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,

7266

* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,

7213

* and partition_sched_domains() will fallback to the single partition

7267

* and partition_sched_domains() will fallback to the single partition

7214

* 'fallback_doms', it also forces the domains to be rebuilt.

7268

* 'fallback_doms', it also forces the domains to be rebuilt.

7215

*

7269

*

7216

* If doms_new == NULL it will be replaced with cpu_online_mask.

7270

* If doms_new == NULL it will be replaced with cpu_online_mask.

7217

* ndoms_new == 0 is a special case for destroying existing domains,

7271

* ndoms_new == 0 is a special case for destroying existing domains,

7218

* and it will not create the default domain.

7272

* and it will not create the default domain.

7219

*

7273

*

7220

* Call with hotplug lock held

7274

* Call with hotplug lock held

7221

*/

7275

*/

7222

void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

7276

void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

7223

struct sched_domain_attr *dattr_new)

7277

struct sched_domain_attr *dattr_new)

7224

{

7278

{

7225

int i, j, n;

7279

int i, j, n;

7226

int new_topology;

7280

int new_topology;

7227

7281

7228

mutex_lock(&sched_domains_mutex);

7282

mutex_lock(&sched_domains_mutex);

7229

7283

7230

/* always unregister in case we don't destroy any domains */

7284

/* always unregister in case we don't destroy any domains */

7231

unregister_sched_domain_sysctl();

7285

unregister_sched_domain_sysctl();

7232

7286

7233

/* Let architecture update cpu core mappings. */

7287

/* Let architecture update cpu core mappings. */

7234

new_topology = arch_update_cpu_topology();

7288

new_topology = arch_update_cpu_topology();

7235

7289

7236

n = doms_new ? ndoms_new : 0;

7290

n = doms_new ? ndoms_new : 0;

7237

7291

7238

/* Destroy deleted domains */

7292

/* Destroy deleted domains */

7239

for (i = 0; i < ndoms_cur; i++) {

7293

for (i = 0; i < ndoms_cur; i++) {

7240

for (j = 0; j < n && !new_topology; j++) {

7294

for (j = 0; j < n && !new_topology; j++) {

7241

if (cpumask_equal(doms_cur[i], doms_new[j])

7295

if (cpumask_equal(doms_cur[i], doms_new[j])

7242

&& dattrs_equal(dattr_cur, i, dattr_new, j))

7296

&& dattrs_equal(dattr_cur, i, dattr_new, j))

7243

goto match1;

7297

goto match1;

7244

}

7298

}

7245

/* no match - a current sched domain not in new doms_new[] */

7299

/* no match - a current sched domain not in new doms_new[] */

7246

detach_destroy_domains(doms_cur[i]);

7300

detach_destroy_domains(doms_cur[i]);

7247

match1:

7301

match1:

7248

;

7302

;

7249

}

7303

}

7250

7304

7251

if (doms_new == NULL) {

7305

if (doms_new == NULL) {

7252

ndoms_cur = 0;

7306

ndoms_cur = 0;

7253

doms_new = &fallback_doms;

7307

doms_new = &fallback_doms;

7254

cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);

7308

cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);

7255

WARN_ON_ONCE(dattr_new);

7309

WARN_ON_ONCE(dattr_new);

7256

}

7310

}

7257

7311

7258

/* Build new domains */

7312

/* Build new domains */

7259

for (i = 0; i < ndoms_new; i++) {

7313

for (i = 0; i < ndoms_new; i++) {

7260

for (j = 0; j < ndoms_cur && !new_topology; j++) {

7314

for (j = 0; j < ndoms_cur && !new_topology; j++) {

7261

if (cpumask_equal(doms_new[i], doms_cur[j])

7315

if (cpumask_equal(doms_new[i], doms_cur[j])

7262

&& dattrs_equal(dattr_new, i, dattr_cur, j))

7316

&& dattrs_equal(dattr_new, i, dattr_cur, j))

7263

goto match2;

7317

goto match2;

7264

}

7318

}

7265

/* no match - add a new doms_new */

7319

/* no match - add a new doms_new */

7266

__build_sched_domains(doms_new[i],

7320

__build_sched_domains(doms_new[i],

7267

dattr_new ? dattr_new + i : NULL);

7321

dattr_new ? dattr_new + i : NULL);

7268

match2:

7322

match2:

7269

;

7323

;

7270

}

7324

}

7271

7325

7272

/* Remember the new sched domains */

7326

/* Remember the new sched domains */

7273

if (doms_cur != &fallback_doms)

7327

if (doms_cur != &fallback_doms)

7274

free_sched_domains(doms_cur, ndoms_cur);

7328

free_sched_domains(doms_cur, ndoms_cur);

7275

kfree(dattr_cur); /* kfree(NULL) is safe */

7329

kfree(dattr_cur); /* kfree(NULL) is safe */

7276

doms_cur = doms_new;

7330

doms_cur = doms_new;

7277

dattr_cur = dattr_new;

7331

dattr_cur = dattr_new;

7278

ndoms_cur = ndoms_new;

7332

ndoms_cur = ndoms_new;

7279

7333

7280

register_sched_domain_sysctl();

7334

register_sched_domain_sysctl();

7281

7335

7282

mutex_unlock(&sched_domains_mutex);

7336

mutex_unlock(&sched_domains_mutex);

7283

}

7337

}

7284

7338

7285

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

7339

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

7286

static void arch_reinit_sched_domains(void)

7340

static void arch_reinit_sched_domains(void)

7287

{

7341

{

7288

get_online_cpus();

7342

get_online_cpus();

7289

7343

7290

/* Destroy domains first to force the rebuild */

7344

/* Destroy domains first to force the rebuild */

7291

partition_sched_domains(0, NULL, NULL);

7345

partition_sched_domains(0, NULL, NULL);

7292

7346

7293

rebuild_sched_domains();

7347

rebuild_sched_domains();

7294

put_online_cpus();

7348

put_online_cpus();

7295

}

7349

}

7296

7350

7297

static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)

7351

static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)

7298

{

7352

{

7299

unsigned int level = 0;

7353

unsigned int level = 0;

7300

7354

7301

if (sscanf(buf, "%u", &level) != 1)

7355

if (sscanf(buf, "%u", &level) != 1)

7302

return -EINVAL;

7356

return -EINVAL;

7303

7357

7304

/*

7358

/*

7305

* level is always be positive so don't check for

7359

* level is always be positive so don't check for

7306

* level < POWERSAVINGS_BALANCE_NONE which is 0

7360

* level < POWERSAVINGS_BALANCE_NONE which is 0

7307

* What happens on 0 or 1 byte write,

7361

* What happens on 0 or 1 byte write,

7308

* need to check for count as well?

7362

* need to check for count as well?

7309

*/

7363

*/

7310

7364

7311

if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)

7365

if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)

7312

return -EINVAL;

7366

return -EINVAL;

7313

7367

7314

if (smt)

7368

if (smt)

7315

sched_smt_power_savings = level;

7369

sched_smt_power_savings = level;

7316

else

7370

else

7317

sched_mc_power_savings = level;

7371

sched_mc_power_savings = level;

7318

7372

7319

arch_reinit_sched_domains();

7373

arch_reinit_sched_domains();

7320

7374

7321

return count;

7375

return count;

7322

}

7376

}

7323

7377

7324

#ifdef CONFIG_SCHED_MC

7378

#ifdef CONFIG_SCHED_MC

7325

static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,

7379

static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,

7326

struct sysdev_class_attribute *attr,

7380

struct sysdev_class_attribute *attr,

7327

char *page)

7381

char *page)

7328

{

7382

{

7329

return sprintf(page, "%u\n", sched_mc_power_savings);

7383

return sprintf(page, "%u\n", sched_mc_power_savings);

7330

}

7384

}

7331

static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,

7385

static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,

7332

struct sysdev_class_attribute *attr,

7386

struct sysdev_class_attribute *attr,

7333

const char *buf, size_t count)

7387

const char *buf, size_t count)

7334

{

7388

{

7335

return sched_power_savings_store(buf, count, 0);

7389

return sched_power_savings_store(buf, count, 0);

7336

}

7390

}

7337

static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,

7391

static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,

7338

sched_mc_power_savings_show,

7392

sched_mc_power_savings_show,

7339

sched_mc_power_savings_store);

7393

sched_mc_power_savings_store);

7340

#endif

7394

#endif

7341

7395

7342

#ifdef CONFIG_SCHED_SMT

7396

#ifdef CONFIG_SCHED_SMT

7343

static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,

7397

static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,

7344

struct sysdev_class_attribute *attr,

7398

struct sysdev_class_attribute *attr,

7345

char *page)

7399

char *page)

7346

{

7400

{

7347

return sprintf(page, "%u\n", sched_smt_power_savings);

7401

return sprintf(page, "%u\n", sched_smt_power_savings);

7348

}

7402

}

7349

static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,

7403

static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,

7350

struct sysdev_class_attribute *attr,

7404

struct sysdev_class_attribute *attr,

7351

const char *buf, size_t count)

7405

const char *buf, size_t count)

7352

{

7406

{

7353

return sched_power_savings_store(buf, count, 1);

7407

return sched_power_savings_store(buf, count, 1);

7354

}

7408

}

7355

static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,

7409

static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,

7356

sched_smt_power_savings_show,

7410

sched_smt_power_savings_show,

7357

sched_smt_power_savings_store);

7411

sched_smt_power_savings_store);

7358

#endif

7412

#endif

7359

7413

7360

int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)

7414

int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)

7361

{

7415

{

7362

int err = 0;

7416

int err = 0;

7363

7417

7364

#ifdef CONFIG_SCHED_SMT

7418

#ifdef CONFIG_SCHED_SMT

7365

if (smt_capable())

7419

if (smt_capable())

7366

err = sysfs_create_file(&cls->kset.kobj,

7420

err = sysfs_create_file(&cls->kset.kobj,

7367

&attr_sched_smt_power_savings.attr);

7421

&attr_sched_smt_power_savings.attr);

7368

#endif

7422

#endif

7369

#ifdef CONFIG_SCHED_MC

7423

#ifdef CONFIG_SCHED_MC

7370

if (!err && mc_capable())

7424

if (!err && mc_capable())

7371

err = sysfs_create_file(&cls->kset.kobj,

7425

err = sysfs_create_file(&cls->kset.kobj,

7372

&attr_sched_mc_power_savings.attr);

7426

&attr_sched_mc_power_savings.attr);

7373

#endif

7427

#endif

7374

return err;

7428

return err;

7375

}

7429

}

7376

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

7430

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

7377

7431

7378

#ifndef CONFIG_CPUSETS

7432

#ifndef CONFIG_CPUSETS

7379

/*

7433

/*

7380

* Add online and remove offline CPUs from the scheduler domains.

7434

* Add online and remove offline CPUs from the scheduler domains.

7381

* When cpusets are enabled they take over this function.

7435

* When cpusets are enabled they take over this function.

7382

*/

7436

*/

7383

static int update_sched_domains(struct notifier_block *nfb,

7437

static int update_sched_domains(struct notifier_block *nfb,

7384

unsigned long action, void *hcpu)

7438

unsigned long action, void *hcpu)

7385

{

7439

{

7386

switch (action) {

7440

switch (action) {

7387

case CPU_ONLINE:

7441

case CPU_ONLINE:

7388

case CPU_ONLINE_FROZEN:

7442

case CPU_ONLINE_FROZEN:

7389

case CPU_DOWN_PREPARE:

7443

case CPU_DOWN_PREPARE:

7390

case CPU_DOWN_PREPARE_FROZEN:

7444

case CPU_DOWN_PREPARE_FROZEN:

7391

case CPU_DOWN_FAILED:

7445

case CPU_DOWN_FAILED:

7392

case CPU_DOWN_FAILED_FROZEN:

7446

case CPU_DOWN_FAILED_FROZEN:

7393

partition_sched_domains(1, NULL, NULL);

7447

partition_sched_domains(1, NULL, NULL);

7394

return NOTIFY_OK;

7448

return NOTIFY_OK;

7395

7449

7396

default:

7450

default:

7397

return NOTIFY_DONE;

7451

return NOTIFY_DONE;

7398

}

7452

}

7399

}

7453

}

7400

#endif

7454

#endif

7401

7455

7402

static int update_runtime(struct notifier_block *nfb,

7456

static int update_runtime(struct notifier_block *nfb,

7403

unsigned long action, void *hcpu)

7457

unsigned long action, void *hcpu)

7404

{

7458

{

7405

int cpu = (int)(long)hcpu;

7459

int cpu = (int)(long)hcpu;

7406

7460

7407

switch (action) {

7461

switch (action) {

7408

case CPU_DOWN_PREPARE:

7462

case CPU_DOWN_PREPARE:

7409

case CPU_DOWN_PREPARE_FROZEN:

7463

case CPU_DOWN_PREPARE_FROZEN:

7410

disable_runtime(cpu_rq(cpu));

7464

disable_runtime(cpu_rq(cpu));

7411

return NOTIFY_OK;

7465

return NOTIFY_OK;

7412

7466

7413

case CPU_DOWN_FAILED:

7467

case CPU_DOWN_FAILED:

7414

case CPU_DOWN_FAILED_FROZEN:

7468

case CPU_DOWN_FAILED_FROZEN:

7415

case CPU_ONLINE:

7469

case CPU_ONLINE:

7416

case CPU_ONLINE_FROZEN:

7470

case CPU_ONLINE_FROZEN:

7417

enable_runtime(cpu_rq(cpu));

7471

enable_runtime(cpu_rq(cpu));

7418

return NOTIFY_OK;

7472

return NOTIFY_OK;

7419

7473

7420

default:

7474

default:

7421

return NOTIFY_DONE;

7475

return NOTIFY_DONE;

7422

}

7476

}

7423

}

7477

}

7424

7478

7425

void __init sched_init_smp(void)

7479

void __init sched_init_smp(void)

7426

{

7480

{

7427

cpumask_var_t non_isolated_cpus;

7481

cpumask_var_t non_isolated_cpus;

7428

7482

7429

alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);

7483

alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);

7430

alloc_cpumask_var(&fallback_doms, GFP_KERNEL);

7484

alloc_cpumask_var(&fallback_doms, GFP_KERNEL);

7431

7485

7432

#if defined(CONFIG_NUMA)

7486

#if defined(CONFIG_NUMA)

7433

sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),

7487

sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),

7434

GFP_KERNEL);

7488

GFP_KERNEL);

7435

BUG_ON(sched_group_nodes_bycpu == NULL);

7489

BUG_ON(sched_group_nodes_bycpu == NULL);

7436

#endif

7490

#endif

7437

get_online_cpus();

7491

get_online_cpus();

7438

mutex_lock(&sched_domains_mutex);

7492

mutex_lock(&sched_domains_mutex);

7439

arch_init_sched_domains(cpu_active_mask);

7493

arch_init_sched_domains(cpu_active_mask);

7440

cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);

7494

cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);

7441

if (cpumask_empty(non_isolated_cpus))

7495

if (cpumask_empty(non_isolated_cpus))

7442

cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);

7496

cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);

7443

mutex_unlock(&sched_domains_mutex);

7497

mutex_unlock(&sched_domains_mutex);

7444

put_online_cpus();

7498

put_online_cpus();

7445

7499

7446

#ifndef CONFIG_CPUSETS

7500

#ifndef CONFIG_CPUSETS

7447

/* XXX: Theoretical race here - CPU may be hotplugged now */

7501

/* XXX: Theoretical race here - CPU may be hotplugged now */

7448

hotcpu_notifier(update_sched_domains, 0);

7502

hotcpu_notifier(update_sched_domains, 0);

7449

#endif

7503

#endif

7450

7504

7451

/* RT runtime code needs to handle some hotplug events */

7505

/* RT runtime code needs to handle some hotplug events */

7452

hotcpu_notifier(update_runtime, 0);

7506

hotcpu_notifier(update_runtime, 0);

7453

7507

7454

init_hrtick();

7508

init_hrtick();

7455

7509

7456

/* Move init over to a non-isolated CPU */

7510

/* Move init over to a non-isolated CPU */

7457

if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)

7511

if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)

7458

BUG();

7512

BUG();

7459

sched_init_granularity();

7513

sched_init_granularity();

7460

free_cpumask_var(non_isolated_cpus);

7514

free_cpumask_var(non_isolated_cpus);

7461

7515

7462

init_sched_rt_class();

7516

init_sched_rt_class();

7463

}

7517

}

7464

#else

7518

#else

7465

void __init sched_init_smp(void)

7519

void __init sched_init_smp(void)

7466

{

7520

{

7467

sched_init_granularity();

7521

sched_init_granularity();

7468

}

7522

}

7469

#endif /* CONFIG_SMP */

7523

#endif /* CONFIG_SMP */

7470

7524

7471

const_debug unsigned int sysctl_timer_migration = 1;

7525

const_debug unsigned int sysctl_timer_migration = 1;

7472

7526

7473

int in_sched_functions(unsigned long addr)

7527

int in_sched_functions(unsigned long addr)

7474

{

7528

{

7475

return in_lock_functions(addr) ||

7529

return in_lock_functions(addr) ||

7476

(addr >= (unsigned long)__sched_text_start

7530

(addr >= (unsigned long)__sched_text_start

7477

&& addr < (unsigned long)__sched_text_end);

7531

&& addr < (unsigned long)__sched_text_end);

7478

}

7532

}

7479

7533

7480

static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)

7534

static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)

7481

{

7535

{

7482

cfs_rq->tasks_timeline = RB_ROOT;

7536

cfs_rq->tasks_timeline = RB_ROOT;

7483

INIT_LIST_HEAD(&cfs_rq->tasks);

7537

INIT_LIST_HEAD(&cfs_rq->tasks);

7484

#ifdef CONFIG_FAIR_GROUP_SCHED

7538

#ifdef CONFIG_FAIR_GROUP_SCHED

7485

cfs_rq->rq = rq;

7539

cfs_rq->rq = rq;

7486

#endif

7540

#endif

7487

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

7541

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

7488

}

7542

}

7489

7543

7490

static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)

7544

static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)

7491

{

7545

{

7492

struct rt_prio_array *array;

7546

struct rt_prio_array *array;

7493

int i;

7547

int i;

7494

7548

7495

array = &rt_rq->active;

7549

array = &rt_rq->active;

7496

for (i = 0; i < MAX_RT_PRIO; i++) {

7550

for (i = 0; i < MAX_RT_PRIO; i++) {

7497

INIT_LIST_HEAD(array->queue + i);

7551

INIT_LIST_HEAD(array->queue + i);

7498

__clear_bit(i, array->bitmap);

7552

__clear_bit(i, array->bitmap);

7499

}

7553

}

7500

/* delimiter for bitsearch: */

7554

/* delimiter for bitsearch: */

7501

__set_bit(MAX_RT_PRIO, array->bitmap);

7555

__set_bit(MAX_RT_PRIO, array->bitmap);

7502

7556

7503

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

7557

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

7504

rt_rq->highest_prio.curr = MAX_RT_PRIO;

7558

rt_rq->highest_prio.curr = MAX_RT_PRIO;

7505

#ifdef CONFIG_SMP

7559

#ifdef CONFIG_SMP

7506

rt_rq->highest_prio.next = MAX_RT_PRIO;

7560

rt_rq->highest_prio.next = MAX_RT_PRIO;

7507

#endif

7561

#endif

7508

#endif

7562

#endif

7509

#ifdef CONFIG_SMP

7563

#ifdef CONFIG_SMP

7510

rt_rq->rt_nr_migratory = 0;

7564

rt_rq->rt_nr_migratory = 0;

7511

rt_rq->overloaded = 0;

7565

rt_rq->overloaded = 0;

7512

plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);

7566

plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);

7513

#endif

7567

#endif

7514

7568

7515

rt_rq->rt_time = 0;

7569

rt_rq->rt_time = 0;

7516

rt_rq->rt_throttled = 0;

7570

rt_rq->rt_throttled = 0;

7517

rt_rq->rt_runtime = 0;

7571

rt_rq->rt_runtime = 0;

7518

raw_spin_lock_init(&rt_rq->rt_runtime_lock);

7572

raw_spin_lock_init(&rt_rq->rt_runtime_lock);

7519

7573

7520

#ifdef CONFIG_RT_GROUP_SCHED

7574

#ifdef CONFIG_RT_GROUP_SCHED

7521

rt_rq->rt_nr_boosted = 0;

7575

rt_rq->rt_nr_boosted = 0;

7522

rt_rq->rq = rq;

7576

rt_rq->rq = rq;

7523

#endif

7577

#endif

7524

}

7578

}

7525

7579

7526

#ifdef CONFIG_FAIR_GROUP_SCHED

7580

#ifdef CONFIG_FAIR_GROUP_SCHED

7527

static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

7581

static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

7528

struct sched_entity *se, int cpu, int add,

7582

struct sched_entity *se, int cpu, int add,

7529

struct sched_entity *parent)

7583

struct sched_entity *parent)

7530

{

7584

{

7531

struct rq *rq = cpu_rq(cpu);

7585

struct rq *rq = cpu_rq(cpu);

7532

tg->cfs_rq[cpu] = cfs_rq;

7586

tg->cfs_rq[cpu] = cfs_rq;

7533

init_cfs_rq(cfs_rq, rq);

7587

init_cfs_rq(cfs_rq, rq);

7534

cfs_rq->tg = tg;

7588

cfs_rq->tg = tg;

7535

if (add)

7589

if (add)

7536

list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);

7590

list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);

7537

7591

7538

tg->se[cpu] = se;

7592

tg->se[cpu] = se;

7539

/* se could be NULL for init_task_group */

7593

/* se could be NULL for init_task_group */

7540

if (!se)

7594

if (!se)

7541

return;

7595

return;

7542

7596

7543

if (!parent)

7597

if (!parent)

7544

se->cfs_rq = &rq->cfs;

7598

se->cfs_rq = &rq->cfs;

7545

else

7599

else

7546

se->cfs_rq = parent->my_q;

7600

se->cfs_rq = parent->my_q;

7547

7601

7548

se->my_q = cfs_rq;

7602

se->my_q = cfs_rq;

7549

se->load.weight = tg->shares;

7603

se->load.weight = tg->shares;

7550

se->load.inv_weight = 0;

7604

se->load.inv_weight = 0;

7551

se->parent = parent;

7605

se->parent = parent;

7552

}

7606

}

7553

#endif

7607

#endif

7554

7608

7555

#ifdef CONFIG_RT_GROUP_SCHED

7609

#ifdef CONFIG_RT_GROUP_SCHED

7556

static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,

7610

static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,

7557

struct sched_rt_entity *rt_se, int cpu, int add,

7611

struct sched_rt_entity *rt_se, int cpu, int add,

7558

struct sched_rt_entity *parent)

7612

struct sched_rt_entity *parent)

7559

{

7613

{

7560

struct rq *rq = cpu_rq(cpu);

7614

struct rq *rq = cpu_rq(cpu);

7561

7615

7562

tg->rt_rq[cpu] = rt_rq;

7616

tg->rt_rq[cpu] = rt_rq;

7563

init_rt_rq(rt_rq, rq);

7617

init_rt_rq(rt_rq, rq);

7564

rt_rq->tg = tg;

7618

rt_rq->tg = tg;

7565

rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;

7619

rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;

7566

if (add)

7620

if (add)

7567

list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);

7621

list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);

7568

7622

7569

tg->rt_se[cpu] = rt_se;

7623

tg->rt_se[cpu] = rt_se;

7570

if (!rt_se)

7624

if (!rt_se)

7571

return;

7625

return;

7572

7626

7573

if (!parent)

7627

if (!parent)

7574

rt_se->rt_rq = &rq->rt;

7628

rt_se->rt_rq = &rq->rt;

7575

else

7629

else

7576

rt_se->rt_rq = parent->my_q;

7630

rt_se->rt_rq = parent->my_q;

7577

7631

7578

rt_se->my_q = rt_rq;

7632

rt_se->my_q = rt_rq;

7579

rt_se->parent = parent;

7633

rt_se->parent = parent;

7580

INIT_LIST_HEAD(&rt_se->run_list);

7634

INIT_LIST_HEAD(&rt_se->run_list);

7581

}

7635

}

7582

#endif

7636

#endif

7583

7637

7584

void __init sched_init(void)

7638

void __init sched_init(void)

7585

{

7639

{

7586

int i, j;

7640

int i, j;

7587

unsigned long alloc_size = 0, ptr;

7641

unsigned long alloc_size = 0, ptr;

7588

7642

7589

#ifdef CONFIG_FAIR_GROUP_SCHED

7643

#ifdef CONFIG_FAIR_GROUP_SCHED

7590

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

7644

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

7591

#endif

7645

#endif

7592

#ifdef CONFIG_RT_GROUP_SCHED

7646

#ifdef CONFIG_RT_GROUP_SCHED

7593

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

7647

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

7594

#endif

7648

#endif

7595

#ifdef CONFIG_CPUMASK_OFFSTACK

7649

#ifdef CONFIG_CPUMASK_OFFSTACK

7596

alloc_size += num_possible_cpus() * cpumask_size();

7650

alloc_size += num_possible_cpus() * cpumask_size();

7597

#endif

7651

#endif

7598

if (alloc_size) {

7652

if (alloc_size) {

7599

ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

7653

ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

7600

7654

7601

#ifdef CONFIG_FAIR_GROUP_SCHED

7655

#ifdef CONFIG_FAIR_GROUP_SCHED

7602

init_task_group.se = (struct sched_entity **)ptr;

7656

init_task_group.se = (struct sched_entity **)ptr;

7603

ptr += nr_cpu_ids * sizeof(void **);

7657

ptr += nr_cpu_ids * sizeof(void **);

7604

7658

7605

init_task_group.cfs_rq = (struct cfs_rq **)ptr;

7659

init_task_group.cfs_rq = (struct cfs_rq **)ptr;

7606

ptr += nr_cpu_ids * sizeof(void **);

7660

ptr += nr_cpu_ids * sizeof(void **);

7607

7661

7608

#endif /* CONFIG_FAIR_GROUP_SCHED */

7662

#endif /* CONFIG_FAIR_GROUP_SCHED */

7609

#ifdef CONFIG_RT_GROUP_SCHED

7663

#ifdef CONFIG_RT_GROUP_SCHED

7610

init_task_group.rt_se = (struct sched_rt_entity **)ptr;

7664

init_task_group.rt_se = (struct sched_rt_entity **)ptr;

7611

ptr += nr_cpu_ids * sizeof(void **);

7665

ptr += nr_cpu_ids * sizeof(void **);

7612

7666

7613

init_task_group.rt_rq = (struct rt_rq **)ptr;

7667

init_task_group.rt_rq = (struct rt_rq **)ptr;

7614

ptr += nr_cpu_ids * sizeof(void **);

7668

ptr += nr_cpu_ids * sizeof(void **);

7615

7669

7616

#endif /* CONFIG_RT_GROUP_SCHED */

7670

#endif /* CONFIG_RT_GROUP_SCHED */

7617

#ifdef CONFIG_CPUMASK_OFFSTACK

7671

#ifdef CONFIG_CPUMASK_OFFSTACK

7618

for_each_possible_cpu(i) {

7672

for_each_possible_cpu(i) {

7619

per_cpu(load_balance_tmpmask, i) = (void *)ptr;

7673

per_cpu(load_balance_tmpmask, i) = (void *)ptr;

7620

ptr += cpumask_size();

7674

ptr += cpumask_size();

7621

}

7675

}

7622

#endif /* CONFIG_CPUMASK_OFFSTACK */

7676

#endif /* CONFIG_CPUMASK_OFFSTACK */

7623

}

7677

}

7624

7678

7625

#ifdef CONFIG_SMP

7679

#ifdef CONFIG_SMP

7626

init_defrootdomain();

7680

init_defrootdomain();

7627

#endif

7681

#endif

7628

7682

7629

init_rt_bandwidth(&def_rt_bandwidth,

7683

init_rt_bandwidth(&def_rt_bandwidth,

7630

global_rt_period(), global_rt_runtime());

7684

global_rt_period(), global_rt_runtime());

7631

7685

7632

#ifdef CONFIG_RT_GROUP_SCHED

7686

#ifdef CONFIG_RT_GROUP_SCHED

7633

init_rt_bandwidth(&init_task_group.rt_bandwidth,

7687

init_rt_bandwidth(&init_task_group.rt_bandwidth,

7634

global_rt_period(), global_rt_runtime());

7688

global_rt_period(), global_rt_runtime());

7635

#endif /* CONFIG_RT_GROUP_SCHED */

7689

#endif /* CONFIG_RT_GROUP_SCHED */

7636

7690

7637

#ifdef CONFIG_CGROUP_SCHED

7691

#ifdef CONFIG_CGROUP_SCHED

7638

list_add(&init_task_group.list, &task_groups);

7692

list_add(&init_task_group.list, &task_groups);

7639

INIT_LIST_HEAD(&init_task_group.children);

7693

INIT_LIST_HEAD(&init_task_group.children);

7640

7694

7641

#endif /* CONFIG_CGROUP_SCHED */

7695

#endif /* CONFIG_CGROUP_SCHED */

7642

7696

7643

#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP

7697

#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP

7644

update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),

7698

update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),

7645

__alignof__(unsigned long));

7699

__alignof__(unsigned long));

7646

#endif

7700

#endif

7647

for_each_possible_cpu(i) {

7701

for_each_possible_cpu(i) {

7648

struct rq *rq;

7702

struct rq *rq;

7649

7703

7650

rq = cpu_rq(i);

7704

rq = cpu_rq(i);

7651

raw_spin_lock_init(&rq->lock);

7705

raw_spin_lock_init(&rq->lock);

7652

rq->nr_running = 0;

7706

rq->nr_running = 0;

7653

rq->calc_load_active = 0;

7707

rq->calc_load_active = 0;

7654

rq->calc_load_update = jiffies + LOAD_FREQ;

7708

rq->calc_load_update = jiffies + LOAD_FREQ;

7655

init_cfs_rq(&rq->cfs, rq);

7709

init_cfs_rq(&rq->cfs, rq);

7656

init_rt_rq(&rq->rt, rq);

7710

init_rt_rq(&rq->rt, rq);

7657

#ifdef CONFIG_FAIR_GROUP_SCHED

7711

#ifdef CONFIG_FAIR_GROUP_SCHED

7658

init_task_group.shares = init_task_group_load;

7712

init_task_group.shares = init_task_group_load;

7659

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

7713

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

7660

#ifdef CONFIG_CGROUP_SCHED

7714

#ifdef CONFIG_CGROUP_SCHED

7661

/*

7715

/*

7662

* How much cpu bandwidth does init_task_group get?

7716

* How much cpu bandwidth does init_task_group get?

7663

*

7717

*

7664

* In case of task-groups formed thr' the cgroup filesystem, it

7718

* In case of task-groups formed thr' the cgroup filesystem, it

7665

* gets 100% of the cpu resources in the system. This overall

7719

* gets 100% of the cpu resources in the system. This overall

7666

* system cpu resource is divided among the tasks of

7720

* system cpu resource is divided among the tasks of

7667

* init_task_group and its child task-groups in a fair manner,

7721

* init_task_group and its child task-groups in a fair manner,

7668

* based on each entity's (task or task-group's) weight

7722

* based on each entity's (task or task-group's) weight

7669

* (se->load.weight).

7723

* (se->load.weight).

7670

*

7724

*

7671

* In other words, if init_task_group has 10 tasks of weight

7725

* In other words, if init_task_group has 10 tasks of weight

7672

* 1024) and two child groups A0 and A1 (of weight 1024 each),

7726

* 1024) and two child groups A0 and A1 (of weight 1024 each),

7673

* then A0's share of the cpu resource is:

7727

* then A0's share of the cpu resource is:

7674

*

7728

*

7675

* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%

7729

* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%

7676

*

7730

*

7677

* We achieve this by letting init_task_group's tasks sit

7731

* We achieve this by letting init_task_group's tasks sit

7678

* directly in rq->cfs (i.e init_task_group->se[] = NULL).

7732

* directly in rq->cfs (i.e init_task_group->se[] = NULL).

7679

*/

7733

*/

7680

init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);

7734

init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);

7681

#endif

7735

#endif

7682

#endif /* CONFIG_FAIR_GROUP_SCHED */

7736

#endif /* CONFIG_FAIR_GROUP_SCHED */

7683

7737

7684

rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

7738

rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

7685

#ifdef CONFIG_RT_GROUP_SCHED

7739

#ifdef CONFIG_RT_GROUP_SCHED

7686

INIT_LIST_HEAD(&rq->leaf_rt_rq_list);

7740

INIT_LIST_HEAD(&rq->leaf_rt_rq_list);

7687

#ifdef CONFIG_CGROUP_SCHED

7741

#ifdef CONFIG_CGROUP_SCHED

7688

init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);

7742

init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);

7689

#endif

7743

#endif

7690

#endif

7744

#endif

7691

7745

7692

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

7746

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

7693

rq->cpu_load[j] = 0;

7747

rq->cpu_load[j] = 0;

7694

#ifdef CONFIG_SMP

7748

#ifdef CONFIG_SMP

7695

rq->sd = NULL;

7749

rq->sd = NULL;

7696

rq->rd = NULL;

7750

rq->rd = NULL;

7697

rq->post_schedule = 0;

7751

rq->post_schedule = 0;

7698

rq->active_balance = 0;

7752

rq->active_balance = 0;

7699

rq->next_balance = jiffies;

7753

rq->next_balance = jiffies;

7700

rq->push_cpu = 0;

7754

rq->push_cpu = 0;

7701

rq->cpu = i;

7755

rq->cpu = i;

7702

rq->online = 0;

7756

rq->online = 0;

7703

rq->migration_thread = NULL;

7757

rq->migration_thread = NULL;

7704

rq->idle_stamp = 0;

7758

rq->idle_stamp = 0;

7705

rq->avg_idle = 2*sysctl_sched_migration_cost;

7759

rq->avg_idle = 2*sysctl_sched_migration_cost;

7706

INIT_LIST_HEAD(&rq->migration_queue);

7760

INIT_LIST_HEAD(&rq->migration_queue);

7707

rq_attach_root(rq, &def_root_domain);

7761

rq_attach_root(rq, &def_root_domain);

7708

#endif

7762

#endif

7709

init_rq_hrtick(rq);

7763

init_rq_hrtick(rq);

7710

atomic_set(&rq->nr_iowait, 0);

7764

atomic_set(&rq->nr_iowait, 0);

7711

}

7765

}

7712

7766

7713

set_load_weight(&init_task);

7767

set_load_weight(&init_task);

7714

7768

7715

#ifdef CONFIG_PREEMPT_NOTIFIERS

7769

#ifdef CONFIG_PREEMPT_NOTIFIERS

7716

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

7770

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

7717

#endif

7771

#endif

7718

7772

7719

#ifdef CONFIG_SMP

7773

#ifdef CONFIG_SMP

7720

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

7774

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

7721

#endif

7775

#endif

7722

7776

7723

#ifdef CONFIG_RT_MUTEXES

7777

#ifdef CONFIG_RT_MUTEXES

7724

plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);

7778

plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);

7725

#endif

7779

#endif

7726

7780

7727

/*

7781

/*

7728

* The boot idle thread does lazy MMU switching as well:

7782

* The boot idle thread does lazy MMU switching as well:

7729

*/

7783

*/

7730

atomic_inc(&init_mm.mm_count);

7784

atomic_inc(&init_mm.mm_count);

7731

enter_lazy_tlb(&init_mm, current);

7785

enter_lazy_tlb(&init_mm, current);

7732

7786

7733

/*

7787

/*

7734

* Make us the idle thread. Technically, schedule() should not be

7788

* Make us the idle thread. Technically, schedule() should not be

7735

* called from this thread, however somewhere below it might be,

7789

* called from this thread, however somewhere below it might be,

7736

* but because we are the idle thread, we just pick up running again

7790

* but because we are the idle thread, we just pick up running again

7737

* when this runqueue becomes "idle".

7791

* when this runqueue becomes "idle".

7738

*/

7792

*/

7739

init_idle(current, smp_processor_id());

7793

init_idle(current, smp_processor_id());

7740

7794

7741

calc_load_update = jiffies + LOAD_FREQ;

7795

calc_load_update = jiffies + LOAD_FREQ;

7742

7796

7743

/*

7797

/*

7744

* During early bootup we pretend to be a normal task:

7798

* During early bootup we pretend to be a normal task:

7745

*/

7799

*/

7746

current->sched_class = &fair_sched_class;

7800

current->sched_class = &fair_sched_class;

7747

7801

7748

/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */

7802

/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */

7749

zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);

7803

zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);

7750

#ifdef CONFIG_SMP

7804

#ifdef CONFIG_SMP

7751

#ifdef CONFIG_NO_HZ

7805

#ifdef CONFIG_NO_HZ

7752

zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);

7806

zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);

7753

alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);

7807

alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);

7754

#endif

7808

#endif

7755

/* May be allocated at isolcpus cmdline parse time */

7809

/* May be allocated at isolcpus cmdline parse time */

7756

if (cpu_isolated_map == NULL)

7810

if (cpu_isolated_map == NULL)

7757

zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);

7811

zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);

7758

#endif /* SMP */

7812

#endif /* SMP */

7759

7813

7760

perf_event_init();

7814

perf_event_init();

7761

7815

7762

scheduler_running = 1;

7816

scheduler_running = 1;

7763

}

7817

}

7764

7818

7765

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

7819

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

7766

static inline int preempt_count_equals(int preempt_offset)

7820

static inline int preempt_count_equals(int preempt_offset)

7767

{

7821

{

7768

int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();

7822

int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();

7769

7823

7770

return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);

7824

return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);

7771

}

7825

}

7772

7826

7773

void __might_sleep(const char *file, int line, int preempt_offset)

7827

void __might_sleep(const char *file, int line, int preempt_offset)

7774

{

7828

{

7775

#ifdef in_atomic

7829

#ifdef in_atomic

7776

static unsigned long prev_jiffy; /* ratelimiting */

7830

static unsigned long prev_jiffy; /* ratelimiting */

7777

7831

7778

if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||

7832

if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||

7779

system_state != SYSTEM_RUNNING || oops_in_progress)

7833

system_state != SYSTEM_RUNNING || oops_in_progress)

7780

return;

7834

return;

7781

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

7835

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

7782

return;

7836

return;

7783

prev_jiffy = jiffies;

7837

prev_jiffy = jiffies;

7784

7838

7785

printk(KERN_ERR

7839

printk(KERN_ERR

7786

"BUG: sleeping function called from invalid context at %s:%d\n",

7840

"BUG: sleeping function called from invalid context at %s:%d\n",

7787

file, line);

7841

file, line);

7788

printk(KERN_ERR

7842

printk(KERN_ERR

7789

"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

7843

"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

7790

in_atomic(), irqs_disabled(),

7844

in_atomic(), irqs_disabled(),

7791

current->pid, current->comm);

7845

current->pid, current->comm);

7792

7846

7793

debug_show_held_locks(current);

7847

debug_show_held_locks(current);

7794

if (irqs_disabled())

7848

if (irqs_disabled())

7795

print_irqtrace_events(current);

7849

print_irqtrace_events(current);

7796

dump_stack();

7850

dump_stack();

7797

#endif

7851

#endif

7798

}

7852

}

7799

EXPORT_SYMBOL(__might_sleep);

7853

EXPORT_SYMBOL(__might_sleep);

7800

#endif

7854

#endif

7801

7855

7802

#ifdef CONFIG_MAGIC_SYSRQ

7856

#ifdef CONFIG_MAGIC_SYSRQ

7803

static void normalize_task(struct rq *rq, struct task_struct *p)

7857

static void normalize_task(struct rq *rq, struct task_struct *p)

7804

{

7858

{

7805

int on_rq;

7859

int on_rq;

7806

7860

7807

on_rq = p->se.on_rq;

7861

on_rq = p->se.on_rq;

7808

if (on_rq)

7862

if (on_rq)

7809

deactivate_task(rq, p, 0);

7863

deactivate_task(rq, p, 0);

7810

__setscheduler(rq, p, SCHED_NORMAL, 0);

7864

__setscheduler(rq, p, SCHED_NORMAL, 0);

7811

if (on_rq) {

7865

if (on_rq) {

7812

activate_task(rq, p, 0);

7866

activate_task(rq, p, 0);

7813

resched_task(rq->curr);

7867

resched_task(rq->curr);

7814

}

7868

}

7815

}

7869

}

7816

7870

7817

void normalize_rt_tasks(void)

7871

void normalize_rt_tasks(void)

7818

{

7872

{

7819

struct task_struct *g, *p;

7873

struct task_struct *g, *p;

7820

unsigned long flags;

7874

unsigned long flags;

7821

struct rq *rq;

7875

struct rq *rq;

7822

7876

7823

read_lock_irqsave(&tasklist_lock, flags);

7877

read_lock_irqsave(&tasklist_lock, flags);

7824

do_each_thread(g, p) {

7878

do_each_thread(g, p) {

7825

/*

7879

/*

7826

* Only normalize user tasks:

7880

* Only normalize user tasks:

7827

*/

7881

*/

7828

if (!p->mm)

7882

if (!p->mm)

7829

continue;

7883

continue;

7830

7884

7831

p->se.exec_start = 0;

7885

p->se.exec_start = 0;

7832

#ifdef CONFIG_SCHEDSTATS

7886

#ifdef CONFIG_SCHEDSTATS

7833

p->se.statistics.wait_start = 0;

7887

p->se.statistics.wait_start = 0;

7834

p->se.statistics.sleep_start = 0;

7888

p->se.statistics.sleep_start = 0;

7835

p->se.statistics.block_start = 0;

7889

p->se.statistics.block_start = 0;

7836

#endif

7890

#endif

7837

7891

7838

if (!rt_task(p)) {

7892

if (!rt_task(p)) {

7839

/*

7893

/*

7840

* Renice negative nice level userspace

7894

* Renice negative nice level userspace

7841

* tasks back to 0:

7895

* tasks back to 0:

7842

*/

7896

*/

7843

if (TASK_NICE(p) < 0 && p->mm)

7897

if (TASK_NICE(p) < 0 && p->mm)

7844

set_user_nice(p, 0);

7898

set_user_nice(p, 0);

7845

continue;

7899

continue;

7846

}

7900

}

7847

7901

7848

raw_spin_lock(&p->pi_lock);

7902

raw_spin_lock(&p->pi_lock);

7849

rq = __task_rq_lock(p);

7903

rq = __task_rq_lock(p);

7850

7904

7851

normalize_task(rq, p);

7905

normalize_task(rq, p);

7852

7906

7853

__task_rq_unlock(rq);

7907

__task_rq_unlock(rq);

7854

raw_spin_unlock(&p->pi_lock);

7908

raw_spin_unlock(&p->pi_lock);

7855

} while_each_thread(g, p);

7909

} while_each_thread(g, p);

7856

7910

7857

read_unlock_irqrestore(&tasklist_lock, flags);

7911

read_unlock_irqrestore(&tasklist_lock, flags);

7858

}

7912

}

7859

7913

7860

#endif /* CONFIG_MAGIC_SYSRQ */

7914

#endif /* CONFIG_MAGIC_SYSRQ */

7861

7915

7862

#ifdef CONFIG_IA64

7916

#ifdef CONFIG_IA64

7863

/*

7917

/*

7864

* These functions are only useful for the IA64 MCA handling.

7918

* These functions are only useful for the IA64 MCA handling.

7865

*

7919

*

7866

* They can only be called when the whole system has been

7920

* They can only be called when the whole system has been

7867

* stopped - every CPU needs to be quiescent, and no scheduling

7921

* stopped - every CPU needs to be quiescent, and no scheduling

7868

* activity can take place. Using them for anything else would

7922

* activity can take place. Using them for anything else would

7869

* be a serious bug, and as a result, they aren't even visible

7923

* be a serious bug, and as a result, they aren't even visible

7870

* under any other configuration.

7924

* under any other configuration.

7871

*/

7925

*/

7872

7926

7873

/**

7927

/**

7874

* curr_task - return the current task for a given cpu.

7928

* curr_task - return the current task for a given cpu.

7875

* @cpu: the processor in question.

7929

* @cpu: the processor in question.

7876

*

7930

*

7877

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

7931

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

7878

*/

7932

*/

7879

struct task_struct *curr_task(int cpu)

7933

struct task_struct *curr_task(int cpu)

7880

{

7934

{

7881

return cpu_curr(cpu);

7935

return cpu_curr(cpu);

7882

}

7936

}

7883

7937

7884

/**

7938

/**

7885

* set_curr_task - set the current task for a given cpu.

7939

* set_curr_task - set the current task for a given cpu.

7886

* @cpu: the processor in question.

7940

* @cpu: the processor in question.

7887

* @p: the task pointer to set.

7941

* @p: the task pointer to set.

7888

*

7942

*

7889

* Description: This function must only be used when non-maskable interrupts

7943

* Description: This function must only be used when non-maskable interrupts

7890

* are serviced on a separate stack. It allows the architecture to switch the

7944

* are serviced on a separate stack. It allows the architecture to switch the

7891

* notion of the current task on a cpu in a non-blocking manner. This function

7945

* notion of the current task on a cpu in a non-blocking manner. This function

7892

* must be called with all CPU's synchronized, and interrupts disabled, the

7946

* must be called with all CPU's synchronized, and interrupts disabled, the

7893

* and caller must save the original value of the current task (see

7947

* and caller must save the original value of the current task (see

7894

* curr_task() above) and restore that value before reenabling interrupts and

7948

* curr_task() above) and restore that value before reenabling interrupts and

7895

* re-starting the system.

7949

* re-starting the system.

7896

*

7950

*

7897

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

7951

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

7898

*/

7952

*/

7899

void set_curr_task(int cpu, struct task_struct *p)

7953

void set_curr_task(int cpu, struct task_struct *p)

7900

{

7954

{

7901

cpu_curr(cpu) = p;

7955

cpu_curr(cpu) = p;

7902

}

7956

}

7903

7957

7904

#endif

7958

#endif

7905

7959

7906

#ifdef CONFIG_FAIR_GROUP_SCHED

7960

#ifdef CONFIG_FAIR_GROUP_SCHED

7907

static void free_fair_sched_group(struct task_group *tg)

7961

static void free_fair_sched_group(struct task_group *tg)

7908

{

7962

{

7909

int i;

7963

int i;

7910

7964

7911

for_each_possible_cpu(i) {

7965

for_each_possible_cpu(i) {

7912

if (tg->cfs_rq)

7966

if (tg->cfs_rq)

7913

kfree(tg->cfs_rq[i]);

7967

kfree(tg->cfs_rq[i]);

7914

if (tg->se)

7968

if (tg->se)

7915

kfree(tg->se[i]);

7969

kfree(tg->se[i]);

7916

}

7970

}

7917

7971

7918

kfree(tg->cfs_rq);

7972

kfree(tg->cfs_rq);

7919

kfree(tg->se);

7973

kfree(tg->se);

7920

}

7974

}

7921

7975

7922

static

7976

static

7923

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

7977

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

7924

{

7978

{

7925

struct cfs_rq *cfs_rq;

7979

struct cfs_rq *cfs_rq;

7926

struct sched_entity *se;

7980

struct sched_entity *se;

7927

struct rq *rq;

7981

struct rq *rq;

7928

int i;

7982

int i;

7929

7983

7930

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

7984

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

7931

if (!tg->cfs_rq)

7985

if (!tg->cfs_rq)

7932

goto err;

7986

goto err;

7933

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

7987

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

7934

if (!tg->se)

7988

if (!tg->se)

7935

goto err;

7989

goto err;

7936

7990

7937

tg->shares = NICE_0_LOAD;

7991

tg->shares = NICE_0_LOAD;

7938

7992

7939

for_each_possible_cpu(i) {

7993

for_each_possible_cpu(i) {

7940

rq = cpu_rq(i);

7994

rq = cpu_rq(i);

7941

7995

7942

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

7996

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

7943

GFP_KERNEL, cpu_to_node(i));

7997

GFP_KERNEL, cpu_to_node(i));

7944

if (!cfs_rq)

7998

if (!cfs_rq)

7945

goto err;

7999

goto err;

7946

8000

7947

se = kzalloc_node(sizeof(struct sched_entity),

8001

se = kzalloc_node(sizeof(struct sched_entity),

7948

GFP_KERNEL, cpu_to_node(i));

8002

GFP_KERNEL, cpu_to_node(i));

7949

if (!se)

8003

if (!se)

7950

goto err_free_rq;

8004

goto err_free_rq;

7951

8005

7952

init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);

8006

init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);

7953

}

8007

}

7954

8008

7955

return 1;

8009

return 1;

7956

8010

7957

err_free_rq:

8011

err_free_rq:

7958

kfree(cfs_rq);

8012

kfree(cfs_rq);

7959

err:

8013

err:

7960

return 0;

8014

return 0;

7961

}

8015

}

7962

8016

7963

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

8017

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

7964

{

8018

{

7965

list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,

8019

list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,

7966

&cpu_rq(cpu)->leaf_cfs_rq_list);

8020

&cpu_rq(cpu)->leaf_cfs_rq_list);

7967

}

8021

}

7968

8022

7969

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

8023

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

7970

{

8024

{

7971

list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);

8025

list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);

7972

}

8026

}

7973

#else /* !CONFG_FAIR_GROUP_SCHED */

8027

#else /* !CONFG_FAIR_GROUP_SCHED */

7974

static inline void free_fair_sched_group(struct task_group *tg)

8028

static inline void free_fair_sched_group(struct task_group *tg)

7975

{

8029

{

7976

}

8030

}

7977

8031

7978

static inline

8032

static inline

7979

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

8033

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

7980

{

8034

{

7981

return 1;

8035

return 1;

7982

}

8036

}

7983

8037

7984

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

8038

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

7985

{

8039

{

7986

}

8040

}

7987

8041

7988

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

8042

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

7989

{

8043

{

7990

}

8044

}

7991

#endif /* CONFIG_FAIR_GROUP_SCHED */

8045

#endif /* CONFIG_FAIR_GROUP_SCHED */

7992

8046

7993

#ifdef CONFIG_RT_GROUP_SCHED

8047

#ifdef CONFIG_RT_GROUP_SCHED

7994

static void free_rt_sched_group(struct task_group *tg)

8048

static void free_rt_sched_group(struct task_group *tg)

7995

{

8049

{

7996

int i;

8050

int i;

7997

8051

7998

destroy_rt_bandwidth(&tg->rt_bandwidth);

8052

destroy_rt_bandwidth(&tg->rt_bandwidth);

7999

8053

8000

for_each_possible_cpu(i) {

8054

for_each_possible_cpu(i) {

8001

if (tg->rt_rq)

8055

if (tg->rt_rq)

8002

kfree(tg->rt_rq[i]);

8056

kfree(tg->rt_rq[i]);

8003

if (tg->rt_se)

8057

if (tg->rt_se)

8004

kfree(tg->rt_se[i]);

8058

kfree(tg->rt_se[i]);

8005

}

8059

}

8006

8060

8007

kfree(tg->rt_rq);

8061

kfree(tg->rt_rq);

8008

kfree(tg->rt_se);

8062

kfree(tg->rt_se);

8009

}

8063

}

8010

8064

8011

static

8065

static

8012

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

8066

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

8013

{

8067

{

8014

struct rt_rq *rt_rq;

8068

struct rt_rq *rt_rq;

8015

struct sched_rt_entity *rt_se;

8069

struct sched_rt_entity *rt_se;

8016

struct rq *rq;

8070

struct rq *rq;

8017

int i;

8071

int i;

8018

8072

8019

tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);

8073

tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);

8020

if (!tg->rt_rq)

8074

if (!tg->rt_rq)

8021

goto err;

8075

goto err;

8022

tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);

8076

tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);

8023

if (!tg->rt_se)

8077

if (!tg->rt_se)

8024

goto err;

8078

goto err;

8025

8079

8026

init_rt_bandwidth(&tg->rt_bandwidth,

8080

init_rt_bandwidth(&tg->rt_bandwidth,

8027

ktime_to_ns(def_rt_bandwidth.rt_period), 0);

8081

ktime_to_ns(def_rt_bandwidth.rt_period), 0);

8028

8082

8029

for_each_possible_cpu(i) {

8083

for_each_possible_cpu(i) {

8030

rq = cpu_rq(i);

8084

rq = cpu_rq(i);

8031

8085

8032

rt_rq = kzalloc_node(sizeof(struct rt_rq),

8086

rt_rq = kzalloc_node(sizeof(struct rt_rq),

8033

GFP_KERNEL, cpu_to_node(i));

8087

GFP_KERNEL, cpu_to_node(i));

8034

if (!rt_rq)

8088

if (!rt_rq)

8035

goto err;

8089

goto err;

8036

8090

8037

rt_se = kzalloc_node(sizeof(struct sched_rt_entity),

8091

rt_se = kzalloc_node(sizeof(struct sched_rt_entity),

8038

GFP_KERNEL, cpu_to_node(i));

8092

GFP_KERNEL, cpu_to_node(i));

8039

if (!rt_se)

8093

if (!rt_se)

8040

goto err_free_rq;

8094

goto err_free_rq;

8041

8095

8042

init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);

8096

init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);

8043

}

8097

}

8044

8098

8045

return 1;

8099

return 1;

8046

8100

8047

err_free_rq:

8101

err_free_rq:

8048

kfree(rt_rq);

8102

kfree(rt_rq);

8049

err:

8103

err:

8050

return 0;

8104

return 0;

8051

}

8105

}

8052

8106

8053

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

8107

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

8054

{

8108

{

8055

list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,

8109

list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,

8056

&cpu_rq(cpu)->leaf_rt_rq_list);

8110

&cpu_rq(cpu)->leaf_rt_rq_list);

8057

}

8111

}

8058

8112

8059

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

8113

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

8060

{

8114

{

8061

list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);

8115

list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);

8062

}

8116

}

8063

#else /* !CONFIG_RT_GROUP_SCHED */

8117

#else /* !CONFIG_RT_GROUP_SCHED */

8064

static inline void free_rt_sched_group(struct task_group *tg)

8118

static inline void free_rt_sched_group(struct task_group *tg)

8065

{

8119

{

8066

}

8120

}

8067

8121

8068

static inline

8122

static inline

8069

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

8123

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

8070

{

8124

{

8071

return 1;

8125

return 1;

8072

}

8126

}

8073

8127

8074

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

8128

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

8075

{

8129

{

8076

}

8130

}

8077

8131

8078

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

8132

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

8079

{

8133

{

8080

}

8134

}

8081

#endif /* CONFIG_RT_GROUP_SCHED */

8135

#endif /* CONFIG_RT_GROUP_SCHED */

8082

8136

8083

#ifdef CONFIG_CGROUP_SCHED

8137

#ifdef CONFIG_CGROUP_SCHED

8084

static void free_sched_group(struct task_group *tg)

8138

static void free_sched_group(struct task_group *tg)

8085

{

8139

{

8086

free_fair_sched_group(tg);

8140

free_fair_sched_group(tg);

8087

free_rt_sched_group(tg);

8141

free_rt_sched_group(tg);

8088

kfree(tg);

8142

kfree(tg);

8089

}

8143

}

8090

8144

8091

/* allocate runqueue etc for a new task group */

8145

/* allocate runqueue etc for a new task group */

8092

struct task_group *sched_create_group(struct task_group *parent)

8146

struct task_group *sched_create_group(struct task_group *parent)

8093

{

8147

{

8094

struct task_group *tg;

8148

struct task_group *tg;

8095

unsigned long flags;

8149

unsigned long flags;

8096

int i;

8150

int i;

8097

8151

8098

tg = kzalloc(sizeof(*tg), GFP_KERNEL);

8152

tg = kzalloc(sizeof(*tg), GFP_KERNEL);

8099

if (!tg)

8153

if (!tg)

8100

return ERR_PTR(-ENOMEM);

8154

return ERR_PTR(-ENOMEM);

8101

8155

8102

if (!alloc_fair_sched_group(tg, parent))

8156

if (!alloc_fair_sched_group(tg, parent))

8103

goto err;

8157

goto err;

8104

8158

8105

if (!alloc_rt_sched_group(tg, parent))

8159

if (!alloc_rt_sched_group(tg, parent))

8106

goto err;

8160

goto err;

8107

8161

8108

spin_lock_irqsave(&task_group_lock, flags);

8162

spin_lock_irqsave(&task_group_lock, flags);

8109

for_each_possible_cpu(i) {

8163

for_each_possible_cpu(i) {

8110

register_fair_sched_group(tg, i);

8164

register_fair_sched_group(tg, i);

8111

register_rt_sched_group(tg, i);

8165

register_rt_sched_group(tg, i);

8112

}

8166

}

8113

list_add_rcu(&tg->list, &task_groups);

8167

list_add_rcu(&tg->list, &task_groups);

8114

8168

8115

WARN_ON(!parent); /* root should already exist */

8169

WARN_ON(!parent); /* root should already exist */

8116

8170

8117

tg->parent = parent;

8171

tg->parent = parent;

8118

INIT_LIST_HEAD(&tg->children);

8172

INIT_LIST_HEAD(&tg->children);

8119

list_add_rcu(&tg->siblings, &parent->children);

8173

list_add_rcu(&tg->siblings, &parent->children);

8120

spin_unlock_irqrestore(&task_group_lock, flags);

8174

spin_unlock_irqrestore(&task_group_lock, flags);

8121

8175

8122

return tg;

8176

return tg;

8123

8177

8124

err:

8178

err:

8125

free_sched_group(tg);

8179

free_sched_group(tg);

8126

return ERR_PTR(-ENOMEM);

8180

return ERR_PTR(-ENOMEM);

8127

}

8181

}

8128

8182

8129

/* rcu callback to free various structures associated with a task group */

8183

/* rcu callback to free various structures associated with a task group */

8130

static void free_sched_group_rcu(struct rcu_head *rhp)

8184

static void free_sched_group_rcu(struct rcu_head *rhp)

8131

{

8185

{

8132

/* now it should be safe to free those cfs_rqs */

8186

/* now it should be safe to free those cfs_rqs */

8133

free_sched_group(container_of(rhp, struct task_group, rcu));

8187

free_sched_group(container_of(rhp, struct task_group, rcu));

8134

}

8188

}

8135

8189

8136

/* Destroy runqueue etc associated with a task group */

8190

/* Destroy runqueue etc associated with a task group */

8137

void sched_destroy_group(struct task_group *tg)

8191

void sched_destroy_group(struct task_group *tg)

8138

{

8192

{

8139

unsigned long flags;

8193

unsigned long flags;

8140

int i;

8194

int i;

8141

8195

8142

spin_lock_irqsave(&task_group_lock, flags);

8196

spin_lock_irqsave(&task_group_lock, flags);

8143

for_each_possible_cpu(i) {

8197

for_each_possible_cpu(i) {

8144

unregister_fair_sched_group(tg, i);

8198

unregister_fair_sched_group(tg, i);

8145

unregister_rt_sched_group(tg, i);

8199

unregister_rt_sched_group(tg, i);

8146

}

8200

}

8147

list_del_rcu(&tg->list);

8201

list_del_rcu(&tg->list);

8148

list_del_rcu(&tg->siblings);

8202

list_del_rcu(&tg->siblings);

8149

spin_unlock_irqrestore(&task_group_lock, flags);

8203

spin_unlock_irqrestore(&task_group_lock, flags);

8150

8204

8151

/* wait for possible concurrent references to cfs_rqs complete */

8205

/* wait for possible concurrent references to cfs_rqs complete */

8152

call_rcu(&tg->rcu, free_sched_group_rcu);

8206

call_rcu(&tg->rcu, free_sched_group_rcu);

8153

}

8207

}

8154

8208

8155

/* change task's runqueue when it moves between groups.

8209

/* change task's runqueue when it moves between groups.

8156

* The caller of this function should have put the task in its new group

8210

* The caller of this function should have put the task in its new group

8157

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

8211

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

8158

* reflect its new group.

8212

* reflect its new group.

8159

*/

8213

*/

8160

void sched_move_task(struct task_struct *tsk)

8214

void sched_move_task(struct task_struct *tsk)

8161

{

8215

{

8162

int on_rq, running;

8216

int on_rq, running;

8163

unsigned long flags;

8217

unsigned long flags;

8164

struct rq *rq;

8218

struct rq *rq;

8165

8219

8166

rq = task_rq_lock(tsk, &flags);

8220

rq = task_rq_lock(tsk, &flags);

8167

8221

8168

running = task_current(rq, tsk);

8222

running = task_current(rq, tsk);

8169

on_rq = tsk->se.on_rq;

8223

on_rq = tsk->se.on_rq;

8170

8224

8171

if (on_rq)

8225

if (on_rq)

8172

dequeue_task(rq, tsk, 0);

8226

dequeue_task(rq, tsk, 0);

8173

if (unlikely(running))

8227

if (unlikely(running))

8174

tsk->sched_class->put_prev_task(rq, tsk);

8228

tsk->sched_class->put_prev_task(rq, tsk);

8175

8229

8176

set_task_rq(tsk, task_cpu(tsk));

8230

set_task_rq(tsk, task_cpu(tsk));

8177

8231

8178

#ifdef CONFIG_FAIR_GROUP_SCHED

8232

#ifdef CONFIG_FAIR_GROUP_SCHED

8179

if (tsk->sched_class->moved_group)

8233

if (tsk->sched_class->moved_group)

8180

tsk->sched_class->moved_group(tsk, on_rq);

8234

tsk->sched_class->moved_group(tsk, on_rq);

8181

#endif

8235

#endif

8182

8236

8183

if (unlikely(running))

8237

if (unlikely(running))

8184

tsk->sched_class->set_curr_task(rq);

8238

tsk->sched_class->set_curr_task(rq);

8185

if (on_rq)

8239

if (on_rq)

8186

enqueue_task(rq, tsk, 0);

8240

enqueue_task(rq, tsk, 0);

8187

8241

8188

task_rq_unlock(rq, &flags);

8242

task_rq_unlock(rq, &flags);

8189

}

8243

}

8190

#endif /* CONFIG_CGROUP_SCHED */

8244

#endif /* CONFIG_CGROUP_SCHED */

8191

8245

8192

#ifdef CONFIG_FAIR_GROUP_SCHED

8246

#ifdef CONFIG_FAIR_GROUP_SCHED

8193

static void __set_se_shares(struct sched_entity *se, unsigned long shares)

8247

static void __set_se_shares(struct sched_entity *se, unsigned long shares)

8194

{

8248

{

8195

struct cfs_rq *cfs_rq = se->cfs_rq;

8249

struct cfs_rq *cfs_rq = se->cfs_rq;

8196

int on_rq;

8250

int on_rq;

8197

8251

8198

on_rq = se->on_rq;

8252

on_rq = se->on_rq;

8199

if (on_rq)

8253

if (on_rq)

8200

dequeue_entity(cfs_rq, se, 0);

8254

dequeue_entity(cfs_rq, se, 0);

8201

8255

8202

se->load.weight = shares;

8256

se->load.weight = shares;

8203

se->load.inv_weight = 0;

8257

se->load.inv_weight = 0;

8204

8258

8205

if (on_rq)

8259

if (on_rq)

8206

enqueue_entity(cfs_rq, se, 0);

8260

enqueue_entity(cfs_rq, se, 0);

8207

}

8261

}

8208

8262

8209

static void set_se_shares(struct sched_entity *se, unsigned long shares)

8263

static void set_se_shares(struct sched_entity *se, unsigned long shares)

8210

{

8264

{

8211

struct cfs_rq *cfs_rq = se->cfs_rq;

8265

struct cfs_rq *cfs_rq = se->cfs_rq;

8212

struct rq *rq = cfs_rq->rq;

8266

struct rq *rq = cfs_rq->rq;

8213

unsigned long flags;

8267

unsigned long flags;

8214

8268

8215

raw_spin_lock_irqsave(&rq->lock, flags);

8269

raw_spin_lock_irqsave(&rq->lock, flags);

8216

__set_se_shares(se, shares);

8270

__set_se_shares(se, shares);

8217

raw_spin_unlock_irqrestore(&rq->lock, flags);

8271

raw_spin_unlock_irqrestore(&rq->lock, flags);

8218

}

8272

}

8219

8273

8220

static DEFINE_MUTEX(shares_mutex);

8274

static DEFINE_MUTEX(shares_mutex);

8221

8275

8222

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

8276

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

8223

{

8277

{

8224

int i;

8278

int i;

8225

unsigned long flags;

8279

unsigned long flags;

8226

8280

8227

/*

8281

/*

8228

* We can't change the weight of the root cgroup.

8282

* We can't change the weight of the root cgroup.

8229

*/

8283

*/

8230

if (!tg->se[0])

8284

if (!tg->se[0])

8231

return -EINVAL;

8285

return -EINVAL;

8232

8286

8233

if (shares < MIN_SHARES)

8287

if (shares < MIN_SHARES)

8234

shares = MIN_SHARES;

8288

shares = MIN_SHARES;

8235

else if (shares > MAX_SHARES)

8289

else if (shares > MAX_SHARES)

8236

shares = MAX_SHARES;

8290

shares = MAX_SHARES;

8237

8291

8238

mutex_lock(&shares_mutex);

8292

mutex_lock(&shares_mutex);

8239

if (tg->shares == shares)

8293

if (tg->shares == shares)

8240

goto done;

8294

goto done;

8241

8295

8242

spin_lock_irqsave(&task_group_lock, flags);

8296

spin_lock_irqsave(&task_group_lock, flags);

8243

for_each_possible_cpu(i)

8297

for_each_possible_cpu(i)

8244

unregister_fair_sched_group(tg, i);

8298

unregister_fair_sched_group(tg, i);

8245

list_del_rcu(&tg->siblings);

8299

list_del_rcu(&tg->siblings);

8246

spin_unlock_irqrestore(&task_group_lock, flags);

8300

spin_unlock_irqrestore(&task_group_lock, flags);

8247

8301

8248

/* wait for any ongoing reference to this group to finish */

8302

/* wait for any ongoing reference to this group to finish */

8249

synchronize_sched();

8303

synchronize_sched();

8250

8304

8251

/*

8305

/*

8252

* Now we are free to modify the group's share on each cpu

8306

* Now we are free to modify the group's share on each cpu

8253

* w/o tripping rebalance_share or load_balance_fair.

8307

* w/o tripping rebalance_share or load_balance_fair.

8254

*/

8308

*/

8255

tg->shares = shares;

8309

tg->shares = shares;

8256

for_each_possible_cpu(i) {

8310

for_each_possible_cpu(i) {

8257

/*

8311

/*

8258

* force a rebalance

8312

* force a rebalance

8259

*/

8313

*/

8260

cfs_rq_set_shares(tg->cfs_rq[i], 0);

8314

cfs_rq_set_shares(tg->cfs_rq[i], 0);

8261

set_se_shares(tg->se[i], shares);

8315

set_se_shares(tg->se[i], shares);

8262

}

8316

}

8263

8317

8264

/*

8318

/*

8265

* Enable load balance activity on this group, by inserting it back on

8319

* Enable load balance activity on this group, by inserting it back on

8266

* each cpu's rq->leaf_cfs_rq_list.

8320

* each cpu's rq->leaf_cfs_rq_list.

8267

*/

8321

*/

8268

spin_lock_irqsave(&task_group_lock, flags);

8322

spin_lock_irqsave(&task_group_lock, flags);

8269

for_each_possible_cpu(i)

8323

for_each_possible_cpu(i)

8270

register_fair_sched_group(tg, i);

8324

register_fair_sched_group(tg, i);

8271

list_add_rcu(&tg->siblings, &tg->parent->children);

8325

list_add_rcu(&tg->siblings, &tg->parent->children);

8272

spin_unlock_irqrestore(&task_group_lock, flags);

8326

spin_unlock_irqrestore(&task_group_lock, flags);

8273

done:

8327

done:

8274

mutex_unlock(&shares_mutex);

8328

mutex_unlock(&shares_mutex);

8275

return 0;

8329

return 0;

8276

}

8330

}

8277

8331

8278

unsigned long sched_group_shares(struct task_group *tg)

8332

unsigned long sched_group_shares(struct task_group *tg)

8279

{

8333

{

8280

return tg->shares;

8334

return tg->shares;

8281

}

8335

}

8282

#endif

8336

#endif

8283

8337

8284

#ifdef CONFIG_RT_GROUP_SCHED

8338

#ifdef CONFIG_RT_GROUP_SCHED

8285

/*

8339

/*

8286

* Ensure that the real time constraints are schedulable.

8340

* Ensure that the real time constraints are schedulable.

8287

*/

8341

*/

8288

static DEFINE_MUTEX(rt_constraints_mutex);

8342

static DEFINE_MUTEX(rt_constraints_mutex);

8289

8343

8290

static unsigned long to_ratio(u64 period, u64 runtime)

8344

static unsigned long to_ratio(u64 period, u64 runtime)

8291

{

8345

{

8292

if (runtime == RUNTIME_INF)

8346

if (runtime == RUNTIME_INF)

8293

return 1ULL << 20;

8347

return 1ULL << 20;

8294

8348

8295

return div64_u64(runtime << 20, period);

8349

return div64_u64(runtime << 20, period);

8296

}

8350

}

8297

8351

8298

/* Must be called with tasklist_lock held */

8352

/* Must be called with tasklist_lock held */

8299

static inline int tg_has_rt_tasks(struct task_group *tg)

8353

static inline int tg_has_rt_tasks(struct task_group *tg)

8300

{

8354

{

8301

struct task_struct *g, *p;

8355

struct task_struct *g, *p;

8302

8356

8303

do_each_thread(g, p) {

8357

do_each_thread(g, p) {

8304

if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)

8358

if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)

8305

return 1;

8359

return 1;

8306

} while_each_thread(g, p);

8360

} while_each_thread(g, p);

8307

8361

8308

return 0;

8362

return 0;

8309

}

8363

}

8310

8364

8311

struct rt_schedulable_data {

8365

struct rt_schedulable_data {

8312

struct task_group *tg;

8366

struct task_group *tg;

8313

u64 rt_period;

8367

u64 rt_period;

8314

u64 rt_runtime;

8368

u64 rt_runtime;

8315

};

8369

};

8316

8370

8317

static int tg_schedulable(struct task_group *tg, void *data)

8371

static int tg_schedulable(struct task_group *tg, void *data)

8318

{

8372

{

8319

struct rt_schedulable_data *d = data;

8373

struct rt_schedulable_data *d = data;

8320

struct task_group *child;

8374

struct task_group *child;

8321

unsigned long total, sum = 0;

8375

unsigned long total, sum = 0;

8322

u64 period, runtime;

8376

u64 period, runtime;

8323

8377

8324

period = ktime_to_ns(tg->rt_bandwidth.rt_period);

8378

period = ktime_to_ns(tg->rt_bandwidth.rt_period);

8325

runtime = tg->rt_bandwidth.rt_runtime;

8379

runtime = tg->rt_bandwidth.rt_runtime;

8326

8380

8327

if (tg == d->tg) {

8381

if (tg == d->tg) {

8328

period = d->rt_period;

8382

period = d->rt_period;

8329

runtime = d->rt_runtime;

8383

runtime = d->rt_runtime;

8330

}

8384

}

8331

8385

8332

/*

8386

/*

8333

* Cannot have more runtime than the period.

8387

* Cannot have more runtime than the period.

8334

*/

8388

*/

8335

if (runtime > period && runtime != RUNTIME_INF)

8389

if (runtime > period && runtime != RUNTIME_INF)

8336

return -EINVAL;

8390

return -EINVAL;

8337

8391

8338

/*

8392

/*

8339

* Ensure we don't starve existing RT tasks.

8393

* Ensure we don't starve existing RT tasks.

8340

*/

8394

*/

8341

if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))

8395

if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))

8342

return -EBUSY;

8396

return -EBUSY;

8343

8397

8344

total = to_ratio(period, runtime);

8398

total = to_ratio(period, runtime);

8345

8399

8346

/*

8400

/*

8347

* Nobody can have more than the global setting allows.

8401

* Nobody can have more than the global setting allows.

8348

*/

8402

*/

8349

if (total > to_ratio(global_rt_period(), global_rt_runtime()))

8403

if (total > to_ratio(global_rt_period(), global_rt_runtime()))

8350

return -EINVAL;

8404

return -EINVAL;

8351

8405

8352

/*

8406

/*

8353

* The sum of our children's runtime should not exceed our own.

8407

* The sum of our children's runtime should not exceed our own.

8354

*/

8408

*/

8355

list_for_each_entry_rcu(child, &tg->children, siblings) {

8409

list_for_each_entry_rcu(child, &tg->children, siblings) {

8356

period = ktime_to_ns(child->rt_bandwidth.rt_period);

8410

period = ktime_to_ns(child->rt_bandwidth.rt_period);

8357

runtime = child->rt_bandwidth.rt_runtime;

8411

runtime = child->rt_bandwidth.rt_runtime;

8358

8412

8359

if (child == d->tg) {

8413

if (child == d->tg) {

8360

period = d->rt_period;

8414

period = d->rt_period;

8361

runtime = d->rt_runtime;

8415

runtime = d->rt_runtime;

8362

}

8416

}

8363

8417

8364

sum += to_ratio(period, runtime);

8418

sum += to_ratio(period, runtime);

8365

}

8419

}

8366

8420

8367

if (sum > total)

8421

if (sum > total)

8368

return -EINVAL;

8422

return -EINVAL;

8369

8423

8370

return 0;

8424

return 0;

8371

}

8425

}

8372

8426

8373

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)

8427

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)

8374

{

8428

{

8375

struct rt_schedulable_data data = {

8429

struct rt_schedulable_data data = {

8376

.tg = tg,

8430

.tg = tg,

8377

.rt_period = period,

8431

.rt_period = period,

8378

.rt_runtime = runtime,

8432

.rt_runtime = runtime,

8379

};

8433

};

8380

8434

8381

return walk_tg_tree(tg_schedulable, tg_nop, &data);

8435

return walk_tg_tree(tg_schedulable, tg_nop, &data);

8382

}

8436

}

8383

8437

8384

static int tg_set_bandwidth(struct task_group *tg,

8438

static int tg_set_bandwidth(struct task_group *tg,

8385

u64 rt_period, u64 rt_runtime)

8439

u64 rt_period, u64 rt_runtime)

8386

{

8440

{

8387

int i, err = 0;

8441

int i, err = 0;

8388

8442

8389

mutex_lock(&rt_constraints_mutex);

8443

mutex_lock(&rt_constraints_mutex);

8390

read_lock(&tasklist_lock);

8444

read_lock(&tasklist_lock);

8391

err = __rt_schedulable(tg, rt_period, rt_runtime);

8445

err = __rt_schedulable(tg, rt_period, rt_runtime);

8392

if (err)

8446

if (err)

8393

goto unlock;

8447

goto unlock;

8394

8448

8395

raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);

8449

raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);

8396

tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);

8450

tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);

8397

tg->rt_bandwidth.rt_runtime = rt_runtime;

8451

tg->rt_bandwidth.rt_runtime = rt_runtime;

8398

8452

8399

for_each_possible_cpu(i) {

8453

for_each_possible_cpu(i) {

8400

struct rt_rq *rt_rq = tg->rt_rq[i];

8454

struct rt_rq *rt_rq = tg->rt_rq[i];

8401

8455

8402

raw_spin_lock(&rt_rq->rt_runtime_lock);

8456

raw_spin_lock(&rt_rq->rt_runtime_lock);

8403

rt_rq->rt_runtime = rt_runtime;

8457

rt_rq->rt_runtime = rt_runtime;

8404

raw_spin_unlock(&rt_rq->rt_runtime_lock);

8458

raw_spin_unlock(&rt_rq->rt_runtime_lock);

8405

}

8459

}

8406

raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);

8460

raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);

8407

unlock:

8461

unlock:

8408

read_unlock(&tasklist_lock);

8462

read_unlock(&tasklist_lock);

8409

mutex_unlock(&rt_constraints_mutex);

8463

mutex_unlock(&rt_constraints_mutex);

8410

8464

8411

return err;

8465

return err;

8412

}

8466

}

8413

8467

8414

int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)

8468

int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)

8415

{

8469

{

8416

u64 rt_runtime, rt_period;

8470

u64 rt_runtime, rt_period;

8417

8471

8418

rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);

8472

rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);

8419

rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;

8473

rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;

8420

if (rt_runtime_us < 0)

8474

if (rt_runtime_us < 0)

8421

rt_runtime = RUNTIME_INF;

8475

rt_runtime = RUNTIME_INF;

8422

8476

8423

return tg_set_bandwidth(tg, rt_period, rt_runtime);

8477

return tg_set_bandwidth(tg, rt_period, rt_runtime);

8424

}

8478

}

8425

8479

8426

long sched_group_rt_runtime(struct task_group *tg)

8480

long sched_group_rt_runtime(struct task_group *tg)

8427

{

8481

{

8428

u64 rt_runtime_us;

8482

u64 rt_runtime_us;

8429

8483

8430

if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)

8484

if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)

8431

return -1;

8485

return -1;

8432

8486

8433

rt_runtime_us = tg->rt_bandwidth.rt_runtime;

8487

rt_runtime_us = tg->rt_bandwidth.rt_runtime;

8434

do_div(rt_runtime_us, NSEC_PER_USEC);

8488

do_div(rt_runtime_us, NSEC_PER_USEC);

8435

return rt_runtime_us;

8489

return rt_runtime_us;

8436

}

8490

}

8437

8491

8438

int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)

8492

int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)

8439

{

8493

{

8440

u64 rt_runtime, rt_period;

8494

u64 rt_runtime, rt_period;

8441

8495

8442

rt_period = (u64)rt_period_us * NSEC_PER_USEC;

8496

rt_period = (u64)rt_period_us * NSEC_PER_USEC;

8443

rt_runtime = tg->rt_bandwidth.rt_runtime;

8497

rt_runtime = tg->rt_bandwidth.rt_runtime;

8444

8498

8445

if (rt_period == 0)

8499

if (rt_period == 0)

8446

return -EINVAL;

8500

return -EINVAL;

8447

8501

8448

return tg_set_bandwidth(tg, rt_period, rt_runtime);

8502

return tg_set_bandwidth(tg, rt_period, rt_runtime);

8449

}

8503

}

8450

8504

8451

long sched_group_rt_period(struct task_group *tg)

8505

long sched_group_rt_period(struct task_group *tg)

8452

{

8506

{

8453

u64 rt_period_us;

8507

u64 rt_period_us;

8454

8508

8455

rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);

8509

rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);

8456

do_div(rt_period_us, NSEC_PER_USEC);

8510

do_div(rt_period_us, NSEC_PER_USEC);

8457

return rt_period_us;

8511

return rt_period_us;

8458

}

8512

}

8459

8513

8460

static int sched_rt_global_constraints(void)

8514

static int sched_rt_global_constraints(void)

8461

{

8515

{

8462

u64 runtime, period;

8516

u64 runtime, period;

8463

int ret = 0;

8517

int ret = 0;

8464

8518

8465

if (sysctl_sched_rt_period <= 0)

8519

if (sysctl_sched_rt_period <= 0)

8466

return -EINVAL;

8520

return -EINVAL;

8467

8521

8468

runtime = global_rt_runtime();

8522

runtime = global_rt_runtime();

8469

period = global_rt_period();

8523

period = global_rt_period();

8470

8524

8471

/*

8525

/*

8472

* Sanity check on the sysctl variables.

8526

* Sanity check on the sysctl variables.

8473

*/

8527

*/

8474

if (runtime > period && runtime != RUNTIME_INF)

8528

if (runtime > period && runtime != RUNTIME_INF)

8475

return -EINVAL;

8529

return -EINVAL;

8476

8530

8477

mutex_lock(&rt_constraints_mutex);

8531

mutex_lock(&rt_constraints_mutex);

8478

read_lock(&tasklist_lock);

8532

read_lock(&tasklist_lock);

8479

ret = __rt_schedulable(NULL, 0, 0);

8533

ret = __rt_schedulable(NULL, 0, 0);

8480

read_unlock(&tasklist_lock);

8534

read_unlock(&tasklist_lock);

8481

mutex_unlock(&rt_constraints_mutex);

8535

mutex_unlock(&rt_constraints_mutex);

8482

8536

8483

return ret;

8537

return ret;

8484

}

8538

}

8485

8539

8486

int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)

8540

int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)

8487

{

8541

{

8488

/* Don't accept realtime tasks when there is no way for them to run */

8542

/* Don't accept realtime tasks when there is no way for them to run */

8489

if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)

8543

if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)

8490

return 0;

8544

return 0;

8491

8545

8492

return 1;

8546

return 1;

8493

}

8547

}

8494

8548

8495

#else /* !CONFIG_RT_GROUP_SCHED */

8549

#else /* !CONFIG_RT_GROUP_SCHED */

8496

static int sched_rt_global_constraints(void)

8550

static int sched_rt_global_constraints(void)

8497

{

8551

{

8498

unsigned long flags;

8552

unsigned long flags;

8499

int i;

8553

int i;

8500

8554

8501

if (sysctl_sched_rt_period <= 0)

8555

if (sysctl_sched_rt_period <= 0)

8502

return -EINVAL;

8556

return -EINVAL;

8503

8557

8504

/*

8558

/*

8505

* There's always some RT tasks in the root group

8559

* There's always some RT tasks in the root group

8506

* -- migration, kstopmachine etc..

8560

* -- migration, kstopmachine etc..

8507

*/

8561

*/

8508

if (sysctl_sched_rt_runtime == 0)

8562

if (sysctl_sched_rt_runtime == 0)

8509

return -EBUSY;

8563

return -EBUSY;

8510

8564

8511

raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);

8565

raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);

8512

for_each_possible_cpu(i) {

8566

for_each_possible_cpu(i) {

8513

struct rt_rq *rt_rq = &cpu_rq(i)->rt;

8567

struct rt_rq *rt_rq = &cpu_rq(i)->rt;

8514

8568

8515

raw_spin_lock(&rt_rq->rt_runtime_lock);

8569

raw_spin_lock(&rt_rq->rt_runtime_lock);

8516

rt_rq->rt_runtime = global_rt_runtime();

8570

rt_rq->rt_runtime = global_rt_runtime();

8517

raw_spin_unlock(&rt_rq->rt_runtime_lock);

8571

raw_spin_unlock(&rt_rq->rt_runtime_lock);

8518

}

8572

}

8519

raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

8573

raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

8520

8574

8521

return 0;

8575

return 0;

8522

}

8576

}

8523

#endif /* CONFIG_RT_GROUP_SCHED */

8577

#endif /* CONFIG_RT_GROUP_SCHED */

8524

8578

8525

int sched_rt_handler(struct ctl_table *table, int write,

8579

int sched_rt_handler(struct ctl_table *table, int write,

8526

void __user *buffer, size_t *lenp,

8580

void __user *buffer, size_t *lenp,

8527

loff_t *ppos)

8581

loff_t *ppos)

8528

{

8582

{

8529

int ret;

8583

int ret;

8530

int old_period, old_runtime;

8584

int old_period, old_runtime;

8531

static DEFINE_MUTEX(mutex);

8585

static DEFINE_MUTEX(mutex);

8532

8586

8533

mutex_lock(&mutex);

8587

mutex_lock(&mutex);

8534

old_period = sysctl_sched_rt_period;

8588

old_period = sysctl_sched_rt_period;

8535

old_runtime = sysctl_sched_rt_runtime;

8589

old_runtime = sysctl_sched_rt_runtime;

8536

8590

8537

ret = proc_dointvec(table, write, buffer, lenp, ppos);

8591

ret = proc_dointvec(table, write, buffer, lenp, ppos);

8538

8592

8539

if (!ret && write) {

8593

if (!ret && write) {

8540

ret = sched_rt_global_constraints();

8594

ret = sched_rt_global_constraints();

8541

if (ret) {

8595

if (ret) {

8542

sysctl_sched_rt_period = old_period;

8596

sysctl_sched_rt_period = old_period;

8543

sysctl_sched_rt_runtime = old_runtime;

8597

sysctl_sched_rt_runtime = old_runtime;

8544

} else {

8598

} else {

8545

def_rt_bandwidth.rt_runtime = global_rt_runtime();

8599

def_rt_bandwidth.rt_runtime = global_rt_runtime();

8546

def_rt_bandwidth.rt_period =

8600

def_rt_bandwidth.rt_period =

8547

ns_to_ktime(global_rt_period());

8601

ns_to_ktime(global_rt_period());

8548

}

8602

}

8549

}

8603

}

8550

mutex_unlock(&mutex);

8604

mutex_unlock(&mutex);

8551

8605

8552

return ret;

8606

return ret;

8553

}

8607

}

8554

8608

8555

#ifdef CONFIG_CGROUP_SCHED

8609

#ifdef CONFIG_CGROUP_SCHED

8556

8610

8557

/* return corresponding task_group object of a cgroup */

8611

/* return corresponding task_group object of a cgroup */

8558

static inline struct task_group *cgroup_tg(struct cgroup *cgrp)

8612

static inline struct task_group *cgroup_tg(struct cgroup *cgrp)

8559

{

8613

{

8560

return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),

8614

return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),

8561

struct task_group, css);

8615

struct task_group, css);

8562

}

8616

}

8563

8617

8564

static struct cgroup_subsys_state *

8618

static struct cgroup_subsys_state *

8565

cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)

8619

cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)

8566

{

8620

{

8567

struct task_group *tg, *parent;

8621

struct task_group *tg, *parent;

8568

8622

8569

if (!cgrp->parent) {

8623

if (!cgrp->parent) {

8570

/* This is early initialization for the top cgroup */

8624

/* This is early initialization for the top cgroup */

8571

return &init_task_group.css;

8625

return &init_task_group.css;

8572

}

8626

}

8573

8627

8574

parent = cgroup_tg(cgrp->parent);

8628

parent = cgroup_tg(cgrp->parent);

8575

tg = sched_create_group(parent);

8629

tg = sched_create_group(parent);

8576

if (IS_ERR(tg))

8630

if (IS_ERR(tg))

8577

return ERR_PTR(-ENOMEM);

8631

return ERR_PTR(-ENOMEM);

8578

8632

8579

return &tg->css;

8633

return &tg->css;

8580

}

8634

}

8581

8635

8582

static void

8636

static void

8583

cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

8637

cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

8584

{

8638

{

8585

struct task_group *tg = cgroup_tg(cgrp);

8639

struct task_group *tg = cgroup_tg(cgrp);

8586

8640

8587

sched_destroy_group(tg);

8641

sched_destroy_group(tg);

8588

}

8642

}

8589

8643

8590

static int

8644

static int

8591

cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

8645

cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

8592

{

8646

{

8593

#ifdef CONFIG_RT_GROUP_SCHED

8647

#ifdef CONFIG_RT_GROUP_SCHED

8594

if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))

8648

if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))

8595

return -EINVAL;

8649

return -EINVAL;

8596

#else

8650

#else

8597

/* We don't support RT-tasks being in separate groups */

8651

/* We don't support RT-tasks being in separate groups */

8598

if (tsk->sched_class != &fair_sched_class)

8652

if (tsk->sched_class != &fair_sched_class)

8599

return -EINVAL;

8653

return -EINVAL;

8600

#endif

8654

#endif

8601

return 0;

8655

return 0;

8602

}

8656

}

8603

8657

8604

static int

8658

static int

8605

cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

8659

cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

8606

struct task_struct *tsk, bool threadgroup)

8660

struct task_struct *tsk, bool threadgroup)

8607

{

8661

{

8608

int retval = cpu_cgroup_can_attach_task(cgrp, tsk);

8662

int retval = cpu_cgroup_can_attach_task(cgrp, tsk);

8609

if (retval)

8663

if (retval)

8610

return retval;

8664

return retval;

8611

if (threadgroup) {

8665

if (threadgroup) {

8612

struct task_struct *c;

8666

struct task_struct *c;

8613

rcu_read_lock();

8667

rcu_read_lock();

8614

list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {

8668

list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {

8615

retval = cpu_cgroup_can_attach_task(cgrp, c);

8669

retval = cpu_cgroup_can_attach_task(cgrp, c);

8616

if (retval) {

8670

if (retval) {

8617

rcu_read_unlock();

8671

rcu_read_unlock();

8618

return retval;

8672

return retval;

8619

}

8673

}

8620

}

8674

}

8621

rcu_read_unlock();

8675

rcu_read_unlock();

8622

}

8676

}

8623

return 0;

8677

return 0;

8624

}

8678

}

8625

8679

8626

static void

8680

static void

8627

cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

8681

cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

8628

struct cgroup *old_cont, struct task_struct *tsk,

8682

struct cgroup *old_cont, struct task_struct *tsk,

8629

bool threadgroup)

8683

bool threadgroup)

8630

{

8684

{

8631

sched_move_task(tsk);

8685

sched_move_task(tsk);

8632

if (threadgroup) {

8686

if (threadgroup) {

8633

struct task_struct *c;

8687

struct task_struct *c;

8634

rcu_read_lock();

8688

rcu_read_lock();

8635

list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {

8689

list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {

8636

sched_move_task(c);

8690

sched_move_task(c);

8637

}

8691

}

8638

rcu_read_unlock();

8692

rcu_read_unlock();

8639

}

8693

}

8640

}

8694

}

8641

8695

8642

#ifdef CONFIG_FAIR_GROUP_SCHED

8696

#ifdef CONFIG_FAIR_GROUP_SCHED

8643

static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,

8697

static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,

8644

u64 shareval)

8698

u64 shareval)

8645

{

8699

{

8646

return sched_group_set_shares(cgroup_tg(cgrp), shareval);

8700

return sched_group_set_shares(cgroup_tg(cgrp), shareval);

8647

}

8701

}

8648

8702

8649

static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)

8703

static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)

8650

{

8704

{

8651

struct task_group *tg = cgroup_tg(cgrp);

8705

struct task_group *tg = cgroup_tg(cgrp);

8652

8706

8653

return (u64) tg->shares;

8707

return (u64) tg->shares;

8654

}

8708

}

8655

#endif /* CONFIG_FAIR_GROUP_SCHED */

8709

#endif /* CONFIG_FAIR_GROUP_SCHED */

8656

8710

8657

#ifdef CONFIG_RT_GROUP_SCHED

8711

#ifdef CONFIG_RT_GROUP_SCHED

8658

static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,

8712

static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,

8659

s64 val)

8713

s64 val)

8660

{

8714

{

8661

return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);

8715

return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);

8662

}

8716

}

8663

8717

8664

static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)

8718

static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)

8665

{

8719

{

8666

return sched_group_rt_runtime(cgroup_tg(cgrp));

8720

return sched_group_rt_runtime(cgroup_tg(cgrp));

8667

}

8721

}

8668

8722

8669

static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,

8723

static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,

8670

u64 rt_period_us)

8724

u64 rt_period_us)

8671

{

8725

{

8672

return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);

8726

return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);

8673

}

8727

}

8674

8728

8675

static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)

8729

static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)

8676

{

8730

{

8677

return sched_group_rt_period(cgroup_tg(cgrp));

8731

return sched_group_rt_period(cgroup_tg(cgrp));

8678

}

8732

}

8679

#endif /* CONFIG_RT_GROUP_SCHED */

8733

#endif /* CONFIG_RT_GROUP_SCHED */

8680

8734

8681

static struct cftype cpu_files[] = {

8735

static struct cftype cpu_files[] = {

8682

#ifdef CONFIG_FAIR_GROUP_SCHED

8736

#ifdef CONFIG_FAIR_GROUP_SCHED

8683

{

8737

{

8684

.name = "shares",

8738

.name = "shares",

8685

.read_u64 = cpu_shares_read_u64,

8739

.read_u64 = cpu_shares_read_u64,

8686

.write_u64 = cpu_shares_write_u64,

8740

.write_u64 = cpu_shares_write_u64,

8687

},

8741

},

8688

#endif

8742

#endif

8689

#ifdef CONFIG_RT_GROUP_SCHED

8743

#ifdef CONFIG_RT_GROUP_SCHED

8690

{

8744

{

8691

.name = "rt_runtime_us",

8745

.name = "rt_runtime_us",

8692

.read_s64 = cpu_rt_runtime_read,

8746

.read_s64 = cpu_rt_runtime_read,

8693

.write_s64 = cpu_rt_runtime_write,

8747

.write_s64 = cpu_rt_runtime_write,

8694

},

8748

},

8695

{

8749

{

8696

.name = "rt_period_us",

8750

.name = "rt_period_us",

8697

.read_u64 = cpu_rt_period_read_uint,

8751

.read_u64 = cpu_rt_period_read_uint,

8698

.write_u64 = cpu_rt_period_write_uint,

8752

.write_u64 = cpu_rt_period_write_uint,

8699

},

8753

},

8700

#endif

8754

#endif

8701

};

8755

};

8702

8756

8703

static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)

8757

static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)

8704

{

8758

{

8705

return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));

8759

return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));

8706

}

8760

}

8707

8761

8708

struct cgroup_subsys cpu_cgroup_subsys = {

8762

struct cgroup_subsys cpu_cgroup_subsys = {

8709

.name = "cpu",

8763

.name = "cpu",

8710

.create = cpu_cgroup_create,

8764

.create = cpu_cgroup_create,

8711

.destroy = cpu_cgroup_destroy,

8765

.destroy = cpu_cgroup_destroy,

8712

.can_attach = cpu_cgroup_can_attach,

8766

.can_attach = cpu_cgroup_can_attach,

8713

.attach = cpu_cgroup_attach,

8767

.attach = cpu_cgroup_attach,

8714

.populate = cpu_cgroup_populate,

8768

.populate = cpu_cgroup_populate,

8715

.subsys_id = cpu_cgroup_subsys_id,

8769

.subsys_id = cpu_cgroup_subsys_id,

8716

.early_init = 1,

8770

.early_init = 1,

8717

};

8771

};

8718

8772

8719

#endif /* CONFIG_CGROUP_SCHED */

8773

#endif /* CONFIG_CGROUP_SCHED */

8720

8774

8721

#ifdef CONFIG_CGROUP_CPUACCT

8775

#ifdef CONFIG_CGROUP_CPUACCT

8722

8776

8723

/*

8777

/*

8724

* CPU accounting code for task groups.

8778

* CPU accounting code for task groups.

8725

*

8779

*

8726

* Based on the work by Paul Menage (menage@google.com) and Balbir Singh

8780

* Based on the work by Paul Menage (menage@google.com) and Balbir Singh

8727

* (balbir@in.ibm.com).

8781

* (balbir@in.ibm.com).

8728

*/

8782

*/

8729

8783

8730

/* track cpu usage of a group of tasks and its child groups */

8784

/* track cpu usage of a group of tasks and its child groups */

8731

struct cpuacct {

8785

struct cpuacct {

8732

struct cgroup_subsys_state css;

8786

struct cgroup_subsys_state css;

8733

/* cpuusage holds pointer to a u64-type object on every cpu */

8787

/* cpuusage holds pointer to a u64-type object on every cpu */

8734

u64 __percpu *cpuusage;

8788

u64 __percpu *cpuusage;

8735

struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];

8789

struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];

8736

struct cpuacct *parent;

8790

struct cpuacct *parent;

8737

};

8791

};

8738

8792

8739

struct cgroup_subsys cpuacct_subsys;

8793

struct cgroup_subsys cpuacct_subsys;

8740

8794

8741

/* return cpu accounting group corresponding to this container */

8795

/* return cpu accounting group corresponding to this container */

8742

static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)

8796

static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)

8743

{

8797

{

8744

return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),

8798

return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),

8745

struct cpuacct, css);

8799

struct cpuacct, css);

8746

}

8800

}

8747

8801

8748

/* return cpu accounting group to which this task belongs */

8802

/* return cpu accounting group to which this task belongs */

8749

static inline struct cpuacct *task_ca(struct task_struct *tsk)

8803

static inline struct cpuacct *task_ca(struct task_struct *tsk)

8750

{

8804

{

8751

return container_of(task_subsys_state(tsk, cpuacct_subsys_id),

8805

return container_of(task_subsys_state(tsk, cpuacct_subsys_id),

8752

struct cpuacct, css);

8806

struct cpuacct, css);

8753

}

8807

}

8754

8808

8755

/* create a new cpu accounting group */

8809

/* create a new cpu accounting group */

8756

static struct cgroup_subsys_state *cpuacct_create(

8810

static struct cgroup_subsys_state *cpuacct_create(

8757

struct cgroup_subsys *ss, struct cgroup *cgrp)

8811

struct cgroup_subsys *ss, struct cgroup *cgrp)

8758

{

8812

{

8759

struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);

8813

struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);

8760

int i;

8814

int i;

8761

8815

8762

if (!ca)

8816

if (!ca)

8763

goto out;

8817

goto out;

8764

8818

8765

ca->cpuusage = alloc_percpu(u64);

8819

ca->cpuusage = alloc_percpu(u64);

8766

if (!ca->cpuusage)

8820

if (!ca->cpuusage)

8767

goto out_free_ca;

8821

goto out_free_ca;

8768

8822

8769

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

8823

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

8770

if (percpu_counter_init(&ca->cpustat[i], 0))

8824

if (percpu_counter_init(&ca->cpustat[i], 0))

8771

goto out_free_counters;

8825

goto out_free_counters;

8772

8826

8773

if (cgrp->parent)

8827

if (cgrp->parent)

8774

ca->parent = cgroup_ca(cgrp->parent);

8828

ca->parent = cgroup_ca(cgrp->parent);

8775

8829

8776

return &ca->css;

8830

return &ca->css;

8777

8831

8778

out_free_counters:

8832

out_free_counters:

8779

while (--i >= 0)

8833

while (--i >= 0)

8780

percpu_counter_destroy(&ca->cpustat[i]);

8834

percpu_counter_destroy(&ca->cpustat[i]);

8781

free_percpu(ca->cpuusage);

8835

free_percpu(ca->cpuusage);

8782

out_free_ca:

8836

out_free_ca:

8783

kfree(ca);

8837

kfree(ca);

8784

out:

8838

out:

8785

return ERR_PTR(-ENOMEM);

8839

return ERR_PTR(-ENOMEM);

8786

}

8840

}

8787

8841

8788

/* destroy an existing cpu accounting group */

8842

/* destroy an existing cpu accounting group */

8789

static void

8843

static void

8790

cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

8844

cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

8791

{

8845

{

8792

struct cpuacct *ca = cgroup_ca(cgrp);

8846

struct cpuacct *ca = cgroup_ca(cgrp);

8793

int i;

8847

int i;

8794

8848

8795

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

8849

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

8796

percpu_counter_destroy(&ca->cpustat[i]);

8850

percpu_counter_destroy(&ca->cpustat[i]);

8797

free_percpu(ca->cpuusage);

8851

free_percpu(ca->cpuusage);

8798

kfree(ca);

8852

kfree(ca);

8799

}

8853

}

8800

8854

8801

static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)

8855

static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)

8802

{

8856

{

8803

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

8857

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

8804

u64 data;

8858

u64 data;

8805

8859

8806

#ifndef CONFIG_64BIT

8860

#ifndef CONFIG_64BIT

8807

/*

8861

/*

8808

* Take rq->lock to make 64-bit read safe on 32-bit platforms.

8862

* Take rq->lock to make 64-bit read safe on 32-bit platforms.

8809

*/

8863

*/

8810

raw_spin_lock_irq(&cpu_rq(cpu)->lock);

8864

raw_spin_lock_irq(&cpu_rq(cpu)->lock);

8811

data = *cpuusage;

8865

data = *cpuusage;

8812

raw_spin_unlock_irq(&cpu_rq(cpu)->lock);

8866

raw_spin_unlock_irq(&cpu_rq(cpu)->lock);

8813

#else

8867

#else

8814

data = *cpuusage;

8868

data = *cpuusage;

8815

#endif

8869

#endif

8816

8870

8817

return data;

8871

return data;

8818

}

8872

}

8819

8873

8820

static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)

8874

static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)

8821

{

8875

{

8822

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

8876

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

8823

8877

8824

#ifndef CONFIG_64BIT

8878

#ifndef CONFIG_64BIT

8825

/*

8879

/*

8826

* Take rq->lock to make 64-bit write safe on 32-bit platforms.

8880

* Take rq->lock to make 64-bit write safe on 32-bit platforms.

8827

*/

8881

*/

8828

raw_spin_lock_irq(&cpu_rq(cpu)->lock);

8882

raw_spin_lock_irq(&cpu_rq(cpu)->lock);

8829

*cpuusage = val;

8883

*cpuusage = val;

8830

raw_spin_unlock_irq(&cpu_rq(cpu)->lock);

8884

raw_spin_unlock_irq(&cpu_rq(cpu)->lock);

8831

#else

8885

#else

8832

*cpuusage = val;

8886

*cpuusage = val;

8833

#endif

8887

#endif

8834

}

8888

}

8835

8889

8836

/* return total cpu usage (in nanoseconds) of a group */

8890

/* return total cpu usage (in nanoseconds) of a group */

8837

static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)

8891

static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)

8838

{

8892

{

8839

struct cpuacct *ca = cgroup_ca(cgrp);

8893

struct cpuacct *ca = cgroup_ca(cgrp);

8840

u64 totalcpuusage = 0;

8894

u64 totalcpuusage = 0;

8841

int i;

8895

int i;

8842

8896

8843

for_each_present_cpu(i)

8897

for_each_present_cpu(i)

8844

totalcpuusage += cpuacct_cpuusage_read(ca, i);

8898

totalcpuusage += cpuacct_cpuusage_read(ca, i);

8845

8899

8846

return totalcpuusage;

8900

return totalcpuusage;

8847

}

8901

}

8848

8902

8849

static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,

8903

static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,

8850

u64 reset)

8904

u64 reset)

8851

{

8905

{

8852

struct cpuacct *ca = cgroup_ca(cgrp);

8906

struct cpuacct *ca = cgroup_ca(cgrp);

8853

int err = 0;

8907

int err = 0;

8854

int i;

8908

int i;

8855

8909

8856

if (reset) {

8910

if (reset) {

8857

err = -EINVAL;

8911

err = -EINVAL;

8858

goto out;

8912

goto out;

8859

}

8913

}

8860

8914

8861

for_each_present_cpu(i)

8915

for_each_present_cpu(i)

8862

cpuacct_cpuusage_write(ca, i, 0);

8916

cpuacct_cpuusage_write(ca, i, 0);

8863

8917

8864

out:

8918

out:

8865

return err;

8919

return err;

8866

}

8920

}

8867

8921

8868

static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,

8922

static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,

8869

struct seq_file *m)

8923

struct seq_file *m)

8870

{

8924

{

8871

struct cpuacct *ca = cgroup_ca(cgroup);

8925

struct cpuacct *ca = cgroup_ca(cgroup);

8872

u64 percpu;

8926

u64 percpu;

8873

int i;

8927

int i;

8874

8928

8875

for_each_present_cpu(i) {

8929

for_each_present_cpu(i) {

8876

percpu = cpuacct_cpuusage_read(ca, i);

8930

percpu = cpuacct_cpuusage_read(ca, i);

8877

seq_printf(m, "%llu ", (unsigned long long) percpu);

8931

seq_printf(m, "%llu ", (unsigned long long) percpu);

8878

}

8932

}

8879

seq_printf(m, "\n");

8933

seq_printf(m, "\n");

8880

return 0;

8934

return 0;

8881

}

8935

}

8882

8936

8883

static const char *cpuacct_stat_desc[] = {

8937

static const char *cpuacct_stat_desc[] = {

8884

[CPUACCT_STAT_USER] = "user",

8938

[CPUACCT_STAT_USER] = "user",

8885

[CPUACCT_STAT_SYSTEM] = "system",

8939

[CPUACCT_STAT_SYSTEM] = "system",

8886

};

8940

};

8887

8941

8888

static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,

8942

static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,

8889

struct cgroup_map_cb *cb)

8943

struct cgroup_map_cb *cb)

8890

{

8944

{

8891

struct cpuacct *ca = cgroup_ca(cgrp);

8945

struct cpuacct *ca = cgroup_ca(cgrp);

8892

int i;

8946

int i;

8893

8947

8894

for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {

8948

for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {

8895

s64 val = percpu_counter_read(&ca->cpustat[i]);

8949

s64 val = percpu_counter_read(&ca->cpustat[i]);

8896

val = cputime64_to_clock_t(val);

8950

val = cputime64_to_clock_t(val);

8897

cb->fill(cb, cpuacct_stat_desc[i], val);

8951

cb->fill(cb, cpuacct_stat_desc[i], val);

8898

}

8952

}

8899

return 0;

8953

return 0;

8900

}

8954

}

8901

8955

8902

static struct cftype files[] = {

8956

static struct cftype files[] = {

8903

{

8957

{

8904

.name = "usage",

8958

.name = "usage",

8905

.read_u64 = cpuusage_read,

8959

.read_u64 = cpuusage_read,

8906

.write_u64 = cpuusage_write,

8960

.write_u64 = cpuusage_write,

8907

},

8961

},

8908

{

8962

{

8909

.name = "usage_percpu",

8963

.name = "usage_percpu",

8910

.read_seq_string = cpuacct_percpu_seq_read,

8964

.read_seq_string = cpuacct_percpu_seq_read,

8911

},

8965

},

8912

{

8966

{

8913

.name = "stat",

8967

.name = "stat",

8914

.read_map = cpuacct_stats_show,

8968

.read_map = cpuacct_stats_show,

8915

},

8969

},

8916

};

8970

};

8917

8971

8918

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)

8972

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)

8919

{

8973

{

8920

return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));

8974

return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));

8921

}

8975

}

8922

8976

8923

/*

8977

/*

8924

* charge this task's execution time to its accounting group.

8978

* charge this task's execution time to its accounting group.

8925

*

8979

*

8926

* called with rq->lock held.

8980

* called with rq->lock held.

8927

*/

8981

*/

8928

static void cpuacct_charge(struct task_struct *tsk, u64 cputime)

8982

static void cpuacct_charge(struct task_struct *tsk, u64 cputime)

8929

{

8983

{

8930

struct cpuacct *ca;

8984

struct cpuacct *ca;

8931

int cpu;

8985

int cpu;

8932

8986

8933

if (unlikely(!cpuacct_subsys.active))

8987

if (unlikely(!cpuacct_subsys.active))

8934

return;

8988

return;

8935

8989

8936

cpu = task_cpu(tsk);

8990

cpu = task_cpu(tsk);

8937

8991

8938

rcu_read_lock();

8992

rcu_read_lock();

8939

8993

8940

ca = task_ca(tsk);

8994

ca = task_ca(tsk);

8941

8995

8942

for (; ca; ca = ca->parent) {

8996

for (; ca; ca = ca->parent) {

8943

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

8997

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

8944

*cpuusage += cputime;

8998

*cpuusage += cputime;

8945

}

8999

}

8946

9000

8947

rcu_read_unlock();

9001

rcu_read_unlock();

8948

}

9002

}

8949

9003

8950

/*

9004

/*

8951

* When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large

9005

* When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large

8952

* in cputime_t units. As a result, cpuacct_update_stats calls

9006

* in cputime_t units. As a result, cpuacct_update_stats calls

8953

* percpu_counter_add with values large enough to always overflow the

9007

* percpu_counter_add with values large enough to always overflow the

8954

* per cpu batch limit causing bad SMP scalability.

9008

* per cpu batch limit causing bad SMP scalability.

8955

*

9009

*

8956

* To fix this we scale percpu_counter_batch by cputime_one_jiffy so we

9010

* To fix this we scale percpu_counter_batch by cputime_one_jiffy so we

8957

* batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled

9011

* batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled

8958

* and enabled. We cap it at INT_MAX which is the largest allowed batch value.

9012

* and enabled. We cap it at INT_MAX which is the largest allowed batch value.

8959

*/

9013

*/

8960

#ifdef CONFIG_SMP

9014

#ifdef CONFIG_SMP

8961

#define CPUACCT_BATCH \

9015

#define CPUACCT_BATCH \

8962

min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)

9016

min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)

8963

#else

9017

#else

8964

#define CPUACCT_BATCH 0

9018

#define CPUACCT_BATCH 0

8965

#endif

9019

#endif

8966

9020

8967

/*

9021

/*

8968

* Charge the system/user time to the task's accounting group.

9022

* Charge the system/user time to the task's accounting group.

8969

*/

9023

*/

8970

static void cpuacct_update_stats(struct task_struct *tsk,

9024

static void cpuacct_update_stats(struct task_struct *tsk,

8971

enum cpuacct_stat_index idx, cputime_t val)

9025

enum cpuacct_stat_index idx, cputime_t val)

8972

{

9026

{

8973

struct cpuacct *ca;

9027

struct cpuacct *ca;

8974

int batch = CPUACCT_BATCH;

9028

int batch = CPUACCT_BATCH;

8975

9029

8976

if (unlikely(!cpuacct_subsys.active))

9030

if (unlikely(!cpuacct_subsys.active))

8977

return;

9031

return;

8978

9032

8979

rcu_read_lock();

9033

rcu_read_lock();

8980

ca = task_ca(tsk);

9034

ca = task_ca(tsk);

8981

9035

8982

do {

9036

do {

8983

__percpu_counter_add(&ca->cpustat[idx], val, batch);

9037

__percpu_counter_add(&ca->cpustat[idx], val, batch);

8984

ca = ca->parent;

9038

ca = ca->parent;

8985

} while (ca);

9039

} while (ca);

8986

rcu_read_unlock();

9040

rcu_read_unlock();

8987

}

9041

}

8988

9042

8989

struct cgroup_subsys cpuacct_subsys = {

9043

struct cgroup_subsys cpuacct_subsys = {

8990

.name = "cpuacct",

9044

.name = "cpuacct",

8991

.create = cpuacct_create,

9045

.create = cpuacct_create,

8992

.destroy = cpuacct_destroy,

9046

.destroy = cpuacct_destroy,

8993

.populate = cpuacct_populate,

9047

.populate = cpuacct_populate,

8994

.subsys_id = cpuacct_subsys_id,

9048

.subsys_id = cpuacct_subsys_id,

8995

};

9049

};

8996

#endif /* CONFIG_CGROUP_CPUACCT */

9050

#endif /* CONFIG_CGROUP_CPUACCT */

8997

9051

8998

#ifndef CONFIG_SMP

9052

#ifndef CONFIG_SMP

8999

9053

9000

int rcu_expedited_torture_stats(char *page)

9054

int rcu_expedited_torture_stats(char *page)

9001

{

9055

{

9002

return 0;

9056

return 0;

9003

}

9057

}

9004

EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);

9058

EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);

9005

9059

9006

void synchronize_sched_expedited(void)

9060

void synchronize_sched_expedited(void)

9007

{

9061

{

9008

}

9062

}

9009

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

9063

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

9010

9064

9011

#else /* #ifndef CONFIG_SMP */

9065

#else /* #ifndef CONFIG_SMP */

9012

9066

9013

static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);

9067

static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);

9014

static DEFINE_MUTEX(rcu_sched_expedited_mutex);

9068

static DEFINE_MUTEX(rcu_sched_expedited_mutex);

9015

9069

9016

#define RCU_EXPEDITED_STATE_POST -2

9070

#define RCU_EXPEDITED_STATE_POST -2

9017

#define RCU_EXPEDITED_STATE_IDLE -1

9071

#define RCU_EXPEDITED_STATE_IDLE -1

9018

9072

9019

static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;

9073

static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;

9020

9074

9021

int rcu_expedited_torture_stats(char *page)

9075

int rcu_expedited_torture_stats(char *page)

9022

{

9076

{

9023

int cnt = 0;

9077

int cnt = 0;

9024

int cpu;

9078

int cpu;

9025

9079

9026

cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);

9080

cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);

9027

for_each_online_cpu(cpu) {

9081

for_each_online_cpu(cpu) {

9028

cnt += sprintf(&page[cnt], " %d:%d",

9082

cnt += sprintf(&page[cnt], " %d:%d",

9029

cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);

9083

cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);

9030

}

9084

}

9031

cnt += sprintf(&page[cnt], "\n");

9085

cnt += sprintf(&page[cnt], "\n");

9032

return cnt;

9086

return cnt;

9033

}

9087

}

9034

EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);

9088

EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);

9035

9089

9036

static long synchronize_sched_expedited_count;

9090

static long synchronize_sched_expedited_count;

9037

9091

9038

/*

9092

/*

9039

* Wait for an rcu-sched grace period to elapse, but use "big hammer"

9093

* Wait for an rcu-sched grace period to elapse, but use "big hammer"

9040

* approach to force grace period to end quickly. This consumes

9094

* approach to force grace period to end quickly. This consumes

9041

* significant time on all CPUs, and is thus not recommended for

9095

* significant time on all CPUs, and is thus not recommended for

9042

* any sort of common-case code.

9096

* any sort of common-case code.

9043

*

9097

*

9044

* Note that it is illegal to call this function while holding any

9098

* Note that it is illegal to call this function while holding any

9045

* lock that is acquired by a CPU-hotplug notifier. Failing to

9099

* lock that is acquired by a CPU-hotplug notifier. Failing to

9046

* observe this restriction will result in deadlock.

9100

* observe this restriction will result in deadlock.

9047

*/

9101

*/

9048

void synchronize_sched_expedited(void)

9102

void synchronize_sched_expedited(void)

9049

{

9103

{

9050

int cpu;

9104

int cpu;

9051

unsigned long flags;

9105

unsigned long flags;

9052

bool need_full_sync = 0;

9106

bool need_full_sync = 0;

9053

struct rq *rq;

9107

struct rq *rq;

9054

struct migration_req *req;

9108

struct migration_req *req;

9055

long snap;

9109

long snap;

9056

int trycount = 0;

9110

int trycount = 0;

9057

9111

9058

smp_mb(); /* ensure prior mod happens before capturing snap. */

9112

smp_mb(); /* ensure prior mod happens before capturing snap. */

9059

snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;

9113

snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;

9060

get_online_cpus();

9114

get_online_cpus();

9061

while (!mutex_trylock(&rcu_sched_expedited_mutex)) {

9115

while (!mutex_trylock(&rcu_sched_expedited_mutex)) {

9062

put_online_cpus();

9116

put_online_cpus();

9063

if (trycount++ < 10)

9117

if (trycount++ < 10)

9064

udelay(trycount * num_online_cpus());

9118

udelay(trycount * num_online_cpus());

9065

else {

9119

else {

9066

synchronize_sched();

9120

synchronize_sched();

9067

return;

9121

return;

9068

}

9122

}

9069

if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {

9123

if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {

9070

smp_mb(); /* ensure test happens before caller kfree */

9124

smp_mb(); /* ensure test happens before caller kfree */

9071

return;

9125

return;

9072

}

9126

}

9073

get_online_cpus();

9127

get_online_cpus();

9074

}

9128

}

9075

rcu_expedited_state = RCU_EXPEDITED_STATE_POST;

9129

rcu_expedited_state = RCU_EXPEDITED_STATE_POST;

9076

for_each_online_cpu(cpu) {

9130

for_each_online_cpu(cpu) {

9077

rq = cpu_rq(cpu);

9131

rq = cpu_rq(cpu);

9078

req = &per_cpu(rcu_migration_req, cpu);

9132

req = &per_cpu(rcu_migration_req, cpu);

9079

init_completion(&req->done);

9133

init_completion(&req->done);

9080

req->task = NULL;

9134

req->task = NULL;

9081

req->dest_cpu = RCU_MIGRATION_NEED_QS;

9135

req->dest_cpu = RCU_MIGRATION_NEED_QS;

9082

raw_spin_lock_irqsave(&rq->lock, flags);

9136

raw_spin_lock_irqsave(&rq->lock, flags);

9083

list_add(&req->list, &rq->migration_queue);

9137

list_add(&req->list, &rq->migration_queue);

9084

raw_spin_unlock_irqrestore(&rq->lock, flags);

9138

raw_spin_unlock_irqrestore(&rq->lock, flags);

9085

wake_up_process(rq->migration_thread);

9139

wake_up_process(rq->migration_thread);

9086

}

9140

}

9087

for_each_online_cpu(cpu) {

9141

for_each_online_cpu(cpu) {

9088

rcu_expedited_state = cpu;

9142

rcu_expedited_state = cpu;

9089

req = &per_cpu(rcu_migration_req, cpu);

9143

req = &per_cpu(rcu_migration_req, cpu);

9090

rq = cpu_rq(cpu);

9144

rq = cpu_rq(cpu);

9091

wait_for_completion(&req->done);

9145

wait_for_completion(&req->done);

9092

raw_spin_lock_irqsave(&rq->lock, flags);

9146

raw_spin_lock_irqsave(&rq->lock, flags);

9093

if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))

9147

if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))

9094

need_full_sync = 1;

9148

need_full_sync = 1;

9095

req->dest_cpu = RCU_MIGRATION_IDLE;

9149

req->dest_cpu = RCU_MIGRATION_IDLE;

9096

raw_spin_unlock_irqrestore(&rq->lock, flags);

9150

raw_spin_unlock_irqrestore(&rq->lock, flags);

9097

}

9151

}

9098

rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;

9152

rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;

9099

synchronize_sched_expedited_count++;

9153

synchronize_sched_expedited_count++;

9100

mutex_unlock(&rcu_sched_expedited_mutex);

9154

mutex_unlock(&rcu_sched_expedited_mutex);

9101

put_online_cpus();

9155

put_online_cpus();

9102

if (need_full_sync)

9156

if (need_full_sync)

9103

synchronize_sched();

9157

synchronize_sched();

9104

}

9158

}

9105

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

9159

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

GITLAB

Eric Lee / linux-smarc-t335x-v3.2

sched: Cure load average vs NO_HZ woes

 /*
  * idle-task scheduling class.
  *
  * (NOTE: these are not related to SCHED_IDLE tasks which are
  *  handled in sched_fair.c)
  */
 #ifdef CONFIG_SMP
 static int
 select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }
 #endif /* CONFIG_SMP */
 /*
  * Idle tasks are unconditionally rescheduled:
  */
 static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
 {
 	resched_task(rq->idle);
 }
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
 	schedstat_inc(rq, sched_goidle);
-	/* adjust the active tasks as we might go into a long sleep */
+	calc_load_account_idle(rq);
-	calc_load_account_active(rq);
 	return rq->idle;
 }
 /*
  * It is not legal to sleep in the idle task - print a warning
  * message if some code attempts to do it:
  */
 static void
 dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 {
 	raw_spin_unlock_irq(&rq->lock);
 	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
 	dump_stack();
 	raw_spin_lock_irq(&rq->lock);
 }
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
 }
 static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
 {
 }
 static void set_curr_task_idle(struct rq *rq)
 {
 }
 static void switched_to_idle(struct rq *rq, struct task_struct *p,
 			     int running)
 {
 	/* Can this actually happen?? */
 	if (running)
 		resched_task(rq->curr);
 	else
 		check_preempt_curr(rq, p, 0);
 }
 static void prio_changed_idle(struct rq *rq, struct task_struct *p,
 			      int oldprio, int running)
 {
 	/* This can happen for hot plug CPUS */
 	/*
 	 * Reschedule if we are currently running on this runqueue and
 	 * our priority decreased, or if we are not currently running on
 	 * this runqueue and our priority is higher than the current's
 	 */
 	if (running) {
 		if (p->prio > oldprio)
 			resched_task(rq->curr);
 	} else
 		check_preempt_curr(rq, p, 0);
 }
 static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
 {
 	return 0;
 }
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
 static const struct sched_class idle_sched_class = {
 	/* .next is NULL */
 	/* no enqueue/yield_task for idle tasks */
 	/* dequeue is not valid, we print a debug message there: */
 	.dequeue_task		= dequeue_task_idle,
 	.check_preempt_curr	= check_preempt_curr_idle,
 	.pick_next_task		= pick_next_task_idle,
 	.put_prev_task		= put_prev_task_idle,
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_idle,
 #endif
 	.set_curr_task          = set_curr_task_idle,
 	.task_tick		= task_tick_idle,
 	.get_rr_interval	= get_rr_interval_idle,
 	.prio_changed		= prio_changed_idle,
 	.switched_to		= switched_to_idle,
 	/* no .task_new for idle tasks */
 };