Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* kernel/sched.c

2

* kernel/sched.c

3

*

3

*

4

* Kernel scheduler and related syscalls

4

* Kernel scheduler and related syscalls

5

*

5

*

6

7

*

7

*

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

9

* make semaphores SMP safe

9

* make semaphores SMP safe

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

11

* by Andrea Arcangeli

11

* by Andrea Arcangeli

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

13

* hybrid priority-list and round-robin design with

13

* hybrid priority-list and round-robin design with

14

* an array-switch method of distributing timeslices

14

* an array-switch method of distributing timeslices

15

* and per-CPU runqueues. Cleanups and useful suggestions

15

* and per-CPU runqueues. Cleanups and useful suggestions

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

18

* 2004-04-02 Scheduler domains code by Nick Piggin

18

* 2004-04-02 Scheduler domains code by Nick Piggin

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

20

* fair scheduling design by Con Kolivas.

20

* fair scheduling design by Con Kolivas.

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

22

* by Peter Williams

22

* by Peter Williams

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

25

* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,

25

* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,

26

* Thomas Gleixner, Mike Kravetz

26

* Thomas Gleixner, Mike Kravetz

27

*/

27

*/

28

29

#include <linux/mm.h>

29

#include <linux/mm.h>

30

#include <linux/module.h>

30

#include <linux/module.h>

31

#include <linux/nmi.h>

31

#include <linux/nmi.h>

32

#include <linux/init.h>

32

#include <linux/init.h>

33

#include <linux/uaccess.h>

33

#include <linux/uaccess.h>

34

#include <linux/highmem.h>

34

#include <linux/highmem.h>

35

#include <linux/smp_lock.h>

35

#include <linux/smp_lock.h>

36

#include <asm/mmu_context.h>

36

#include <asm/mmu_context.h>

37

#include <linux/interrupt.h>

37

#include <linux/interrupt.h>

38

#include <linux/capability.h>

38

#include <linux/capability.h>

39

#include <linux/completion.h>

39

#include <linux/completion.h>

40

#include <linux/kernel_stat.h>

40

#include <linux/kernel_stat.h>

41

#include <linux/debug_locks.h>

41

#include <linux/debug_locks.h>

42

#include <linux/perf_event.h>

42

#include <linux/perf_event.h>

43

#include <linux/security.h>

43

#include <linux/security.h>

44

#include <linux/notifier.h>

44

#include <linux/notifier.h>

45

#include <linux/profile.h>

45

#include <linux/profile.h>

46

#include <linux/freezer.h>

46

#include <linux/freezer.h>

47

#include <linux/vmalloc.h>

47

#include <linux/vmalloc.h>

48

#include <linux/blkdev.h>

48

#include <linux/blkdev.h>

49

#include <linux/delay.h>

49

#include <linux/delay.h>

50

#include <linux/pid_namespace.h>

50

#include <linux/pid_namespace.h>

51

#include <linux/smp.h>

51

#include <linux/smp.h>

52

#include <linux/threads.h>

52

#include <linux/threads.h>

53

#include <linux/timer.h>

53

#include <linux/timer.h>

54

#include <linux/rcupdate.h>

54

#include <linux/rcupdate.h>

55

#include <linux/cpu.h>

55

#include <linux/cpu.h>

56

#include <linux/cpuset.h>

56

#include <linux/cpuset.h>

57

#include <linux/percpu.h>

57

#include <linux/percpu.h>

58

#include <linux/kthread.h>

58

#include <linux/kthread.h>

59

#include <linux/proc_fs.h>

59

#include <linux/proc_fs.h>

60

#include <linux/seq_file.h>

60

#include <linux/seq_file.h>

61

#include <linux/sysctl.h>

61

#include <linux/sysctl.h>

62

#include <linux/syscalls.h>

62

#include <linux/syscalls.h>

63

#include <linux/times.h>

63

#include <linux/times.h>

64

#include <linux/tsacct_kern.h>

64

#include <linux/tsacct_kern.h>

65

#include <linux/kprobes.h>

65

#include <linux/kprobes.h>

66

#include <linux/delayacct.h>

66

#include <linux/delayacct.h>

67

#include <linux/unistd.h>

67

#include <linux/unistd.h>

68

#include <linux/pagemap.h>

68

#include <linux/pagemap.h>

69

#include <linux/hrtimer.h>

69

#include <linux/hrtimer.h>

70

#include <linux/tick.h>

70

#include <linux/tick.h>

71

#include <linux/debugfs.h>

71

#include <linux/debugfs.h>

72

#include <linux/ctype.h>

72

#include <linux/ctype.h>

73

#include <linux/ftrace.h>

73

#include <linux/ftrace.h>

74

75

#include <asm/tlb.h>

75

#include <asm/tlb.h>

76

#include <asm/irq_regs.h>

76

#include <asm/irq_regs.h>

77

78

#include "sched_cpupri.h"

78

#include "sched_cpupri.h"

79

80

#define CREATE_TRACE_POINTS

80

#define CREATE_TRACE_POINTS

81

#include <trace/events/sched.h>

81

#include <trace/events/sched.h>

82

83

/*

83

/*

84

* Convert user-nice values [ -20 ... 0 ... 19 ]

84

* Convert user-nice values [ -20 ... 0 ... 19 ]

85

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

85

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

86

* and back.

86

* and back.

87

*/

87

*/

88

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

88

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

89

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

89

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

90

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

90

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

91

92

/*

92

/*

93

* 'User priority' is the nice value converted to something we

93

* 'User priority' is the nice value converted to something we

94

* can work with better when scaling various scheduler parameters,

94

* can work with better when scaling various scheduler parameters,

95

* it's a [ 0 ... 39 ] range.

95

* it's a [ 0 ... 39 ] range.

96

*/

96

*/

97

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

97

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

98

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

98

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

99

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

99

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

100

101

/*

101

/*

102

* Helpers for converting nanosecond timing to jiffy resolution

102

* Helpers for converting nanosecond timing to jiffy resolution

103

*/

103

*/

104

#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))

104

#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))

105

106

#define NICE_0_LOAD SCHED_LOAD_SCALE

106

#define NICE_0_LOAD SCHED_LOAD_SCALE

107

#define NICE_0_SHIFT SCHED_LOAD_SHIFT

107

#define NICE_0_SHIFT SCHED_LOAD_SHIFT

108

109

/*

109

/*

110

* These are the 'tuning knobs' of the scheduler:

110

* These are the 'tuning knobs' of the scheduler:

111

*

111

*

112

* default timeslice is 100 msecs (used only for SCHED_RR tasks).

112

* default timeslice is 100 msecs (used only for SCHED_RR tasks).

113

* Timeslices get refilled after they expire.

113

* Timeslices get refilled after they expire.

114

*/

114

*/

115

#define DEF_TIMESLICE (100 * HZ / 1000)

115

#define DEF_TIMESLICE (100 * HZ / 1000)

116

117

/*

117

/*

118

* single value that denotes runtime == period, ie unlimited time.

118

* single value that denotes runtime == period, ie unlimited time.

119

*/

119

*/

120

#define RUNTIME_INF ((u64)~0ULL)

120

#define RUNTIME_INF ((u64)~0ULL)

121

122

static inline int rt_policy(int policy)

122

static inline int rt_policy(int policy)

123

{

123

{

124

if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))

124

if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))

125

return 1;

125

return 1;

126

return 0;

126

return 0;

127

}

127

}

128

129

static inline int task_has_rt_policy(struct task_struct *p)

129

static inline int task_has_rt_policy(struct task_struct *p)

130

{

130

{

131

return rt_policy(p->policy);

131

return rt_policy(p->policy);

132

}

132

}

133

134

/*

134

/*

135

* This is the priority-queue data structure of the RT scheduling class:

135

* This is the priority-queue data structure of the RT scheduling class:

136

*/

136

*/

137

struct rt_prio_array {

137

struct rt_prio_array {

138

DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */

138

DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */

139

struct list_head queue[MAX_RT_PRIO];

139

struct list_head queue[MAX_RT_PRIO];

140

};

140

};

141

142

struct rt_bandwidth {

142

struct rt_bandwidth {

143

/* nests inside the rq lock: */

143

/* nests inside the rq lock: */

144

raw_spinlock_t rt_runtime_lock;

144

raw_spinlock_t rt_runtime_lock;

145

ktime_t rt_period;

145

ktime_t rt_period;

146

u64 rt_runtime;

146

u64 rt_runtime;

147

struct hrtimer rt_period_timer;

147

struct hrtimer rt_period_timer;

148

};

148

};

149

150

static struct rt_bandwidth def_rt_bandwidth;

150

static struct rt_bandwidth def_rt_bandwidth;

151

152

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

152

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

153

154

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)

154

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)

155

{

155

{

156

struct rt_bandwidth *rt_b =

156

struct rt_bandwidth *rt_b =

157

container_of(timer, struct rt_bandwidth, rt_period_timer);

157

container_of(timer, struct rt_bandwidth, rt_period_timer);

158

ktime_t now;

158

ktime_t now;

159

int overrun;

159

int overrun;

160

int idle = 0;

160

int idle = 0;

161

162

for (;;) {

162

for (;;) {

163

now = hrtimer_cb_get_time(timer);

163

now = hrtimer_cb_get_time(timer);

164

overrun = hrtimer_forward(timer, now, rt_b->rt_period);

164

overrun = hrtimer_forward(timer, now, rt_b->rt_period);

165

166

if (!overrun)

166

if (!overrun)

167

break;

167

break;

168

169

idle = do_sched_rt_period_timer(rt_b, overrun);

169

idle = do_sched_rt_period_timer(rt_b, overrun);

170

}

170

}

171

172

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

172

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

173

}

173

}

174

175

static

175

static

176

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)

176

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)

177

{

177

{

178

rt_b->rt_period = ns_to_ktime(period);

178

rt_b->rt_period = ns_to_ktime(period);

179

rt_b->rt_runtime = runtime;

179

rt_b->rt_runtime = runtime;

180

181

raw_spin_lock_init(&rt_b->rt_runtime_lock);

181

raw_spin_lock_init(&rt_b->rt_runtime_lock);

182

183

hrtimer_init(&rt_b->rt_period_timer,

183

hrtimer_init(&rt_b->rt_period_timer,

184

CLOCK_MONOTONIC, HRTIMER_MODE_REL);

184

CLOCK_MONOTONIC, HRTIMER_MODE_REL);

185

rt_b->rt_period_timer.function = sched_rt_period_timer;

185

rt_b->rt_period_timer.function = sched_rt_period_timer;

186

}

186

}

187

188

static inline int rt_bandwidth_enabled(void)

188

static inline int rt_bandwidth_enabled(void)

189

{

189

{

190

return sysctl_sched_rt_runtime >= 0;

190

return sysctl_sched_rt_runtime >= 0;

191

}

191

}

192

193

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)

193

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)

194

{

194

{

195

ktime_t now;

195

ktime_t now;

196

197

if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)

197

if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)

198

return;

198

return;

199

200

if (hrtimer_active(&rt_b->rt_period_timer))

200

if (hrtimer_active(&rt_b->rt_period_timer))

201

return;

201

return;

202

203

raw_spin_lock(&rt_b->rt_runtime_lock);

203

raw_spin_lock(&rt_b->rt_runtime_lock);

204

for (;;) {

204

for (;;) {

205

unsigned long delta;

205

unsigned long delta;

206

ktime_t soft, hard;

206

ktime_t soft, hard;

207

208

if (hrtimer_active(&rt_b->rt_period_timer))

208

if (hrtimer_active(&rt_b->rt_period_timer))

209

break;

209

break;

210

211

now = hrtimer_cb_get_time(&rt_b->rt_period_timer);

211

now = hrtimer_cb_get_time(&rt_b->rt_period_timer);

212

hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);

212

hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);

213

214

soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);

214

soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);

215

hard = hrtimer_get_expires(&rt_b->rt_period_timer);

215

hard = hrtimer_get_expires(&rt_b->rt_period_timer);

216

delta = ktime_to_ns(ktime_sub(hard, soft));

216

delta = ktime_to_ns(ktime_sub(hard, soft));

217

__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,

217

__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,

218

HRTIMER_MODE_ABS_PINNED, 0);

218

HRTIMER_MODE_ABS_PINNED, 0);

219

}

219

}

220

raw_spin_unlock(&rt_b->rt_runtime_lock);

220

raw_spin_unlock(&rt_b->rt_runtime_lock);

221

}

221

}

222

223

#ifdef CONFIG_RT_GROUP_SCHED

223

#ifdef CONFIG_RT_GROUP_SCHED

224

static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)

224

static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)

225

{

225

{

226

hrtimer_cancel(&rt_b->rt_period_timer);

226

hrtimer_cancel(&rt_b->rt_period_timer);

227

}

227

}

228

#endif

228

#endif

229

230

/*

230

/*

231

* sched_domains_mutex serializes calls to arch_init_sched_domains,

231

* sched_domains_mutex serializes calls to arch_init_sched_domains,

232

* detach_destroy_domains and partition_sched_domains.

232

* detach_destroy_domains and partition_sched_domains.

233

*/

233

*/

234

static DEFINE_MUTEX(sched_domains_mutex);

234

static DEFINE_MUTEX(sched_domains_mutex);

235

236

#ifdef CONFIG_CGROUP_SCHED

236

#ifdef CONFIG_CGROUP_SCHED

237

238

#include <linux/cgroup.h>

238

#include <linux/cgroup.h>

239

240

struct cfs_rq;

240

struct cfs_rq;

241

242

static LIST_HEAD(task_groups);

242

static LIST_HEAD(task_groups);

243

244

/* task group related information */

244

/* task group related information */

245

struct task_group {

245

struct task_group {

246

struct cgroup_subsys_state css;

246

struct cgroup_subsys_state css;

247

248

#ifdef CONFIG_FAIR_GROUP_SCHED

248

#ifdef CONFIG_FAIR_GROUP_SCHED

249

/* schedulable entities of this group on each cpu */

249

/* schedulable entities of this group on each cpu */

250

struct sched_entity **se;

250

struct sched_entity **se;

251

/* runqueue "owned" by this group on each cpu */

251

/* runqueue "owned" by this group on each cpu */

252

struct cfs_rq **cfs_rq;

252

struct cfs_rq **cfs_rq;

253

unsigned long shares;

253

unsigned long shares;

254

#endif

254

#endif

255

256

#ifdef CONFIG_RT_GROUP_SCHED

256

#ifdef CONFIG_RT_GROUP_SCHED

257

struct sched_rt_entity **rt_se;

257

struct sched_rt_entity **rt_se;

258

struct rt_rq **rt_rq;

258

struct rt_rq **rt_rq;

259

260

struct rt_bandwidth rt_bandwidth;

260

struct rt_bandwidth rt_bandwidth;

261

#endif

261

#endif

262

263

struct rcu_head rcu;

263

struct rcu_head rcu;

264

struct list_head list;

264

struct list_head list;

265

266

struct task_group *parent;

266

struct task_group *parent;

267

struct list_head siblings;

267

struct list_head siblings;

268

struct list_head children;

268

struct list_head children;

269

};

269

};

270

271

#define root_task_group init_task_group

271

#define root_task_group init_task_group

272

273

/* task_group_lock serializes add/remove of task groups and also changes to

273

/* task_group_lock serializes add/remove of task groups and also changes to

274

* a task group's cpu shares.

274

* a task group's cpu shares.

275

*/

275

*/

276

static DEFINE_SPINLOCK(task_group_lock);

276

static DEFINE_SPINLOCK(task_group_lock);

277

278

#ifdef CONFIG_FAIR_GROUP_SCHED

278

#ifdef CONFIG_FAIR_GROUP_SCHED

279

280

#ifdef CONFIG_SMP

280

#ifdef CONFIG_SMP

281

static int root_task_group_empty(void)

281

static int root_task_group_empty(void)

282

{

282

{

283

return list_empty(&root_task_group.children);

283

return list_empty(&root_task_group.children);

284

}

284

}

285

#endif

285

#endif

286

287

# define INIT_TASK_GROUP_LOAD NICE_0_LOAD

287

# define INIT_TASK_GROUP_LOAD NICE_0_LOAD

288

289

/*

289

/*

290

* A weight of 0 or 1 can cause arithmetics problems.

290

* A weight of 0 or 1 can cause arithmetics problems.

291

* A weight of a cfs_rq is the sum of weights of which entities

291

* A weight of a cfs_rq is the sum of weights of which entities

292

* are queued on this cfs_rq, so a weight of a entity should not be

292

* are queued on this cfs_rq, so a weight of a entity should not be

293

* too large, so as the shares value of a task group.

293

* too large, so as the shares value of a task group.

294

* (The default weight is 1024 - so there's no practical

294

* (The default weight is 1024 - so there's no practical

295

* limitation from this.)

295

* limitation from this.)

296

*/

296

*/

297

#define MIN_SHARES 2

297

#define MIN_SHARES 2

298

#define MAX_SHARES (1UL << 18)

298

#define MAX_SHARES (1UL << 18)

299

300

static int init_task_group_load = INIT_TASK_GROUP_LOAD;

300

static int init_task_group_load = INIT_TASK_GROUP_LOAD;

301

#endif

301

#endif

302

303

/* Default task group.

303

/* Default task group.

304

* Every task in system belong to this group at bootup.

304

* Every task in system belong to this group at bootup.

305

*/

305

*/

306

struct task_group init_task_group;

306

struct task_group init_task_group;

307

308

/* return group to which a task belongs */

308

/* return group to which a task belongs */

309

static inline struct task_group *task_group(struct task_struct *p)

309

static inline struct task_group *task_group(struct task_struct *p)

310

{

310

{

311

struct task_group *tg;

311

struct task_group *tg;

312

313

#ifdef CONFIG_CGROUP_SCHED

313

#ifdef CONFIG_CGROUP_SCHED

314

tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),

314

tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),

315

struct task_group, css);

315

struct task_group, css);

316

#else

316

#else

317

tg = &init_task_group;

317

tg = &init_task_group;

318

#endif

318

#endif

319

return tg;

319

return tg;

320

}

320

}

321

322

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

322

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

323

static inline void set_task_rq(struct task_struct *p, unsigned int cpu)

323

static inline void set_task_rq(struct task_struct *p, unsigned int cpu)

324

{

324

{

325

#ifdef CONFIG_FAIR_GROUP_SCHED

325

#ifdef CONFIG_FAIR_GROUP_SCHED

326

p->se.cfs_rq = task_group(p)->cfs_rq[cpu];

326

p->se.cfs_rq = task_group(p)->cfs_rq[cpu];

327

p->se.parent = task_group(p)->se[cpu];

327

p->se.parent = task_group(p)->se[cpu];

328

#endif

328

#endif

329

330

#ifdef CONFIG_RT_GROUP_SCHED

330

#ifdef CONFIG_RT_GROUP_SCHED

331

p->rt.rt_rq = task_group(p)->rt_rq[cpu];

331

p->rt.rt_rq = task_group(p)->rt_rq[cpu];

332

p->rt.parent = task_group(p)->rt_se[cpu];

332

p->rt.parent = task_group(p)->rt_se[cpu];

333

#endif

333

#endif

334

}

334

}

335

336

#else

336

#else

337

338

static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }

338

static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }

339

static inline struct task_group *task_group(struct task_struct *p)

339

static inline struct task_group *task_group(struct task_struct *p)

340

{

340

{

341

return NULL;

341

return NULL;

342

}

342

}

343

344

#endif /* CONFIG_CGROUP_SCHED */

344

#endif /* CONFIG_CGROUP_SCHED */

345

346

/* CFS-related fields in a runqueue */

346

/* CFS-related fields in a runqueue */

347

struct cfs_rq {

347

struct cfs_rq {

348

struct load_weight load;

348

struct load_weight load;

349

unsigned long nr_running;

349

unsigned long nr_running;

350

351

u64 exec_clock;

351

u64 exec_clock;

352

u64 min_vruntime;

352

u64 min_vruntime;

353

354

struct rb_root tasks_timeline;

354

struct rb_root tasks_timeline;

355

struct rb_node *rb_leftmost;

355

struct rb_node *rb_leftmost;

356

357

struct list_head tasks;

357

struct list_head tasks;

358

struct list_head *balance_iterator;

358

struct list_head *balance_iterator;

359

360

/*

360

/*

361

* 'curr' points to currently running entity on this cfs_rq.

361

* 'curr' points to currently running entity on this cfs_rq.

362

* It is set to NULL otherwise (i.e when none are currently running).

362

* It is set to NULL otherwise (i.e when none are currently running).

363

*/

363

*/

364

struct sched_entity *curr, *next, *last;

364

struct sched_entity *curr, *next, *last;

365

366

unsigned int nr_spread_over;

366

unsigned int nr_spread_over;

367

368

#ifdef CONFIG_FAIR_GROUP_SCHED

368

#ifdef CONFIG_FAIR_GROUP_SCHED

369

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

369

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

370

371

/*

371

/*

372

* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

372

* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

373

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

373

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

374

* (like users, containers etc.)

374

* (like users, containers etc.)

375

*

375

*

376

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

376

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

377

* list is used during load balance.

377

* list is used during load balance.

378

*/

378

*/

379

struct list_head leaf_cfs_rq_list;

379

struct list_head leaf_cfs_rq_list;

380

struct task_group *tg; /* group that "owns" this runqueue */

380

struct task_group *tg; /* group that "owns" this runqueue */

381

382

#ifdef CONFIG_SMP

382

#ifdef CONFIG_SMP

383

/*

383

/*

384

* the part of load.weight contributed by tasks

384

* the part of load.weight contributed by tasks

385

*/

385

*/

386

unsigned long task_weight;

386

unsigned long task_weight;

387

388

/*

388

/*

389

* h_load = weight * f(tg)

389

* h_load = weight * f(tg)

390

*

390

*

391

* Where f(tg) is the recursive weight fraction assigned to

391

* Where f(tg) is the recursive weight fraction assigned to

392

* this group.

392

* this group.

393

*/

393

*/

394

unsigned long h_load;

394

unsigned long h_load;

395

396

/*

396

/*

397

* this cpu's part of tg->shares

397

* this cpu's part of tg->shares

398

*/

398

*/

399

unsigned long shares;

399

unsigned long shares;

400

401

/*

401

/*

402

* load.weight at the time we set shares

402

* load.weight at the time we set shares

403

*/

403

*/

404

unsigned long rq_weight;

404

unsigned long rq_weight;

405

#endif

405

#endif

406

#endif

406

#endif

407

};

407

};

408

409

/* Real-Time classes' related field in a runqueue: */

409

/* Real-Time classes' related field in a runqueue: */

410

struct rt_rq {

410

struct rt_rq {

411

struct rt_prio_array active;

411

struct rt_prio_array active;

412

unsigned long rt_nr_running;

412

unsigned long rt_nr_running;

413

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

413

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

414

struct {

414

struct {

415

int curr; /* highest queued rt task prio */

415

int curr; /* highest queued rt task prio */

416

#ifdef CONFIG_SMP

416

#ifdef CONFIG_SMP

417

int next; /* next highest */

417

int next; /* next highest */

418

#endif

418

#endif

419

} highest_prio;

419

} highest_prio;

420

#endif

420

#endif

421

#ifdef CONFIG_SMP

421

#ifdef CONFIG_SMP

422

unsigned long rt_nr_migratory;

422

unsigned long rt_nr_migratory;

423

unsigned long rt_nr_total;

423

unsigned long rt_nr_total;

424

int overloaded;

424

int overloaded;

425

struct plist_head pushable_tasks;

425

struct plist_head pushable_tasks;

426

#endif

426

#endif

427

int rt_throttled;

427

int rt_throttled;

428

u64 rt_time;

428

u64 rt_time;

429

u64 rt_runtime;

429

u64 rt_runtime;

430

/* Nests inside the rq lock: */

430

/* Nests inside the rq lock: */

431

raw_spinlock_t rt_runtime_lock;

431

raw_spinlock_t rt_runtime_lock;

432

433

#ifdef CONFIG_RT_GROUP_SCHED

433

#ifdef CONFIG_RT_GROUP_SCHED

434

unsigned long rt_nr_boosted;

434

unsigned long rt_nr_boosted;

435

436

struct rq *rq;

436

struct rq *rq;

437

struct list_head leaf_rt_rq_list;

437

struct list_head leaf_rt_rq_list;

438

struct task_group *tg;

438

struct task_group *tg;

439

#endif

439

#endif

440

};

440

};

441

442

#ifdef CONFIG_SMP

442

#ifdef CONFIG_SMP

443

444

/*

444

/*

445

* We add the notion of a root-domain which will be used to define per-domain

445

* We add the notion of a root-domain which will be used to define per-domain

446

* variables. Each exclusive cpuset essentially defines an island domain by

446

* variables. Each exclusive cpuset essentially defines an island domain by

447

* fully partitioning the member cpus from any other cpuset. Whenever a new

447

* fully partitioning the member cpus from any other cpuset. Whenever a new

448

* exclusive cpuset is created, we also create and attach a new root-domain

448

* exclusive cpuset is created, we also create and attach a new root-domain

449

* object.

449

* object.

450

*

450

*

451

*/

451

*/

452

struct root_domain {

452

struct root_domain {

453

atomic_t refcount;

453

atomic_t refcount;

454

cpumask_var_t span;

454

cpumask_var_t span;

455

cpumask_var_t online;

455

cpumask_var_t online;

456

457

/*

457

/*

458

* The "RT overload" flag: it gets set if a CPU has more than

458

* The "RT overload" flag: it gets set if a CPU has more than

459

* one runnable RT task.

459

* one runnable RT task.

460

*/

460

*/

461

cpumask_var_t rto_mask;

461

cpumask_var_t rto_mask;

462

atomic_t rto_count;

462

atomic_t rto_count;

463

#ifdef CONFIG_SMP

463

#ifdef CONFIG_SMP

464

struct cpupri cpupri;

464

struct cpupri cpupri;

465

#endif

465

#endif

466

};

466

};

467

468

/*

468

/*

469

* By default the system creates a single root-domain with all cpus as

469

* By default the system creates a single root-domain with all cpus as

470

* members (mimicking the global state we have today).

470

* members (mimicking the global state we have today).

471

*/

471

*/

472

static struct root_domain def_root_domain;

472

static struct root_domain def_root_domain;

473

474

#endif

474

#endif

475

476

/*

476

/*

477

* This is the main, per-CPU runqueue data structure.

477

* This is the main, per-CPU runqueue data structure.

478

*

478

*

479

* Locking rule: those places that want to lock multiple runqueues

479

* Locking rule: those places that want to lock multiple runqueues

480

* (such as the load balancing or the thread migration code), lock

480

* (such as the load balancing or the thread migration code), lock

481

* acquire operations must be ordered by ascending &runqueue.

481

* acquire operations must be ordered by ascending &runqueue.

482

*/

482

*/

483

struct rq {

483

struct rq {

484

/* runqueue lock: */

484

/* runqueue lock: */

485

raw_spinlock_t lock;

485

raw_spinlock_t lock;

486

487

/*

487

/*

488

* nr_running and cpu_load should be in the same cacheline because

488

* nr_running and cpu_load should be in the same cacheline because

489

* remote CPUs use both these fields when doing load calculation.

489

* remote CPUs use both these fields when doing load calculation.

490

*/

490

*/

491

unsigned long nr_running;

491

unsigned long nr_running;

492

#define CPU_LOAD_IDX_MAX 5

492

#define CPU_LOAD_IDX_MAX 5

493

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

493

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

494

#ifdef CONFIG_NO_HZ

494

#ifdef CONFIG_NO_HZ

495

unsigned char in_nohz_recently;

495

unsigned char in_nohz_recently;

496

#endif

496

#endif

497

/* capture load from *all* tasks on this cpu: */

497

/* capture load from *all* tasks on this cpu: */

498

struct load_weight load;

498

struct load_weight load;

499

unsigned long nr_load_updates;

499

unsigned long nr_load_updates;

500

u64 nr_switches;

500

u64 nr_switches;

501

502

struct cfs_rq cfs;

502

struct cfs_rq cfs;

503

struct rt_rq rt;

503

struct rt_rq rt;

504

505

#ifdef CONFIG_FAIR_GROUP_SCHED

505

#ifdef CONFIG_FAIR_GROUP_SCHED

506

/* list of leaf cfs_rq on this cpu: */

506

/* list of leaf cfs_rq on this cpu: */

507

struct list_head leaf_cfs_rq_list;

507

struct list_head leaf_cfs_rq_list;

508

#endif

508

#endif

509

#ifdef CONFIG_RT_GROUP_SCHED

509

#ifdef CONFIG_RT_GROUP_SCHED

510

struct list_head leaf_rt_rq_list;

510

struct list_head leaf_rt_rq_list;

511

#endif

511

#endif

512

513

/*

513

/*

514

* This is part of a global counter where only the total sum

514

* This is part of a global counter where only the total sum

515

* over all CPUs matters. A task can increase this counter on

515

* over all CPUs matters. A task can increase this counter on

516

* one CPU and if it got migrated afterwards it may decrease

516

* one CPU and if it got migrated afterwards it may decrease

517

* it on another CPU. Always updated under the runqueue lock:

517

* it on another CPU. Always updated under the runqueue lock:

518

*/

518

*/

519

unsigned long nr_uninterruptible;

519

unsigned long nr_uninterruptible;

520

521

struct task_struct *curr, *idle;

521

struct task_struct *curr, *idle;

522

unsigned long next_balance;

522

unsigned long next_balance;

523

struct mm_struct *prev_mm;

523

struct mm_struct *prev_mm;

524

525

u64 clock;

525

u64 clock;

526

527

atomic_t nr_iowait;

527

atomic_t nr_iowait;

528

529

#ifdef CONFIG_SMP

529

#ifdef CONFIG_SMP

530

struct root_domain *rd;

530

struct root_domain *rd;

531

struct sched_domain *sd;

531

struct sched_domain *sd;

532

533

unsigned char idle_at_tick;

533

unsigned char idle_at_tick;

534

/* For active balancing */

534

/* For active balancing */

535

int post_schedule;

535

int post_schedule;

536

int active_balance;

536

int active_balance;

537

int push_cpu;

537

int push_cpu;

538

/* cpu of this runqueue: */

538

/* cpu of this runqueue: */

539

int cpu;

539

int cpu;

540

int online;

540

int online;

541

542

unsigned long avg_load_per_task;

542

unsigned long avg_load_per_task;

543

544

struct task_struct *migration_thread;

544

struct task_struct *migration_thread;

545

struct list_head migration_queue;

545

struct list_head migration_queue;

546

547

u64 rt_avg;

547

u64 rt_avg;

548

u64 age_stamp;

548

u64 age_stamp;

549

u64 idle_stamp;

549

u64 idle_stamp;

550

u64 avg_idle;

550

u64 avg_idle;

551

#endif

551

#endif

552

553

/* calc_load related fields */

553

/* calc_load related fields */

554

unsigned long calc_load_update;

554

unsigned long calc_load_update;

555

long calc_load_active;

555

long calc_load_active;

556

557

#ifdef CONFIG_SCHED_HRTICK

557

#ifdef CONFIG_SCHED_HRTICK

558

#ifdef CONFIG_SMP

558

#ifdef CONFIG_SMP

559

int hrtick_csd_pending;

559

int hrtick_csd_pending;

560

struct call_single_data hrtick_csd;

560

struct call_single_data hrtick_csd;

561

#endif

561

#endif

562

struct hrtimer hrtick_timer;

562

struct hrtimer hrtick_timer;

563

#endif

563

#endif

564

565

#ifdef CONFIG_SCHEDSTATS

565

#ifdef CONFIG_SCHEDSTATS

566

/* latency stats */

566

/* latency stats */

567

struct sched_info rq_sched_info;

567

struct sched_info rq_sched_info;

568

unsigned long long rq_cpu_time;

568

unsigned long long rq_cpu_time;

569

/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

569

/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

570

571

/* sys_sched_yield() stats */

571

/* sys_sched_yield() stats */

572

unsigned int yld_count;

572

unsigned int yld_count;

573

574

/* schedule() stats */

574

/* schedule() stats */

575

unsigned int sched_switch;

575

unsigned int sched_switch;

576

unsigned int sched_count;

576

unsigned int sched_count;

577

unsigned int sched_goidle;

577

unsigned int sched_goidle;

578

579

/* try_to_wake_up() stats */

579

/* try_to_wake_up() stats */

580

unsigned int ttwu_count;

580

unsigned int ttwu_count;

581

unsigned int ttwu_local;

581

unsigned int ttwu_local;

582

583

/* BKL stats */

583

/* BKL stats */

584

unsigned int bkl_count;

584

unsigned int bkl_count;

585

#endif

585

#endif

586

};

586

};

587

588

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

588

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

589

590

static inline

590

static inline

591

void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)

591

void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)

592

{

592

{

593

rq->curr->sched_class->check_preempt_curr(rq, p, flags);

593

rq->curr->sched_class->check_preempt_curr(rq, p, flags);

594

}

594

}

595

596

static inline int cpu_of(struct rq *rq)

596

static inline int cpu_of(struct rq *rq)

597

{

597

{

598

#ifdef CONFIG_SMP

598

#ifdef CONFIG_SMP

599

return rq->cpu;

599

return rq->cpu;

600

#else

600

#else

601

return 0;

601

return 0;

602

#endif

602

#endif

603

}

603

}

604

605

#define rcu_dereference_check_sched_domain(p) \

605

#define rcu_dereference_check_sched_domain(p) \

606

rcu_dereference_check((p), \

606

rcu_dereference_check((p), \

607

rcu_read_lock_sched_held() || \

607

rcu_read_lock_sched_held() || \

608

lockdep_is_held(&sched_domains_mutex))

608

lockdep_is_held(&sched_domains_mutex))

609

610

/*

610

/*

611

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

611

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

612

* See detach_destroy_domains: synchronize_sched for details.

612

* See detach_destroy_domains: synchronize_sched for details.

613

*

613

*

614

* The domain tree of any CPU may only be accessed from within

614

* The domain tree of any CPU may only be accessed from within

615

* preempt-disabled sections.

615

* preempt-disabled sections.

616

*/

616

*/

617

#define for_each_domain(cpu, __sd) \

617

#define for_each_domain(cpu, __sd) \

618

for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)

618

for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)

619

620

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

620

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

621

#define this_rq() (&__get_cpu_var(runqueues))

621

#define this_rq() (&__get_cpu_var(runqueues))

622

#define task_rq(p) cpu_rq(task_cpu(p))

622

#define task_rq(p) cpu_rq(task_cpu(p))

623

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

623

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

624

#define raw_rq() (&__raw_get_cpu_var(runqueues))

624

#define raw_rq() (&__raw_get_cpu_var(runqueues))

625

626

inline void update_rq_clock(struct rq *rq)

626

inline void update_rq_clock(struct rq *rq)

627

{

627

{

628

rq->clock = sched_clock_cpu(cpu_of(rq));

628

rq->clock = sched_clock_cpu(cpu_of(rq));

629

}

629

}

630

631

/*

631

/*

632

* Tunables that become constants when CONFIG_SCHED_DEBUG is off:

632

* Tunables that become constants when CONFIG_SCHED_DEBUG is off:

633

*/

633

*/

634

#ifdef CONFIG_SCHED_DEBUG

634

#ifdef CONFIG_SCHED_DEBUG

635

# define const_debug __read_mostly

635

# define const_debug __read_mostly

636

#else

636

#else

637

# define const_debug static const

637

# define const_debug static const

638

#endif

638

#endif

639

640

/**

640

/**

641

* runqueue_is_locked

641

* runqueue_is_locked

642

* @cpu: the processor in question.

642

* @cpu: the processor in question.

643

*

643

*

644

* Returns true if the current cpu runqueue is locked.

644

* Returns true if the current cpu runqueue is locked.

645

* This interface allows printk to be called with the runqueue lock

645

* This interface allows printk to be called with the runqueue lock

646

* held and know whether or not it is OK to wake up the klogd.

646

* held and know whether or not it is OK to wake up the klogd.

647

*/

647

*/

648

int runqueue_is_locked(int cpu)

648

int runqueue_is_locked(int cpu)

649

{

649

{

650

return raw_spin_is_locked(&cpu_rq(cpu)->lock);

650

return raw_spin_is_locked(&cpu_rq(cpu)->lock);

651

}

651

}

652

653

/*

653

/*

654

* Debugging: various feature bits

654

* Debugging: various feature bits

655

*/

655

*/

656

657

#define SCHED_FEAT(name, enabled) \

657

#define SCHED_FEAT(name, enabled) \

658

__SCHED_FEAT_##name ,

658

__SCHED_FEAT_##name ,

659

660

enum {

660

enum {

661

#include "sched_features.h"

661

#include "sched_features.h"

662

};

662

};

663

664

#undef SCHED_FEAT

664

#undef SCHED_FEAT

665

666

#define SCHED_FEAT(name, enabled) \

666

#define SCHED_FEAT(name, enabled) \

667

(1UL << __SCHED_FEAT_##name) * enabled |

667

(1UL << __SCHED_FEAT_##name) * enabled |

668

669

const_debug unsigned int sysctl_sched_features =

669

const_debug unsigned int sysctl_sched_features =

670

#include "sched_features.h"

670

#include "sched_features.h"

671

0;

671

0;

672

673

#undef SCHED_FEAT

673

#undef SCHED_FEAT

674

675

#ifdef CONFIG_SCHED_DEBUG

675

#ifdef CONFIG_SCHED_DEBUG

676

#define SCHED_FEAT(name, enabled) \

676

#define SCHED_FEAT(name, enabled) \

677

#name ,

677

#name ,

678

679

static __read_mostly char *sched_feat_names[] = {

679

static __read_mostly char *sched_feat_names[] = {

680

#include "sched_features.h"

680

#include "sched_features.h"

681

NULL

681

NULL

682

};

682

};

683

684

#undef SCHED_FEAT

684

#undef SCHED_FEAT

685

686

static int sched_feat_show(struct seq_file *m, void *v)

686

static int sched_feat_show(struct seq_file *m, void *v)

687

{

687

{

688

int i;

688

int i;

689

690

for (i = 0; sched_feat_names[i]; i++) {

690

for (i = 0; sched_feat_names[i]; i++) {

691

if (!(sysctl_sched_features & (1UL << i)))

691

if (!(sysctl_sched_features & (1UL << i)))

692

seq_puts(m, "NO_");

692

seq_puts(m, "NO_");

693

seq_printf(m, "%s ", sched_feat_names[i]);

693

seq_printf(m, "%s ", sched_feat_names[i]);

694

}

694

}

695

seq_puts(m, "\n");

695

seq_puts(m, "\n");

696

697

return 0;

697

return 0;

698

}

698

}

699

700

static ssize_t

700

static ssize_t

701

sched_feat_write(struct file *filp, const char __user *ubuf,

701

sched_feat_write(struct file *filp, const char __user *ubuf,

702

size_t cnt, loff_t *ppos)

702

size_t cnt, loff_t *ppos)

703

{

703

{

704

char buf[64];

704

char buf[64];

705

char *cmp = buf;

705

char *cmp = buf;

706

int neg = 0;

706

int neg = 0;

707

int i;

707

int i;

708

709

if (cnt > 63)

709

if (cnt > 63)

710

cnt = 63;

710

cnt = 63;

711

712

if (copy_from_user(&buf, ubuf, cnt))

712

if (copy_from_user(&buf, ubuf, cnt))

713

return -EFAULT;

713

return -EFAULT;

714

715

buf[cnt] = 0;

715

buf[cnt] = 0;

716

717

if (strncmp(buf, "NO_", 3) == 0) {

717

if (strncmp(buf, "NO_", 3) == 0) {

718

neg = 1;

718

neg = 1;

719

cmp += 3;

719

cmp += 3;

720

}

720

}

721

722

for (i = 0; sched_feat_names[i]; i++) {

722

for (i = 0; sched_feat_names[i]; i++) {

723

int len = strlen(sched_feat_names[i]);

723

int len = strlen(sched_feat_names[i]);

724

725

if (strncmp(cmp, sched_feat_names[i], len) == 0) {

725

if (strncmp(cmp, sched_feat_names[i], len) == 0) {

726

if (neg)

726

if (neg)

727

sysctl_sched_features &= ~(1UL << i);

727

sysctl_sched_features &= ~(1UL << i);

728

else

728

else

729

sysctl_sched_features |= (1UL << i);

729

sysctl_sched_features |= (1UL << i);

730

break;

730

break;

731

}

731

}

732

}

732

}

733

734

if (!sched_feat_names[i])

734

if (!sched_feat_names[i])

735

return -EINVAL;

735

return -EINVAL;

736

737

*ppos += cnt;

737

*ppos += cnt;

738

739

return cnt;

739

return cnt;

740

}

740

}

741

742

static int sched_feat_open(struct inode *inode, struct file *filp)

742

static int sched_feat_open(struct inode *inode, struct file *filp)

743

{

743

{

744

return single_open(filp, sched_feat_show, NULL);

744

return single_open(filp, sched_feat_show, NULL);

745

}

745

}

746

747

static const struct file_operations sched_feat_fops = {

747

static const struct file_operations sched_feat_fops = {

748

.open = sched_feat_open,

748

.open = sched_feat_open,

749

.write = sched_feat_write,

749

.write = sched_feat_write,

750

.read = seq_read,

750

.read = seq_read,

751

.llseek = seq_lseek,

751

.llseek = seq_lseek,

752

.release = single_release,

752

.release = single_release,

753

};

753

};

754

755

static __init int sched_init_debug(void)

755

static __init int sched_init_debug(void)

756

{

756

{

757

debugfs_create_file("sched_features", 0644, NULL, NULL,

757

debugfs_create_file("sched_features", 0644, NULL, NULL,

758

&sched_feat_fops);

758

&sched_feat_fops);

759

760

return 0;

760

return 0;

761

}

761

}

762

late_initcall(sched_init_debug);

762

late_initcall(sched_init_debug);

763

764

#endif

764

#endif

765

766

#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))

766

#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))

767

768

/*

768

/*

769

* Number of tasks to iterate in a single balance run.

769

* Number of tasks to iterate in a single balance run.

770

* Limited because this is done with IRQs disabled.

770

* Limited because this is done with IRQs disabled.

771

*/

771

*/

772

const_debug unsigned int sysctl_sched_nr_migrate = 32;

772

const_debug unsigned int sysctl_sched_nr_migrate = 32;

773

774

/*

774

/*

775

* ratelimit for updating the group shares.

775

* ratelimit for updating the group shares.

776

* default: 0.25ms

776

* default: 0.25ms

777

*/

777

*/

778

unsigned int sysctl_sched_shares_ratelimit = 250000;

778

unsigned int sysctl_sched_shares_ratelimit = 250000;

779

unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;

779

unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;

780

781

/*

781

/*

782

* Inject some fuzzyness into changing the per-cpu group shares

782

* Inject some fuzzyness into changing the per-cpu group shares

783

* this avoids remote rq-locks at the expense of fairness.

783

* this avoids remote rq-locks at the expense of fairness.

784

* default: 4

784

* default: 4

785

*/

785

*/

786

unsigned int sysctl_sched_shares_thresh = 4;

786

unsigned int sysctl_sched_shares_thresh = 4;

787

788

/*

788

/*

789

* period over which we average the RT time consumption, measured

789

* period over which we average the RT time consumption, measured

790

* in ms.

790

* in ms.

791

*

791

*

792

* default: 1s

792

* default: 1s

793

*/

793

*/

794

const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

794

const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

795

796

/*

796

/*

797

* period over which we measure -rt task cpu usage in us.

797

* period over which we measure -rt task cpu usage in us.

798

* default: 1s

798

* default: 1s

799

*/

799

*/

800

unsigned int sysctl_sched_rt_period = 1000000;

800

unsigned int sysctl_sched_rt_period = 1000000;

801

802

static __read_mostly int scheduler_running;

802

static __read_mostly int scheduler_running;

803

804

/*

804

/*

805

* part of the period that we allow rt tasks to run in us.

805

* part of the period that we allow rt tasks to run in us.

806

* default: 0.95s

806

* default: 0.95s

807

*/

807

*/

808

int sysctl_sched_rt_runtime = 950000;

808

int sysctl_sched_rt_runtime = 950000;

809

810

static inline u64 global_rt_period(void)

810

static inline u64 global_rt_period(void)

811

{

811

{

812

return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;

812

return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;

813

}

813

}

814

815

static inline u64 global_rt_runtime(void)

815

static inline u64 global_rt_runtime(void)

816

{

816

{

817

if (sysctl_sched_rt_runtime < 0)

817

if (sysctl_sched_rt_runtime < 0)

818

return RUNTIME_INF;

818

return RUNTIME_INF;

819

820

return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;

820

return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;

821

}

821

}

822

823

#ifndef prepare_arch_switch

823

#ifndef prepare_arch_switch

824

# define prepare_arch_switch(next) do { } while (0)

824

# define prepare_arch_switch(next) do { } while (0)

825

#endif

825

#endif

826

#ifndef finish_arch_switch

826

#ifndef finish_arch_switch

827

# define finish_arch_switch(prev) do { } while (0)

827

# define finish_arch_switch(prev) do { } while (0)

828

#endif

828

#endif

829

830

static inline int task_current(struct rq *rq, struct task_struct *p)

830

static inline int task_current(struct rq *rq, struct task_struct *p)

831

{

831

{

832

return rq->curr == p;

832

return rq->curr == p;

833

}

833

}

834

835

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

835

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

836

static inline int task_running(struct rq *rq, struct task_struct *p)

836

static inline int task_running(struct rq *rq, struct task_struct *p)

837

{

837

{

838

return task_current(rq, p);

838

return task_current(rq, p);

839

}

839

}

840

841

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

841

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

842

{

842

{

843

}

843

}

844

845

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

845

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

846

{

846

{

847

#ifdef CONFIG_DEBUG_SPINLOCK

847

#ifdef CONFIG_DEBUG_SPINLOCK

848

/* this is a valid case when another task releases the spinlock */

848

/* this is a valid case when another task releases the spinlock */

849

rq->lock.owner = current;

849

rq->lock.owner = current;

850

#endif

850

#endif

851

/*

851

/*

852

* If we are tracking spinlock dependencies then we have to

852

* If we are tracking spinlock dependencies then we have to

853

* fix up the runqueue lock - which gets 'carried over' from

853

* fix up the runqueue lock - which gets 'carried over' from

854

* prev into current:

854

* prev into current:

855

*/

855

*/

856

spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

856

spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

857

858

raw_spin_unlock_irq(&rq->lock);

858

raw_spin_unlock_irq(&rq->lock);

859

}

859

}

860

861

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

861

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

862

static inline int task_running(struct rq *rq, struct task_struct *p)

862

static inline int task_running(struct rq *rq, struct task_struct *p)

863

{

863

{

864

#ifdef CONFIG_SMP

864

#ifdef CONFIG_SMP

865

return p->oncpu;

865

return p->oncpu;

866

#else

866

#else

867

return task_current(rq, p);

867

return task_current(rq, p);

868

#endif

868

#endif

869

}

869

}

870

871

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

871

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

872

{

872

{

873

#ifdef CONFIG_SMP

873

#ifdef CONFIG_SMP

874

/*

874

/*

875

* We can optimise this out completely for !SMP, because the

875

* We can optimise this out completely for !SMP, because the

876

* SMP rebalancing from interrupt is the only thing that cares

876

* SMP rebalancing from interrupt is the only thing that cares

877

* here.

877

* here.

878

*/

878

*/

879

next->oncpu = 1;

879

next->oncpu = 1;

880

#endif

880

#endif

881

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

881

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

882

raw_spin_unlock_irq(&rq->lock);

882

raw_spin_unlock_irq(&rq->lock);

883

#else

883

#else

884

raw_spin_unlock(&rq->lock);

884

raw_spin_unlock(&rq->lock);

885

#endif

885

#endif

886

}

886

}

887

888

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

888

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

889

{

889

{

890

#ifdef CONFIG_SMP

890

#ifdef CONFIG_SMP

891

/*

891

/*

892

* After ->oncpu is cleared, the task can be moved to a different CPU.

892

* After ->oncpu is cleared, the task can be moved to a different CPU.

893

* We must ensure this doesn't happen until the switch is completely

893

* We must ensure this doesn't happen until the switch is completely

894

* finished.

894

* finished.

895

*/

895

*/

896

smp_wmb();

896

smp_wmb();

897

prev->oncpu = 0;

897

prev->oncpu = 0;

898

#endif

898

#endif

899

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

899

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

900

local_irq_enable();

900

local_irq_enable();

901

#endif

901

#endif

902

}

902

}

903

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

903

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

904

905

/*

905

/*

906

* Check whether the task is waking, we use this to synchronize against

906

* Check whether the task is waking, we use this to synchronize against

907

* ttwu() so that task_cpu() reports a stable number.

907

* ttwu() so that task_cpu() reports a stable number.

908

*

908

*

909

* We need to make an exception for PF_STARTING tasks because the fork

909

* We need to make an exception for PF_STARTING tasks because the fork

910

* path might require task_rq_lock() to work, eg. it can call

910

* path might require task_rq_lock() to work, eg. it can call

911

* set_cpus_allowed_ptr() from the cpuset clone_ns code.

911

* set_cpus_allowed_ptr() from the cpuset clone_ns code.

912

*/

912

*/

913

static inline int task_is_waking(struct task_struct *p)

913

static inline int task_is_waking(struct task_struct *p)

914

{

914

{

915

return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));

915

return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));

916

}

916

}

917

918

/*

918

/*

919

* __task_rq_lock - lock the runqueue a given task resides on.

919

* __task_rq_lock - lock the runqueue a given task resides on.

920

* Must be called interrupts disabled.

920

* Must be called interrupts disabled.

921

*/

921

*/

922

static inline struct rq *__task_rq_lock(struct task_struct *p)

922

static inline struct rq *__task_rq_lock(struct task_struct *p)

923

__acquires(rq->lock)

923

__acquires(rq->lock)

924

{

924

{

925

struct rq *rq;

925

struct rq *rq;

926

927

for (;;) {

927

for (;;) {

928

while (task_is_waking(p))

928

while (task_is_waking(p))

929

cpu_relax();

929

cpu_relax();

930

rq = task_rq(p);

930

rq = task_rq(p);

931

raw_spin_lock(&rq->lock);

931

raw_spin_lock(&rq->lock);

932

if (likely(rq == task_rq(p) && !task_is_waking(p)))

932

if (likely(rq == task_rq(p) && !task_is_waking(p)))

933

return rq;

933

return rq;

934

raw_spin_unlock(&rq->lock);

934

raw_spin_unlock(&rq->lock);

935

}

935

}

936

}

936

}

937

938

/*

938

/*

939

* task_rq_lock - lock the runqueue a given task resides on and disable

939

* task_rq_lock - lock the runqueue a given task resides on and disable

940

* interrupts. Note the ordering: we can safely lookup the task_rq without

940

* interrupts. Note the ordering: we can safely lookup the task_rq without

941

* explicitly disabling preemption.

941

* explicitly disabling preemption.

942

*/

942

*/

943

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

943

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

944

__acquires(rq->lock)

944

__acquires(rq->lock)

945

{

945

{

946

struct rq *rq;

946

struct rq *rq;

947

948

for (;;) {

948

for (;;) {

949

while (task_is_waking(p))

949

while (task_is_waking(p))

950

cpu_relax();

950

cpu_relax();

951

local_irq_save(*flags);

951

local_irq_save(*flags);

952

rq = task_rq(p);

952

rq = task_rq(p);

953

raw_spin_lock(&rq->lock);

953

raw_spin_lock(&rq->lock);

954

if (likely(rq == task_rq(p) && !task_is_waking(p)))

954

if (likely(rq == task_rq(p) && !task_is_waking(p)))

955

return rq;

955

return rq;

956

raw_spin_unlock_irqrestore(&rq->lock, *flags);

956

raw_spin_unlock_irqrestore(&rq->lock, *flags);

957

}

957

}

958

}

958

}

959

960

void task_rq_unlock_wait(struct task_struct *p)

960

void task_rq_unlock_wait(struct task_struct *p)

961

{

961

{

962

struct rq *rq = task_rq(p);

962

struct rq *rq = task_rq(p);

963

964

smp_mb(); /* spin-unlock-wait is not a full memory barrier */

964

smp_mb(); /* spin-unlock-wait is not a full memory barrier */

965

raw_spin_unlock_wait(&rq->lock);

965

raw_spin_unlock_wait(&rq->lock);

966

}

966

}

967

968

static void __task_rq_unlock(struct rq *rq)

968

static void __task_rq_unlock(struct rq *rq)

969

__releases(rq->lock)

969

__releases(rq->lock)

970

{

970

{

971

raw_spin_unlock(&rq->lock);

971

raw_spin_unlock(&rq->lock);

972

}

972

}

973

974

static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)

974

static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)

975

__releases(rq->lock)

975

__releases(rq->lock)

976

{

976

{

977

raw_spin_unlock_irqrestore(&rq->lock, *flags);

977

raw_spin_unlock_irqrestore(&rq->lock, *flags);

978

}

978

}

979

980

/*

980

/*

981

* this_rq_lock - lock this runqueue and disable interrupts.

981

* this_rq_lock - lock this runqueue and disable interrupts.

982

*/

982

*/

983

static struct rq *this_rq_lock(void)

983

static struct rq *this_rq_lock(void)

984

__acquires(rq->lock)

984

__acquires(rq->lock)

985

{

985

{

986

struct rq *rq;

986

struct rq *rq;

987

988

local_irq_disable();

988

local_irq_disable();

989

rq = this_rq();

989

rq = this_rq();

990

raw_spin_lock(&rq->lock);

990

raw_spin_lock(&rq->lock);

991

992

return rq;

992

return rq;

993

}

993

}

994

995

#ifdef CONFIG_SCHED_HRTICK

995

#ifdef CONFIG_SCHED_HRTICK

996

/*

996

/*

997

* Use HR-timers to deliver accurate preemption points.

997

* Use HR-timers to deliver accurate preemption points.

998

*

998

*

999

* Its all a bit involved since we cannot program an hrt while holding the

999

* Its all a bit involved since we cannot program an hrt while holding the

1000

* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a

1000

* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a

1001

* reschedule event.

1001

* reschedule event.

1002

*

1002

*

1003

* When we get rescheduled we reprogram the hrtick_timer outside of the

1003

* When we get rescheduled we reprogram the hrtick_timer outside of the

1004

* rq->lock.

1004

* rq->lock.

1005

*/

1005

*/

1006

1007

/*

1007

/*

1008

* Use hrtick when:

1008

* Use hrtick when:

1009

* - enabled by features

1009

* - enabled by features

1010

* - hrtimer is actually high res

1010

* - hrtimer is actually high res

1011

*/

1011

*/

1012

static inline int hrtick_enabled(struct rq *rq)

1012

static inline int hrtick_enabled(struct rq *rq)

1013

{

1013

{

1014

if (!sched_feat(HRTICK))

1014

if (!sched_feat(HRTICK))

1015

return 0;

1015

return 0;

1016

if (!cpu_active(cpu_of(rq)))

1016

if (!cpu_active(cpu_of(rq)))

1017

return 0;

1017

return 0;

1018

return hrtimer_is_hres_active(&rq->hrtick_timer);

1018

return hrtimer_is_hres_active(&rq->hrtick_timer);

1019

}

1019

}

1020

1021

static void hrtick_clear(struct rq *rq)

1021

static void hrtick_clear(struct rq *rq)

1022

{

1022

{

1023

if (hrtimer_active(&rq->hrtick_timer))

1023

if (hrtimer_active(&rq->hrtick_timer))

1024

hrtimer_cancel(&rq->hrtick_timer);

1024

hrtimer_cancel(&rq->hrtick_timer);

1025

}

1025

}

1026

1027

/*

1027

/*

1028

* High-resolution timer tick.

1028

* High-resolution timer tick.

1029

* Runs from hardirq context with interrupts disabled.

1029

* Runs from hardirq context with interrupts disabled.

1030

*/

1030

*/

1031

static enum hrtimer_restart hrtick(struct hrtimer *timer)

1031

static enum hrtimer_restart hrtick(struct hrtimer *timer)

1032

{

1032

{

1033

struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1033

struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1034

1035

WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1035

WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1036

1037

raw_spin_lock(&rq->lock);

1037

raw_spin_lock(&rq->lock);

1038

update_rq_clock(rq);

1038

update_rq_clock(rq);

1039

rq->curr->sched_class->task_tick(rq, rq->curr, 1);

1039

rq->curr->sched_class->task_tick(rq, rq->curr, 1);

1040

raw_spin_unlock(&rq->lock);

1040

raw_spin_unlock(&rq->lock);

1041

1042

return HRTIMER_NORESTART;

1042

return HRTIMER_NORESTART;

1043

}

1043

}

1044

1045

#ifdef CONFIG_SMP

1045

#ifdef CONFIG_SMP

1046

/*

1046

/*

1047

* called from hardirq (IPI) context

1047

* called from hardirq (IPI) context

1048

*/

1048

*/

1049

static void __hrtick_start(void *arg)

1049

static void __hrtick_start(void *arg)

1050

{

1050

{

1051

struct rq *rq = arg;

1051

struct rq *rq = arg;

1052

1053

raw_spin_lock(&rq->lock);

1053

raw_spin_lock(&rq->lock);

1054

hrtimer_restart(&rq->hrtick_timer);

1054

hrtimer_restart(&rq->hrtick_timer);

1055

rq->hrtick_csd_pending = 0;

1055

rq->hrtick_csd_pending = 0;

1056

raw_spin_unlock(&rq->lock);

1056

raw_spin_unlock(&rq->lock);

1057

}

1057

}

1058

1059

/*

1059

/*

1060

* Called to set the hrtick timer state.

1060

* Called to set the hrtick timer state.

1061

*

1061

*

1062

* called with rq->lock held and irqs disabled

1062

* called with rq->lock held and irqs disabled

1063

*/

1063

*/

1064

static void hrtick_start(struct rq *rq, u64 delay)

1064

static void hrtick_start(struct rq *rq, u64 delay)

1065

{

1065

{

1066

struct hrtimer *timer = &rq->hrtick_timer;

1066

struct hrtimer *timer = &rq->hrtick_timer;

1067

ktime_t time = ktime_add_ns(timer->base->get_time(), delay);

1067

ktime_t time = ktime_add_ns(timer->base->get_time(), delay);

1068

1069

hrtimer_set_expires(timer, time);

1069

hrtimer_set_expires(timer, time);

1070

1071

if (rq == this_rq()) {

1071

if (rq == this_rq()) {

1072

hrtimer_restart(timer);

1072

hrtimer_restart(timer);

1073

} else if (!rq->hrtick_csd_pending) {

1073

} else if (!rq->hrtick_csd_pending) {

1074

__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);

1074

__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);

1075

rq->hrtick_csd_pending = 1;

1075

rq->hrtick_csd_pending = 1;

1076

}

1076

}

1077

}

1077

}

1078

1079

static int

1079

static int

1080

hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)

1080

hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)

1081

{

1081

{

1082

int cpu = (int)(long)hcpu;

1082

int cpu = (int)(long)hcpu;

1083

1084

switch (action) {

1084

switch (action) {

1085

case CPU_UP_CANCELED:

1085

case CPU_UP_CANCELED:

1086

case CPU_UP_CANCELED_FROZEN:

1086

case CPU_UP_CANCELED_FROZEN:

1087

case CPU_DOWN_PREPARE:

1087

case CPU_DOWN_PREPARE:

1088

case CPU_DOWN_PREPARE_FROZEN:

1088

case CPU_DOWN_PREPARE_FROZEN:

1089

case CPU_DEAD:

1089

case CPU_DEAD:

1090

case CPU_DEAD_FROZEN:

1090

case CPU_DEAD_FROZEN:

1091

hrtick_clear(cpu_rq(cpu));

1091

hrtick_clear(cpu_rq(cpu));

1092

return NOTIFY_OK;

1092

return NOTIFY_OK;

1093

}

1093

}

1094

1095

return NOTIFY_DONE;

1095

return NOTIFY_DONE;

1096

}

1096

}

1097

1098

static __init void init_hrtick(void)

1098

static __init void init_hrtick(void)

1099

{

1099

{

1100

hotcpu_notifier(hotplug_hrtick, 0);

1100

hotcpu_notifier(hotplug_hrtick, 0);

1101

}

1101

}

1102

#else

1102

#else

1103

/*

1103

/*

1104

* Called to set the hrtick timer state.

1104

* Called to set the hrtick timer state.

1105

*

1105

*

1106

* called with rq->lock held and irqs disabled

1106

* called with rq->lock held and irqs disabled

1107

*/

1107

*/

1108

static void hrtick_start(struct rq *rq, u64 delay)

1108

static void hrtick_start(struct rq *rq, u64 delay)

1109

{

1109

{

1110

__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,

1110

__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,

1111

HRTIMER_MODE_REL_PINNED, 0);

1111

HRTIMER_MODE_REL_PINNED, 0);

1112

}

1112

}

1113

1114

static inline void init_hrtick(void)

1114

static inline void init_hrtick(void)

1115

{

1115

{

1116

}

1116

}

1117

#endif /* CONFIG_SMP */

1117

#endif /* CONFIG_SMP */

1118

1119

static void init_rq_hrtick(struct rq *rq)

1119

static void init_rq_hrtick(struct rq *rq)

1120

{

1120

{

1121

#ifdef CONFIG_SMP

1121

#ifdef CONFIG_SMP

1122

rq->hrtick_csd_pending = 0;

1122

rq->hrtick_csd_pending = 0;

1123

1124

rq->hrtick_csd.flags = 0;

1124

rq->hrtick_csd.flags = 0;

1125

rq->hrtick_csd.func = __hrtick_start;

1125

rq->hrtick_csd.func = __hrtick_start;

1126

rq->hrtick_csd.info = rq;

1126

rq->hrtick_csd.info = rq;

1127

#endif

1127

#endif

1128

1129

hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

1129

hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

1130

rq->hrtick_timer.function = hrtick;

1130

rq->hrtick_timer.function = hrtick;

1131

}

1131

}

1132

#else /* CONFIG_SCHED_HRTICK */

1132

#else /* CONFIG_SCHED_HRTICK */

1133

static inline void hrtick_clear(struct rq *rq)

1133

static inline void hrtick_clear(struct rq *rq)

1134

{

1134

{

1135

}

1135

}

1136

1137

static inline void init_rq_hrtick(struct rq *rq)

1137

static inline void init_rq_hrtick(struct rq *rq)

1138

{

1138

{

1139

}

1139

}

1140

1141

static inline void init_hrtick(void)

1141

static inline void init_hrtick(void)

1142

{

1142

{

1143

}

1143

}

1144

#endif /* CONFIG_SCHED_HRTICK */

1144

#endif /* CONFIG_SCHED_HRTICK */

1145

1146

/*

1146

/*

1147

* resched_task - mark a task 'to be rescheduled now'.

1147

* resched_task - mark a task 'to be rescheduled now'.

1148

*

1148

*

1149

* On UP this means the setting of the need_resched flag, on SMP it

1149

* On UP this means the setting of the need_resched flag, on SMP it

1150

* might also involve a cross-CPU call to trigger the scheduler on

1150

* might also involve a cross-CPU call to trigger the scheduler on

1151

* the target CPU.

1151

* the target CPU.

1152

*/

1152

*/

1153

#ifdef CONFIG_SMP

1153

#ifdef CONFIG_SMP

1154

1155

#ifndef tsk_is_polling

1155

#ifndef tsk_is_polling

1156

#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)

1156

#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)

1157

#endif

1157

#endif

1158

1159

static void resched_task(struct task_struct *p)

1159

static void resched_task(struct task_struct *p)

1160

{

1160

{

1161

int cpu;

1161

int cpu;

1162

1163

assert_raw_spin_locked(&task_rq(p)->lock);

1163

assert_raw_spin_locked(&task_rq(p)->lock);

1164

1165

if (test_tsk_need_resched(p))

1165

if (test_tsk_need_resched(p))

1166

return;

1166

return;

1167

1168

set_tsk_need_resched(p);

1168

set_tsk_need_resched(p);

1169

1170

cpu = task_cpu(p);

1170

cpu = task_cpu(p);

1171

if (cpu == smp_processor_id())

1171

if (cpu == smp_processor_id())

1172

return;

1172

return;

1173

1174

/* NEED_RESCHED must be visible before we test polling */

1174

/* NEED_RESCHED must be visible before we test polling */

1175

smp_mb();

1175

smp_mb();

1176

if (!tsk_is_polling(p))

1176

if (!tsk_is_polling(p))

1177

smp_send_reschedule(cpu);

1177

smp_send_reschedule(cpu);

1178

}

1178

}

1179

1180

static void resched_cpu(int cpu)

1180

static void resched_cpu(int cpu)

1181

{

1181

{

1182

struct rq *rq = cpu_rq(cpu);

1182

struct rq *rq = cpu_rq(cpu);

1183

unsigned long flags;

1183

unsigned long flags;

1184

1185

if (!raw_spin_trylock_irqsave(&rq->lock, flags))

1185

if (!raw_spin_trylock_irqsave(&rq->lock, flags))

1186

return;

1186

return;

1187

resched_task(cpu_curr(cpu));

1187

resched_task(cpu_curr(cpu));

1188

raw_spin_unlock_irqrestore(&rq->lock, flags);

1188

raw_spin_unlock_irqrestore(&rq->lock, flags);

1189

}

1189

}

1190

1191

#ifdef CONFIG_NO_HZ

1191

#ifdef CONFIG_NO_HZ

1192

/*

1192

/*

1193

* When add_timer_on() enqueues a timer into the timer wheel of an

1193

* When add_timer_on() enqueues a timer into the timer wheel of an

1194

* idle CPU then this timer might expire before the next timer event

1194

* idle CPU then this timer might expire before the next timer event

1195

* which is scheduled to wake up that CPU. In case of a completely

1195

* which is scheduled to wake up that CPU. In case of a completely

1196

* idle system the next event might even be infinite time into the

1196

* idle system the next event might even be infinite time into the

1197

* future. wake_up_idle_cpu() ensures that the CPU is woken up and

1197

* future. wake_up_idle_cpu() ensures that the CPU is woken up and

1198

* leaves the inner idle loop so the newly added timer is taken into

1198

* leaves the inner idle loop so the newly added timer is taken into

1199

* account when the CPU goes back to idle and evaluates the timer

1199

* account when the CPU goes back to idle and evaluates the timer

1200

* wheel for the next timer event.

1200

* wheel for the next timer event.

1201

*/

1201

*/

1202

void wake_up_idle_cpu(int cpu)

1202

void wake_up_idle_cpu(int cpu)

1203

{

1203

{

1204

struct rq *rq = cpu_rq(cpu);

1204

struct rq *rq = cpu_rq(cpu);

1205

1206

if (cpu == smp_processor_id())

1206

if (cpu == smp_processor_id())

1207

return;

1207

return;

1208

1209

/*

1209

/*

1210

* This is safe, as this function is called with the timer

1210

* This is safe, as this function is called with the timer

1211

* wheel base lock of (cpu) held. When the CPU is on the way

1211

* wheel base lock of (cpu) held. When the CPU is on the way

1212

* to idle and has not yet set rq->curr to idle then it will

1212

* to idle and has not yet set rq->curr to idle then it will

1213

* be serialized on the timer wheel base lock and take the new

1213

* be serialized on the timer wheel base lock and take the new

1214

* timer into account automatically.

1214

* timer into account automatically.

1215

*/

1215

*/

1216

if (rq->curr != rq->idle)

1216

if (rq->curr != rq->idle)

1217

return;

1217

return;

1218

1219

/*

1219

/*

1220

* We can set TIF_RESCHED on the idle task of the other CPU

1220

* We can set TIF_RESCHED on the idle task of the other CPU

1221

* lockless. The worst case is that the other CPU runs the

1221

* lockless. The worst case is that the other CPU runs the

1222

* idle task through an additional NOOP schedule()

1222

* idle task through an additional NOOP schedule()

1223

*/

1223

*/

1224

set_tsk_need_resched(rq->idle);

1224

set_tsk_need_resched(rq->idle);

1225

1226

/* NEED_RESCHED must be visible before we test polling */

1226

/* NEED_RESCHED must be visible before we test polling */

1227

smp_mb();

1227

smp_mb();

1228

if (!tsk_is_polling(rq->idle))

1228

if (!tsk_is_polling(rq->idle))

1229

smp_send_reschedule(cpu);

1229

smp_send_reschedule(cpu);

1230

}

1230

}

1231

#endif /* CONFIG_NO_HZ */

1231

#endif /* CONFIG_NO_HZ */

1232

1233

static u64 sched_avg_period(void)

1233

static u64 sched_avg_period(void)

1234

{

1234

{

1235

return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;

1235

return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;

1236

}

1236

}

1237

1238

static void sched_avg_update(struct rq *rq)

1238

static void sched_avg_update(struct rq *rq)

1239

{

1239

{

1240

s64 period = sched_avg_period();

1240

s64 period = sched_avg_period();

1241

1242

while ((s64)(rq->clock - rq->age_stamp) > period) {

1242

while ((s64)(rq->clock - rq->age_stamp) > period) {

1243

rq->age_stamp += period;

1243

rq->age_stamp += period;

1244

rq->rt_avg /= 2;

1244

rq->rt_avg /= 2;

1245

}

1245

}

1246

}

1246

}

1247

1248

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1248

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1249

{

1249

{

1250

rq->rt_avg += rt_delta;

1250

rq->rt_avg += rt_delta;

1251

sched_avg_update(rq);

1251

sched_avg_update(rq);

1252

}

1252

}

1253

1254

#else /* !CONFIG_SMP */

1254

#else /* !CONFIG_SMP */

1255

static void resched_task(struct task_struct *p)

1255

static void resched_task(struct task_struct *p)

1256

{

1256

{

1257

assert_raw_spin_locked(&task_rq(p)->lock);

1257

assert_raw_spin_locked(&task_rq(p)->lock);

1258

set_tsk_need_resched(p);

1258

set_tsk_need_resched(p);

1259

}

1259

}

1260

1261

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1261

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1262

{

1262

{

1263

}

1263

}

1264

#endif /* CONFIG_SMP */

1264

#endif /* CONFIG_SMP */

1265

1266

#if BITS_PER_LONG == 32

1266

#if BITS_PER_LONG == 32

1267

# define WMULT_CONST (~0UL)

1267

# define WMULT_CONST (~0UL)

1268

#else

1268

#else

1269

# define WMULT_CONST (1UL << 32)

1269

# define WMULT_CONST (1UL << 32)

1270

#endif

1270

#endif

1271

1272

#define WMULT_SHIFT 32

1272

#define WMULT_SHIFT 32

1273

1274

/*

1274

/*

1275

* Shift right and round:

1275

* Shift right and round:

1276

*/

1276

*/

1277

#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))

1277

#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))

1278

1279

/*

1279

/*

1280

* delta *= weight / lw

1280

* delta *= weight / lw

1281

*/

1281

*/

1282

static unsigned long

1282

static unsigned long

1283

calc_delta_mine(unsigned long delta_exec, unsigned long weight,

1283

calc_delta_mine(unsigned long delta_exec, unsigned long weight,

1284

struct load_weight *lw)

1284

struct load_weight *lw)

1285

{

1285

{

1286

u64 tmp;

1286

u64 tmp;

1287

1288

if (!lw->inv_weight) {

1288

if (!lw->inv_weight) {

1289

if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))

1289

if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))

1290

lw->inv_weight = 1;

1290

lw->inv_weight = 1;

1291

else

1291

else

1292

lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)

1292

lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)

1293

/ (lw->weight+1);

1293

/ (lw->weight+1);

1294

}

1294

}

1295

1296

tmp = (u64)delta_exec * weight;

1296

tmp = (u64)delta_exec * weight;

1297

/*

1297

/*

1298

* Check whether we'd overflow the 64-bit multiplication:

1298

* Check whether we'd overflow the 64-bit multiplication:

1299

*/

1299

*/

1300

if (unlikely(tmp > WMULT_CONST))

1300

if (unlikely(tmp > WMULT_CONST))

1301

tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,

1301

tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,

1302

WMULT_SHIFT/2);

1302

WMULT_SHIFT/2);

1303

else

1303

else

1304

tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

1304

tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

1305

1306

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);

1306

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);

1307

}

1307

}

1308

1309

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

1309

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

1310

{

1310

{

1311

lw->weight += inc;

1311

lw->weight += inc;

1312

lw->inv_weight = 0;

1312

lw->inv_weight = 0;

1313

}

1313

}

1314

1315

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

1315

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

1316

{

1316

{

1317

lw->weight -= dec;

1317

lw->weight -= dec;

1318

lw->inv_weight = 0;

1318

lw->inv_weight = 0;

1319

}

1319

}

1320

1321

/*

1321

/*

1322

* To aid in avoiding the subversion of "niceness" due to uneven distribution

1322

* To aid in avoiding the subversion of "niceness" due to uneven distribution

1323

* of tasks with abnormal "nice" values across CPUs the contribution that

1323

* of tasks with abnormal "nice" values across CPUs the contribution that

1324

* each task makes to its run queue's load is weighted according to its

1324

* each task makes to its run queue's load is weighted according to its

1325

* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a

1325

* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a

1326

* scaled version of the new time slice allocation that they receive on time

1326

* scaled version of the new time slice allocation that they receive on time

1327

* slice expiry etc.

1327

* slice expiry etc.

1328

*/

1328

*/

1329

1330

#define WEIGHT_IDLEPRIO 3

1330

#define WEIGHT_IDLEPRIO 3

1331

#define WMULT_IDLEPRIO 1431655765

1331

#define WMULT_IDLEPRIO 1431655765

1332

1333

/*

1333

/*

1334

* Nice levels are multiplicative, with a gentle 10% change for every

1334

* Nice levels are multiplicative, with a gentle 10% change for every

1335

* nice level changed. I.e. when a CPU-bound task goes from nice 0 to

1335

* nice level changed. I.e. when a CPU-bound task goes from nice 0 to

1336

* nice 1, it will get ~10% less CPU time than another CPU-bound task

1336

* nice 1, it will get ~10% less CPU time than another CPU-bound task

1337

* that remained on nice 0.

1337

* that remained on nice 0.

1338

*

1338

*

1339

* The "10% effect" is relative and cumulative: from _any_ nice level,

1339

* The "10% effect" is relative and cumulative: from _any_ nice level,

1340

* if you go up 1 level, it's -10% CPU usage, if you go down 1 level

1340

* if you go up 1 level, it's -10% CPU usage, if you go down 1 level

1341

* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.

1341

* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.

1342

* If a task goes up by ~10% and another task goes down by ~10% then

1342

* If a task goes up by ~10% and another task goes down by ~10% then

1343

* the relative distance between them is ~25%.)

1343

* the relative distance between them is ~25%.)

1344

*/

1344

*/

1345

static const int prio_to_weight[40] = {

1345

static const int prio_to_weight[40] = {

1346

/* -20 */ 88761, 71755, 56483, 46273, 36291,

1346

/* -20 */ 88761, 71755, 56483, 46273, 36291,

1347

/* -15 */ 29154, 23254, 18705, 14949, 11916,

1347

/* -15 */ 29154, 23254, 18705, 14949, 11916,

1348

/* -10 */ 9548, 7620, 6100, 4904, 3906,

1348

/* -10 */ 9548, 7620, 6100, 4904, 3906,

1349

/* -5 */ 3121, 2501, 1991, 1586, 1277,

1349

/* -5 */ 3121, 2501, 1991, 1586, 1277,

1350

/* 0 */ 1024, 820, 655, 526, 423,

1350

/* 0 */ 1024, 820, 655, 526, 423,

1351

/* 5 */ 335, 272, 215, 172, 137,

1351

/* 5 */ 335, 272, 215, 172, 137,

1352

/* 10 */ 110, 87, 70, 56, 45,

1352

/* 10 */ 110, 87, 70, 56, 45,

1353

/* 15 */ 36, 29, 23, 18, 15,

1353

/* 15 */ 36, 29, 23, 18, 15,

1354

};

1354

};

1355

1356

/*

1356

/*

1357

* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.

1357

* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.

1358

*

1358

*

1359

* In cases where the weight does not change often, we can use the

1359

* In cases where the weight does not change often, we can use the

1360

* precalculated inverse to speed up arithmetics by turning divisions

1360

* precalculated inverse to speed up arithmetics by turning divisions

1361

* into multiplications:

1361

* into multiplications:

1362

*/

1362

*/

1363

static const u32 prio_to_wmult[40] = {

1363

static const u32 prio_to_wmult[40] = {

1364

/* -20 */ 48388, 59856, 76040, 92818, 118348,

1364

/* -20 */ 48388, 59856, 76040, 92818, 118348,

1365

/* -15 */ 147320, 184698, 229616, 287308, 360437,

1365

/* -15 */ 147320, 184698, 229616, 287308, 360437,

1366

/* -10 */ 449829, 563644, 704093, 875809, 1099582,

1366

/* -10 */ 449829, 563644, 704093, 875809, 1099582,

1367

/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,

1367

/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,

1368

/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,

1368

/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,

1369

/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,

1369

/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,

1370

/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,

1370

/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,

1371

/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,

1371

/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,

1372

};

1372

};

1373

1374

/* Time spent by the tasks of the cpu accounting group executing in ... */

1374

/* Time spent by the tasks of the cpu accounting group executing in ... */

1375

enum cpuacct_stat_index {

1375

enum cpuacct_stat_index {

1376

CPUACCT_STAT_USER, /* ... user mode */

1376

CPUACCT_STAT_USER, /* ... user mode */

1377

CPUACCT_STAT_SYSTEM, /* ... kernel mode */

1377

CPUACCT_STAT_SYSTEM, /* ... kernel mode */

1378

1379

CPUACCT_STAT_NSTATS,

1379

CPUACCT_STAT_NSTATS,

1380

};

1380

};

1381

1382

#ifdef CONFIG_CGROUP_CPUACCT

1382

#ifdef CONFIG_CGROUP_CPUACCT

1383

static void cpuacct_charge(struct task_struct *tsk, u64 cputime);

1383

static void cpuacct_charge(struct task_struct *tsk, u64 cputime);

1384

static void cpuacct_update_stats(struct task_struct *tsk,

1384

static void cpuacct_update_stats(struct task_struct *tsk,

1385

enum cpuacct_stat_index idx, cputime_t val);

1385

enum cpuacct_stat_index idx, cputime_t val);

1386

#else

1386

#else

1387

static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}

1387

static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}

1388

static inline void cpuacct_update_stats(struct task_struct *tsk,

1388

static inline void cpuacct_update_stats(struct task_struct *tsk,

1389

enum cpuacct_stat_index idx, cputime_t val) {}

1389

enum cpuacct_stat_index idx, cputime_t val) {}

1390

#endif

1390

#endif

1391

1392

static inline void inc_cpu_load(struct rq *rq, unsigned long load)

1392

static inline void inc_cpu_load(struct rq *rq, unsigned long load)

1393

{

1393

{

1394

update_load_add(&rq->load, load);

1394

update_load_add(&rq->load, load);

1395

}

1395

}

1396

1397

static inline void dec_cpu_load(struct rq *rq, unsigned long load)

1397

static inline void dec_cpu_load(struct rq *rq, unsigned long load)

1398

{

1398

{

1399

update_load_sub(&rq->load, load);

1399

update_load_sub(&rq->load, load);

1400

}

1400

}

1401

1402

#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)

1402

#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)

1403

typedef int (*tg_visitor)(struct task_group *, void *);

1403

typedef int (*tg_visitor)(struct task_group *, void *);

1404

1405

/*

1405

/*

1406

* Iterate the full tree, calling @down when first entering a node and @up when

1406

* Iterate the full tree, calling @down when first entering a node and @up when

1407

* leaving it for the final time.

1407

* leaving it for the final time.

1408

*/

1408

*/

1409

static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)

1409

static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)

1410

{

1410

{

1411

struct task_group *parent, *child;

1411

struct task_group *parent, *child;

1412

int ret;

1412

int ret;

1413

1414

rcu_read_lock();

1414

rcu_read_lock();

1415

parent = &root_task_group;

1415

parent = &root_task_group;

1416

down:

1416

down:

1417

ret = (*down)(parent, data);

1417

ret = (*down)(parent, data);

1418

if (ret)

1418

if (ret)

1419

goto out_unlock;

1419

goto out_unlock;

1420

list_for_each_entry_rcu(child, &parent->children, siblings) {

1420

list_for_each_entry_rcu(child, &parent->children, siblings) {

1421

parent = child;

1421

parent = child;

1422

goto down;

1422

goto down;

1423

1424

up:

1424

up:

1425

continue;

1425

continue;

1426

}

1426

}

1427

ret = (*up)(parent, data);

1427

ret = (*up)(parent, data);

1428

if (ret)

1428

if (ret)

1429

goto out_unlock;

1429

goto out_unlock;

1430

1431

child = parent;

1431

child = parent;

1432

parent = parent->parent;

1432

parent = parent->parent;

1433

if (parent)

1433

if (parent)

1434

goto up;

1434

goto up;

1435

out_unlock:

1435

out_unlock:

1436

rcu_read_unlock();

1436

rcu_read_unlock();

1437

1438

return ret;

1438

return ret;

1439

}

1439

}

1440

1441

static int tg_nop(struct task_group *tg, void *data)

1441

static int tg_nop(struct task_group *tg, void *data)

1442

{

1442

{

1443

return 0;

1443

return 0;

1444

}

1444

}

1445

#endif

1445

#endif

1446

1447

#ifdef CONFIG_SMP

1447

#ifdef CONFIG_SMP

1448

/* Used instead of source_load when we know the type == 0 */

1448

/* Used instead of source_load when we know the type == 0 */

1449

static unsigned long weighted_cpuload(const int cpu)

1449

static unsigned long weighted_cpuload(const int cpu)

1450

{

1450

{

1451

return cpu_rq(cpu)->load.weight;

1451

return cpu_rq(cpu)->load.weight;

1452

}

1452

}

1453

1454

/*

1454

/*

1455

* Return a low guess at the load of a migration-source cpu weighted

1455

* Return a low guess at the load of a migration-source cpu weighted

1456

* according to the scheduling class and "nice" value.

1456

* according to the scheduling class and "nice" value.

1457

*

1457

*

1458

* We want to under-estimate the load of migration sources, to

1458

* We want to under-estimate the load of migration sources, to

1459

* balance conservatively.

1459

* balance conservatively.

1460

*/

1460

*/

1461

static unsigned long source_load(int cpu, int type)

1461

static unsigned long source_load(int cpu, int type)

1462

{

1462

{

1463

struct rq *rq = cpu_rq(cpu);

1463

struct rq *rq = cpu_rq(cpu);

1464

unsigned long total = weighted_cpuload(cpu);

1464

unsigned long total = weighted_cpuload(cpu);

1465

1466

if (type == 0 || !sched_feat(LB_BIAS))

1466

if (type == 0 || !sched_feat(LB_BIAS))

1467

return total;

1467

return total;

1468

1469

return min(rq->cpu_load[type-1], total);

1469

return min(rq->cpu_load[type-1], total);

1470

}

1470

}

1471

1472

/*

1472

/*

1473

* Return a high guess at the load of a migration-target cpu weighted

1473

* Return a high guess at the load of a migration-target cpu weighted

1474

* according to the scheduling class and "nice" value.

1474

* according to the scheduling class and "nice" value.

1475

*/

1475

*/

1476

static unsigned long target_load(int cpu, int type)

1476

static unsigned long target_load(int cpu, int type)

1477

{

1477

{

1478

struct rq *rq = cpu_rq(cpu);

1478

struct rq *rq = cpu_rq(cpu);

1479

unsigned long total = weighted_cpuload(cpu);

1479

unsigned long total = weighted_cpuload(cpu);

1480

1481

if (type == 0 || !sched_feat(LB_BIAS))

1481

if (type == 0 || !sched_feat(LB_BIAS))

1482

return total;

1482

return total;

1483

1484

return max(rq->cpu_load[type-1], total);

1484

return max(rq->cpu_load[type-1], total);

1485

}

1485

}

1486

1487

static struct sched_group *group_of(int cpu)

1487

static struct sched_group *group_of(int cpu)

1488

{

1488

{

1489

struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);

1489

struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);

1490

1491

if (!sd)

1491

if (!sd)

1492

return NULL;

1492

return NULL;

1493

1494

return sd->groups;

1494

return sd->groups;

1495

}

1495

}

1496

1497

static unsigned long power_of(int cpu)

1497

static unsigned long power_of(int cpu)

1498

{

1498

{

1499

struct sched_group *group = group_of(cpu);

1499

struct sched_group *group = group_of(cpu);

1500

1501

if (!group)

1501

if (!group)

1502

return SCHED_LOAD_SCALE;

1502

return SCHED_LOAD_SCALE;

1503

1504

return group->cpu_power;

1504

return group->cpu_power;

1505

}

1505

}

1506

1507

static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);

1507

static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);

1508

1509

static unsigned long cpu_avg_load_per_task(int cpu)

1509

static unsigned long cpu_avg_load_per_task(int cpu)

1510

{

1510

{

1511

struct rq *rq = cpu_rq(cpu);

1511

struct rq *rq = cpu_rq(cpu);

1512

unsigned long nr_running = ACCESS_ONCE(rq->nr_running);

1512

unsigned long nr_running = ACCESS_ONCE(rq->nr_running);

1513

1514

if (nr_running)

1514

if (nr_running)

1515

rq->avg_load_per_task = rq->load.weight / nr_running;

1515

rq->avg_load_per_task = rq->load.weight / nr_running;

1516

else

1516

else

1517

rq->avg_load_per_task = 0;

1517

rq->avg_load_per_task = 0;

1518

1519

return rq->avg_load_per_task;

1519

return rq->avg_load_per_task;

1520

}

1520

}

1521

1522

#ifdef CONFIG_FAIR_GROUP_SCHED

1522

#ifdef CONFIG_FAIR_GROUP_SCHED

1523

1524

static __read_mostly unsigned long __percpu *update_shares_data;

1524

static __read_mostly unsigned long __percpu *update_shares_data;

1525

1526

static void __set_se_shares(struct sched_entity *se, unsigned long shares);

1526

static void __set_se_shares(struct sched_entity *se, unsigned long shares);

1527

1528

/*

1528

/*

1529

* Calculate and set the cpu's group shares.

1529

* Calculate and set the cpu's group shares.

1530

*/

1530

*/

1531

static void update_group_shares_cpu(struct task_group *tg, int cpu,

1531

static void update_group_shares_cpu(struct task_group *tg, int cpu,

1532

unsigned long sd_shares,

1532

unsigned long sd_shares,

1533

unsigned long sd_rq_weight,

1533

unsigned long sd_rq_weight,

1534

unsigned long *usd_rq_weight)

1534

unsigned long *usd_rq_weight)

1535

{

1535

{

1536

unsigned long shares, rq_weight;

1536

unsigned long shares, rq_weight;

1537

int boost = 0;

1537

int boost = 0;

1538

1539

rq_weight = usd_rq_weight[cpu];

1539

rq_weight = usd_rq_weight[cpu];

1540

if (!rq_weight) {

1540

if (!rq_weight) {

1541

boost = 1;

1541

boost = 1;

1542

rq_weight = NICE_0_LOAD;

1542

rq_weight = NICE_0_LOAD;

1543

}

1543

}

1544

1545

/*

1545

/*

1546

* \Sum_j shares_j * rq_weight_i

1546

* \Sum_j shares_j * rq_weight_i

1547

* shares_i = -----------------------------

1547

* shares_i = -----------------------------

1548

* \Sum_j rq_weight_j

1548

* \Sum_j rq_weight_j

1549

*/

1549

*/

1550

shares = (sd_shares * rq_weight) / sd_rq_weight;

1550

shares = (sd_shares * rq_weight) / sd_rq_weight;

1551

shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);

1551

shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);

1552

1553

if (abs(shares - tg->se[cpu]->load.weight) >

1553

if (abs(shares - tg->se[cpu]->load.weight) >

1554

sysctl_sched_shares_thresh) {

1554

sysctl_sched_shares_thresh) {

1555

struct rq *rq = cpu_rq(cpu);

1555

struct rq *rq = cpu_rq(cpu);

1556

unsigned long flags;

1556

unsigned long flags;

1557

1558

raw_spin_lock_irqsave(&rq->lock, flags);

1558

raw_spin_lock_irqsave(&rq->lock, flags);

1559

tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;

1559

tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;

1560

tg->cfs_rq[cpu]->shares = boost ? 0 : shares;

1560

tg->cfs_rq[cpu]->shares = boost ? 0 : shares;

1561

__set_se_shares(tg->se[cpu], shares);

1561

__set_se_shares(tg->se[cpu], shares);

1562

raw_spin_unlock_irqrestore(&rq->lock, flags);

1562

raw_spin_unlock_irqrestore(&rq->lock, flags);

1563

}

1563

}

1564

}

1564

}

1565

1566

/*

1566

/*

1567

* Re-compute the task group their per cpu shares over the given domain.

1567

* Re-compute the task group their per cpu shares over the given domain.

1568

* This needs to be done in a bottom-up fashion because the rq weight of a

1568

* This needs to be done in a bottom-up fashion because the rq weight of a

1569

* parent group depends on the shares of its child groups.

1569

* parent group depends on the shares of its child groups.

1570

*/

1570

*/

1571

static int tg_shares_up(struct task_group *tg, void *data)

1571

static int tg_shares_up(struct task_group *tg, void *data)

1572

{

1572

{

1573

unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;

1573

unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;

1574

unsigned long *usd_rq_weight;

1574

unsigned long *usd_rq_weight;

1575

struct sched_domain *sd = data;

1575

struct sched_domain *sd = data;

1576

unsigned long flags;

1576

unsigned long flags;

1577

int i;

1577

int i;

1578

1579

if (!tg->se[0])

1579

if (!tg->se[0])

1580

return 0;

1580

return 0;

1581

1582

local_irq_save(flags);

1582

local_irq_save(flags);

1583

usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());

1583

usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());

1584

1585

for_each_cpu(i, sched_domain_span(sd)) {

1585

for_each_cpu(i, sched_domain_span(sd)) {

1586

weight = tg->cfs_rq[i]->load.weight;

1586

weight = tg->cfs_rq[i]->load.weight;

1587

usd_rq_weight[i] = weight;

1587

usd_rq_weight[i] = weight;

1588

1589

rq_weight += weight;

1589

rq_weight += weight;

1590

/*

1590

/*

1591

* If there are currently no tasks on the cpu pretend there

1591

* If there are currently no tasks on the cpu pretend there

1592

* is one of average load so that when a new task gets to

1592

* is one of average load so that when a new task gets to

1593

* run here it will not get delayed by group starvation.

1593

* run here it will not get delayed by group starvation.

1594

*/

1594

*/

1595

if (!weight)

1595

if (!weight)

1596

weight = NICE_0_LOAD;

1596

weight = NICE_0_LOAD;

1597

1598

sum_weight += weight;

1598

sum_weight += weight;

1599

shares += tg->cfs_rq[i]->shares;

1599

shares += tg->cfs_rq[i]->shares;

1600

}

1600

}

1601

1602

if (!rq_weight)

1602

if (!rq_weight)

1603

rq_weight = sum_weight;

1603

rq_weight = sum_weight;

1604

1605

if ((!shares && rq_weight) || shares > tg->shares)

1605

if ((!shares && rq_weight) || shares > tg->shares)

1606

shares = tg->shares;

1606

shares = tg->shares;

1607

1608

if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))

1608

if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))

1609

shares = tg->shares;

1609

shares = tg->shares;

1610

1611

for_each_cpu(i, sched_domain_span(sd))

1611

for_each_cpu(i, sched_domain_span(sd))

1612

update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);

1612

update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);

1613

1614

local_irq_restore(flags);

1614

local_irq_restore(flags);

1615

1616

return 0;

1616

return 0;

1617

}

1617

}

1618

1619

/*

1619

/*

1620

* Compute the cpu's hierarchical load factor for each task group.

1620

* Compute the cpu's hierarchical load factor for each task group.

1621

* This needs to be done in a top-down fashion because the load of a child

1621

* This needs to be done in a top-down fashion because the load of a child

1622

* group is a fraction of its parents load.

1622

* group is a fraction of its parents load.

1623

*/

1623

*/

1624

static int tg_load_down(struct task_group *tg, void *data)

1624

static int tg_load_down(struct task_group *tg, void *data)

1625

{

1625

{

1626

unsigned long load;

1626

unsigned long load;

1627

long cpu = (long)data;

1627

long cpu = (long)data;

1628

1629

if (!tg->parent) {

1629

if (!tg->parent) {

1630

load = cpu_rq(cpu)->load.weight;

1630

load = cpu_rq(cpu)->load.weight;

1631

} else {

1631

} else {

1632

load = tg->parent->cfs_rq[cpu]->h_load;

1632

load = tg->parent->cfs_rq[cpu]->h_load;

1633

load *= tg->cfs_rq[cpu]->shares;

1633

load *= tg->cfs_rq[cpu]->shares;

1634

load /= tg->parent->cfs_rq[cpu]->load.weight + 1;

1634

load /= tg->parent->cfs_rq[cpu]->load.weight + 1;

1635

}

1635

}

1636

1637

tg->cfs_rq[cpu]->h_load = load;

1637

tg->cfs_rq[cpu]->h_load = load;

1638

1639

return 0;

1639

return 0;

1640

}

1640

}

1641

1642

static void update_shares(struct sched_domain *sd)

1642

static void update_shares(struct sched_domain *sd)

1643

{

1643

{

1644

s64 elapsed;

1644

s64 elapsed;

1645

u64 now;

1645

u64 now;

1646

1647

if (root_task_group_empty())

1647

if (root_task_group_empty())

1648

return;

1648

return;

1649

1650

now = cpu_clock(raw_smp_processor_id());

1650

now = cpu_clock(raw_smp_processor_id());

1651

elapsed = now - sd->last_update;

1651

elapsed = now - sd->last_update;

1652

1653

if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {

1653

if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {

1654

sd->last_update = now;

1654

sd->last_update = now;

1655

walk_tg_tree(tg_nop, tg_shares_up, sd);

1655

walk_tg_tree(tg_nop, tg_shares_up, sd);

1656

}

1656

}

1657

}

1657

}

1658

1659

static void update_h_load(long cpu)

1659

static void update_h_load(long cpu)

1660

{

1660

{

1661

if (root_task_group_empty())

1661

if (root_task_group_empty())

1662

return;

1662

return;

1663

1664

walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);

1664

walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);

1665

}

1665

}

1666

1667

#else

1667

#else

1668

1669

static inline void update_shares(struct sched_domain *sd)

1669

static inline void update_shares(struct sched_domain *sd)

1670

{

1670

{

1671

}

1671

}

1672

1673

#endif

1673

#endif

1674

1675

#ifdef CONFIG_PREEMPT

1675

#ifdef CONFIG_PREEMPT

1676

1677

static void double_rq_lock(struct rq *rq1, struct rq *rq2);

1677

static void double_rq_lock(struct rq *rq1, struct rq *rq2);

1678

1679

/*

1679

/*

1680

* fair double_lock_balance: Safely acquires both rq->locks in a fair

1680

* fair double_lock_balance: Safely acquires both rq->locks in a fair

1681

* way at the expense of forcing extra atomic operations in all

1681

* way at the expense of forcing extra atomic operations in all

1682

* invocations. This assures that the double_lock is acquired using the

1682

* invocations. This assures that the double_lock is acquired using the

1683

* same underlying policy as the spinlock_t on this architecture, which

1683

* same underlying policy as the spinlock_t on this architecture, which

1684

* reduces latency compared to the unfair variant below. However, it

1684

* reduces latency compared to the unfair variant below. However, it

1685

* also adds more overhead and therefore may reduce throughput.

1685

* also adds more overhead and therefore may reduce throughput.

1686

*/

1686

*/

1687

static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1687

static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1688

__releases(this_rq->lock)

1688

__releases(this_rq->lock)

1689

__acquires(busiest->lock)

1689

__acquires(busiest->lock)

1690

__acquires(this_rq->lock)

1690

__acquires(this_rq->lock)

1691

{

1691

{

1692

raw_spin_unlock(&this_rq->lock);

1692

raw_spin_unlock(&this_rq->lock);

1693

double_rq_lock(this_rq, busiest);

1693

double_rq_lock(this_rq, busiest);

1694

1695

return 1;

1695

return 1;

1696

}

1696

}

1697

1698

#else

1698

#else

1699

/*

1699

/*

1700

* Unfair double_lock_balance: Optimizes throughput at the expense of

1700

* Unfair double_lock_balance: Optimizes throughput at the expense of

1701

* latency by eliminating extra atomic operations when the locks are

1701

* latency by eliminating extra atomic operations when the locks are

1702

* already in proper order on entry. This favors lower cpu-ids and will

1702

* already in proper order on entry. This favors lower cpu-ids and will

1703

* grant the double lock to lower cpus over higher ids under contention,

1703

* grant the double lock to lower cpus over higher ids under contention,

1704

* regardless of entry order into the function.

1704

* regardless of entry order into the function.

1705

*/

1705

*/

1706

static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1706

static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1707

__releases(this_rq->lock)

1707

__releases(this_rq->lock)

1708

__acquires(busiest->lock)

1708

__acquires(busiest->lock)

1709

__acquires(this_rq->lock)

1709

__acquires(this_rq->lock)

1710

{

1710

{

1711

int ret = 0;

1711

int ret = 0;

1712

1713

if (unlikely(!raw_spin_trylock(&busiest->lock))) {

1713

if (unlikely(!raw_spin_trylock(&busiest->lock))) {

1714

if (busiest < this_rq) {

1714

if (busiest < this_rq) {

1715

raw_spin_unlock(&this_rq->lock);

1715

raw_spin_unlock(&this_rq->lock);

1716

raw_spin_lock(&busiest->lock);

1716

raw_spin_lock(&busiest->lock);

1717

raw_spin_lock_nested(&this_rq->lock,

1717

raw_spin_lock_nested(&this_rq->lock,

1718

SINGLE_DEPTH_NESTING);

1718

SINGLE_DEPTH_NESTING);

1719

ret = 1;

1719

ret = 1;

1720

} else

1720

} else

1721

raw_spin_lock_nested(&busiest->lock,

1721

raw_spin_lock_nested(&busiest->lock,

1722

SINGLE_DEPTH_NESTING);

1722

SINGLE_DEPTH_NESTING);

1723

}

1723

}

1724

return ret;

1724

return ret;

1725

}

1725

}

1726

1727

#endif /* CONFIG_PREEMPT */

1727

#endif /* CONFIG_PREEMPT */

1728

1729

/*

1729

/*

1730

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

1730

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

1731

*/

1731

*/

1732

static int double_lock_balance(struct rq *this_rq, struct rq *busiest)

1732

static int double_lock_balance(struct rq *this_rq, struct rq *busiest)

1733

{

1733

{

1734

if (unlikely(!irqs_disabled())) {

1734

if (unlikely(!irqs_disabled())) {

1735

/* printk() doesn't work good under rq->lock */

1735

/* printk() doesn't work good under rq->lock */

1736

raw_spin_unlock(&this_rq->lock);

1736

raw_spin_unlock(&this_rq->lock);

1737

BUG_ON(1);

1737

BUG_ON(1);

1738

}

1738

}

1739

1740

return _double_lock_balance(this_rq, busiest);

1740

return _double_lock_balance(this_rq, busiest);

1741

}

1741

}

1742

1743

static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)

1743

static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)

1744

__releases(busiest->lock)

1744

__releases(busiest->lock)

1745

{

1745

{

1746

raw_spin_unlock(&busiest->lock);

1746

raw_spin_unlock(&busiest->lock);

1747

lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);

1747

lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);

1748

}

1748

}

1749

1750

/*

1750

/*

1751

* double_rq_lock - safely lock two runqueues

1751

* double_rq_lock - safely lock two runqueues

1752

*

1752

*

1753

* Note this does not disable interrupts like task_rq_lock,

1753

* Note this does not disable interrupts like task_rq_lock,

1754

* you need to do so manually before calling.

1754

* you need to do so manually before calling.

1755

*/

1755

*/

1756

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

1756

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

1757

__acquires(rq1->lock)

1757

__acquires(rq1->lock)

1758

__acquires(rq2->lock)

1758

__acquires(rq2->lock)

1759

{

1759

{

1760

BUG_ON(!irqs_disabled());

1760

BUG_ON(!irqs_disabled());

1761

if (rq1 == rq2) {

1761

if (rq1 == rq2) {

1762

raw_spin_lock(&rq1->lock);

1762

raw_spin_lock(&rq1->lock);

1763

__acquire(rq2->lock); /* Fake it out ;) */

1763

__acquire(rq2->lock); /* Fake it out ;) */

1764

} else {

1764

} else {

1765

if (rq1 < rq2) {

1765

if (rq1 < rq2) {

1766

raw_spin_lock(&rq1->lock);

1766

raw_spin_lock(&rq1->lock);

1767

raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);

1767

raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);

1768

} else {

1768

} else {

1769

raw_spin_lock(&rq2->lock);

1769

raw_spin_lock(&rq2->lock);

1770

raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);

1770

raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);

1771

}

1771

}

1772

}

1772

}

1773

update_rq_clock(rq1);

1773

update_rq_clock(rq1);

1774

update_rq_clock(rq2);

1774

update_rq_clock(rq2);

1775

}

1775

}

1776

1777

/*

1777

/*

1778

* double_rq_unlock - safely unlock two runqueues

1778

* double_rq_unlock - safely unlock two runqueues

1779

*

1779

*

1780

* Note this does not restore interrupts like task_rq_unlock,

1780

* Note this does not restore interrupts like task_rq_unlock,

1781

* you need to do so manually after calling.

1781

* you need to do so manually after calling.

1782

*/

1782

*/

1783

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

1783

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

1784

__releases(rq1->lock)

1784

__releases(rq1->lock)

1785

__releases(rq2->lock)

1785

__releases(rq2->lock)

1786

{

1786

{

1787

raw_spin_unlock(&rq1->lock);

1787

raw_spin_unlock(&rq1->lock);

1788

if (rq1 != rq2)

1788

if (rq1 != rq2)

1789

raw_spin_unlock(&rq2->lock);

1789

raw_spin_unlock(&rq2->lock);

1790

else

1790

else

1791

__release(rq2->lock);

1791

__release(rq2->lock);

1792

}

1792

}

1793

1794

#endif

1794

#endif

1795

1796

#ifdef CONFIG_FAIR_GROUP_SCHED

1796

#ifdef CONFIG_FAIR_GROUP_SCHED

1797

static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)

1797

static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)

1798

{

1798

{

1799

#ifdef CONFIG_SMP

1799

#ifdef CONFIG_SMP

1800

cfs_rq->shares = shares;

1800

cfs_rq->shares = shares;

1801

#endif

1801

#endif

1802

}

1802

}

1803

#endif

1803

#endif

1804

1805

static void calc_load_account_active(struct rq *this_rq);

1805

static void calc_load_account_active(struct rq *this_rq);

1806

static void update_sysctl(void);

1806

static void update_sysctl(void);

1807

static int get_update_sysctl_factor(void);

1807

static int get_update_sysctl_factor(void);

1808

1809

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

1809

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

1810

{

1810

{

1811

set_task_rq(p, cpu);

1811

set_task_rq(p, cpu);

1812

#ifdef CONFIG_SMP

1812

#ifdef CONFIG_SMP

1813

/*

1813

/*

1814

* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be

1814

* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be

1815

* successfuly executed on another CPU. We must ensure that updates of

1815

* successfuly executed on another CPU. We must ensure that updates of

1816

* per-task data have been completed by this moment.

1816

* per-task data have been completed by this moment.

1817

*/

1817

*/

1818

smp_wmb();

1818

smp_wmb();

1819

task_thread_info(p)->cpu = cpu;

1819

task_thread_info(p)->cpu = cpu;

1820

#endif

1820

#endif

1821

}

1821

}

1822

1823

static const struct sched_class rt_sched_class;

1823

static const struct sched_class rt_sched_class;

1824

1825

#define sched_class_highest (&rt_sched_class)

1825

#define sched_class_highest (&rt_sched_class)

1826

#define for_each_class(class) \

1826

#define for_each_class(class) \

1827

for (class = sched_class_highest; class; class = class->next)

1827

for (class = sched_class_highest; class; class = class->next)

1828

1829

#include "sched_stats.h"

1829

#include "sched_stats.h"

1830

1831

static void inc_nr_running(struct rq *rq)

1831

static void inc_nr_running(struct rq *rq)

1832

{

1832

{

1833

rq->nr_running++;

1833

rq->nr_running++;

1834

}

1834

}

1835

1836

static void dec_nr_running(struct rq *rq)

1836

static void dec_nr_running(struct rq *rq)

1837

{

1837

{

1838

rq->nr_running--;

1838

rq->nr_running--;

1839

}

1839

}

1840

1841

static void set_load_weight(struct task_struct *p)

1841

static void set_load_weight(struct task_struct *p)

1842

{

1842

{

1843

if (task_has_rt_policy(p)) {

1843

if (task_has_rt_policy(p)) {

1844

p->se.load.weight = prio_to_weight[0] * 2;

1844

p->se.load.weight = prio_to_weight[0] * 2;

1845

p->se.load.inv_weight = prio_to_wmult[0] >> 1;

1845

p->se.load.inv_weight = prio_to_wmult[0] >> 1;

1846

return;

1846

return;

1847

}

1847

}

1848

1849

/*

1849

/*

1850

* SCHED_IDLE tasks get minimal weight:

1850

* SCHED_IDLE tasks get minimal weight:

1851

*/

1851

*/

1852

if (p->policy == SCHED_IDLE) {

1852

if (p->policy == SCHED_IDLE) {

1853

p->se.load.weight = WEIGHT_IDLEPRIO;

1853

p->se.load.weight = WEIGHT_IDLEPRIO;

1854

p->se.load.inv_weight = WMULT_IDLEPRIO;

1854

p->se.load.inv_weight = WMULT_IDLEPRIO;

1855

return;

1855

return;

1856

}

1856

}

1857

1858

p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];

1858

p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];

1859

p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];

1859

p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];

1860

}

1860

}

1861

1862

static void update_avg(u64 *avg, u64 sample)

1862

static void update_avg(u64 *avg, u64 sample)

1863

{

1863

{

1864

s64 diff = sample - *avg;

1864

s64 diff = sample - *avg;

1865

*avg += diff >> 3;

1865

*avg += diff >> 3;

1866

}

1866

}

1867

1868

static void

1868

static void

1869

enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)

1869

enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)

1870

{

1870

{

1871

if (wakeup)

1871

if (wakeup)

1872

p->se.start_runtime = p->se.sum_exec_runtime;

1872

p->se.start_runtime = p->se.sum_exec_runtime;

1873

1874

sched_info_queued(p);

1874

sched_info_queued(p);

1875

p->sched_class->enqueue_task(rq, p, wakeup, head);

1875

p->sched_class->enqueue_task(rq, p, wakeup, head);

1876

p->se.on_rq = 1;

1876

p->se.on_rq = 1;

1877

}

1877

}

1878

1879

static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)

1879

static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)

1880

{

1880

{

1881

if (sleep) {

1881

if (sleep) {

1882

if (p->se.last_wakeup) {

1882

if (p->se.last_wakeup) {

1883

update_avg(&p->se.avg_overlap,

1883

update_avg(&p->se.avg_overlap,

1884

p->se.sum_exec_runtime - p->se.last_wakeup);

1884

p->se.sum_exec_runtime - p->se.last_wakeup);

1885

p->se.last_wakeup = 0;

1885

p->se.last_wakeup = 0;

1886

} else {

1886

} else {

1887

update_avg(&p->se.avg_wakeup,

1887

update_avg(&p->se.avg_wakeup,

1888

sysctl_sched_wakeup_granularity);

1888

sysctl_sched_wakeup_granularity);

1889

}

1889

}

1890

}

1890

}

1891

1892

sched_info_dequeued(p);

1892

sched_info_dequeued(p);

1893

p->sched_class->dequeue_task(rq, p, sleep);

1893

p->sched_class->dequeue_task(rq, p, sleep);

1894

p->se.on_rq = 0;

1894

p->se.on_rq = 0;

1895

}

1895

}

1896

1897

/*

1897

/*

1898

* activate_task - move a task to the runqueue.

1898

* activate_task - move a task to the runqueue.

1899

*/

1899

*/

1900

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)

1900

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)

1901

{

1901

{

1902

if (task_contributes_to_load(p))

1902

if (task_contributes_to_load(p))

1903

rq->nr_uninterruptible--;

1903

rq->nr_uninterruptible--;

1904

1905

enqueue_task(rq, p, wakeup, false);

1905

enqueue_task(rq, p, wakeup, false);

1906

inc_nr_running(rq);

1906

inc_nr_running(rq);

1907

}

1907

}

1908

1909

/*

1909

/*

1910

* deactivate_task - remove a task from the runqueue.

1910

* deactivate_task - remove a task from the runqueue.

1911

*/

1911

*/

1912

static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)

1912

static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)

1913

{

1913

{

1914

if (task_contributes_to_load(p))

1914

if (task_contributes_to_load(p))

1915

rq->nr_uninterruptible++;

1915

rq->nr_uninterruptible++;

1916

1917

dequeue_task(rq, p, sleep);

1917

dequeue_task(rq, p, sleep);

1918

dec_nr_running(rq);

1918

dec_nr_running(rq);

1919

}

1919

}

1920

1921

#include "sched_idletask.c"

1921

#include "sched_idletask.c"

1922

#include "sched_fair.c"

1922

#include "sched_fair.c"

1923

#include "sched_rt.c"

1923

#include "sched_rt.c"

1924

#ifdef CONFIG_SCHED_DEBUG

1924

#ifdef CONFIG_SCHED_DEBUG

1925

# include "sched_debug.c"

1925

# include "sched_debug.c"

1926

#endif

1926

#endif

1927

1928

/*

1928

/*

1929

* __normal_prio - return the priority that is based on the static prio

1929

* __normal_prio - return the priority that is based on the static prio

1930

*/

1930

*/

1931

static inline int __normal_prio(struct task_struct *p)

1931

static inline int __normal_prio(struct task_struct *p)

1932

{

1932

{

1933

return p->static_prio;

1933

return p->static_prio;

1934

}

1934

}

1935

1936

/*

1936

/*

1937

* Calculate the expected normal priority: i.e. priority

1937

* Calculate the expected normal priority: i.e. priority

1938

* without taking RT-inheritance into account. Might be

1938

* without taking RT-inheritance into account. Might be

1939

* boosted by interactivity modifiers. Changes upon fork,

1939

* boosted by interactivity modifiers. Changes upon fork,

1940

* setprio syscalls, and whenever the interactivity

1940

* setprio syscalls, and whenever the interactivity

1941

* estimator recalculates.

1941

* estimator recalculates.

1942

*/

1942

*/

1943

static inline int normal_prio(struct task_struct *p)

1943

static inline int normal_prio(struct task_struct *p)

1944

{

1944

{

1945

int prio;

1945

int prio;

1946

1947

if (task_has_rt_policy(p))

1947

if (task_has_rt_policy(p))

1948

prio = MAX_RT_PRIO-1 - p->rt_priority;

1948

prio = MAX_RT_PRIO-1 - p->rt_priority;

1949

else

1949

else

1950

prio = __normal_prio(p);

1950

prio = __normal_prio(p);

1951

return prio;

1951

return prio;

1952

}

1952

}

1953

1954

/*

1954

/*

1955

* Calculate the current priority, i.e. the priority

1955

* Calculate the current priority, i.e. the priority

1956

* taken into account by the scheduler. This value might

1956

* taken into account by the scheduler. This value might

1957

* be boosted by RT tasks, or might be boosted by

1957

* be boosted by RT tasks, or might be boosted by

1958

* interactivity modifiers. Will be RT if the task got

1958

* interactivity modifiers. Will be RT if the task got

1959

* RT-boosted. If not then it returns p->normal_prio.

1959

* RT-boosted. If not then it returns p->normal_prio.

1960

*/

1960

*/

1961

static int effective_prio(struct task_struct *p)

1961

static int effective_prio(struct task_struct *p)

1962

{

1962

{

1963

p->normal_prio = normal_prio(p);

1963

p->normal_prio = normal_prio(p);

1964

/*

1964

/*

1965

* If we are RT tasks or we were boosted to RT priority,

1965

* If we are RT tasks or we were boosted to RT priority,

1966

* keep the priority unchanged. Otherwise, update priority

1966

* keep the priority unchanged. Otherwise, update priority

1967

* to the normal priority:

1967

* to the normal priority:

1968

*/

1968

*/

1969

if (!rt_prio(p->prio))

1969

if (!rt_prio(p->prio))

1970

return p->normal_prio;

1970

return p->normal_prio;

1971

return p->prio;

1971

return p->prio;

1972

}

1972

}

1973

1974

/**

1974

/**

1975

* task_curr - is this task currently executing on a CPU?

1975

* task_curr - is this task currently executing on a CPU?

1976

* @p: the task in question.

1976

* @p: the task in question.

1977

*/

1977

*/

1978

inline int task_curr(const struct task_struct *p)

1978

inline int task_curr(const struct task_struct *p)

1979

{

1979

{

1980

return cpu_curr(task_cpu(p)) == p;

1980

return cpu_curr(task_cpu(p)) == p;

1981

}

1981

}

1982

1983

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

1983

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

1984

const struct sched_class *prev_class,

1984

const struct sched_class *prev_class,

1985

int oldprio, int running)

1985

int oldprio, int running)

1986

{

1986

{

1987

if (prev_class != p->sched_class) {

1987

if (prev_class != p->sched_class) {

1988

if (prev_class->switched_from)

1988

if (prev_class->switched_from)

1989

prev_class->switched_from(rq, p, running);

1989

prev_class->switched_from(rq, p, running);

1990

p->sched_class->switched_to(rq, p, running);

1990

p->sched_class->switched_to(rq, p, running);

1991

} else

1991

} else

1992

p->sched_class->prio_changed(rq, p, oldprio, running);

1992

p->sched_class->prio_changed(rq, p, oldprio, running);

1993

}

1993

}

1994

1995

#ifdef CONFIG_SMP

1995

#ifdef CONFIG_SMP

1996

/*

1996

/*

1997

* Is this task likely cache-hot:

1997

* Is this task likely cache-hot:

1998

*/

1998

*/

1999

static int

1999

static int

2000

task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)

2000

task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)

2001

{

2001

{

2002

s64 delta;

2002

s64 delta;

2003

2004

if (p->sched_class != &fair_sched_class)

2004

if (p->sched_class != &fair_sched_class)

2005

return 0;

2005

return 0;

2006

2007

/*

2007

/*

2008

* Buddy candidates are cache hot:

2008

* Buddy candidates are cache hot:

2009

*/

2009

*/

2010

if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&

2010

if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&

2011

(&p->se == cfs_rq_of(&p->se)->next ||

2011

(&p->se == cfs_rq_of(&p->se)->next ||

2012

&p->se == cfs_rq_of(&p->se)->last))

2012

&p->se == cfs_rq_of(&p->se)->last))

2013

return 1;

2013

return 1;

2014

2015

if (sysctl_sched_migration_cost == -1)

2015

if (sysctl_sched_migration_cost == -1)

2016

return 1;

2016

return 1;

2017

if (sysctl_sched_migration_cost == 0)

2017

if (sysctl_sched_migration_cost == 0)

2018

return 0;

2018

return 0;

2019

2020

delta = now - p->se.exec_start;

2020

delta = now - p->se.exec_start;

2021

2022

return delta < (s64)sysctl_sched_migration_cost;

2022

return delta < (s64)sysctl_sched_migration_cost;

2023

}

2023

}

2024

2025

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

2025

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

2026

{

2026

{

2027

#ifdef CONFIG_SCHED_DEBUG

2027

#ifdef CONFIG_SCHED_DEBUG

2028

/*

2028

/*

2029

* We should never call set_task_cpu() on a blocked task,

2029

* We should never call set_task_cpu() on a blocked task,

2030

* ttwu() will sort out the placement.

2030

* ttwu() will sort out the placement.

2031

*/

2031

*/

2032

WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&

2032

WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&

2033

!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));

2033

!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));

2034

#endif

2034

#endif

2035

2036

trace_sched_migrate_task(p, new_cpu);

2036

trace_sched_migrate_task(p, new_cpu);

2037

2038

if (task_cpu(p) != new_cpu) {

2038

if (task_cpu(p) != new_cpu) {

2039

p->se.nr_migrations++;

2039

p->se.nr_migrations++;

2040

perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);

2040

perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);

2041

}

2041

}

2042

2043

__set_task_cpu(p, new_cpu);

2043

__set_task_cpu(p, new_cpu);

2044

}

2044

}

2045

2046

struct migration_req {

2046

struct migration_req {

2047

struct list_head list;

2047

struct list_head list;

2048

2049

struct task_struct *task;

2049

struct task_struct *task;

2050

int dest_cpu;

2050

int dest_cpu;

2051

2052

struct completion done;

2052

struct completion done;

2053

};

2053

};

2054

2055

/*

2055

/*

2056

* The task's runqueue lock must be held.

2056

* The task's runqueue lock must be held.

2057

* Returns true if you have to wait for migration thread.

2057

* Returns true if you have to wait for migration thread.

2058

*/

2058

*/

2059

static int

2059

static int

2060

migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)

2060

migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)

2061

{

2061

{

2062

struct rq *rq = task_rq(p);

2062

struct rq *rq = task_rq(p);

2063

2064

/*

2064

/*

2065

* If the task is not on a runqueue (and not running), then

2065

* If the task is not on a runqueue (and not running), then

2066

* the next wake-up will properly place the task.

2066

* the next wake-up will properly place the task.

2067

*/

2067

*/

2068

if (!p->se.on_rq && !task_running(rq, p))

2068

if (!p->se.on_rq && !task_running(rq, p))

2069

return 0;

2069

return 0;

2070

2071

init_completion(&req->done);

2071

init_completion(&req->done);

2072

req->task = p;

2072

req->task = p;

2073

req->dest_cpu = dest_cpu;

2073

req->dest_cpu = dest_cpu;

2074

list_add(&req->list, &rq->migration_queue);

2074

list_add(&req->list, &rq->migration_queue);

2075

2076

return 1;

2076

return 1;

2077

}

2077

}

2078

2079

/*

2079

/*

2080

* wait_task_context_switch - wait for a thread to complete at least one

2080

* wait_task_context_switch - wait for a thread to complete at least one

2081

* context switch.

2081

* context switch.

2082

*

2082

*

2083

* @p must not be current.

2083

* @p must not be current.

2084

*/

2084

*/

2085

void wait_task_context_switch(struct task_struct *p)

2085

void wait_task_context_switch(struct task_struct *p)

2086

{

2086

{

2087

unsigned long nvcsw, nivcsw, flags;

2087

unsigned long nvcsw, nivcsw, flags;

2088

int running;

2088

int running;

2089

struct rq *rq;

2089

struct rq *rq;

2090

2091

nvcsw = p->nvcsw;

2091

nvcsw = p->nvcsw;

2092

nivcsw = p->nivcsw;

2092

nivcsw = p->nivcsw;

2093

for (;;) {

2093

for (;;) {

2094

/*

2094

/*

2095

* The runqueue is assigned before the actual context

2095

* The runqueue is assigned before the actual context

2096

* switch. We need to take the runqueue lock.

2096

* switch. We need to take the runqueue lock.

2097

*

2097

*

2098

* We could check initially without the lock but it is

2098

* We could check initially without the lock but it is

2099

* very likely that we need to take the lock in every

2099

* very likely that we need to take the lock in every

2100

* iteration.

2100

* iteration.

2101

*/

2101

*/

2102

rq = task_rq_lock(p, &flags);

2102

rq = task_rq_lock(p, &flags);

2103

running = task_running(rq, p);

2103

running = task_running(rq, p);

2104

task_rq_unlock(rq, &flags);

2104

task_rq_unlock(rq, &flags);

2105

2106

if (likely(!running))

2106

if (likely(!running))

2107

break;

2107

break;

2108

/*

2108

/*

2109

* The switch count is incremented before the actual

2109

* The switch count is incremented before the actual

2110

* context switch. We thus wait for two switches to be

2110

* context switch. We thus wait for two switches to be

2111

* sure at least one completed.

2111

* sure at least one completed.

2112

*/

2112

*/

2113

if ((p->nvcsw - nvcsw) > 1)

2113

if ((p->nvcsw - nvcsw) > 1)

2114

break;

2114

break;

2115

if ((p->nivcsw - nivcsw) > 1)

2115

if ((p->nivcsw - nivcsw) > 1)

2116

break;

2116

break;

2117

2118

cpu_relax();

2118

cpu_relax();

2119

}

2119

}

2120

}

2120

}

2121

2122

/*

2122

/*

2123

* wait_task_inactive - wait for a thread to unschedule.

2123

* wait_task_inactive - wait for a thread to unschedule.

2124

*

2124

*

2125

* If @match_state is nonzero, it's the @p->state value just checked and

2125

* If @match_state is nonzero, it's the @p->state value just checked and

2126

* not expected to change. If it changes, i.e. @p might have woken up,

2126

* not expected to change. If it changes, i.e. @p might have woken up,

2127

* then return zero. When we succeed in waiting for @p to be off its CPU,

2127

* then return zero. When we succeed in waiting for @p to be off its CPU,

2128

* we return a positive number (its total switch count). If a second call

2128

* we return a positive number (its total switch count). If a second call

2129

* a short while later returns the same number, the caller can be sure that

2129

* a short while later returns the same number, the caller can be sure that

2130

* @p has remained unscheduled the whole time.

2130

* @p has remained unscheduled the whole time.

2131

*

2131

*

2132

* The caller must ensure that the task *will* unschedule sometime soon,

2132

* The caller must ensure that the task *will* unschedule sometime soon,

2133

* else this function might spin for a *long* time. This function can't

2133

* else this function might spin for a *long* time. This function can't

2134

* be called with interrupts off, or it may introduce deadlock with

2134

* be called with interrupts off, or it may introduce deadlock with

2135

* smp_call_function() if an IPI is sent by the same process we are

2135

* smp_call_function() if an IPI is sent by the same process we are

2136

* waiting to become inactive.

2136

* waiting to become inactive.

2137

*/

2137

*/

2138

unsigned long wait_task_inactive(struct task_struct *p, long match_state)

2138

unsigned long wait_task_inactive(struct task_struct *p, long match_state)

2139

{

2139

{

2140

unsigned long flags;

2140

unsigned long flags;

2141

int running, on_rq;

2141

int running, on_rq;

2142

unsigned long ncsw;

2142

unsigned long ncsw;

2143

struct rq *rq;

2143

struct rq *rq;

2144

2145

for (;;) {

2145

for (;;) {

2146

/*

2146

/*

2147

* We do the initial early heuristics without holding

2147

* We do the initial early heuristics without holding

2148

* any task-queue locks at all. We'll only try to get

2148

* any task-queue locks at all. We'll only try to get

2149

* the runqueue lock when things look like they will

2149

* the runqueue lock when things look like they will

2150

* work out!

2150

* work out!

2151

*/

2151

*/

2152

rq = task_rq(p);

2152

rq = task_rq(p);

2153

2154

/*

2154

/*

2155

* If the task is actively running on another CPU

2155

* If the task is actively running on another CPU

2156

* still, just relax and busy-wait without holding

2156

* still, just relax and busy-wait without holding

2157

* any locks.

2157

* any locks.

2158

*

2158

*

2159

* NOTE! Since we don't hold any locks, it's not

2159

* NOTE! Since we don't hold any locks, it's not

2160

* even sure that "rq" stays as the right runqueue!

2160

* even sure that "rq" stays as the right runqueue!

2161

* But we don't care, since "task_running()" will

2161

* But we don't care, since "task_running()" will

2162

* return false if the runqueue has changed and p

2162

* return false if the runqueue has changed and p

2163

* is actually now running somewhere else!

2163

* is actually now running somewhere else!

2164

*/

2164

*/

2165

while (task_running(rq, p)) {

2165

while (task_running(rq, p)) {

2166

if (match_state && unlikely(p->state != match_state))

2166

if (match_state && unlikely(p->state != match_state))

2167

return 0;

2167

return 0;

2168

cpu_relax();

2168

cpu_relax();

2169

}

2169

}

2170

2171

/*

2171

/*

2172

* Ok, time to look more closely! We need the rq

2172

* Ok, time to look more closely! We need the rq

2173

* lock now, to be *sure*. If we're wrong, we'll

2173

* lock now, to be *sure*. If we're wrong, we'll

2174

* just go back and repeat.

2174

* just go back and repeat.

2175

*/

2175

*/

2176

rq = task_rq_lock(p, &flags);

2176

rq = task_rq_lock(p, &flags);

2177

trace_sched_wait_task(rq, p);

2177

trace_sched_wait_task(rq, p);

2178

running = task_running(rq, p);

2178

running = task_running(rq, p);

2179

on_rq = p->se.on_rq;

2179

on_rq = p->se.on_rq;

2180

ncsw = 0;

2180

ncsw = 0;

2181

if (!match_state || p->state == match_state)

2181

if (!match_state || p->state == match_state)

2182

ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

2182

ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

2183

task_rq_unlock(rq, &flags);

2183

task_rq_unlock(rq, &flags);

2184

2185

/*

2185

/*

2186

* If it changed from the expected state, bail out now.

2186

* If it changed from the expected state, bail out now.

2187

*/

2187

*/

2188

if (unlikely(!ncsw))

2188

if (unlikely(!ncsw))

2189

break;

2189

break;

2190

2191

/*

2191

/*

2192

* Was it really running after all now that we

2192

* Was it really running after all now that we

2193

* checked with the proper locks actually held?

2193

* checked with the proper locks actually held?

2194

*

2194

*

2195

* Oops. Go back and try again..

2195

* Oops. Go back and try again..

2196

*/

2196

*/

2197

if (unlikely(running)) {

2197

if (unlikely(running)) {

2198

cpu_relax();

2198

cpu_relax();

2199

continue;

2199

continue;

2200

}

2200

}

2201

2202

/*

2202

/*

2203

* It's not enough that it's not actively running,

2203

* It's not enough that it's not actively running,

2204

* it must be off the runqueue _entirely_, and not

2204

* it must be off the runqueue _entirely_, and not

2205

* preempted!

2205

* preempted!

2206

*

2206

*

2207

* So if it was still runnable (but just not actively

2207

* So if it was still runnable (but just not actively

2208

* running right now), it's preempted, and we should

2208

* running right now), it's preempted, and we should

2209

* yield - it could be a while.

2209

* yield - it could be a while.

2210

*/

2210

*/

2211

if (unlikely(on_rq)) {

2211

if (unlikely(on_rq)) {

2212

schedule_timeout_uninterruptible(1);

2212

schedule_timeout_uninterruptible(1);

2213

continue;

2213

continue;

2214

}

2214

}

2215

2216

/*

2216

/*

2217

* Ahh, all good. It wasn't running, and it wasn't

2217

* Ahh, all good. It wasn't running, and it wasn't

2218

* runnable, which means that it will never become

2218

* runnable, which means that it will never become

2219

* running in the future either. We're all done!

2219

* running in the future either. We're all done!

2220

*/

2220

*/

2221

break;

2221

break;

2222

}

2222

}

2223

2224

return ncsw;

2224

return ncsw;

2225

}

2225

}

2226

2227

/***

2227

/***

2228

* kick_process - kick a running thread to enter/exit the kernel

2228

* kick_process - kick a running thread to enter/exit the kernel

2229

* @p: the to-be-kicked thread

2229

* @p: the to-be-kicked thread

2230

*

2230

*

2231

* Cause a process which is running on another CPU to enter

2231

* Cause a process which is running on another CPU to enter

2232

* kernel-mode, without any delay. (to get signals handled.)

2232

* kernel-mode, without any delay. (to get signals handled.)

2233

*

2233

*

2234

* NOTE: this function doesnt have to take the runqueue lock,

2234

* NOTE: this function doesnt have to take the runqueue lock,

2235

* because all it wants to ensure is that the remote task enters

2235

* because all it wants to ensure is that the remote task enters

2236

* the kernel. If the IPI races and the task has been migrated

2236

* the kernel. If the IPI races and the task has been migrated

2237

* to another CPU then no harm is done and the purpose has been

2237

* to another CPU then no harm is done and the purpose has been

2238

* achieved as well.

2238

* achieved as well.

2239

*/

2239

*/

2240

void kick_process(struct task_struct *p)

2240

void kick_process(struct task_struct *p)

2241

{

2241

{

2242

int cpu;

2242

int cpu;

2243

2244

preempt_disable();

2244

preempt_disable();

2245

cpu = task_cpu(p);

2245

cpu = task_cpu(p);

2246

if ((cpu != smp_processor_id()) && task_curr(p))

2246

if ((cpu != smp_processor_id()) && task_curr(p))

2247

smp_send_reschedule(cpu);

2247

smp_send_reschedule(cpu);

2248

preempt_enable();

2248

preempt_enable();

2249

}

2249

}

2250

EXPORT_SYMBOL_GPL(kick_process);

2250

EXPORT_SYMBOL_GPL(kick_process);

2251

#endif /* CONFIG_SMP */

2251

#endif /* CONFIG_SMP */

2252

2253

/**

2253

/**

2254

* task_oncpu_function_call - call a function on the cpu on which a task runs

2254

* task_oncpu_function_call - call a function on the cpu on which a task runs

2255

* @p: the task to evaluate

2255

* @p: the task to evaluate

2256

* @func: the function to be called

2256

* @func: the function to be called

2257

* @info: the function call argument

2257

* @info: the function call argument

2258

*

2258

*

2259

* Calls the function @func when the task is currently running. This might

2259

* Calls the function @func when the task is currently running. This might

2260

* be on the current CPU, which just calls the function directly

2260

* be on the current CPU, which just calls the function directly

2261

*/

2261

*/

2262

void task_oncpu_function_call(struct task_struct *p,

2262

void task_oncpu_function_call(struct task_struct *p,

2263

void (*func) (void *info), void *info)

2263

void (*func) (void *info), void *info)

2264

{

2264

{

2265

int cpu;

2265

int cpu;

2266

2267

preempt_disable();

2267

preempt_disable();

2268

cpu = task_cpu(p);

2268

cpu = task_cpu(p);

2269

if (task_curr(p))

2269

if (task_curr(p))

2270

smp_call_function_single(cpu, func, info, 1);

2270

smp_call_function_single(cpu, func, info, 1);

2271

preempt_enable();

2271

preempt_enable();

2272

}

2272

}

2273

2274

#ifdef CONFIG_SMP

2274

#ifdef CONFIG_SMP

2275

static int select_fallback_rq(int cpu, struct task_struct *p)

2275

static int select_fallback_rq(int cpu, struct task_struct *p)

2276

{

2276

{

2277

int dest_cpu;

2277

int dest_cpu;

2278

const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));

2278

const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));

2279

2280

/* Look for allowed, online CPU in same node. */

2280

/* Look for allowed, online CPU in same node. */

2281

for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)

2281

for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)

2282

if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

2282

if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

2283

return dest_cpu;

2283

return dest_cpu;

2284

2285

/* Any allowed, online CPU? */

2285

/* Any allowed, online CPU? */

2286

dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);

2286

dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);

2287

if (dest_cpu < nr_cpu_ids)

2287

if (dest_cpu < nr_cpu_ids)

2288

return dest_cpu;

2288

return dest_cpu;

2289

2290

/* No more Mr. Nice Guy. */

2290

/* No more Mr. Nice Guy. */

2291

if (dest_cpu >= nr_cpu_ids) {

2291

if (dest_cpu >= nr_cpu_ids) {

2292

rcu_read_lock();

2292

rcu_read_lock();

2293

cpuset_cpus_allowed_locked(p, &p->cpus_allowed);

2293

cpuset_cpus_allowed_locked(p, &p->cpus_allowed);

2294

rcu_read_unlock();

2294

rcu_read_unlock();

2295

dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);

2295

dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);

2296

2297

/*

2297

/*

2298

* Don't tell them about moving exiting tasks or

2298

* Don't tell them about moving exiting tasks or

2299

* kernel threads (both mm NULL), since they never

2299

* kernel threads (both mm NULL), since they never

2300

* leave kernel.

2300

* leave kernel.

2301

*/

2301

*/

2302

if (p->mm && printk_ratelimit()) {

2302

if (p->mm && printk_ratelimit()) {

2303

printk(KERN_INFO "process %d (%s) no "

2303

printk(KERN_INFO "process %d (%s) no "

2304

"longer affine to cpu%d\n",

2304

"longer affine to cpu%d\n",

2305

task_pid_nr(p), p->comm, cpu);

2305

task_pid_nr(p), p->comm, cpu);

2306

}

2306

}

2307

}

2307

}

2308

2309

return dest_cpu;

2309

return dest_cpu;

2310

}

2310

}

2311

2312

/*

2312

/*

2313

* Gets called from 3 sites (exec, fork, wakeup), since it is called without

2313

* Gets called from 3 sites (exec, fork, wakeup), since it is called without

2314

* holding rq->lock we need to ensure ->cpus_allowed is stable, this is done

2314

* holding rq->lock we need to ensure ->cpus_allowed is stable, this is done

2315

* by:

2315

* by:

2316

*

2316

*

2317

* exec: is unstable, retry loop

2317

* exec: is unstable, retry loop

2318

* fork & wake-up: serialize ->cpus_allowed against TASK_WAKING

2318

* fork & wake-up: serialize ->cpus_allowed against TASK_WAKING

2319

*/

2319

*/

2320

static inline

2320

static inline

2321

int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)

2321

int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)

2322

{

2322

{

2323

int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);

2323

int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);

2324

2325

/*

2325

/*

2326

* In order not to call set_task_cpu() on a blocking task we need

2326

* In order not to call set_task_cpu() on a blocking task we need

2327

* to rely on ttwu() to place the task on a valid ->cpus_allowed

2327

* to rely on ttwu() to place the task on a valid ->cpus_allowed

2328

* cpu.

2328

* cpu.

2329

*

2329

*

2330

* Since this is common to all placement strategies, this lives here.

2330

* Since this is common to all placement strategies, this lives here.

2331

*

2331

*

2332

* [ this allows ->select_task() to simply return task_cpu(p) and

2332

* [ this allows ->select_task() to simply return task_cpu(p) and

2333

* not worry about this generic constraint ]

2333

* not worry about this generic constraint ]

2334

*/

2334

*/

2335

if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||

2335

if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||

2336

!cpu_online(cpu)))

2336

!cpu_online(cpu)))

2337

cpu = select_fallback_rq(task_cpu(p), p);

2337

cpu = select_fallback_rq(task_cpu(p), p);

2338

2339

return cpu;

2339

return cpu;

2340

}

2340

}

2341

#endif

2341

#endif

2342

2343

/***

2343

/***

2344

* try_to_wake_up - wake up a thread

2344

* try_to_wake_up - wake up a thread

2345

* @p: the to-be-woken-up thread

2345

* @p: the to-be-woken-up thread

2346

* @state: the mask of task states that can be woken

2346

* @state: the mask of task states that can be woken

2347

* @sync: do a synchronous wakeup?

2347

* @sync: do a synchronous wakeup?

2348

*

2348

*

2349

* Put it on the run-queue if it's not already there. The "current"

2349

* Put it on the run-queue if it's not already there. The "current"

2350

* thread is always on the run-queue (except when the actual

2350

* thread is always on the run-queue (except when the actual

2351

* re-schedule is in progress), and as such you're allowed to do

2351

* re-schedule is in progress), and as such you're allowed to do

2352

* the simpler "current->state = TASK_RUNNING" to mark yourself

2352

* the simpler "current->state = TASK_RUNNING" to mark yourself

2353

* runnable without the overhead of this.

2353

* runnable without the overhead of this.

2354

*

2354

*

2355

* returns failure only if the task is already active.

2355

* returns failure only if the task is already active.

2356

*/

2356

*/

2357

static int try_to_wake_up(struct task_struct *p, unsigned int state,

2357

static int try_to_wake_up(struct task_struct *p, unsigned int state,

2358

int wake_flags)

2358

int wake_flags)

2359

{

2359

{

2360

int cpu, orig_cpu, this_cpu, success = 0;

2360

int cpu, orig_cpu, this_cpu, success = 0;

2361

unsigned long flags;

2361

unsigned long flags;

2362

struct rq *rq;

2362

struct rq *rq;

2363

2364

if (!sched_feat(SYNC_WAKEUPS))

2364

if (!sched_feat(SYNC_WAKEUPS))

2365

wake_flags &= ~WF_SYNC;

2365

wake_flags &= ~WF_SYNC;

2366

2367

this_cpu = get_cpu();

2367

this_cpu = get_cpu();

2368

2369

smp_wmb();

2369

smp_wmb();

2370

rq = task_rq_lock(p, &flags);

2370

rq = task_rq_lock(p, &flags);

2371

update_rq_clock(rq);

2371

update_rq_clock(rq);

2372

if (!(p->state & state))

2372

if (!(p->state & state))

2373

goto out;

2373

goto out;

2374

2375

if (p->se.on_rq)

2375

if (p->se.on_rq)

2376

goto out_running;

2376

goto out_running;

2377

2378

cpu = task_cpu(p);

2378

cpu = task_cpu(p);

2379

orig_cpu = cpu;

2379

orig_cpu = cpu;

2380

2381

#ifdef CONFIG_SMP

2381

#ifdef CONFIG_SMP

2382

if (unlikely(task_running(rq, p)))

2382

if (unlikely(task_running(rq, p)))

2383

goto out_activate;

2383

goto out_activate;

2384

2385

/*

2385

/*

2386

* In order to handle concurrent wakeups and release the rq->lock

2386

* In order to handle concurrent wakeups and release the rq->lock

2387

* we put the task in TASK_WAKING state.

2387

* we put the task in TASK_WAKING state.

2388

*

2388

*

2389

* First fix up the nr_uninterruptible count:

2389

* First fix up the nr_uninterruptible count:

2390

*/

2390

*/

2391

if (task_contributes_to_load(p))

2391

if (task_contributes_to_load(p))

2392

rq->nr_uninterruptible--;

2392

rq->nr_uninterruptible--;

2393

p->state = TASK_WAKING;

2393

p->state = TASK_WAKING;

2394

2395

if (p->sched_class->task_waking)

2395

if (p->sched_class->task_waking)

2396

p->sched_class->task_waking(rq, p);

2396

p->sched_class->task_waking(rq, p);

2397

2398

__task_rq_unlock(rq);

2398

__task_rq_unlock(rq);

2399

2400

cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);

2400

cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);

2401

if (cpu != orig_cpu) {

2401

if (cpu != orig_cpu) {

2402

/*

2402

/*

2403

* Since we migrate the task without holding any rq->lock,

2403

* Since we migrate the task without holding any rq->lock,

2404

* we need to be careful with task_rq_lock(), since that

2404

* we need to be careful with task_rq_lock(), since that

2405

* might end up locking an invalid rq.

2405

* might end up locking an invalid rq.

2406

*/

2406

*/

2407

set_task_cpu(p, cpu);

2407

set_task_cpu(p, cpu);

2408

}

2408

}

2409

2410

rq = cpu_rq(cpu);

2410

rq = cpu_rq(cpu);

2411

raw_spin_lock(&rq->lock);

2411

raw_spin_lock(&rq->lock);

2412

update_rq_clock(rq);

2412

update_rq_clock(rq);

2413

2414

/*

2414

/*

2415

* We migrated the task without holding either rq->lock, however

2415

* We migrated the task without holding either rq->lock, however

2416

* since the task is not on the task list itself, nobody else

2416

* since the task is not on the task list itself, nobody else

2417

* will try and migrate the task, hence the rq should match the

2417

* will try and migrate the task, hence the rq should match the

2418

* cpu we just moved it to.

2418

* cpu we just moved it to.

2419

*/

2419

*/

2420

WARN_ON(task_cpu(p) != cpu);

2420

WARN_ON(task_cpu(p) != cpu);

2421

WARN_ON(p->state != TASK_WAKING);

2421

WARN_ON(p->state != TASK_WAKING);

2422

2423

#ifdef CONFIG_SCHEDSTATS

2423

#ifdef CONFIG_SCHEDSTATS

2424

schedstat_inc(rq, ttwu_count);

2424

schedstat_inc(rq, ttwu_count);

2425

if (cpu == this_cpu)

2425

if (cpu == this_cpu)

2426

schedstat_inc(rq, ttwu_local);

2426

schedstat_inc(rq, ttwu_local);

2427

else {

2427

else {

2428

struct sched_domain *sd;

2428

struct sched_domain *sd;

2429

for_each_domain(this_cpu, sd) {

2429

for_each_domain(this_cpu, sd) {

2430

if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {

2430

if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {

2431

schedstat_inc(sd, ttwu_wake_remote);

2431

schedstat_inc(sd, ttwu_wake_remote);

2432

break;

2432

break;

2433

}

2433

}

2434

}

2434

}

2435

}

2435

}

2436

#endif /* CONFIG_SCHEDSTATS */

2436

#endif /* CONFIG_SCHEDSTATS */

2437

2438

out_activate:

2438

out_activate:

2439

#endif /* CONFIG_SMP */

2439

#endif /* CONFIG_SMP */

2440

schedstat_inc(p, se.nr_wakeups);

2440

schedstat_inc(p, se.nr_wakeups);

2441

if (wake_flags & WF_SYNC)

2441

if (wake_flags & WF_SYNC)

2442

schedstat_inc(p, se.nr_wakeups_sync);

2442

schedstat_inc(p, se.nr_wakeups_sync);

2443

if (orig_cpu != cpu)

2443

if (orig_cpu != cpu)

2444

schedstat_inc(p, se.nr_wakeups_migrate);

2444

schedstat_inc(p, se.nr_wakeups_migrate);

2445

if (cpu == this_cpu)

2445

if (cpu == this_cpu)

2446

schedstat_inc(p, se.nr_wakeups_local);

2446

schedstat_inc(p, se.nr_wakeups_local);

2447

else

2447

else

2448

schedstat_inc(p, se.nr_wakeups_remote);

2448

schedstat_inc(p, se.nr_wakeups_remote);

2449

activate_task(rq, p, 1);

2449

activate_task(rq, p, 1);

2450

success = 1;

2450

success = 1;

2451

2452

/*

2452

/*

2453

* Only attribute actual wakeups done by this task.

2453

* Only attribute actual wakeups done by this task.

2454

*/

2454

*/

2455

if (!in_interrupt()) {

2455

if (!in_interrupt()) {

2456

struct sched_entity *se = &current->se;

2456

struct sched_entity *se = &current->se;

2457

u64 sample = se->sum_exec_runtime;

2457

u64 sample = se->sum_exec_runtime;

2458

2459

if (se->last_wakeup)

2459

if (se->last_wakeup)

2460

sample -= se->last_wakeup;

2460

sample -= se->last_wakeup;

2461

else

2461

else

2462

sample -= se->start_runtime;

2462

sample -= se->start_runtime;

2463

update_avg(&se->avg_wakeup, sample);

2463

update_avg(&se->avg_wakeup, sample);

2464

2465

se->last_wakeup = se->sum_exec_runtime;

2465

se->last_wakeup = se->sum_exec_runtime;

2466

}

2466

}

2467

2468

out_running:

2468

out_running:

2469

trace_sched_wakeup(rq, p, success);

2469

trace_sched_wakeup(rq, p, success);

2470

check_preempt_curr(rq, p, wake_flags);

2470

check_preempt_curr(rq, p, wake_flags);

2471

2472

p->state = TASK_RUNNING;

2472

p->state = TASK_RUNNING;

2473

#ifdef CONFIG_SMP

2473

#ifdef CONFIG_SMP

2474

if (p->sched_class->task_woken)

2474

if (p->sched_class->task_woken)

2475

p->sched_class->task_woken(rq, p);

2475

p->sched_class->task_woken(rq, p);

2476

2477

if (unlikely(rq->idle_stamp)) {

2477

if (unlikely(rq->idle_stamp)) {

2478

u64 delta = rq->clock - rq->idle_stamp;

2478

u64 delta = rq->clock - rq->idle_stamp;

2479

u64 max = 2*sysctl_sched_migration_cost;

2479

u64 max = 2*sysctl_sched_migration_cost;

2480

2481

if (delta > max)

2481

if (delta > max)

2482

rq->avg_idle = max;

2482

rq->avg_idle = max;

2483

else

2483

else

2484

update_avg(&rq->avg_idle, delta);

2484

update_avg(&rq->avg_idle, delta);

2485

rq->idle_stamp = 0;

2485

rq->idle_stamp = 0;

2486

}

2486

}

2487

#endif

2487

#endif

2488

out:

2488

out:

2489

task_rq_unlock(rq, &flags);

2489

task_rq_unlock(rq, &flags);

2490

put_cpu();

2490

put_cpu();

2491

2492

return success;

2492

return success;

2493

}

2493

}

2494

2495

/**

2495

/**

2496

* wake_up_process - Wake up a specific process

2496

* wake_up_process - Wake up a specific process

2497

* @p: The process to be woken up.

2497

* @p: The process to be woken up.

2498

*

2498

*

2499

* Attempt to wake up the nominated process and move it to the set of runnable

2499

* Attempt to wake up the nominated process and move it to the set of runnable

2500

* processes. Returns 1 if the process was woken up, 0 if it was already

2500

* processes. Returns 1 if the process was woken up, 0 if it was already

2501

* running.

2501

* running.

2502

*

2502

*

2503

* It may be assumed that this function implies a write memory barrier before

2503

* It may be assumed that this function implies a write memory barrier before

2504

* changing the task state if and only if any tasks are woken up.

2504

* changing the task state if and only if any tasks are woken up.

2505

*/

2505

*/

2506

int wake_up_process(struct task_struct *p)

2506

int wake_up_process(struct task_struct *p)

2507

{

2507

{

2508

return try_to_wake_up(p, TASK_ALL, 0);

2508

return try_to_wake_up(p, TASK_ALL, 0);

2509

}

2509

}

2510

EXPORT_SYMBOL(wake_up_process);

2510

EXPORT_SYMBOL(wake_up_process);

2511

2512

int wake_up_state(struct task_struct *p, unsigned int state)

2512

int wake_up_state(struct task_struct *p, unsigned int state)

2513

{

2513

{

2514

return try_to_wake_up(p, state, 0);

2514

return try_to_wake_up(p, state, 0);

2515

}

2515

}

2516

2517

/*

2517

/*

2518

* Perform scheduler related setup for a newly forked process p.

2518

* Perform scheduler related setup for a newly forked process p.

2519

* p is forked by current.

2519

* p is forked by current.

2520

*

2520

*

2521

* __sched_fork() is basic setup used by init_idle() too:

2521

* __sched_fork() is basic setup used by init_idle() too:

2522

*/

2522

*/

2523

static void __sched_fork(struct task_struct *p)

2523

static void __sched_fork(struct task_struct *p)

2524

{

2524

{

2525

p->se.exec_start = 0;

2525

p->se.exec_start = 0;

2526

p->se.sum_exec_runtime = 0;

2526

p->se.sum_exec_runtime = 0;

2527

p->se.prev_sum_exec_runtime = 0;

2527

p->se.prev_sum_exec_runtime = 0;

2528

p->se.nr_migrations = 0;

2528

p->se.nr_migrations = 0;

2529

p->se.last_wakeup = 0;

2529

p->se.last_wakeup = 0;

2530

p->se.avg_overlap = 0;

2530

p->se.avg_overlap = 0;

2531

p->se.start_runtime = 0;

2531

p->se.start_runtime = 0;

2532

p->se.avg_wakeup = sysctl_sched_wakeup_granularity;

2532

p->se.avg_wakeup = sysctl_sched_wakeup_granularity;

2533

2534

#ifdef CONFIG_SCHEDSTATS

2534

#ifdef CONFIG_SCHEDSTATS

2535

p->se.wait_start = 0;

2535

p->se.wait_start = 0;

2536

p->se.wait_max = 0;

2536

p->se.wait_max = 0;

2537

p->se.wait_count = 0;

2537

p->se.wait_count = 0;

2538

p->se.wait_sum = 0;

2538

p->se.wait_sum = 0;

2539

2540

p->se.sleep_start = 0;

2540

p->se.sleep_start = 0;

2541

p->se.sleep_max = 0;

2541

p->se.sleep_max = 0;

2542

p->se.sum_sleep_runtime = 0;

2542

p->se.sum_sleep_runtime = 0;

2543

2544

p->se.block_start = 0;

2544

p->se.block_start = 0;

2545

p->se.block_max = 0;

2545

p->se.block_max = 0;

2546

p->se.exec_max = 0;

2546

p->se.exec_max = 0;

2547

p->se.slice_max = 0;

2547

p->se.slice_max = 0;

2548

2549

p->se.nr_migrations_cold = 0;

2549

p->se.nr_migrations_cold = 0;

2550

p->se.nr_failed_migrations_affine = 0;

2550

p->se.nr_failed_migrations_affine = 0;

2551

p->se.nr_failed_migrations_running = 0;

2551

p->se.nr_failed_migrations_running = 0;

2552

p->se.nr_failed_migrations_hot = 0;

2552

p->se.nr_failed_migrations_hot = 0;

2553

p->se.nr_forced_migrations = 0;

2553

p->se.nr_forced_migrations = 0;

2554

2555

p->se.nr_wakeups = 0;

2555

p->se.nr_wakeups = 0;

2556

p->se.nr_wakeups_sync = 0;

2556

p->se.nr_wakeups_sync = 0;

2557

p->se.nr_wakeups_migrate = 0;

2557

p->se.nr_wakeups_migrate = 0;

2558

p->se.nr_wakeups_local = 0;

2558

p->se.nr_wakeups_local = 0;

2559

p->se.nr_wakeups_remote = 0;

2559

p->se.nr_wakeups_remote = 0;

2560

p->se.nr_wakeups_affine = 0;

2560

p->se.nr_wakeups_affine = 0;

2561

p->se.nr_wakeups_affine_attempts = 0;

2561

p->se.nr_wakeups_affine_attempts = 0;

2562

p->se.nr_wakeups_passive = 0;

2562

p->se.nr_wakeups_passive = 0;

2563

p->se.nr_wakeups_idle = 0;

2563

p->se.nr_wakeups_idle = 0;

2564

2565

#endif

2565

#endif

2566

2567

INIT_LIST_HEAD(&p->rt.run_list);

2567

INIT_LIST_HEAD(&p->rt.run_list);

2568

p->se.on_rq = 0;

2568

p->se.on_rq = 0;

2569

INIT_LIST_HEAD(&p->se.group_node);

2569

INIT_LIST_HEAD(&p->se.group_node);

2570

2571

#ifdef CONFIG_PREEMPT_NOTIFIERS

2571

#ifdef CONFIG_PREEMPT_NOTIFIERS

2572

INIT_HLIST_HEAD(&p->preempt_notifiers);

2572

INIT_HLIST_HEAD(&p->preempt_notifiers);

2573

#endif

2573

#endif

2574

}

2574

}

2575

2576

/*

2576

/*

2577

* fork()/clone()-time setup:

2577

* fork()/clone()-time setup:

2578

*/

2578

*/

2579

void sched_fork(struct task_struct *p, int clone_flags)

2579

void sched_fork(struct task_struct *p, int clone_flags)

2580

{

2580

{

2581

int cpu = get_cpu();

2581

int cpu = get_cpu();

2582

2583

__sched_fork(p);

2583

__sched_fork(p);

2584

/*

2584

/*

2585

* We mark the process as waking here. This guarantees that

2585

* We mark the process as waking here. This guarantees that

2586

* nobody will actually run it, and a signal or other external

2586

* nobody will actually run it, and a signal or other external

2587

* event cannot wake it up and insert it on the runqueue either.

2587

* event cannot wake it up and insert it on the runqueue either.

2588

*/

2588

*/

2589

p->state = TASK_WAKING;

2589

p->state = TASK_WAKING;

2590

2591

/*

2591

/*

2592

* Revert to default priority/policy on fork if requested.

2592

* Revert to default priority/policy on fork if requested.

2593

*/

2593

*/

2594

if (unlikely(p->sched_reset_on_fork)) {

2594

if (unlikely(p->sched_reset_on_fork)) {

2595

if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {

2595

if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {

2596

p->policy = SCHED_NORMAL;

2596

p->policy = SCHED_NORMAL;

2597

p->normal_prio = p->static_prio;

2597

p->normal_prio = p->static_prio;

2598

}

2598

}

2599

2600

if (PRIO_TO_NICE(p->static_prio) < 0) {

2600

if (PRIO_TO_NICE(p->static_prio) < 0) {

2601

p->static_prio = NICE_TO_PRIO(0);

2601

p->static_prio = NICE_TO_PRIO(0);

2602

p->normal_prio = p->static_prio;

2602

p->normal_prio = p->static_prio;

2603

set_load_weight(p);

2603

set_load_weight(p);

2604

}

2604

}

2605

2606

/*

2606

/*

2607

* We don't need the reset flag anymore after the fork. It has

2607

* We don't need the reset flag anymore after the fork. It has

2608

* fulfilled its duty:

2608

* fulfilled its duty:

2609

*/

2609

*/

2610

p->sched_reset_on_fork = 0;

2610

p->sched_reset_on_fork = 0;

2611

}

2611

}

2612

2613

/*

2613

/*

2614

* Make sure we do not leak PI boosting priority to the child.

2614

* Make sure we do not leak PI boosting priority to the child.

2615

*/

2615

*/

2616

p->prio = current->normal_prio;

2616

p->prio = current->normal_prio;

2617

2618

if (!rt_prio(p->prio))

2618

if (!rt_prio(p->prio))

2619

p->sched_class = &fair_sched_class;

2619

p->sched_class = &fair_sched_class;

2620

2621

if (p->sched_class->task_fork)

2621

if (p->sched_class->task_fork)

2622

p->sched_class->task_fork(p);

2622

p->sched_class->task_fork(p);

2623

2624

set_task_cpu(p, cpu);

2624

set_task_cpu(p, cpu);

2625

2626

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

2626

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

2627

if (likely(sched_info_on()))

2627

if (likely(sched_info_on()))

2628

memset(&p->sched_info, 0, sizeof(p->sched_info));

2628

memset(&p->sched_info, 0, sizeof(p->sched_info));

2629

#endif

2629

#endif

2630

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

2630

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

2631

p->oncpu = 0;

2631

p->oncpu = 0;

2632

#endif

2632

#endif

2633

#ifdef CONFIG_PREEMPT

2633

#ifdef CONFIG_PREEMPT

2634

/* Want to start with kernel preemption disabled. */

2634

/* Want to start with kernel preemption disabled. */

2635

task_thread_info(p)->preempt_count = 1;

2635

task_thread_info(p)->preempt_count = 1;

2636

#endif

2636

#endif

2637

plist_node_init(&p->pushable_tasks, MAX_PRIO);

2637

plist_node_init(&p->pushable_tasks, MAX_PRIO);

2638

2639

put_cpu();

2639

put_cpu();

2640

}

2640

}

2641

2642

/*

2642

/*

2643

* wake_up_new_task - wake up a newly created task for the first time.

2643

* wake_up_new_task - wake up a newly created task for the first time.

2644

*

2644

*

2645

* This function will do some initial scheduler statistics housekeeping

2645

* This function will do some initial scheduler statistics housekeeping

2646

* that must be done for every newly created context, then puts the task

2646

* that must be done for every newly created context, then puts the task

2647

* on the runqueue and wakes it.

2647

* on the runqueue and wakes it.

2648

*/

2648

*/

2649

void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)

2649

void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)

2650

{

2650

{

2651

unsigned long flags;

2651

unsigned long flags;

2652

struct rq *rq;

2652

struct rq *rq;

2653

int cpu __maybe_unused = get_cpu();

2653

int cpu __maybe_unused = get_cpu();

2654

2655

#ifdef CONFIG_SMP

2655

#ifdef CONFIG_SMP

2656

/*

2656

/*

2657

* Fork balancing, do it here and not earlier because:

2657

* Fork balancing, do it here and not earlier because:

2658

* - cpus_allowed can change in the fork path

2658

* - cpus_allowed can change in the fork path

2659

* - any previously selected cpu might disappear through hotplug

2659

* - any previously selected cpu might disappear through hotplug

2660

*

2660

*

2661

* We still have TASK_WAKING but PF_STARTING is gone now, meaning

2661

* We still have TASK_WAKING but PF_STARTING is gone now, meaning

2662

* ->cpus_allowed is stable, we have preemption disabled, meaning

2662

* ->cpus_allowed is stable, we have preemption disabled, meaning

2663

* cpu_online_mask is stable.

2663

* cpu_online_mask is stable.

2664

*/

2664

*/

2665

cpu = select_task_rq(p, SD_BALANCE_FORK, 0);

2665

cpu = select_task_rq(p, SD_BALANCE_FORK, 0);

2666

set_task_cpu(p, cpu);

2666

set_task_cpu(p, cpu);

2667

#endif

2667

#endif

2668

2669

/*

2669

/*

2670

* Since the task is not on the rq and we still have TASK_WAKING set

2670

* Since the task is not on the rq and we still have TASK_WAKING set

2671

* nobody else will migrate this task.

2671

* nobody else will migrate this task.

2672

*/

2672

*/

2673

rq = cpu_rq(cpu);

2673

rq = cpu_rq(cpu);

2674

raw_spin_lock_irqsave(&rq->lock, flags);

2674

raw_spin_lock_irqsave(&rq->lock, flags);

2675

2676

BUG_ON(p->state != TASK_WAKING);

2676

BUG_ON(p->state != TASK_WAKING);

2677

p->state = TASK_RUNNING;

2677

p->state = TASK_RUNNING;

2678

update_rq_clock(rq);

2678

update_rq_clock(rq);

2679

activate_task(rq, p, 0);

2679

activate_task(rq, p, 0);

2680

trace_sched_wakeup_new(rq, p, 1);

2680

trace_sched_wakeup_new(rq, p, 1);

2681

check_preempt_curr(rq, p, WF_FORK);

2681

check_preempt_curr(rq, p, WF_FORK);

2682

#ifdef CONFIG_SMP

2682

#ifdef CONFIG_SMP

2683

if (p->sched_class->task_woken)

2683

if (p->sched_class->task_woken)

2684

p->sched_class->task_woken(rq, p);

2684

p->sched_class->task_woken(rq, p);

2685

#endif

2685

#endif

2686

task_rq_unlock(rq, &flags);

2686

task_rq_unlock(rq, &flags);

2687

put_cpu();

2687

put_cpu();

2688

}

2688

}

2689

2690

#ifdef CONFIG_PREEMPT_NOTIFIERS

2690

#ifdef CONFIG_PREEMPT_NOTIFIERS

2691

2692

/**

2692

/**

2693

* preempt_notifier_register - tell me when current is being preempted & rescheduled

2693

* preempt_notifier_register - tell me when current is being preempted & rescheduled

2694

* @notifier: notifier struct to register

2694

* @notifier: notifier struct to register

2695

*/

2695

*/

2696

void preempt_notifier_register(struct preempt_notifier *notifier)

2696

void preempt_notifier_register(struct preempt_notifier *notifier)

2697

{

2697

{

2698

hlist_add_head(&notifier->link, &current->preempt_notifiers);

2698

hlist_add_head(&notifier->link, &current->preempt_notifiers);

2699

}

2699

}

2700

EXPORT_SYMBOL_GPL(preempt_notifier_register);

2700

EXPORT_SYMBOL_GPL(preempt_notifier_register);

2701

2702

/**

2702

/**

2703

* preempt_notifier_unregister - no longer interested in preemption notifications

2703

* preempt_notifier_unregister - no longer interested in preemption notifications

2704

* @notifier: notifier struct to unregister

2704

* @notifier: notifier struct to unregister

2705

*

2705

*

2706

* This is safe to call from within a preemption notifier.

2706

* This is safe to call from within a preemption notifier.

2707

*/

2707

*/

2708

void preempt_notifier_unregister(struct preempt_notifier *notifier)

2708

void preempt_notifier_unregister(struct preempt_notifier *notifier)

2709

{

2709

{

2710

hlist_del(&notifier->link);

2710

hlist_del(&notifier->link);

2711

}

2711

}

2712

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

2712

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

2713

2714

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2714

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2715

{

2715

{

2716

struct preempt_notifier *notifier;

2716

struct preempt_notifier *notifier;

2717

struct hlist_node *node;

2717

struct hlist_node *node;

2718

2719

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2719

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2720

notifier->ops->sched_in(notifier, raw_smp_processor_id());

2720

notifier->ops->sched_in(notifier, raw_smp_processor_id());

2721

}

2721

}

2722

2723

static void

2723

static void

2724

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2724

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2725

struct task_struct *next)

2725

struct task_struct *next)

2726

{

2726

{

2727

struct preempt_notifier *notifier;

2727

struct preempt_notifier *notifier;

2728

struct hlist_node *node;

2728

struct hlist_node *node;

2729

2730

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2730

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2731

notifier->ops->sched_out(notifier, next);

2731

notifier->ops->sched_out(notifier, next);

2732

}

2732

}

2733

2734

#else /* !CONFIG_PREEMPT_NOTIFIERS */

2734

#else /* !CONFIG_PREEMPT_NOTIFIERS */

2735

2736

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2736

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2737

{

2737

{

2738

}

2738

}

2739

2740

static void

2740

static void

2741

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2741

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2742

struct task_struct *next)

2742

struct task_struct *next)

2743

{

2743

{

2744

}

2744

}

2745

2746

#endif /* CONFIG_PREEMPT_NOTIFIERS */

2746

#endif /* CONFIG_PREEMPT_NOTIFIERS */

2747

2748

/**

2748

/**

2749

* prepare_task_switch - prepare to switch tasks

2749

* prepare_task_switch - prepare to switch tasks

2750

* @rq: the runqueue preparing to switch

2750

* @rq: the runqueue preparing to switch

2751

* @prev: the current task that is being switched out

2751

* @prev: the current task that is being switched out

2752

* @next: the task we are going to switch to.

2752

* @next: the task we are going to switch to.

2753

*

2753

*

2754

* This is called with the rq lock held and interrupts off. It must

2754

* This is called with the rq lock held and interrupts off. It must

2755

* be paired with a subsequent finish_task_switch after the context

2755

* be paired with a subsequent finish_task_switch after the context

2756

* switch.

2756

* switch.

2757

*

2757

*

2758

* prepare_task_switch sets up locking and calls architecture specific

2758

* prepare_task_switch sets up locking and calls architecture specific

2759

* hooks.

2759

* hooks.

2760

*/

2760

*/

2761

static inline void

2761

static inline void

2762

prepare_task_switch(struct rq *rq, struct task_struct *prev,

2762

prepare_task_switch(struct rq *rq, struct task_struct *prev,

2763

struct task_struct *next)

2763

struct task_struct *next)

2764

{

2764

{

2765

fire_sched_out_preempt_notifiers(prev, next);

2765

fire_sched_out_preempt_notifiers(prev, next);

2766

prepare_lock_switch(rq, next);

2766

prepare_lock_switch(rq, next);

2767

prepare_arch_switch(next);

2767

prepare_arch_switch(next);

2768

}

2768

}

2769

2770

/**

2770

/**

2771

* finish_task_switch - clean up after a task-switch

2771

* finish_task_switch - clean up after a task-switch

2772

* @rq: runqueue associated with task-switch

2772

* @rq: runqueue associated with task-switch

2773

* @prev: the thread we just switched away from.

2773

* @prev: the thread we just switched away from.

2774

*

2774

*

2775

* finish_task_switch must be called after the context switch, paired

2775

* finish_task_switch must be called after the context switch, paired

2776

* with a prepare_task_switch call before the context switch.

2776

* with a prepare_task_switch call before the context switch.

2777

* finish_task_switch will reconcile locking set up by prepare_task_switch,

2777

* finish_task_switch will reconcile locking set up by prepare_task_switch,

2778

* and do any other architecture-specific cleanup actions.

2778

* and do any other architecture-specific cleanup actions.

2779

*

2779

*

2780

* Note that we may have delayed dropping an mm in context_switch(). If

2780

* Note that we may have delayed dropping an mm in context_switch(). If

2781

* so, we finish that here outside of the runqueue lock. (Doing it

2781

* so, we finish that here outside of the runqueue lock. (Doing it

2782

* with the lock held can cause deadlocks; see schedule() for

2782

* with the lock held can cause deadlocks; see schedule() for

2783

* details.)

2783

* details.)

2784

*/

2784

*/

2785

static void finish_task_switch(struct rq *rq, struct task_struct *prev)

2785

static void finish_task_switch(struct rq *rq, struct task_struct *prev)

2786

__releases(rq->lock)

2786

__releases(rq->lock)

2787

{

2787

{

2788

struct mm_struct *mm = rq->prev_mm;

2788

struct mm_struct *mm = rq->prev_mm;

2789

long prev_state;

2789

long prev_state;

2790

2791

rq->prev_mm = NULL;

2791

rq->prev_mm = NULL;

2792

2793

/*

2793

/*

2794

* A task struct has one reference for the use as "current".

2794

* A task struct has one reference for the use as "current".

2795

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

2795

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

2796

* schedule one last time. The schedule call will never return, and

2796

* schedule one last time. The schedule call will never return, and

2797

* the scheduled task must drop that reference.

2797

* the scheduled task must drop that reference.

2798

* The test for TASK_DEAD must occur while the runqueue locks are

2798

* The test for TASK_DEAD must occur while the runqueue locks are

2799

* still held, otherwise prev could be scheduled on another cpu, die

2799

* still held, otherwise prev could be scheduled on another cpu, die

2800

* there before we look at prev->state, and then the reference would

2800

* there before we look at prev->state, and then the reference would

2801

* be dropped twice.

2801

* be dropped twice.

2802

* Manfred Spraul <manfred@colorfullife.com>

2802

* Manfred Spraul <manfred@colorfullife.com>

2803

*/

2803

*/

2804

prev_state = prev->state;

2804

prev_state = prev->state;

2805

finish_arch_switch(prev);

2805

finish_arch_switch(prev);

2806

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

2806

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

2807

local_irq_disable();

2807

local_irq_disable();

2808

#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */

2808

#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */

2809

perf_event_task_sched_in(current);

2809

perf_event_task_sched_in(current);

2810

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

2810

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

2811

local_irq_enable();

2811

local_irq_enable();

2812

#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */

2812

#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */

2813

finish_lock_switch(rq, prev);

2813

finish_lock_switch(rq, prev);

2814

2815

fire_sched_in_preempt_notifiers(current);

2815

fire_sched_in_preempt_notifiers(current);

2816

if (mm)

2816

if (mm)

2817

mmdrop(mm);

2817

mmdrop(mm);

2818

if (unlikely(prev_state == TASK_DEAD)) {

2818

if (unlikely(prev_state == TASK_DEAD)) {

2819

/*

2819

/*

2820

* Remove function-return probe instances associated with this

2820

* Remove function-return probe instances associated with this

2821

* task and put them back on the free list.

2821

* task and put them back on the free list.

2822

*/

2822

*/

2823

kprobe_flush_task(prev);

2823

kprobe_flush_task(prev);

2824

put_task_struct(prev);

2824

put_task_struct(prev);

2825

}

2825

}

2826

}

2826

}

2827

2828

#ifdef CONFIG_SMP

2828

#ifdef CONFIG_SMP

2829

2830

/* assumes rq->lock is held */

2830

/* assumes rq->lock is held */

2831

static inline void pre_schedule(struct rq *rq, struct task_struct *prev)

2831

static inline void pre_schedule(struct rq *rq, struct task_struct *prev)

2832

{

2832

{

2833

if (prev->sched_class->pre_schedule)

2833

if (prev->sched_class->pre_schedule)

2834

prev->sched_class->pre_schedule(rq, prev);

2834

prev->sched_class->pre_schedule(rq, prev);

2835

}

2835

}

2836

2837

/* rq->lock is NOT held, but preemption is disabled */

2837

/* rq->lock is NOT held, but preemption is disabled */

2838

static inline void post_schedule(struct rq *rq)

2838

static inline void post_schedule(struct rq *rq)

2839

{

2839

{

2840

if (rq->post_schedule) {

2840

if (rq->post_schedule) {

2841

unsigned long flags;

2841

unsigned long flags;

2842

2843

raw_spin_lock_irqsave(&rq->lock, flags);

2843

raw_spin_lock_irqsave(&rq->lock, flags);

2844

if (rq->curr->sched_class->post_schedule)

2844

if (rq->curr->sched_class->post_schedule)

2845

rq->curr->sched_class->post_schedule(rq);

2845

rq->curr->sched_class->post_schedule(rq);

2846

raw_spin_unlock_irqrestore(&rq->lock, flags);

2846

raw_spin_unlock_irqrestore(&rq->lock, flags);

2847

2848

rq->post_schedule = 0;

2848

rq->post_schedule = 0;

2849

}

2849

}

2850

}

2850

}

2851

2852

#else

2852

#else

2853

2854

static inline void pre_schedule(struct rq *rq, struct task_struct *p)

2854

static inline void pre_schedule(struct rq *rq, struct task_struct *p)

2855

{

2855

{

2856

}

2856

}

2857

2858

static inline void post_schedule(struct rq *rq)

2858

static inline void post_schedule(struct rq *rq)

2859

{

2859

{

2860

}

2860

}

2861

2862

#endif

2862

#endif

2863

2864

/**

2864

/**

2865

* schedule_tail - first thing a freshly forked thread must call.

2865

* schedule_tail - first thing a freshly forked thread must call.

2866

* @prev: the thread we just switched away from.

2866

* @prev: the thread we just switched away from.

2867

*/

2867

*/

2868

asmlinkage void schedule_tail(struct task_struct *prev)

2868

asmlinkage void schedule_tail(struct task_struct *prev)

2869

__releases(rq->lock)

2869

__releases(rq->lock)

2870

{

2870

{

2871

struct rq *rq = this_rq();

2871

struct rq *rq = this_rq();

2872

2873

finish_task_switch(rq, prev);

2873

finish_task_switch(rq, prev);

2874

2875

/*

2875

/*

2876

* FIXME: do we need to worry about rq being invalidated by the

2876

* FIXME: do we need to worry about rq being invalidated by the

2877

* task_switch?

2877

* task_switch?

2878

*/

2878

*/

2879

post_schedule(rq);

2879

post_schedule(rq);

2880

2881

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

2881

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

2882

/* In this case, finish_task_switch does not reenable preemption */

2882

/* In this case, finish_task_switch does not reenable preemption */

2883

preempt_enable();

2883

preempt_enable();

2884

#endif

2884

#endif

2885

if (current->set_child_tid)

2885

if (current->set_child_tid)

2886

put_user(task_pid_vnr(current), current->set_child_tid);

2886

put_user(task_pid_vnr(current), current->set_child_tid);

2887

}

2887

}

2888

2889

/*

2889

/*

2890

* context_switch - switch to the new MM and the new

2890

* context_switch - switch to the new MM and the new

2891

* thread's register state.

2891

* thread's register state.

2892

*/

2892

*/

2893

static inline void

2893

static inline void

2894

context_switch(struct rq *rq, struct task_struct *prev,

2894

context_switch(struct rq *rq, struct task_struct *prev,

2895

struct task_struct *next)

2895

struct task_struct *next)

2896

{

2896

{

2897

struct mm_struct *mm, *oldmm;

2897

struct mm_struct *mm, *oldmm;

2898

2899

prepare_task_switch(rq, prev, next);

2899

prepare_task_switch(rq, prev, next);

2900

trace_sched_switch(rq, prev, next);

2900

trace_sched_switch(rq, prev, next);

2901

mm = next->mm;

2901

mm = next->mm;

2902

oldmm = prev->active_mm;

2902

oldmm = prev->active_mm;

2903

/*

2903

/*

2904

* For paravirt, this is coupled with an exit in switch_to to

2904

* For paravirt, this is coupled with an exit in switch_to to

2905

* combine the page table reload and the switch backend into

2905

* combine the page table reload and the switch backend into

2906

* one hypercall.

2906

* one hypercall.

2907

*/

2907

*/

2908

arch_start_context_switch(prev);

2908

arch_start_context_switch(prev);

2909

2910

if (likely(!mm)) {

2910

if (likely(!mm)) {

2911

next->active_mm = oldmm;

2911

next->active_mm = oldmm;

2912

atomic_inc(&oldmm->mm_count);

2912

atomic_inc(&oldmm->mm_count);

2913

enter_lazy_tlb(oldmm, next);

2913

enter_lazy_tlb(oldmm, next);

2914

} else

2914

} else

2915

switch_mm(oldmm, mm, next);

2915

switch_mm(oldmm, mm, next);

2916

2917

if (likely(!prev->mm)) {

2917

if (likely(!prev->mm)) {

2918

prev->active_mm = NULL;

2918

prev->active_mm = NULL;

2919

rq->prev_mm = oldmm;

2919

rq->prev_mm = oldmm;

2920

}

2920

}

2921

/*

2921

/*

2922

* Since the runqueue lock will be released by the next

2922

* Since the runqueue lock will be released by the next

2923

* task (which is an invalid locking op but in the case

2923

* task (which is an invalid locking op but in the case

2924

* of the scheduler it's an obvious special-case), so we

2924

* of the scheduler it's an obvious special-case), so we

2925

* do an early lockdep release here:

2925

* do an early lockdep release here:

2926

*/

2926

*/

2927

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

2927

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

2928

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

2928

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

2929

#endif

2929

#endif

2930

2931

/* Here we just switch the register state and the stack. */

2931

/* Here we just switch the register state and the stack. */

2932

switch_to(prev, next, prev);

2932

switch_to(prev, next, prev);

2933

2934

barrier();

2934

barrier();

2935

/*

2935

/*

2936

* this_rq must be evaluated again because prev may have moved

2936

* this_rq must be evaluated again because prev may have moved

2937

* CPUs since it called schedule(), thus the 'rq' on its stack

2937

* CPUs since it called schedule(), thus the 'rq' on its stack

2938

* frame will be invalid.

2938

* frame will be invalid.

2939

*/

2939

*/

2940

finish_task_switch(this_rq(), prev);

2940

finish_task_switch(this_rq(), prev);

2941

}

2941

}

2942

2943

/*

2943

/*

2944

* nr_running, nr_uninterruptible and nr_context_switches:

2944

* nr_running, nr_uninterruptible and nr_context_switches:

2945

*

2945

*

2946

* externally visible scheduler statistics: current number of runnable

2946

* externally visible scheduler statistics: current number of runnable

2947

* threads, current number of uninterruptible-sleeping threads, total

2947

* threads, current number of uninterruptible-sleeping threads, total

2948

* number of context switches performed since bootup.

2948

* number of context switches performed since bootup.

2949

*/

2949

*/

2950

unsigned long nr_running(void)

2950

unsigned long nr_running(void)

2951

{

2951

{

2952

unsigned long i, sum = 0;

2952

unsigned long i, sum = 0;

2953

2954

for_each_online_cpu(i)

2954

for_each_online_cpu(i)

2955

sum += cpu_rq(i)->nr_running;

2955

sum += cpu_rq(i)->nr_running;

2956

2957

return sum;

2957

return sum;

2958

}

2958

}

2959

2960

unsigned long nr_uninterruptible(void)

2960

unsigned long nr_uninterruptible(void)

2961

{

2961

{

2962

unsigned long i, sum = 0;

2962

unsigned long i, sum = 0;

2963

2964

for_each_possible_cpu(i)

2964

for_each_possible_cpu(i)

2965

sum += cpu_rq(i)->nr_uninterruptible;

2965

sum += cpu_rq(i)->nr_uninterruptible;

2966

2967

/*

2967

/*

2968

* Since we read the counters lockless, it might be slightly

2968

* Since we read the counters lockless, it might be slightly

2969

* inaccurate. Do not allow it to go below zero though:

2969

* inaccurate. Do not allow it to go below zero though:

2970

*/

2970

*/

2971

if (unlikely((long)sum < 0))

2971

if (unlikely((long)sum < 0))

2972

sum = 0;

2972

sum = 0;

2973

2974

return sum;

2974

return sum;

2975

}

2975

}

2976

2977

unsigned long long nr_context_switches(void)

2977

unsigned long long nr_context_switches(void)

2978

{

2978

{

2979

int i;

2979

int i;

2980

unsigned long long sum = 0;

2980

unsigned long long sum = 0;

2981

2982

for_each_possible_cpu(i)

2982

for_each_possible_cpu(i)

2983

sum += cpu_rq(i)->nr_switches;

2983

sum += cpu_rq(i)->nr_switches;

2984

2985

return sum;

2985

return sum;

2986

}

2986

}

2987

2988

unsigned long nr_iowait(void)

2988

unsigned long nr_iowait(void)

2989

{

2989

{

2990

unsigned long i, sum = 0;

2990

unsigned long i, sum = 0;

2991

2992

for_each_possible_cpu(i)

2992

for_each_possible_cpu(i)

2993

sum += atomic_read(&cpu_rq(i)->nr_iowait);

2993

sum += atomic_read(&cpu_rq(i)->nr_iowait);

2994

2995

return sum;

2995

return sum;

2996

}

2996

}

2997

2998

unsigned long nr_iowait_cpu(void)

2998

unsigned long nr_iowait_cpu(void)

2999

{

2999

{

3000

struct rq *this = this_rq();

3000

struct rq *this = this_rq();

3001

return atomic_read(&this->nr_iowait);

3001

return atomic_read(&this->nr_iowait);

3002

}

3002

}

3003

3004

unsigned long this_cpu_load(void)

3004

unsigned long this_cpu_load(void)

3005

{

3005

{

3006

struct rq *this = this_rq();

3006

struct rq *this = this_rq();

3007

return this->cpu_load[0];

3007

return this->cpu_load[0];

3008

}

3008

}

3009

3010

3011

/* Variables and functions for calc_load */

3011

/* Variables and functions for calc_load */

3012

static atomic_long_t calc_load_tasks;

3012

static atomic_long_t calc_load_tasks;

3013

static unsigned long calc_load_update;

3013

static unsigned long calc_load_update;

3014

unsigned long avenrun[3];

3014

unsigned long avenrun[3];

3015

EXPORT_SYMBOL(avenrun);

3015

EXPORT_SYMBOL(avenrun);

3016

3017

/**

3017

/**

3018

* get_avenrun - get the load average array

3018

* get_avenrun - get the load average array

3019

* @loads: pointer to dest load array

3019

* @loads: pointer to dest load array

3020

* @offset: offset to add

3020

* @offset: offset to add

3021

* @shift: shift count to shift the result left

3021

* @shift: shift count to shift the result left

3022

*

3022

*

3023

* These values are estimates at best, so no need for locking.

3023

* These values are estimates at best, so no need for locking.

3024

*/

3024

*/

3025

void get_avenrun(unsigned long *loads, unsigned long offset, int shift)

3025

void get_avenrun(unsigned long *loads, unsigned long offset, int shift)

3026

{

3026

{

3027

loads[0] = (avenrun[0] + offset) << shift;

3027

loads[0] = (avenrun[0] + offset) << shift;

3028

loads[1] = (avenrun[1] + offset) << shift;

3028

loads[1] = (avenrun[1] + offset) << shift;

3029

loads[2] = (avenrun[2] + offset) << shift;

3029

loads[2] = (avenrun[2] + offset) << shift;

3030

}

3030

}

3031

3032

static unsigned long

3032

static unsigned long

3033

calc_load(unsigned long load, unsigned long exp, unsigned long active)

3033

calc_load(unsigned long load, unsigned long exp, unsigned long active)

3034

{

3034

{

3035

load *= exp;

3035

load *= exp;

3036

load += active * (FIXED_1 - exp);

3036

load += active * (FIXED_1 - exp);

3037

return load >> FSHIFT;

3037

return load >> FSHIFT;

3038

}

3038

}

3039

3040

/*

3040

/*

3041

* calc_load - update the avenrun load estimates 10 ticks after the

3041

* calc_load - update the avenrun load estimates 10 ticks after the

3042

* CPUs have updated calc_load_tasks.

3042

* CPUs have updated calc_load_tasks.

3043

*/

3043

*/

3044

void calc_global_load(void)

3044

void calc_global_load(void)

3045

{

3045

{

3046

unsigned long upd = calc_load_update + 10;

3046

unsigned long upd = calc_load_update + 10;

3047

long active;

3047

long active;

3048

3049

if (time_before(jiffies, upd))

3049

if (time_before(jiffies, upd))

3050

return;

3050

return;

3051

3052

active = atomic_long_read(&calc_load_tasks);

3052

active = atomic_long_read(&calc_load_tasks);

3053

active = active > 0 ? active * FIXED_1 : 0;

3053

active = active > 0 ? active * FIXED_1 : 0;

3054

3055

avenrun[0] = calc_load(avenrun[0], EXP_1, active);

3055

avenrun[0] = calc_load(avenrun[0], EXP_1, active);

3056

avenrun[1] = calc_load(avenrun[1], EXP_5, active);

3056

avenrun[1] = calc_load(avenrun[1], EXP_5, active);

3057

avenrun[2] = calc_load(avenrun[2], EXP_15, active);

3057

avenrun[2] = calc_load(avenrun[2], EXP_15, active);

3058

3059

calc_load_update += LOAD_FREQ;

3059

calc_load_update += LOAD_FREQ;

3060

}

3060

}

3061

3062

/*

3062

/*

3063

* Either called from update_cpu_load() or from a cpu going idle

3063

* Either called from update_cpu_load() or from a cpu going idle

3064

*/

3064

*/

3065

static void calc_load_account_active(struct rq *this_rq)

3065

static void calc_load_account_active(struct rq *this_rq)

3066

{

3066

{

3067

long nr_active, delta;

3067

long nr_active, delta;

3068

3069

nr_active = this_rq->nr_running;

3069

nr_active = this_rq->nr_running;

3070

nr_active += (long) this_rq->nr_uninterruptible;

3070

nr_active += (long) this_rq->nr_uninterruptible;

3071

3072

if (nr_active != this_rq->calc_load_active) {

3072

if (nr_active != this_rq->calc_load_active) {

3073

delta = nr_active - this_rq->calc_load_active;

3073

delta = nr_active - this_rq->calc_load_active;

3074

this_rq->calc_load_active = nr_active;

3074

this_rq->calc_load_active = nr_active;

3075

atomic_long_add(delta, &calc_load_tasks);

3075

atomic_long_add(delta, &calc_load_tasks);

3076

}

3076

}

3077

}

3077

}

3078

3079

/*

3079

/*

3080

* Update rq->cpu_load[] statistics. This function is usually called every

3080

* Update rq->cpu_load[] statistics. This function is usually called every

3081

* scheduler tick (TICK_NSEC).

3081

* scheduler tick (TICK_NSEC).

3082

*/

3082

*/

3083

static void update_cpu_load(struct rq *this_rq)

3083

static void update_cpu_load(struct rq *this_rq)

3084

{

3084

{

3085

unsigned long this_load = this_rq->load.weight;

3085

unsigned long this_load = this_rq->load.weight;

3086

int i, scale;

3086

int i, scale;

3087

3088

this_rq->nr_load_updates++;

3088

this_rq->nr_load_updates++;

3089

3090

/* Update our load: */

3090

/* Update our load: */

3091

for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {

3091

for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {

3092

unsigned long old_load, new_load;

3092

unsigned long old_load, new_load;

3093

3094

/* scale is effectively 1 << i now, and >> i divides by scale */

3094

/* scale is effectively 1 << i now, and >> i divides by scale */

3095

3096

old_load = this_rq->cpu_load[i];

3096

old_load = this_rq->cpu_load[i];

3097

new_load = this_load;

3097

new_load = this_load;

3098

/*

3098

/*

3099

* Round up the averaging division if load is increasing. This

3099

* Round up the averaging division if load is increasing. This

3100

* prevents us from getting stuck on 9 if the load is 10, for

3100

* prevents us from getting stuck on 9 if the load is 10, for

3101

* example.

3101

* example.

3102

*/

3102

*/

3103

if (new_load > old_load)

3103

if (new_load > old_load)

3104

new_load += scale-1;

3104

new_load += scale-1;

3105

this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;

3105

this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;

3106

}

3106

}

3107

3108

if (time_after_eq(jiffies, this_rq->calc_load_update)) {

3108

if (time_after_eq(jiffies, this_rq->calc_load_update)) {

3109

this_rq->calc_load_update += LOAD_FREQ;

3109

this_rq->calc_load_update += LOAD_FREQ;

3110

calc_load_account_active(this_rq);

3110

calc_load_account_active(this_rq);

3111

}

3111

}

3112

}

3112

}

3113

3114

#ifdef CONFIG_SMP

3114

#ifdef CONFIG_SMP

3115

3116

/*

3116

/*

3117

* sched_exec - execve() is a valuable balancing opportunity, because at

3117

* sched_exec - execve() is a valuable balancing opportunity, because at

3118

* this point the task has the smallest effective memory and cache footprint.

3118

* this point the task has the smallest effective memory and cache footprint.

3119

*/

3119

*/

3120

void sched_exec(void)

3120

void sched_exec(void)

3121

{

3121

{

3122

struct task_struct *p = current;

3122

struct task_struct *p = current;

3123

struct migration_req req;

3123

struct migration_req req;

3124

int dest_cpu, this_cpu;

3124

int dest_cpu, this_cpu;

3125

unsigned long flags;

3125

unsigned long flags;

3126

struct rq *rq;

3126

struct rq *rq;

3127

3128

again:

3128

again:

3129

this_cpu = get_cpu();

3129

this_cpu = get_cpu();

3130

dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);

3130

dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);

3131

if (dest_cpu == this_cpu) {

3131

if (dest_cpu == this_cpu) {

3132

put_cpu();

3132

put_cpu();

3133

return;

3133

return;

3134

}

3134

}

3135

3136

rq = task_rq_lock(p, &flags);

3136

rq = task_rq_lock(p, &flags);

3137

put_cpu();

3137

put_cpu();

3138

3139

/*

3139

/*

3140

* select_task_rq() can race against ->cpus_allowed

3140

* select_task_rq() can race against ->cpus_allowed

3141

*/

3141

*/

3142

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)

3142

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)

3143

|| unlikely(!cpu_active(dest_cpu))) {

3143

|| unlikely(!cpu_active(dest_cpu))) {

3144

task_rq_unlock(rq, &flags);

3144

task_rq_unlock(rq, &flags);

3145

goto again;

3145

goto again;

3146

}

3146

}

3147

3148

/* force the process onto the specified CPU */

3148

/* force the process onto the specified CPU */

3149

if (migrate_task(p, dest_cpu, &req)) {

3149

if (migrate_task(p, dest_cpu, &req)) {

3150

/* Need to wait for migration thread (might exit: take ref). */

3150

/* Need to wait for migration thread (might exit: take ref). */

3151

struct task_struct *mt = rq->migration_thread;

3151

struct task_struct *mt = rq->migration_thread;

3152

3153

get_task_struct(mt);

3153

get_task_struct(mt);

3154

task_rq_unlock(rq, &flags);

3154

task_rq_unlock(rq, &flags);

3155

wake_up_process(mt);

3155

wake_up_process(mt);

3156

put_task_struct(mt);

3156

put_task_struct(mt);

3157

wait_for_completion(&req.done);

3157

wait_for_completion(&req.done);

3158

3159

return;

3159

return;

3160

}

3160

}

3161

task_rq_unlock(rq, &flags);

3161

task_rq_unlock(rq, &flags);

3162

}

3162

}

3163

3164

#endif

3164

#endif

3165

3166

DEFINE_PER_CPU(struct kernel_stat, kstat);

3166

DEFINE_PER_CPU(struct kernel_stat, kstat);

3167

3168

EXPORT_PER_CPU_SYMBOL(kstat);

3168

EXPORT_PER_CPU_SYMBOL(kstat);

3169

3170

/*

3170

/*

3171

* Return any ns on the sched_clock that have not yet been accounted in

3171

* Return any ns on the sched_clock that have not yet been accounted in

3172

* @p in case that task is currently running.

3172

* @p in case that task is currently running.

3173

*

3173

*

3174

* Called with task_rq_lock() held on @rq.

3174

* Called with task_rq_lock() held on @rq.

3175

*/

3175

*/

3176

static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)

3176

static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)

3177

{

3177

{

3178

u64 ns = 0;

3178

u64 ns = 0;

3179

3180

if (task_current(rq, p)) {

3180

if (task_current(rq, p)) {

3181

update_rq_clock(rq);

3181

update_rq_clock(rq);

3182

ns = rq->clock - p->se.exec_start;

3182

ns = rq->clock - p->se.exec_start;

3183

if ((s64)ns < 0)

3183

if ((s64)ns < 0)

3184

ns = 0;

3184

ns = 0;

3185

}

3185

}

3186

3187

return ns;

3187

return ns;

3188

}

3188

}

3189

3190

unsigned long long task_delta_exec(struct task_struct *p)

3190

unsigned long long task_delta_exec(struct task_struct *p)

3191

{

3191

{

3192

unsigned long flags;

3192

unsigned long flags;

3193

struct rq *rq;

3193

struct rq *rq;

3194

u64 ns = 0;

3194

u64 ns = 0;

3195

3196

rq = task_rq_lock(p, &flags);

3196

rq = task_rq_lock(p, &flags);

3197

ns = do_task_delta_exec(p, rq);

3197

ns = do_task_delta_exec(p, rq);

3198

task_rq_unlock(rq, &flags);

3198

task_rq_unlock(rq, &flags);

3199

3200

return ns;

3200

return ns;

3201

}

3201

}

3202

3203

/*

3203

/*

3204

* Return accounted runtime for the task.

3204

* Return accounted runtime for the task.

3205

* In case the task is currently running, return the runtime plus current's

3205

* In case the task is currently running, return the runtime plus current's

3206

* pending runtime that have not been accounted yet.

3206

* pending runtime that have not been accounted yet.

3207

*/

3207

*/

3208

unsigned long long task_sched_runtime(struct task_struct *p)

3208

unsigned long long task_sched_runtime(struct task_struct *p)

3209

{

3209

{

3210

unsigned long flags;

3210

unsigned long flags;

3211

struct rq *rq;

3211

struct rq *rq;

3212

u64 ns = 0;

3212

u64 ns = 0;

3213

3214

rq = task_rq_lock(p, &flags);

3214

rq = task_rq_lock(p, &flags);

3215

ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);

3215

ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);

3216

task_rq_unlock(rq, &flags);

3216

task_rq_unlock(rq, &flags);

3217

3218

return ns;

3218

return ns;

3219

}

3219

}

3220

3221

/*

3221

/*

3222

* Return sum_exec_runtime for the thread group.

3222

* Return sum_exec_runtime for the thread group.

3223

* In case the task is currently running, return the sum plus current's

3223

* In case the task is currently running, return the sum plus current's

3224

* pending runtime that have not been accounted yet.

3224

* pending runtime that have not been accounted yet.

3225

*

3225

*

3226

* Note that the thread group might have other running tasks as well,

3226

* Note that the thread group might have other running tasks as well,

3227

* so the return value not includes other pending runtime that other

3227

* so the return value not includes other pending runtime that other

3228

* running tasks might have.

3228

* running tasks might have.

3229

*/

3229

*/

3230

unsigned long long thread_group_sched_runtime(struct task_struct *p)

3230

unsigned long long thread_group_sched_runtime(struct task_struct *p)

3231

{

3231

{

3232

struct task_cputime totals;

3232

struct task_cputime totals;

3233

unsigned long flags;

3233

unsigned long flags;

3234

struct rq *rq;

3234

struct rq *rq;

3235

u64 ns;

3235

u64 ns;

3236

3237

rq = task_rq_lock(p, &flags);

3237

rq = task_rq_lock(p, &flags);

3238

thread_group_cputime(p, &totals);

3238

thread_group_cputime(p, &totals);

3239

ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);

3239

ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);

3240

task_rq_unlock(rq, &flags);

3240

task_rq_unlock(rq, &flags);

3241

3242

return ns;

3242

return ns;

3243

}

3243

}

3244

3245

/*

3245

/*

3246

* Account user cpu time to a process.

3246

* Account user cpu time to a process.

3247

* @p: the process that the cpu time gets accounted to

3247

* @p: the process that the cpu time gets accounted to

3248

* @cputime: the cpu time spent in user space since the last update

3248

* @cputime: the cpu time spent in user space since the last update

3249

* @cputime_scaled: cputime scaled by cpu frequency

3249

* @cputime_scaled: cputime scaled by cpu frequency

3250

*/

3250

*/

3251

void account_user_time(struct task_struct *p, cputime_t cputime,

3251

void account_user_time(struct task_struct *p, cputime_t cputime,

3252

cputime_t cputime_scaled)

3252

cputime_t cputime_scaled)

3253

{

3253

{

3254

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3254

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3255

cputime64_t tmp;

3255

cputime64_t tmp;

3256

3257

/* Add user time to process. */

3257

/* Add user time to process. */

3258

p->utime = cputime_add(p->utime, cputime);

3258

p->utime = cputime_add(p->utime, cputime);

3259

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

3259

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

3260

account_group_user_time(p, cputime);

3260

account_group_user_time(p, cputime);

3261

3262

/* Add user time to cpustat. */

3262

/* Add user time to cpustat. */

3263

tmp = cputime_to_cputime64(cputime);

3263

tmp = cputime_to_cputime64(cputime);

3264

if (TASK_NICE(p) > 0)

3264

if (TASK_NICE(p) > 0)

3265

cpustat->nice = cputime64_add(cpustat->nice, tmp);

3265

cpustat->nice = cputime64_add(cpustat->nice, tmp);

3266

else

3266

else

3267

cpustat->user = cputime64_add(cpustat->user, tmp);

3267

cpustat->user = cputime64_add(cpustat->user, tmp);

3268

3269

cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);

3269

cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);

3270

/* Account for user time used */

3270

/* Account for user time used */

3271

acct_update_integrals(p);

3271

acct_update_integrals(p);

3272

}

3272

}

3273

3274

/*

3274

/*

3275

* Account guest cpu time to a process.

3275

* Account guest cpu time to a process.

3276

* @p: the process that the cpu time gets accounted to

3276

* @p: the process that the cpu time gets accounted to

3277

* @cputime: the cpu time spent in virtual machine since the last update

3277

* @cputime: the cpu time spent in virtual machine since the last update

3278

* @cputime_scaled: cputime scaled by cpu frequency

3278

* @cputime_scaled: cputime scaled by cpu frequency

3279

*/

3279

*/

3280

static void account_guest_time(struct task_struct *p, cputime_t cputime,

3280

static void account_guest_time(struct task_struct *p, cputime_t cputime,

3281

cputime_t cputime_scaled)

3281

cputime_t cputime_scaled)

3282

{

3282

{

3283

cputime64_t tmp;

3283

cputime64_t tmp;

3284

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3284

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3285

3286

tmp = cputime_to_cputime64(cputime);

3286

tmp = cputime_to_cputime64(cputime);

3287

3288

/* Add guest time to process. */

3288

/* Add guest time to process. */

3289

p->utime = cputime_add(p->utime, cputime);

3289

p->utime = cputime_add(p->utime, cputime);

3290

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

3290

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

3291

account_group_user_time(p, cputime);

3291

account_group_user_time(p, cputime);

3292

p->gtime = cputime_add(p->gtime, cputime);

3292

p->gtime = cputime_add(p->gtime, cputime);

3293

3294

/* Add guest time to cpustat. */

3294

/* Add guest time to cpustat. */

3295

if (TASK_NICE(p) > 0) {

3295

if (TASK_NICE(p) > 0) {

3296

cpustat->nice = cputime64_add(cpustat->nice, tmp);

3296

cpustat->nice = cputime64_add(cpustat->nice, tmp);

3297

cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);

3297

cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);

3298

} else {

3298

} else {

3299

cpustat->user = cputime64_add(cpustat->user, tmp);

3299

cpustat->user = cputime64_add(cpustat->user, tmp);

3300

cpustat->guest = cputime64_add(cpustat->guest, tmp);

3300

cpustat->guest = cputime64_add(cpustat->guest, tmp);

3301

}

3301

}

3302

}

3302

}

3303

3304

/*

3304

/*

3305

* Account system cpu time to a process.

3305

* Account system cpu time to a process.

3306

* @p: the process that the cpu time gets accounted to

3306

* @p: the process that the cpu time gets accounted to

3307

* @hardirq_offset: the offset to subtract from hardirq_count()

3307

* @hardirq_offset: the offset to subtract from hardirq_count()

3308

* @cputime: the cpu time spent in kernel space since the last update

3308

* @cputime: the cpu time spent in kernel space since the last update

3309

* @cputime_scaled: cputime scaled by cpu frequency

3309

* @cputime_scaled: cputime scaled by cpu frequency

3310

*/

3310

*/

3311

void account_system_time(struct task_struct *p, int hardirq_offset,

3311

void account_system_time(struct task_struct *p, int hardirq_offset,

3312

cputime_t cputime, cputime_t cputime_scaled)

3312

cputime_t cputime, cputime_t cputime_scaled)

3313

{

3313

{

3314

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3314

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3315

cputime64_t tmp;

3315

cputime64_t tmp;

3316

3317

if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {

3317

if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {

3318

account_guest_time(p, cputime, cputime_scaled);

3318

account_guest_time(p, cputime, cputime_scaled);

3319

return;

3319

return;

3320

}

3320

}

3321

3322

/* Add system time to process. */

3322

/* Add system time to process. */

3323

p->stime = cputime_add(p->stime, cputime);

3323

p->stime = cputime_add(p->stime, cputime);

3324

p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);

3324

p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);

3325

account_group_system_time(p, cputime);

3325

account_group_system_time(p, cputime);

3326

3327

/* Add system time to cpustat. */

3327

/* Add system time to cpustat. */

3328

tmp = cputime_to_cputime64(cputime);

3328

tmp = cputime_to_cputime64(cputime);

3329

if (hardirq_count() - hardirq_offset)

3329

if (hardirq_count() - hardirq_offset)

3330

cpustat->irq = cputime64_add(cpustat->irq, tmp);

3330

cpustat->irq = cputime64_add(cpustat->irq, tmp);

3331

else if (softirq_count())

3331

else if (softirq_count())

3332

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

3332

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

3333

else

3333

else

3334

cpustat->system = cputime64_add(cpustat->system, tmp);

3334

cpustat->system = cputime64_add(cpustat->system, tmp);

3335

3336

cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);

3336

cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);

3337

3338

/* Account for system time used */

3338

/* Account for system time used */

3339

acct_update_integrals(p);

3339

acct_update_integrals(p);

3340

}

3340

}

3341

3342

/*

3342

/*

3343

* Account for involuntary wait time.

3343

* Account for involuntary wait time.

3344

* @steal: the cpu time spent in involuntary wait

3344

* @steal: the cpu time spent in involuntary wait

3345

*/

3345

*/

3346

void account_steal_time(cputime_t cputime)

3346

void account_steal_time(cputime_t cputime)

3347

{

3347

{

3348

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3348

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3349

cputime64_t cputime64 = cputime_to_cputime64(cputime);

3349

cputime64_t cputime64 = cputime_to_cputime64(cputime);

3350

3351

cpustat->steal = cputime64_add(cpustat->steal, cputime64);

3351

cpustat->steal = cputime64_add(cpustat->steal, cputime64);

3352

}

3352

}

3353

3354

/*

3354

/*

3355

* Account for idle time.

3355

* Account for idle time.

3356

* @cputime: the cpu time spent in idle wait

3356

* @cputime: the cpu time spent in idle wait

3357

*/

3357

*/

3358

void account_idle_time(cputime_t cputime)

3358

void account_idle_time(cputime_t cputime)

3359

{

3359

{

3360

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3360

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3361

cputime64_t cputime64 = cputime_to_cputime64(cputime);

3361

cputime64_t cputime64 = cputime_to_cputime64(cputime);

3362

struct rq *rq = this_rq();

3362

struct rq *rq = this_rq();

3363

3364

if (atomic_read(&rq->nr_iowait) > 0)

3364

if (atomic_read(&rq->nr_iowait) > 0)

3365

cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);

3365

cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);

3366

else

3366

else

3367

cpustat->idle = cputime64_add(cpustat->idle, cputime64);

3367

cpustat->idle = cputime64_add(cpustat->idle, cputime64);

3368

}

3368

}

3369

3370

#ifndef CONFIG_VIRT_CPU_ACCOUNTING

3370

#ifndef CONFIG_VIRT_CPU_ACCOUNTING

3371

3372

/*

3372

/*

3373

* Account a single tick of cpu time.

3373

* Account a single tick of cpu time.

3374

* @p: the process that the cpu time gets accounted to

3374

* @p: the process that the cpu time gets accounted to

3375

* @user_tick: indicates if the tick is a user or a system tick

3375

* @user_tick: indicates if the tick is a user or a system tick

3376

*/

3376

*/

3377

void account_process_tick(struct task_struct *p, int user_tick)

3377

void account_process_tick(struct task_struct *p, int user_tick)

3378

{

3378

{

3379

cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);

3379

cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);

3380

struct rq *rq = this_rq();

3380

struct rq *rq = this_rq();

3381

3382

if (user_tick)

3382

if (user_tick)

3383

account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);

3383

account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);

3384

else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))

3384

else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))

3385

account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,

3385

account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,

3386

one_jiffy_scaled);

3386

one_jiffy_scaled);

3387

else

3387

else

3388

account_idle_time(cputime_one_jiffy);

3388

account_idle_time(cputime_one_jiffy);

3389

}

3389

}

3390

3391

/*

3391

/*

3392

* Account multiple ticks of steal time.

3392

* Account multiple ticks of steal time.

3393

* @p: the process from which the cpu time has been stolen

3393

* @p: the process from which the cpu time has been stolen

3394

* @ticks: number of stolen ticks

3394

* @ticks: number of stolen ticks

3395

*/

3395

*/

3396

void account_steal_ticks(unsigned long ticks)

3396

void account_steal_ticks(unsigned long ticks)

3397

{

3397

{

3398

account_steal_time(jiffies_to_cputime(ticks));

3398

account_steal_time(jiffies_to_cputime(ticks));

3399

}

3399

}

3400

3401

/*

3401

/*

3402

* Account multiple ticks of idle time.

3402

* Account multiple ticks of idle time.

3403

* @ticks: number of stolen ticks

3403

* @ticks: number of stolen ticks

3404

*/

3404

*/

3405

void account_idle_ticks(unsigned long ticks)

3405

void account_idle_ticks(unsigned long ticks)

3406

{

3406

{

3407

account_idle_time(jiffies_to_cputime(ticks));

3407

account_idle_time(jiffies_to_cputime(ticks));

3408

}

3408

}

3409

3410

#endif

3410

#endif

3411

3412

/*

3412

/*

3413

* Use precise platform statistics if available:

3413

* Use precise platform statistics if available:

3414

*/

3414

*/

3415

#ifdef CONFIG_VIRT_CPU_ACCOUNTING

3415

#ifdef CONFIG_VIRT_CPU_ACCOUNTING

3416

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

3416

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

3417

{

3417

{

3418

*ut = p->utime;

3418

*ut = p->utime;

3419

*st = p->stime;

3419

*st = p->stime;

3420

}

3420

}

3421

3422

void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

3422

void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

3423

{

3423

{

3424

struct task_cputime cputime;

3424

struct task_cputime cputime;

3425

3426

thread_group_cputime(p, &cputime);

3426

thread_group_cputime(p, &cputime);

3427

3428

*ut = cputime.utime;

3428

*ut = cputime.utime;

3429

*st = cputime.stime;

3429

*st = cputime.stime;

3430

}

3430

}

3431

#else

3431

#else

3432

3433

#ifndef nsecs_to_cputime

3433

#ifndef nsecs_to_cputime

3434

# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)

3434

# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)

3435

#endif

3435

#endif

3436

3437

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

3437

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

3438

{

3438

{

3439

cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);

3439

cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);

3440

3441

/*

3441

/*

3442

* Use CFS's precise accounting:

3442

* Use CFS's precise accounting:

3443

*/

3443

*/

3444

rtime = nsecs_to_cputime(p->se.sum_exec_runtime);

3444

rtime = nsecs_to_cputime(p->se.sum_exec_runtime);

3445

3446

if (total) {

3446

if (total) {

3447

u64 temp;

3447

u64 temp;

3448

3449

temp = (u64)(rtime * utime);

3449

temp = (u64)(rtime * utime);

3450

do_div(temp, total);

3450

do_div(temp, total);

3451

utime = (cputime_t)temp;

3451

utime = (cputime_t)temp;

3452

} else

3452

} else

3453

utime = rtime;

3453

utime = rtime;

3454

3455

/*

3455

/*

3456

* Compare with previous values, to keep monotonicity:

3456

* Compare with previous values, to keep monotonicity:

3457

*/

3457

*/

3458

p->prev_utime = max(p->prev_utime, utime);

3458

p->prev_utime = max(p->prev_utime, utime);

3459

p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));

3459

p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));

3460

3461

*ut = p->prev_utime;

3461

*ut = p->prev_utime;

3462

*st = p->prev_stime;

3462

*st = p->prev_stime;

3463

}

3463

}

3464

3465

/*

3465

/*

3466

* Must be called with siglock held.

3466

* Must be called with siglock held.

3467

*/

3467

*/

3468

void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

3468

void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

3469

{

3469

{

3470

struct signal_struct *sig = p->signal;

3470

struct signal_struct *sig = p->signal;

3471

struct task_cputime cputime;

3471

struct task_cputime cputime;

3472

cputime_t rtime, utime, total;

3472

cputime_t rtime, utime, total;

3473

3474

thread_group_cputime(p, &cputime);

3474

thread_group_cputime(p, &cputime);

3475

3476

total = cputime_add(cputime.utime, cputime.stime);

3476

total = cputime_add(cputime.utime, cputime.stime);

3477

rtime = nsecs_to_cputime(cputime.sum_exec_runtime);

3477

rtime = nsecs_to_cputime(cputime.sum_exec_runtime);

3478

3479

if (total) {

3479

if (total) {

3480

u64 temp;

3480

u64 temp;

3481

3482

temp = (u64)(rtime * cputime.utime);

3482

temp = (u64)(rtime * cputime.utime);

3483

do_div(temp, total);

3483

do_div(temp, total);

3484

utime = (cputime_t)temp;

3484

utime = (cputime_t)temp;

3485

} else

3485

} else

3486

utime = rtime;

3486

utime = rtime;

3487

3488

sig->prev_utime = max(sig->prev_utime, utime);

3488

sig->prev_utime = max(sig->prev_utime, utime);

3489

sig->prev_stime = max(sig->prev_stime,

3489

sig->prev_stime = max(sig->prev_stime,

3490

cputime_sub(rtime, sig->prev_utime));

3490

cputime_sub(rtime, sig->prev_utime));

3491

3492

*ut = sig->prev_utime;

3492

*ut = sig->prev_utime;

3493

*st = sig->prev_stime;

3493

*st = sig->prev_stime;

3494

}

3494

}

3495

#endif

3495

#endif

3496

3497

/*

3497

/*

3498

* This function gets called by the timer code, with HZ frequency.

3498

* This function gets called by the timer code, with HZ frequency.

3499

* We call it with interrupts disabled.

3499

* We call it with interrupts disabled.

3500

*

3500

*

3501

* It also gets called by the fork code, when changing the parent's

3501

* It also gets called by the fork code, when changing the parent's

3502

* timeslices.

3502

* timeslices.

3503

*/

3503

*/

3504

void scheduler_tick(void)

3504

void scheduler_tick(void)

3505

{

3505

{

3506

int cpu = smp_processor_id();

3506

int cpu = smp_processor_id();

3507

struct rq *rq = cpu_rq(cpu);

3507

struct rq *rq = cpu_rq(cpu);

3508

struct task_struct *curr = rq->curr;

3508

struct task_struct *curr = rq->curr;

3509

3510

sched_clock_tick();

3510

sched_clock_tick();

3511

3512

raw_spin_lock(&rq->lock);

3512

raw_spin_lock(&rq->lock);

3513

update_rq_clock(rq);

3513

update_rq_clock(rq);

3514

update_cpu_load(rq);

3514

update_cpu_load(rq);

3515

curr->sched_class->task_tick(rq, curr, 0);

3515

curr->sched_class->task_tick(rq, curr, 0);

3516

raw_spin_unlock(&rq->lock);

3516

raw_spin_unlock(&rq->lock);

3517

3518

perf_event_task_tick(curr);

3518

perf_event_task_tick(curr);

3519

3520

#ifdef CONFIG_SMP

3520

#ifdef CONFIG_SMP

3521

rq->idle_at_tick = idle_cpu(cpu);

3521

rq->idle_at_tick = idle_cpu(cpu);

3522

trigger_load_balance(rq, cpu);

3522

trigger_load_balance(rq, cpu);

3523

#endif

3523

#endif

3524

}

3524

}

3525

3526

notrace unsigned long get_parent_ip(unsigned long addr)

3526

notrace unsigned long get_parent_ip(unsigned long addr)

3527

{

3527

{

3528

if (in_lock_functions(addr)) {

3528

if (in_lock_functions(addr)) {

3529

addr = CALLER_ADDR2;

3529

addr = CALLER_ADDR2;

3530

if (in_lock_functions(addr))

3530

if (in_lock_functions(addr))

3531

addr = CALLER_ADDR3;

3531

addr = CALLER_ADDR3;

3532

}

3532

}

3533

return addr;

3533

return addr;

3534

}

3534

}

3535

3536

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \

3536

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \

3537

defined(CONFIG_PREEMPT_TRACER))

3537

defined(CONFIG_PREEMPT_TRACER))

3538

3539

void __kprobes add_preempt_count(int val)

3539

void __kprobes add_preempt_count(int val)

3540

{

3540

{

3541

#ifdef CONFIG_DEBUG_PREEMPT

3541

#ifdef CONFIG_DEBUG_PREEMPT

3542

/*

3542

/*

3543

* Underflow?

3543

* Underflow?

3544

*/

3544

*/

3545

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

3545

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

3546

return;

3546

return;

3547

#endif

3547

#endif

3548

preempt_count() += val;

3548

preempt_count() += val;

3549

#ifdef CONFIG_DEBUG_PREEMPT

3549

#ifdef CONFIG_DEBUG_PREEMPT

3550

/*

3550

/*

3551

* Spinlock count overflowing soon?

3551

* Spinlock count overflowing soon?

3552

*/

3552

*/

3553

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

3553

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

3554

PREEMPT_MASK - 10);

3554

PREEMPT_MASK - 10);

3555

#endif

3555

#endif

3556

if (preempt_count() == val)

3556

if (preempt_count() == val)

3557

trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

3557

trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

3558

}

3558

}

3559

EXPORT_SYMBOL(add_preempt_count);

3559

EXPORT_SYMBOL(add_preempt_count);

3560

3561

void __kprobes sub_preempt_count(int val)

3561

void __kprobes sub_preempt_count(int val)

3562

{

3562

{

3563

#ifdef CONFIG_DEBUG_PREEMPT

3563

#ifdef CONFIG_DEBUG_PREEMPT

3564

/*

3564

/*

3565

* Underflow?

3565

* Underflow?

3566

*/

3566

*/

3567

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

3567

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

3568

return;

3568

return;

3569

/*

3569

/*

3570

* Is the spinlock portion underflowing?

3570

* Is the spinlock portion underflowing?

3571

*/

3571

*/

3572

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

3572

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

3573

!(preempt_count() & PREEMPT_MASK)))

3573

!(preempt_count() & PREEMPT_MASK)))

3574

return;

3574

return;

3575

#endif

3575

#endif

3576

3577

if (preempt_count() == val)

3577

if (preempt_count() == val)

3578

trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

3578

trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

3579

preempt_count() -= val;

3579

preempt_count() -= val;

3580

}

3580

}

3581

EXPORT_SYMBOL(sub_preempt_count);

3581

EXPORT_SYMBOL(sub_preempt_count);

3582

3583

#endif

3583

#endif

3584

3585

/*

3585

/*

3586

* Print scheduling while atomic bug:

3586

* Print scheduling while atomic bug:

3587

*/

3587

*/

3588

static noinline void __schedule_bug(struct task_struct *prev)

3588

static noinline void __schedule_bug(struct task_struct *prev)

3589

{

3589

{

3590

struct pt_regs *regs = get_irq_regs();

3590

struct pt_regs *regs = get_irq_regs();

3591

3592

printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

3592

printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

3593

prev->comm, prev->pid, preempt_count());

3593

prev->comm, prev->pid, preempt_count());

3594

3595

debug_show_held_locks(prev);

3595

debug_show_held_locks(prev);

3596

print_modules();

3596

print_modules();

3597

if (irqs_disabled())

3597

if (irqs_disabled())

3598

print_irqtrace_events(prev);

3598

print_irqtrace_events(prev);

3599

3600

if (regs)

3600

if (regs)

3601

show_regs(regs);

3601

show_regs(regs);

3602

else

3602

else

3603

dump_stack();

3603

dump_stack();

3604

}

3604

}

3605

3606

/*

3606

/*

3607

* Various schedule()-time debugging checks and statistics:

3607

* Various schedule()-time debugging checks and statistics:

3608

*/

3608

*/

3609

static inline void schedule_debug(struct task_struct *prev)

3609

static inline void schedule_debug(struct task_struct *prev)

3610

{

3610

{

3611

/*

3611

/*

3612

* Test if we are atomic. Since do_exit() needs to call into

3612

* Test if we are atomic. Since do_exit() needs to call into

3613

* schedule() atomically, we ignore that path for now.

3613

* schedule() atomically, we ignore that path for now.

3614

* Otherwise, whine if we are scheduling when we should not be.

3614

* Otherwise, whine if we are scheduling when we should not be.

3615

*/

3615

*/

3616

if (unlikely(in_atomic_preempt_off() && !prev->exit_state))

3616

if (unlikely(in_atomic_preempt_off() && !prev->exit_state))

3617

__schedule_bug(prev);

3617

__schedule_bug(prev);

3618

3619

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

3619

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

3620

3621

schedstat_inc(this_rq(), sched_count);

3621

schedstat_inc(this_rq(), sched_count);

3622

#ifdef CONFIG_SCHEDSTATS

3622

#ifdef CONFIG_SCHEDSTATS

3623

if (unlikely(prev->lock_depth >= 0)) {

3623

if (unlikely(prev->lock_depth >= 0)) {

3624

schedstat_inc(this_rq(), bkl_count);

3624

schedstat_inc(this_rq(), bkl_count);

3625

schedstat_inc(prev, sched_info.bkl_count);

3625

schedstat_inc(prev, sched_info.bkl_count);

3626

}

3626

}

3627

#endif

3627

#endif

3628

}

3628

}

3629

3630

static void put_prev_task(struct rq *rq, struct task_struct *prev)

3630

static void put_prev_task(struct rq *rq, struct task_struct *prev)

3631

{

3631

{

3632

if (prev->state == TASK_RUNNING) {

3632

if (prev->state == TASK_RUNNING) {

3633

u64 runtime = prev->se.sum_exec_runtime;

3633

u64 runtime = prev->se.sum_exec_runtime;

3634

3635

runtime -= prev->se.prev_sum_exec_runtime;

3635

runtime -= prev->se.prev_sum_exec_runtime;

3636

runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);

3636

runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);

3637

3638

/*

3638

/*

3639

* In order to avoid avg_overlap growing stale when we are

3639

* In order to avoid avg_overlap growing stale when we are

3640

* indeed overlapping and hence not getting put to sleep, grow

3640

* indeed overlapping and hence not getting put to sleep, grow

3641

* the avg_overlap on preemption.

3641

* the avg_overlap on preemption.

3642

*

3642

*

3643

* We use the average preemption runtime because that

3643

* We use the average preemption runtime because that

3644

* correlates to the amount of cache footprint a task can

3644

* correlates to the amount of cache footprint a task can

3645

* build up.

3645

* build up.

3646

*/

3646

*/

3647

update_avg(&prev->se.avg_overlap, runtime);

3647

update_avg(&prev->se.avg_overlap, runtime);

3648

}

3648

}

3649

prev->sched_class->put_prev_task(rq, prev);

3649

prev->sched_class->put_prev_task(rq, prev);

3650

}

3650

}

3651

3652

/*

3652

/*

3653

* Pick up the highest-prio task:

3653

* Pick up the highest-prio task:

3654

*/

3654

*/

3655

static inline struct task_struct *

3655

static inline struct task_struct *

3656

pick_next_task(struct rq *rq)

3656

pick_next_task(struct rq *rq)

3657

{

3657

{

3658

const struct sched_class *class;

3658

const struct sched_class *class;

3659

struct task_struct *p;

3659

struct task_struct *p;

3660

3661

/*

3661

/*

3662

* Optimization: we know that if all tasks are in

3662

* Optimization: we know that if all tasks are in

3663

* the fair class we can call that function directly:

3663

* the fair class we can call that function directly:

3664

*/

3664

*/

3665

if (likely(rq->nr_running == rq->cfs.nr_running)) {

3665

if (likely(rq->nr_running == rq->cfs.nr_running)) {

3666

p = fair_sched_class.pick_next_task(rq);

3666

p = fair_sched_class.pick_next_task(rq);

3667

if (likely(p))

3667

if (likely(p))

3668

return p;

3668

return p;

3669

}

3669

}

3670

3671

class = sched_class_highest;

3671

class = sched_class_highest;

3672

for ( ; ; ) {

3672

for ( ; ; ) {

3673

p = class->pick_next_task(rq);

3673

p = class->pick_next_task(rq);

3674

if (p)

3674

if (p)

3675

return p;

3675

return p;

3676

/*

3676

/*

3677

* Will never be NULL as the idle class always

3677

* Will never be NULL as the idle class always

3678

* returns a non-NULL p:

3678

* returns a non-NULL p:

3679

*/

3679

*/

3680

class = class->next;

3680

class = class->next;

3681

}

3681

}

3682

}

3682

}

3683

3684

/*

3684

/*

3685

* schedule() is the main scheduler function.

3685

* schedule() is the main scheduler function.

3686

*/

3686

*/

3687

asmlinkage void __sched schedule(void)

3687

asmlinkage void __sched schedule(void)

3688

{

3688

{

3689

struct task_struct *prev, *next;

3689

struct task_struct *prev, *next;

3690

unsigned long *switch_count;

3690

unsigned long *switch_count;

3691

struct rq *rq;

3691

struct rq *rq;

3692

int cpu;

3692

int cpu;

3693

3694

need_resched:

3694

need_resched:

3695

preempt_disable();

3695

preempt_disable();

3696

cpu = smp_processor_id();

3696

cpu = smp_processor_id();

3697

rq = cpu_rq(cpu);

3697

rq = cpu_rq(cpu);

3698

rcu_sched_qs(cpu);

3698

rcu_sched_qs(cpu);

3699

prev = rq->curr;

3699

prev = rq->curr;

3700

switch_count = &prev->nivcsw;

3700

switch_count = &prev->nivcsw;

3701

3702

release_kernel_lock(prev);

3702

release_kernel_lock(prev);

3703

need_resched_nonpreemptible:

3703

need_resched_nonpreemptible:

3704

3705

schedule_debug(prev);

3705

schedule_debug(prev);

3706

3707

if (sched_feat(HRTICK))

3707

if (sched_feat(HRTICK))

3708

hrtick_clear(rq);

3708

hrtick_clear(rq);

3709

3710

raw_spin_lock_irq(&rq->lock);

3710

raw_spin_lock_irq(&rq->lock);

3711

update_rq_clock(rq);

3711

update_rq_clock(rq);

3712

clear_tsk_need_resched(prev);

3712

clear_tsk_need_resched(prev);

3713

3714

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

3714

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

3715

if (unlikely(signal_pending_state(prev->state, prev)))

3715

if (unlikely(signal_pending_state(prev->state, prev)))

3716

prev->state = TASK_RUNNING;

3716

prev->state = TASK_RUNNING;

3717

else

3717

else

3718

deactivate_task(rq, prev, 1);

3718

deactivate_task(rq, prev, 1);

3719

switch_count = &prev->nvcsw;

3719

switch_count = &prev->nvcsw;

3720

}

3720

}

3721

3722

pre_schedule(rq, prev);

3722

pre_schedule(rq, prev);

3723

3724

if (unlikely(!rq->nr_running))

3724

if (unlikely(!rq->nr_running))

3725

idle_balance(cpu, rq);

3725

idle_balance(cpu, rq);

3726

3727

put_prev_task(rq, prev);

3727

put_prev_task(rq, prev);

3728

next = pick_next_task(rq);

3728

next = pick_next_task(rq);

3729

3730

if (likely(prev != next)) {

3730

if (likely(prev != next)) {

3731

sched_info_switch(prev, next);

3731

sched_info_switch(prev, next);

3732

perf_event_task_sched_out(prev, next);

3732

perf_event_task_sched_out(prev, next);

3733

3734

rq->nr_switches++;

3734

rq->nr_switches++;

3735

rq->curr = next;

3735

rq->curr = next;

3736

++*switch_count;

3736

++*switch_count;

3737

3738

context_switch(rq, prev, next); /* unlocks the rq */

3738

context_switch(rq, prev, next); /* unlocks the rq */

3739

/*

3739

/*

3740

* the context switch might have flipped the stack from under

3740

* the context switch might have flipped the stack from under

3741

* us, hence refresh the local variables.

3741

* us, hence refresh the local variables.

3742

*/

3742

*/

3743

cpu = smp_processor_id();

3743

cpu = smp_processor_id();

3744

rq = cpu_rq(cpu);

3744

rq = cpu_rq(cpu);

3745

} else

3745

} else

3746

raw_spin_unlock_irq(&rq->lock);

3746

raw_spin_unlock_irq(&rq->lock);

3747

3748

post_schedule(rq);

3748

post_schedule(rq);

3749

3750

if (unlikely(reacquire_kernel_lock(current) < 0)) {

3750

if (unlikely(reacquire_kernel_lock(current) < 0)) {

3751

prev = rq->curr;

3751

prev = rq->curr;

3752

switch_count = &prev->nivcsw;

3752

switch_count = &prev->nivcsw;

3753

goto need_resched_nonpreemptible;

3753

goto need_resched_nonpreemptible;

3754

}

3754

}

3755

3756

preempt_enable_no_resched();

3756

preempt_enable_no_resched();

3757

if (need_resched())

3757

if (need_resched())

3758

goto need_resched;

3758

goto need_resched;

3759

}

3759

}

3760

EXPORT_SYMBOL(schedule);

3760

EXPORT_SYMBOL(schedule);

3761

3762

#ifdef CONFIG_MUTEX_SPIN_ON_OWNER

3762

#ifdef CONFIG_MUTEX_SPIN_ON_OWNER

3763

/*

3763

/*

3764

* Look out! "owner" is an entirely speculative pointer

3764

* Look out! "owner" is an entirely speculative pointer

3765

* access and not reliable.

3765

* access and not reliable.

3766

*/

3766

*/

3767

int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)

3767

int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)

3768

{

3768

{

3769

unsigned int cpu;

3769

unsigned int cpu;

3770

struct rq *rq;

3770

struct rq *rq;

3771

3772

if (!sched_feat(OWNER_SPIN))

3772

if (!sched_feat(OWNER_SPIN))

3773

return 0;

3773

return 0;

3774

3775

#ifdef CONFIG_DEBUG_PAGEALLOC

3775

#ifdef CONFIG_DEBUG_PAGEALLOC

3776

/*

3776

/*

3777

* Need to access the cpu field knowing that

3777

* Need to access the cpu field knowing that

3778

* DEBUG_PAGEALLOC could have unmapped it if

3778

* DEBUG_PAGEALLOC could have unmapped it if

3779

* the mutex owner just released it and exited.

3779

* the mutex owner just released it and exited.

3780

*/

3780

*/

3781

if (probe_kernel_address(&owner->cpu, cpu))

3781

if (probe_kernel_address(&owner->cpu, cpu))

3782

goto out;

3782

goto out;

3783

#else

3783

#else

3784

cpu = owner->cpu;

3784

cpu = owner->cpu;

3785

#endif

3785

#endif

3786

3787

/*

3787

/*

3788

* Even if the access succeeded (likely case),

3788

* Even if the access succeeded (likely case),

3789

* the cpu field may no longer be valid.

3789

* the cpu field may no longer be valid.

3790

*/

3790

*/

3791

if (cpu >= nr_cpumask_bits)

3791

if (cpu >= nr_cpumask_bits)

3792

goto out;

3792

goto out;

3793

3794

/*

3794

/*

3795

* We need to validate that we can do a

3795

* We need to validate that we can do a

3796

* get_cpu() and that we have the percpu area.

3796

* get_cpu() and that we have the percpu area.

3797

*/

3797

*/

3798

if (!cpu_online(cpu))

3798

if (!cpu_online(cpu))

3799

goto out;

3799

goto out;

3800

3801

rq = cpu_rq(cpu);

3801

rq = cpu_rq(cpu);

3802

3803

for (;;) {

3803

for (;;) {

3804

/*

3804

/*

3805

* Owner changed, break to re-assess state.

3805

* Owner changed, break to re-assess state.

3806

*/

3806

*/

3807

if (lock->owner != owner)

3807

if (lock->owner != owner)

3808

break;

3808

break;

3809

3810

/*

3810

/*

3811

* Is that owner really running on that cpu?

3811

* Is that owner really running on that cpu?

3812

*/

3812

*/

3813

if (task_thread_info(rq->curr) != owner || need_resched())

3813

if (task_thread_info(rq->curr) != owner || need_resched())

3814

return 0;

3814

return 0;

3815

3816

cpu_relax();

3816

cpu_relax();

3817

}

3817

}

3818

out:

3818

out:

3819

return 1;

3819

return 1;

3820

}

3820

}

3821

#endif

3821

#endif

3822

3823

#ifdef CONFIG_PREEMPT

3823

#ifdef CONFIG_PREEMPT

3824

/*

3824

/*

3825

* this is the entry point to schedule() from in-kernel preemption

3825

* this is the entry point to schedule() from in-kernel preemption

3826

* off of preempt_enable. Kernel preemptions off return from interrupt

3826

* off of preempt_enable. Kernel preemptions off return from interrupt

3827

* occur there and call schedule directly.

3827

* occur there and call schedule directly.

3828

*/

3828

*/

3829

asmlinkage void __sched preempt_schedule(void)

3829

asmlinkage void __sched preempt_schedule(void)

3830

{

3830

{

3831

struct thread_info *ti = current_thread_info();

3831

struct thread_info *ti = current_thread_info();

3832

3833

/*

3833

/*

3834

* If there is a non-zero preempt_count or interrupts are disabled,

3834

* If there is a non-zero preempt_count or interrupts are disabled,

3835

* we do not want to preempt the current task. Just return..

3835

* we do not want to preempt the current task. Just return..

3836

*/

3836

*/

3837

if (likely(ti->preempt_count || irqs_disabled()))

3837

if (likely(ti->preempt_count || irqs_disabled()))

3838

return;

3838

return;

3839

3840

do {

3840

do {

3841

add_preempt_count(PREEMPT_ACTIVE);

3841

add_preempt_count(PREEMPT_ACTIVE);

3842

schedule();

3842

schedule();

3843

sub_preempt_count(PREEMPT_ACTIVE);

3843

sub_preempt_count(PREEMPT_ACTIVE);

3844

3845

/*

3845

/*

3846

* Check again in case we missed a preemption opportunity

3846

* Check again in case we missed a preemption opportunity

3847

* between schedule and now.

3847

* between schedule and now.

3848

*/

3848

*/

3849

barrier();

3849

barrier();

3850

} while (need_resched());

3850

} while (need_resched());

3851

}

3851

}

3852

EXPORT_SYMBOL(preempt_schedule);

3852

EXPORT_SYMBOL(preempt_schedule);

3853

3854

/*

3854

/*

3855

* this is the entry point to schedule() from kernel preemption

3855

* this is the entry point to schedule() from kernel preemption

3856

* off of irq context.

3856

* off of irq context.

3857

* Note, that this is called and return with irqs disabled. This will

3857

* Note, that this is called and return with irqs disabled. This will

3858

* protect us against recursive calling from irq.

3858

* protect us against recursive calling from irq.

3859

*/

3859

*/

3860

asmlinkage void __sched preempt_schedule_irq(void)

3860

asmlinkage void __sched preempt_schedule_irq(void)

3861

{

3861

{

3862

struct thread_info *ti = current_thread_info();

3862

struct thread_info *ti = current_thread_info();

3863

3864

/* Catch callers which need to be fixed */

3864

/* Catch callers which need to be fixed */

3865

BUG_ON(ti->preempt_count || !irqs_disabled());

3865

BUG_ON(ti->preempt_count || !irqs_disabled());

3866

3867

do {

3867

do {

3868

add_preempt_count(PREEMPT_ACTIVE);

3868

add_preempt_count(PREEMPT_ACTIVE);

3869

local_irq_enable();

3869

local_irq_enable();

3870

schedule();

3870

schedule();

3871

local_irq_disable();

3871

local_irq_disable();

3872

sub_preempt_count(PREEMPT_ACTIVE);

3872

sub_preempt_count(PREEMPT_ACTIVE);

3873

3874

/*

3874

/*

3875

* Check again in case we missed a preemption opportunity

3875

* Check again in case we missed a preemption opportunity

3876

* between schedule and now.

3876

* between schedule and now.

3877

*/

3877

*/

3878

barrier();

3878

barrier();

3879

} while (need_resched());

3879

} while (need_resched());

3880

}

3880

}

3881

3882

#endif /* CONFIG_PREEMPT */

3882

#endif /* CONFIG_PREEMPT */

3883

3884

int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,

3884

int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,

3885

void *key)

3885

void *key)

3886

{

3886

{

3887

return try_to_wake_up(curr->private, mode, wake_flags);

3887

return try_to_wake_up(curr->private, mode, wake_flags);

3888

}

3888

}

3889

EXPORT_SYMBOL(default_wake_function);

3889

EXPORT_SYMBOL(default_wake_function);

3890

3891

/*

3891

/*

3892

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

3892

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

3893

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

3893

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

3894

* number) then we wake all the non-exclusive tasks and one exclusive task.

3894

* number) then we wake all the non-exclusive tasks and one exclusive task.

3895

*

3895

*

3896

* There are circumstances in which we can try to wake a task which has already

3896

* There are circumstances in which we can try to wake a task which has already

3897

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

3897

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

3898

* zero in this (rare) case, and we handle it by continuing to scan the queue.

3898

* zero in this (rare) case, and we handle it by continuing to scan the queue.

3899

*/

3899

*/

3900

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

3900

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

3901

int nr_exclusive, int wake_flags, void *key)

3901

int nr_exclusive, int wake_flags, void *key)

3902

{

3902

{

3903

wait_queue_t *curr, *next;

3903

wait_queue_t *curr, *next;

3904

3905

list_for_each_entry_safe(curr, next, &q->task_list, task_list) {

3905

list_for_each_entry_safe(curr, next, &q->task_list, task_list) {

3906

unsigned flags = curr->flags;

3906

unsigned flags = curr->flags;

3907

3908

if (curr->func(curr, mode, wake_flags, key) &&

3908

if (curr->func(curr, mode, wake_flags, key) &&

3909

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

3909

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

3910

break;

3910

break;

3911

}

3911

}

3912

}

3912

}

3913

3914

/**

3914

/**

3915

* __wake_up - wake up threads blocked on a waitqueue.

3915

* __wake_up - wake up threads blocked on a waitqueue.

3916

* @q: the waitqueue

3916

* @q: the waitqueue

3917

* @mode: which threads

3917

* @mode: which threads

3918

* @nr_exclusive: how many wake-one or wake-many threads to wake up

3918

* @nr_exclusive: how many wake-one or wake-many threads to wake up

3919

* @key: is directly passed to the wakeup function

3919

* @key: is directly passed to the wakeup function

3920

*

3920

*

3921

* It may be assumed that this function implies a write memory barrier before

3921

* It may be assumed that this function implies a write memory barrier before

3922

* changing the task state if and only if any tasks are woken up.

3922

* changing the task state if and only if any tasks are woken up.

3923

*/

3923

*/

3924

void __wake_up(wait_queue_head_t *q, unsigned int mode,

3924

void __wake_up(wait_queue_head_t *q, unsigned int mode,

3925

int nr_exclusive, void *key)

3925

int nr_exclusive, void *key)

3926

{

3926

{

3927

unsigned long flags;

3927

unsigned long flags;

3928

3929

spin_lock_irqsave(&q->lock, flags);

3929

spin_lock_irqsave(&q->lock, flags);

3930

__wake_up_common(q, mode, nr_exclusive, 0, key);

3930

__wake_up_common(q, mode, nr_exclusive, 0, key);

3931

spin_unlock_irqrestore(&q->lock, flags);

3931

spin_unlock_irqrestore(&q->lock, flags);

3932

}

3932

}

3933

EXPORT_SYMBOL(__wake_up);

3933

EXPORT_SYMBOL(__wake_up);

3934

3935

/*

3935

/*

3936

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

3936

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

3937

*/

3937

*/

3938

void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

3938

void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

3939

{

3939

{

3940

__wake_up_common(q, mode, 1, 0, NULL);

3940

__wake_up_common(q, mode, 1, 0, NULL);

3941

}

3941

}

3942

3943

void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)

3943

void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)

3944

{

3944

{

3945

__wake_up_common(q, mode, 1, 0, key);

3945

__wake_up_common(q, mode, 1, 0, key);

3946

}

3946

}

3947

3948

/**

3948

/**

3949

* __wake_up_sync_key - wake up threads blocked on a waitqueue.

3949

* __wake_up_sync_key - wake up threads blocked on a waitqueue.

3950

* @q: the waitqueue

3950

* @q: the waitqueue

3951

* @mode: which threads

3951

* @mode: which threads

3952

* @nr_exclusive: how many wake-one or wake-many threads to wake up

3952

* @nr_exclusive: how many wake-one or wake-many threads to wake up

3953

* @key: opaque value to be passed to wakeup targets

3953

* @key: opaque value to be passed to wakeup targets

3954

*

3954

*

3955

* The sync wakeup differs that the waker knows that it will schedule

3955

* The sync wakeup differs that the waker knows that it will schedule

3956

* away soon, so while the target thread will be woken up, it will not

3956

* away soon, so while the target thread will be woken up, it will not

3957

* be migrated to another CPU - ie. the two threads are 'synchronized'

3957

* be migrated to another CPU - ie. the two threads are 'synchronized'

3958

* with each other. This can prevent needless bouncing between CPUs.

3958

* with each other. This can prevent needless bouncing between CPUs.

3959

*

3959

*

3960

* On UP it can prevent extra preemption.

3960

* On UP it can prevent extra preemption.

3961

*

3961

*

3962

* It may be assumed that this function implies a write memory barrier before

3962

* It may be assumed that this function implies a write memory barrier before

3963

* changing the task state if and only if any tasks are woken up.

3963

* changing the task state if and only if any tasks are woken up.

3964

*/

3964

*/

3965

void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,

3965

void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,

3966

int nr_exclusive, void *key)

3966

int nr_exclusive, void *key)

3967

{

3967

{

3968

unsigned long flags;

3968

unsigned long flags;

3969

int wake_flags = WF_SYNC;

3969

int wake_flags = WF_SYNC;

3970

3971

if (unlikely(!q))

3971

if (unlikely(!q))

3972

return;

3972

return;

3973

3974

if (unlikely(!nr_exclusive))

3974

if (unlikely(!nr_exclusive))

3975

wake_flags = 0;

3975

wake_flags = 0;

3976

3977

spin_lock_irqsave(&q->lock, flags);

3977

spin_lock_irqsave(&q->lock, flags);

3978

__wake_up_common(q, mode, nr_exclusive, wake_flags, key);

3978

__wake_up_common(q, mode, nr_exclusive, wake_flags, key);

3979

spin_unlock_irqrestore(&q->lock, flags);

3979

spin_unlock_irqrestore(&q->lock, flags);

3980

}

3980

}

3981

EXPORT_SYMBOL_GPL(__wake_up_sync_key);

3981

EXPORT_SYMBOL_GPL(__wake_up_sync_key);

3982

3983

/*

3983

/*

3984

* __wake_up_sync - see __wake_up_sync_key()

3984

* __wake_up_sync - see __wake_up_sync_key()

3985

*/

3985

*/

3986

void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

3986

void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

3987

{

3987

{

3988

__wake_up_sync_key(q, mode, nr_exclusive, NULL);

3988

__wake_up_sync_key(q, mode, nr_exclusive, NULL);

3989

}

3989

}

3990

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

3990

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

3991

3992

/**

3992

/**

3993

* complete: - signals a single thread waiting on this completion

3993

* complete: - signals a single thread waiting on this completion

3994

* @x: holds the state of this particular completion

3994

* @x: holds the state of this particular completion

3995

*

3995

*

3996

* This will wake up a single thread waiting on this completion. Threads will be

3996

* This will wake up a single thread waiting on this completion. Threads will be

3997

* awakened in the same order in which they were queued.

3997

* awakened in the same order in which they were queued.

3998

*

3998

*

3999

* See also complete_all(), wait_for_completion() and related routines.

3999

* See also complete_all(), wait_for_completion() and related routines.

4000

*

4000

*

4001

* It may be assumed that this function implies a write memory barrier before

4001

* It may be assumed that this function implies a write memory barrier before

4002

* changing the task state if and only if any tasks are woken up.

4002

* changing the task state if and only if any tasks are woken up.

4003

*/

4003

*/

4004

void complete(struct completion *x)

4004

void complete(struct completion *x)

4005

{

4005

{

4006

unsigned long flags;

4006

unsigned long flags;

4007

4008

spin_lock_irqsave(&x->wait.lock, flags);

4008

spin_lock_irqsave(&x->wait.lock, flags);

4009

x->done++;

4009

x->done++;

4010

__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);

4010

__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);

4011

spin_unlock_irqrestore(&x->wait.lock, flags);

4011

spin_unlock_irqrestore(&x->wait.lock, flags);

4012

}

4012

}

4013

EXPORT_SYMBOL(complete);

4013

EXPORT_SYMBOL(complete);

4014

4015

/**

4015

/**

4016

* complete_all: - signals all threads waiting on this completion

4016

* complete_all: - signals all threads waiting on this completion

4017

* @x: holds the state of this particular completion

4017

* @x: holds the state of this particular completion

4018

*

4018

*

4019

* This will wake up all threads waiting on this particular completion event.

4019

* This will wake up all threads waiting on this particular completion event.

4020

*

4020

*

4021

* It may be assumed that this function implies a write memory barrier before

4021

* It may be assumed that this function implies a write memory barrier before

4022

* changing the task state if and only if any tasks are woken up.

4022

* changing the task state if and only if any tasks are woken up.

4023

*/

4023

*/

4024

void complete_all(struct completion *x)

4024

void complete_all(struct completion *x)

4025

{

4025

{

4026

unsigned long flags;

4026

unsigned long flags;

4027

4028

spin_lock_irqsave(&x->wait.lock, flags);

4028

spin_lock_irqsave(&x->wait.lock, flags);

4029

x->done += UINT_MAX/2;

4029

x->done += UINT_MAX/2;

4030

__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);

4030

__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);

4031

spin_unlock_irqrestore(&x->wait.lock, flags);

4031

spin_unlock_irqrestore(&x->wait.lock, flags);

4032

}

4032

}

4033

EXPORT_SYMBOL(complete_all);

4033

EXPORT_SYMBOL(complete_all);

4034

4035

static inline long __sched

4035

static inline long __sched

4036

do_wait_for_common(struct completion *x, long timeout, int state)

4036

do_wait_for_common(struct completion *x, long timeout, int state)

4037

{

4037

{

4038

if (!x->done) {

4038

if (!x->done) {

4039

DECLARE_WAITQUEUE(wait, current);

4039

DECLARE_WAITQUEUE(wait, current);

4040

4041

wait.flags |= WQ_FLAG_EXCLUSIVE;

4041

wait.flags |= WQ_FLAG_EXCLUSIVE;

4042

__add_wait_queue_tail(&x->wait, &wait);

4042

__add_wait_queue_tail(&x->wait, &wait);

4043

do {

4043

do {

4044

if (signal_pending_state(state, current)) {

4044

if (signal_pending_state(state, current)) {

4045

timeout = -ERESTARTSYS;

4045

timeout = -ERESTARTSYS;

4046

break;

4046

break;

4047

}

4047

}

4048

__set_current_state(state);

4048

__set_current_state(state);

4049

spin_unlock_irq(&x->wait.lock);

4049

spin_unlock_irq(&x->wait.lock);

4050

timeout = schedule_timeout(timeout);

4050

timeout = schedule_timeout(timeout);

4051

spin_lock_irq(&x->wait.lock);

4051

spin_lock_irq(&x->wait.lock);

4052

} while (!x->done && timeout);

4052

} while (!x->done && timeout);

4053

__remove_wait_queue(&x->wait, &wait);

4053

__remove_wait_queue(&x->wait, &wait);

4054

if (!x->done)

4054

if (!x->done)

4055

return timeout;

4055

return timeout;

4056

}

4056

}

4057

x->done--;

4057

x->done--;

4058

return timeout ?: 1;

4058

return timeout ?: 1;

4059

}

4059

}

4060

4061

static long __sched

4061

static long __sched

4062

wait_for_common(struct completion *x, long timeout, int state)

4062

wait_for_common(struct completion *x, long timeout, int state)

4063

{

4063

{

4064

might_sleep();

4064

might_sleep();

4065

4066

spin_lock_irq(&x->wait.lock);

4066

spin_lock_irq(&x->wait.lock);

4067

timeout = do_wait_for_common(x, timeout, state);

4067

timeout = do_wait_for_common(x, timeout, state);

4068

spin_unlock_irq(&x->wait.lock);

4068

spin_unlock_irq(&x->wait.lock);

4069

return timeout;

4069

return timeout;

4070

}

4070

}

4071

4072

/**

4072

/**

4073

* wait_for_completion: - waits for completion of a task

4073

* wait_for_completion: - waits for completion of a task

4074

* @x: holds the state of this particular completion

4074

* @x: holds the state of this particular completion

4075

*

4075

*

4076

* This waits to be signaled for completion of a specific task. It is NOT

4076

* This waits to be signaled for completion of a specific task. It is NOT

4077

* interruptible and there is no timeout.

4077

* interruptible and there is no timeout.

4078

*

4078

*

4079

* See also similar routines (i.e. wait_for_completion_timeout()) with timeout

4079

* See also similar routines (i.e. wait_for_completion_timeout()) with timeout

4080

* and interrupt capability. Also see complete().

4080

* and interrupt capability. Also see complete().

4081

*/

4081

*/

4082

void __sched wait_for_completion(struct completion *x)

4082

void __sched wait_for_completion(struct completion *x)

4083

{

4083

{

4084

wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);

4084

wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);

4085

}

4085

}

4086

EXPORT_SYMBOL(wait_for_completion);

4086

EXPORT_SYMBOL(wait_for_completion);

4087

4088

/**

4088

/**

4089

* wait_for_completion_timeout: - waits for completion of a task (w/timeout)

4089

* wait_for_completion_timeout: - waits for completion of a task (w/timeout)

4090

* @x: holds the state of this particular completion

4090

* @x: holds the state of this particular completion

4091

* @timeout: timeout value in jiffies

4091

* @timeout: timeout value in jiffies

4092

*

4092

*

4093

* This waits for either a completion of a specific task to be signaled or for a

4093

* This waits for either a completion of a specific task to be signaled or for a

4094

* specified timeout to expire. The timeout is in jiffies. It is not

4094

* specified timeout to expire. The timeout is in jiffies. It is not

4095

* interruptible.

4095

* interruptible.

4096

*/

4096

*/

4097

unsigned long __sched

4097

unsigned long __sched

4098

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

4098

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

4099

{

4099

{

4100

return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);

4100

return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);

4101

}

4101

}

4102

EXPORT_SYMBOL(wait_for_completion_timeout);

4102

EXPORT_SYMBOL(wait_for_completion_timeout);

4103

4104

/**

4104

/**

4105

* wait_for_completion_interruptible: - waits for completion of a task (w/intr)

4105

* wait_for_completion_interruptible: - waits for completion of a task (w/intr)

4106

* @x: holds the state of this particular completion

4106

* @x: holds the state of this particular completion

4107

*

4107

*

4108

* This waits for completion of a specific task to be signaled. It is

4108

* This waits for completion of a specific task to be signaled. It is

4109

* interruptible.

4109

* interruptible.

4110

*/

4110

*/

4111

int __sched wait_for_completion_interruptible(struct completion *x)

4111

int __sched wait_for_completion_interruptible(struct completion *x)

4112

{

4112

{

4113

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);

4113

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);

4114

if (t == -ERESTARTSYS)

4114

if (t == -ERESTARTSYS)

4115

return t;

4115

return t;

4116

return 0;

4116

return 0;

4117

}

4117

}

4118

EXPORT_SYMBOL(wait_for_completion_interruptible);

4118

EXPORT_SYMBOL(wait_for_completion_interruptible);

4119

4120

/**

4120

/**

4121

* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))

4121

* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))

4122

* @x: holds the state of this particular completion

4122

* @x: holds the state of this particular completion

4123

* @timeout: timeout value in jiffies

4123

* @timeout: timeout value in jiffies

4124

*

4124

*

4125

* This waits for either a completion of a specific task to be signaled or for a

4125

* This waits for either a completion of a specific task to be signaled or for a

4126

* specified timeout to expire. It is interruptible. The timeout is in jiffies.

4126

* specified timeout to expire. It is interruptible. The timeout is in jiffies.

4127

*/

4127

*/

4128

unsigned long __sched

4128

unsigned long __sched

4129

wait_for_completion_interruptible_timeout(struct completion *x,

4129

wait_for_completion_interruptible_timeout(struct completion *x,

4130

unsigned long timeout)

4130

unsigned long timeout)

4131

{

4131

{

4132

return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);

4132

return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);

4133

}

4133

}

4134

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

4134

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

4135

4136

/**

4136

/**

4137

* wait_for_completion_killable: - waits for completion of a task (killable)

4137

* wait_for_completion_killable: - waits for completion of a task (killable)

4138

* @x: holds the state of this particular completion

4138

* @x: holds the state of this particular completion

4139

*

4139

*

4140

* This waits to be signaled for completion of a specific task. It can be

4140

* This waits to be signaled for completion of a specific task. It can be

4141

* interrupted by a kill signal.

4141

* interrupted by a kill signal.

4142

*/

4142

*/

4143

int __sched wait_for_completion_killable(struct completion *x)

4143

int __sched wait_for_completion_killable(struct completion *x)

4144

{

4144

{

4145

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);

4145

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);

4146

if (t == -ERESTARTSYS)

4146

if (t == -ERESTARTSYS)

4147

return t;

4147

return t;

4148

return 0;

4148

return 0;

4149

}

4149

}

4150

EXPORT_SYMBOL(wait_for_completion_killable);

4150

EXPORT_SYMBOL(wait_for_completion_killable);

4151

4152

/**

4152

/**

4153

* try_wait_for_completion - try to decrement a completion without blocking

4153

* try_wait_for_completion - try to decrement a completion without blocking

4154

* @x: completion structure

4154

* @x: completion structure

4155

*

4155

*

4156

* Returns: 0 if a decrement cannot be done without blocking

4156

* Returns: 0 if a decrement cannot be done without blocking

4157

* 1 if a decrement succeeded.

4157

* 1 if a decrement succeeded.

4158

*

4158

*

4159

* If a completion is being used as a counting completion,

4159

* If a completion is being used as a counting completion,

4160

* attempt to decrement the counter without blocking. This

4160

* attempt to decrement the counter without blocking. This

4161

* enables us to avoid waiting if the resource the completion

4161

* enables us to avoid waiting if the resource the completion

4162

* is protecting is not available.

4162

* is protecting is not available.

4163

*/

4163

*/

4164

bool try_wait_for_completion(struct completion *x)

4164

bool try_wait_for_completion(struct completion *x)

4165

{

4165

{

4166

unsigned long flags;

4166

unsigned long flags;

4167

int ret = 1;

4167

int ret = 1;

4168

4169

spin_lock_irqsave(&x->wait.lock, flags);

4169

spin_lock_irqsave(&x->wait.lock, flags);

4170

if (!x->done)

4170

if (!x->done)

4171

ret = 0;

4171

ret = 0;

4172

else

4172

else

4173

x->done--;

4173

x->done--;

4174

spin_unlock_irqrestore(&x->wait.lock, flags);

4174

spin_unlock_irqrestore(&x->wait.lock, flags);

4175

return ret;

4175

return ret;

4176

}

4176

}

4177

EXPORT_SYMBOL(try_wait_for_completion);

4177

EXPORT_SYMBOL(try_wait_for_completion);

4178

4179

/**

4179

/**

4180

* completion_done - Test to see if a completion has any waiters

4180

* completion_done - Test to see if a completion has any waiters

4181

* @x: completion structure

4181

* @x: completion structure

4182

*

4182

*

4183

* Returns: 0 if there are waiters (wait_for_completion() in progress)

4183

* Returns: 0 if there are waiters (wait_for_completion() in progress)

4184

* 1 if there are no waiters.

4184

* 1 if there are no waiters.

4185

*

4185

*

4186

*/

4186

*/

4187

bool completion_done(struct completion *x)

4187

bool completion_done(struct completion *x)

4188

{

4188

{

4189

unsigned long flags;

4189

unsigned long flags;

4190

int ret = 1;

4190

int ret = 1;

4191

4192

spin_lock_irqsave(&x->wait.lock, flags);

4192

spin_lock_irqsave(&x->wait.lock, flags);

4193

if (!x->done)

4193

if (!x->done)

4194

ret = 0;

4194

ret = 0;

4195

spin_unlock_irqrestore(&x->wait.lock, flags);

4195

spin_unlock_irqrestore(&x->wait.lock, flags);

4196

return ret;

4196

return ret;

4197

}

4197

}

4198

EXPORT_SYMBOL(completion_done);

4198

EXPORT_SYMBOL(completion_done);

4199

4200

static long __sched

4200

static long __sched

4201

sleep_on_common(wait_queue_head_t *q, int state, long timeout)

4201

sleep_on_common(wait_queue_head_t *q, int state, long timeout)

4202

{

4202

{

4203

unsigned long flags;

4203

unsigned long flags;

4204

wait_queue_t wait;

4204

wait_queue_t wait;

4205

4206

init_waitqueue_entry(&wait, current);

4206

init_waitqueue_entry(&wait, current);

4207

4208

__set_current_state(state);

4208

__set_current_state(state);

4209

4210

spin_lock_irqsave(&q->lock, flags);

4210

spin_lock_irqsave(&q->lock, flags);

4211

__add_wait_queue(q, &wait);

4211

__add_wait_queue(q, &wait);

4212

spin_unlock(&q->lock);

4212

spin_unlock(&q->lock);

4213

timeout = schedule_timeout(timeout);

4213

timeout = schedule_timeout(timeout);

4214

spin_lock_irq(&q->lock);

4214

spin_lock_irq(&q->lock);

4215

__remove_wait_queue(q, &wait);

4215

__remove_wait_queue(q, &wait);

4216

spin_unlock_irqrestore(&q->lock, flags);

4216

spin_unlock_irqrestore(&q->lock, flags);

4217

4218

return timeout;

4218

return timeout;

4219

}

4219

}

4220

4221

void __sched interruptible_sleep_on(wait_queue_head_t *q)

4221

void __sched interruptible_sleep_on(wait_queue_head_t *q)

4222

{

4222

{

4223

sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

4223

sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

4224

}

4224

}

4225

EXPORT_SYMBOL(interruptible_sleep_on);

4225

EXPORT_SYMBOL(interruptible_sleep_on);

4226

4227

long __sched

4227

long __sched

4228

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

4228

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

4229

{

4229

{

4230

return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);

4230

return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);

4231

}

4231

}

4232

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

4232

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

4233

4234

void __sched sleep_on(wait_queue_head_t *q)

4234

void __sched sleep_on(wait_queue_head_t *q)

4235

{

4235

{

4236

sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

4236

sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

4237

}

4237

}

4238

EXPORT_SYMBOL(sleep_on);

4238

EXPORT_SYMBOL(sleep_on);

4239

4240

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

4240

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

4241

{

4241

{

4242

return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);

4242

return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);

4243

}

4243

}

4244

EXPORT_SYMBOL(sleep_on_timeout);

4244

EXPORT_SYMBOL(sleep_on_timeout);

4245

4246

#ifdef CONFIG_RT_MUTEXES

4246

#ifdef CONFIG_RT_MUTEXES

4247

4248

/*

4248

/*

4249

* rt_mutex_setprio - set the current priority of a task

4249

* rt_mutex_setprio - set the current priority of a task

4250

* @p: task

4250

* @p: task

4251

* @prio: prio value (kernel-internal form)

4251

* @prio: prio value (kernel-internal form)

4252

*

4252

*

4253

* This function changes the 'effective' priority of a task. It does

4253

* This function changes the 'effective' priority of a task. It does

4254

* not touch ->normal_prio like __setscheduler().

4254

* not touch ->normal_prio like __setscheduler().

4255

*

4255

*

4256

* Used by the rt_mutex code to implement priority inheritance logic.

4256

* Used by the rt_mutex code to implement priority inheritance logic.

4257

*/

4257

*/

4258

void rt_mutex_setprio(struct task_struct *p, int prio)

4258

void rt_mutex_setprio(struct task_struct *p, int prio)

4259

{

4259

{

4260

unsigned long flags;

4260

unsigned long flags;

4261

int oldprio, on_rq, running;

4261

int oldprio, on_rq, running;

4262

struct rq *rq;

4262

struct rq *rq;

4263

const struct sched_class *prev_class;

4263

const struct sched_class *prev_class;

4264

4265

BUG_ON(prio < 0 || prio > MAX_PRIO);

4265

BUG_ON(prio < 0 || prio > MAX_PRIO);

4266

4267

rq = task_rq_lock(p, &flags);

4267

rq = task_rq_lock(p, &flags);

4268

update_rq_clock(rq);

4268

update_rq_clock(rq);

4269

4270

oldprio = p->prio;

4270

oldprio = p->prio;

4271

prev_class = p->sched_class;

4271

prev_class = p->sched_class;

4272

on_rq = p->se.on_rq;

4272

on_rq = p->se.on_rq;

4273

running = task_current(rq, p);

4273

running = task_current(rq, p);

4274

if (on_rq)

4274

if (on_rq)

4275

dequeue_task(rq, p, 0);

4275

dequeue_task(rq, p, 0);

4276

if (running)

4276

if (running)

4277

p->sched_class->put_prev_task(rq, p);

4277

p->sched_class->put_prev_task(rq, p);

4278

4279

if (rt_prio(prio))

4279

if (rt_prio(prio))

4280

p->sched_class = &rt_sched_class;

4280

p->sched_class = &rt_sched_class;

4281

else

4281

else

4282

p->sched_class = &fair_sched_class;

4282

p->sched_class = &fair_sched_class;

4283

4284

p->prio = prio;

4284

p->prio = prio;

4285

4286

if (running)

4286

if (running)

4287

p->sched_class->set_curr_task(rq);

4287

p->sched_class->set_curr_task(rq);

4288

if (on_rq) {

4288

if (on_rq) {

4289

enqueue_task(rq, p, 0, oldprio < prio);

4289

enqueue_task(rq, p, 0, oldprio < prio);

4290

4291

check_class_changed(rq, p, prev_class, oldprio, running);

4291

check_class_changed(rq, p, prev_class, oldprio, running);

4292

}

4292

}

4293

task_rq_unlock(rq, &flags);

4293

task_rq_unlock(rq, &flags);

4294

}

4294

}

4295

4296

#endif

4296

#endif

4297

4298

void set_user_nice(struct task_struct *p, long nice)

4298

void set_user_nice(struct task_struct *p, long nice)

4299

{

4299

{

4300

int old_prio, delta, on_rq;

4300

int old_prio, delta, on_rq;

4301

unsigned long flags;

4301

unsigned long flags;

4302

struct rq *rq;

4302

struct rq *rq;

4303

4304

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

4304

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

4305

return;

4305

return;

4306

/*

4306

/*

4307

* We have to be careful, if called from sys_setpriority(),

4307

* We have to be careful, if called from sys_setpriority(),

4308

* the task might be in the middle of scheduling on another CPU.

4308

* the task might be in the middle of scheduling on another CPU.

4309

*/

4309

*/

4310

rq = task_rq_lock(p, &flags);

4310

rq = task_rq_lock(p, &flags);

4311

update_rq_clock(rq);

4311

update_rq_clock(rq);

4312

/*

4312

/*

4313

* The RT priorities are set via sched_setscheduler(), but we still

4313

* The RT priorities are set via sched_setscheduler(), but we still

4314

* allow the 'normal' nice value to be set - but as expected

4314

* allow the 'normal' nice value to be set - but as expected

4315

* it wont have any effect on scheduling until the task is

4315

* it wont have any effect on scheduling until the task is

4316

* SCHED_FIFO/SCHED_RR:

4316

* SCHED_FIFO/SCHED_RR:

4317

*/

4317

*/

4318

if (task_has_rt_policy(p)) {

4318

if (task_has_rt_policy(p)) {

4319

p->static_prio = NICE_TO_PRIO(nice);

4319

p->static_prio = NICE_TO_PRIO(nice);

4320

goto out_unlock;

4320

goto out_unlock;

4321

}

4321

}

4322

on_rq = p->se.on_rq;

4322

on_rq = p->se.on_rq;

4323

if (on_rq)

4323

if (on_rq)

4324

dequeue_task(rq, p, 0);

4324

dequeue_task(rq, p, 0);

4325

4326

p->static_prio = NICE_TO_PRIO(nice);

4326

p->static_prio = NICE_TO_PRIO(nice);

4327

set_load_weight(p);

4327

set_load_weight(p);

4328

old_prio = p->prio;

4328

old_prio = p->prio;

4329

p->prio = effective_prio(p);

4329

p->prio = effective_prio(p);

4330

delta = p->prio - old_prio;

4330

delta = p->prio - old_prio;

4331

4332

if (on_rq) {

4332

if (on_rq) {

4333

enqueue_task(rq, p, 0, false);

4333

enqueue_task(rq, p, 0, false);

4334

/*

4334

/*

4335

* If the task increased its priority or is running and

4335

* If the task increased its priority or is running and

4336

* lowered its priority, then reschedule its CPU:

4336

* lowered its priority, then reschedule its CPU:

4337

*/

4337

*/

4338

if (delta < 0 || (delta > 0 && task_running(rq, p)))

4338

if (delta < 0 || (delta > 0 && task_running(rq, p)))

4339

resched_task(rq->curr);

4339

resched_task(rq->curr);

4340

}

4340

}

4341

out_unlock:

4341

out_unlock:

4342

task_rq_unlock(rq, &flags);

4342

task_rq_unlock(rq, &flags);

4343

}

4343

}

4344

EXPORT_SYMBOL(set_user_nice);

4344

EXPORT_SYMBOL(set_user_nice);

4345

4346

/*

4346

/*

4347

* can_nice - check if a task can reduce its nice value

4347

* can_nice - check if a task can reduce its nice value

4348

* @p: task

4348

* @p: task

4349

* @nice: nice value

4349

* @nice: nice value

4350

*/

4350

*/

4351

int can_nice(const struct task_struct *p, const int nice)

4351

int can_nice(const struct task_struct *p, const int nice)

4352

{

4352

{

4353

/* convert nice value [19,-20] to rlimit style value [1,40] */

4353

/* convert nice value [19,-20] to rlimit style value [1,40] */

4354

int nice_rlim = 20 - nice;

4354

int nice_rlim = 20 - nice;

4355

4356

return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||

4356

return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||

4357

capable(CAP_SYS_NICE));

4357

capable(CAP_SYS_NICE));

4358

}

4358

}

4359

4360

#ifdef __ARCH_WANT_SYS_NICE

4360

#ifdef __ARCH_WANT_SYS_NICE

4361

4362

/*

4362

/*

4363

* sys_nice - change the priority of the current process.

4363

* sys_nice - change the priority of the current process.

4364

* @increment: priority increment

4364

* @increment: priority increment

4365

*

4365

*

4366

* sys_setpriority is a more generic, but much slower function that

4366

* sys_setpriority is a more generic, but much slower function that

4367

* does similar things.

4367

* does similar things.

4368

*/

4368

*/

4369

SYSCALL_DEFINE1(nice, int, increment)

4369

SYSCALL_DEFINE1(nice, int, increment)

4370

{

4370

{

4371

long nice, retval;

4371

long nice, retval;

4372

4373

/*

4373

/*

4374

* Setpriority might change our priority at the same moment.

4374

* Setpriority might change our priority at the same moment.

4375

* We don't have to worry. Conceptually one call occurs first

4375

* We don't have to worry. Conceptually one call occurs first

4376

* and we have a single winner.

4376

* and we have a single winner.

4377

*/

4377

*/

4378

if (increment < -40)

4378

if (increment < -40)

4379

increment = -40;

4379

increment = -40;

4380

if (increment > 40)

4380

if (increment > 40)

4381

increment = 40;

4381

increment = 40;

4382

4383

nice = TASK_NICE(current) + increment;

4383

nice = TASK_NICE(current) + increment;

4384

if (nice < -20)

4384

if (nice < -20)

4385

nice = -20;

4385

nice = -20;

4386

if (nice > 19)

4386

if (nice > 19)

4387

nice = 19;

4387

nice = 19;

4388

4389

if (increment < 0 && !can_nice(current, nice))

4389

if (increment < 0 && !can_nice(current, nice))

4390

return -EPERM;

4390

return -EPERM;

4391

4392

retval = security_task_setnice(current, nice);

4392

retval = security_task_setnice(current, nice);

4393

if (retval)

4393

if (retval)

4394

return retval;

4394

return retval;

4395

4396

set_user_nice(current, nice);

4396

set_user_nice(current, nice);

4397

return 0;

4397

return 0;

4398

}

4398

}

4399

4400

#endif

4400

#endif

4401

4402

/**

4402

/**

4403

* task_prio - return the priority value of a given task.

4403

* task_prio - return the priority value of a given task.

4404

* @p: the task in question.

4404

* @p: the task in question.

4405

*

4405

*

4406

* This is the priority value as seen by users in /proc.

4406

* This is the priority value as seen by users in /proc.

4407

* RT tasks are offset by -200. Normal tasks are centered

4407

* RT tasks are offset by -200. Normal tasks are centered

4408

* around 0, value goes from -16 to +15.

4408

* around 0, value goes from -16 to +15.

4409

*/

4409

*/

4410

int task_prio(const struct task_struct *p)

4410

int task_prio(const struct task_struct *p)

4411

{

4411

{

4412

return p->prio - MAX_RT_PRIO;

4412

return p->prio - MAX_RT_PRIO;

4413

}

4413

}

4414

4415

/**

4415

/**

4416

* task_nice - return the nice value of a given task.

4416

* task_nice - return the nice value of a given task.

4417

* @p: the task in question.

4417

* @p: the task in question.

4418

*/

4418

*/

4419

int task_nice(const struct task_struct *p)

4419

int task_nice(const struct task_struct *p)

4420

{

4420

{

4421

return TASK_NICE(p);

4421

return TASK_NICE(p);

4422

}

4422

}

4423

EXPORT_SYMBOL(task_nice);

4423

EXPORT_SYMBOL(task_nice);

4424

4425

/**

4425

/**

4426

* idle_cpu - is a given cpu idle currently?

4426

* idle_cpu - is a given cpu idle currently?

4427

* @cpu: the processor in question.

4427

* @cpu: the processor in question.

4428

*/

4428

*/

4429

int idle_cpu(int cpu)

4429

int idle_cpu(int cpu)

4430

{

4430

{

4431

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

4431

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

4432

}

4432

}

4433

4434

/**

4434

/**

4435

* idle_task - return the idle task for a given cpu.

4435

* idle_task - return the idle task for a given cpu.

4436

* @cpu: the processor in question.

4436

* @cpu: the processor in question.

4437

*/

4437

*/

4438

struct task_struct *idle_task(int cpu)

4438

struct task_struct *idle_task(int cpu)

4439

{

4439

{

4440

return cpu_rq(cpu)->idle;

4440

return cpu_rq(cpu)->idle;

4441

}

4441

}

4442

4443

/**

4443

/**

4444

* find_process_by_pid - find a process with a matching PID value.

4444

* find_process_by_pid - find a process with a matching PID value.

4445

* @pid: the pid in question.

4445

* @pid: the pid in question.

4446

*/

4446

*/

4447

static struct task_struct *find_process_by_pid(pid_t pid)

4447

static struct task_struct *find_process_by_pid(pid_t pid)

4448

{

4448

{

4449

return pid ? find_task_by_vpid(pid) : current;

4449

return pid ? find_task_by_vpid(pid) : current;

4450

}

4450

}

4451

4452

/* Actually do priority change: must hold rq lock. */

4452

/* Actually do priority change: must hold rq lock. */

4453

static void

4453

static void

4454

__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)

4454

__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)

4455

{

4455

{

4456

BUG_ON(p->se.on_rq);

4456

BUG_ON(p->se.on_rq);

4457

4458

p->policy = policy;

4458

p->policy = policy;

4459

p->rt_priority = prio;

4459

p->rt_priority = prio;

4460

p->normal_prio = normal_prio(p);

4460

p->normal_prio = normal_prio(p);

4461

/* we are holding p->pi_lock already */

4461

/* we are holding p->pi_lock already */

4462

p->prio = rt_mutex_getprio(p);

4462

p->prio = rt_mutex_getprio(p);

4463

if (rt_prio(p->prio))

4463

if (rt_prio(p->prio))

4464

p->sched_class = &rt_sched_class;

4464

p->sched_class = &rt_sched_class;

4465

else

4465

else

4466

p->sched_class = &fair_sched_class;

4466

p->sched_class = &fair_sched_class;

4467

set_load_weight(p);

4467

set_load_weight(p);

4468

}

4468

}

4469

4470

/*

4470

/*

4471

* check the target process has a UID that matches the current process's

4471

* check the target process has a UID that matches the current process's

4472

*/

4472

*/

4473

static bool check_same_owner(struct task_struct *p)

4473

static bool check_same_owner(struct task_struct *p)

4474

{

4474

{

4475

const struct cred *cred = current_cred(), *pcred;

4475

const struct cred *cred = current_cred(), *pcred;

4476

bool match;

4476

bool match;

4477

4478

rcu_read_lock();

4478

rcu_read_lock();

4479

pcred = __task_cred(p);

4479

pcred = __task_cred(p);

4480

match = (cred->euid == pcred->euid ||

4480

match = (cred->euid == pcred->euid ||

4481

cred->euid == pcred->uid);

4481

cred->euid == pcred->uid);

4482

rcu_read_unlock();

4482

rcu_read_unlock();

4483

return match;

4483

return match;

4484

}

4484

}

4485

4486

static int __sched_setscheduler(struct task_struct *p, int policy,

4486

static int __sched_setscheduler(struct task_struct *p, int policy,

4487

struct sched_param *param, bool user)

4487

struct sched_param *param, bool user)

4488

{

4488

{

4489

int retval, oldprio, oldpolicy = -1, on_rq, running;

4489

int retval, oldprio, oldpolicy = -1, on_rq, running;

4490

unsigned long flags;

4490

unsigned long flags;

4491

const struct sched_class *prev_class;

4491

const struct sched_class *prev_class;

4492

struct rq *rq;

4492

struct rq *rq;

4493

int reset_on_fork;

4493

int reset_on_fork;

4494

4495

/* may grab non-irq protected spin_locks */

4495

/* may grab non-irq protected spin_locks */

4496

BUG_ON(in_interrupt());

4496

BUG_ON(in_interrupt());

4497

recheck:

4497

recheck:

4498

/* double check policy once rq lock held */

4498

/* double check policy once rq lock held */

4499

if (policy < 0) {

4499

if (policy < 0) {

4500

reset_on_fork = p->sched_reset_on_fork;

4500

reset_on_fork = p->sched_reset_on_fork;

4501

policy = oldpolicy = p->policy;

4501

policy = oldpolicy = p->policy;

4502

} else {

4502

} else {

4503

reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);

4503

reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);

4504

policy &= ~SCHED_RESET_ON_FORK;

4504

policy &= ~SCHED_RESET_ON_FORK;

4505

4506

if (policy != SCHED_FIFO && policy != SCHED_RR &&

4506

if (policy != SCHED_FIFO && policy != SCHED_RR &&

4507

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

4507

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

4508

policy != SCHED_IDLE)

4508

policy != SCHED_IDLE)

4509

return -EINVAL;

4509

return -EINVAL;

4510

}

4510

}

4511

4512

/*

4512

/*

4513

* Valid priorities for SCHED_FIFO and SCHED_RR are

4513

* Valid priorities for SCHED_FIFO and SCHED_RR are

4514

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

4514

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

4515

* SCHED_BATCH and SCHED_IDLE is 0.

4515

* SCHED_BATCH and SCHED_IDLE is 0.

4516

*/

4516

*/

4517

if (param->sched_priority < 0 ||

4517

if (param->sched_priority < 0 ||

4518

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

4518

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

4519

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

4519

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

4520

return -EINVAL;

4520

return -EINVAL;

4521

if (rt_policy(policy) != (param->sched_priority != 0))

4521

if (rt_policy(policy) != (param->sched_priority != 0))

4522

return -EINVAL;

4522

return -EINVAL;

4523

4524

/*

4524

/*

4525

* Allow unprivileged RT tasks to decrease priority:

4525

* Allow unprivileged RT tasks to decrease priority:

4526

*/

4526

*/

4527

if (user && !capable(CAP_SYS_NICE)) {

4527

if (user && !capable(CAP_SYS_NICE)) {

4528

if (rt_policy(policy)) {

4528

if (rt_policy(policy)) {

4529

unsigned long rlim_rtprio;

4529

unsigned long rlim_rtprio;

4530

4531

if (!lock_task_sighand(p, &flags))

4531

if (!lock_task_sighand(p, &flags))

4532

return -ESRCH;

4532

return -ESRCH;

4533

rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);

4533

rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);

4534

unlock_task_sighand(p, &flags);

4534

unlock_task_sighand(p, &flags);

4535

4536

/* can't set/change the rt policy */

4536

/* can't set/change the rt policy */

4537

if (policy != p->policy && !rlim_rtprio)

4537

if (policy != p->policy && !rlim_rtprio)

4538

return -EPERM;

4538

return -EPERM;

4539

4540

/* can't increase priority */

4540

/* can't increase priority */

4541

if (param->sched_priority > p->rt_priority &&

4541

if (param->sched_priority > p->rt_priority &&

4542

param->sched_priority > rlim_rtprio)

4542

param->sched_priority > rlim_rtprio)

4543

return -EPERM;

4543

return -EPERM;

4544

}

4544

}

4545

/*

4545

/*

4546

* Like positive nice levels, dont allow tasks to

4546

* Like positive nice levels, dont allow tasks to

4547

* move out of SCHED_IDLE either:

4547

* move out of SCHED_IDLE either:

4548

*/

4548

*/

4549

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)

4549

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)

4550

return -EPERM;

4550

return -EPERM;

4551

4552

/* can't change other user's priorities */

4552

/* can't change other user's priorities */

4553

if (!check_same_owner(p))

4553

if (!check_same_owner(p))

4554

return -EPERM;

4554

return -EPERM;

4555

4556

/* Normal users shall not reset the sched_reset_on_fork flag */

4556

/* Normal users shall not reset the sched_reset_on_fork flag */

4557

if (p->sched_reset_on_fork && !reset_on_fork)

4557

if (p->sched_reset_on_fork && !reset_on_fork)

4558

return -EPERM;

4558

return -EPERM;

4559

}

4559

}

4560

4561

if (user) {

4561

if (user) {

4562

#ifdef CONFIG_RT_GROUP_SCHED

4562

#ifdef CONFIG_RT_GROUP_SCHED

4563

/*

4563

/*

4564

* Do not allow realtime tasks into groups that have no runtime

4564

* Do not allow realtime tasks into groups that have no runtime

4565

* assigned.

4565

* assigned.

4566

*/

4566

*/

4567

if (rt_bandwidth_enabled() && rt_policy(policy) &&

4567

if (rt_bandwidth_enabled() && rt_policy(policy) &&

4568

task_group(p)->rt_bandwidth.rt_runtime == 0)

4568

task_group(p)->rt_bandwidth.rt_runtime == 0)

4569

return -EPERM;

4569

return -EPERM;

4570

#endif

4570

#endif

4571

4572

retval = security_task_setscheduler(p, policy, param);

4572

retval = security_task_setscheduler(p, policy, param);

4573

if (retval)

4573

if (retval)

4574

return retval;

4574

return retval;

4575

}

4575

}

4576

4577

/*

4577

/*

4578

* make sure no PI-waiters arrive (or leave) while we are

4578

* make sure no PI-waiters arrive (or leave) while we are

4579

* changing the priority of the task:

4579

* changing the priority of the task:

4580

*/

4580

*/

4581

raw_spin_lock_irqsave(&p->pi_lock, flags);

4581

raw_spin_lock_irqsave(&p->pi_lock, flags);

4582

/*

4582

/*

4583

* To be able to change p->policy safely, the apropriate

4583

* To be able to change p->policy safely, the apropriate

4584

* runqueue lock must be held.

4584

* runqueue lock must be held.

4585

*/

4585

*/

4586

rq = __task_rq_lock(p);

4586

rq = __task_rq_lock(p);

4587

/* recheck policy now with rq lock held */

4587

/* recheck policy now with rq lock held */

4588

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

4588

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

4589

policy = oldpolicy = -1;

4589

policy = oldpolicy = -1;

4590

__task_rq_unlock(rq);

4590

__task_rq_unlock(rq);

4591

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

4591

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

4592

goto recheck;

4592

goto recheck;

4593

}

4593

}

4594

update_rq_clock(rq);

4594

update_rq_clock(rq);

4595

on_rq = p->se.on_rq;

4595

on_rq = p->se.on_rq;

4596

running = task_current(rq, p);

4596

running = task_current(rq, p);

4597

if (on_rq)

4597

if (on_rq)

4598

deactivate_task(rq, p, 0);

4598

deactivate_task(rq, p, 0);

4599

if (running)

4599

if (running)

4600

p->sched_class->put_prev_task(rq, p);

4600

p->sched_class->put_prev_task(rq, p);

4601

4602

p->sched_reset_on_fork = reset_on_fork;

4602

p->sched_reset_on_fork = reset_on_fork;

4603

4604

oldprio = p->prio;

4604

oldprio = p->prio;

4605

prev_class = p->sched_class;

4605

prev_class = p->sched_class;

4606

__setscheduler(rq, p, policy, param->sched_priority);

4606

__setscheduler(rq, p, policy, param->sched_priority);

4607

4608

if (running)

4608

if (running)

4609

p->sched_class->set_curr_task(rq);

4609

p->sched_class->set_curr_task(rq);

4610

if (on_rq) {

4610

if (on_rq) {

4611

activate_task(rq, p, 0);

4611

activate_task(rq, p, 0);

4612

4613

check_class_changed(rq, p, prev_class, oldprio, running);

4613

check_class_changed(rq, p, prev_class, oldprio, running);

4614

}

4614

}

4615

__task_rq_unlock(rq);

4615

__task_rq_unlock(rq);

4616

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

4616

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

4617

4618

rt_mutex_adjust_pi(p);

4618

rt_mutex_adjust_pi(p);

4619

4620

return 0;

4620

return 0;

4621

}

4621

}

4622

4623

/**

4623

/**

4624

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

4624

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

4625

* @p: the task in question.

4625

* @p: the task in question.

4626

* @policy: new policy.

4626

* @policy: new policy.

4627

* @param: structure containing the new RT priority.

4627

* @param: structure containing the new RT priority.

4628

*

4628

*

4629

* NOTE that the task may be already dead.

4629

* NOTE that the task may be already dead.

4630

*/

4630

*/

4631

int sched_setscheduler(struct task_struct *p, int policy,

4631

int sched_setscheduler(struct task_struct *p, int policy,

4632

struct sched_param *param)

4632

struct sched_param *param)

4633

{

4633

{

4634

return __sched_setscheduler(p, policy, param, true);

4634

return __sched_setscheduler(p, policy, param, true);

4635

}

4635

}

4636

EXPORT_SYMBOL_GPL(sched_setscheduler);

4636

EXPORT_SYMBOL_GPL(sched_setscheduler);

4637

4638

/**

4638

/**

4639

* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

4639

* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

4640

* @p: the task in question.

4640

* @p: the task in question.

4641

* @policy: new policy.

4641

* @policy: new policy.

4642

* @param: structure containing the new RT priority.

4642

* @param: structure containing the new RT priority.

4643

*

4643

*

4644

* Just like sched_setscheduler, only don't bother checking if the

4644

* Just like sched_setscheduler, only don't bother checking if the

4645

* current context has permission. For example, this is needed in

4645

* current context has permission. For example, this is needed in

4646

* stop_machine(): we create temporary high priority worker threads,

4646

* stop_machine(): we create temporary high priority worker threads,

4647

* but our caller might not have that capability.

4647

* but our caller might not have that capability.

4648

*/

4648

*/

4649

int sched_setscheduler_nocheck(struct task_struct *p, int policy,

4649

int sched_setscheduler_nocheck(struct task_struct *p, int policy,

4650

struct sched_param *param)

4650

struct sched_param *param)

4651

{

4651

{

4652

return __sched_setscheduler(p, policy, param, false);

4652

return __sched_setscheduler(p, policy, param, false);

4653

}

4653

}

4654

4655

static int

4655

static int

4656

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

4656

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

4657

{

4657

{

4658

struct sched_param lparam;

4658

struct sched_param lparam;

4659

struct task_struct *p;

4659

struct task_struct *p;

4660

int retval;

4660

int retval;

4661

4662

if (!param || pid < 0)

4662

if (!param || pid < 0)

4663

return -EINVAL;

4663

return -EINVAL;

4664

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

4664

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

4665

return -EFAULT;

4665

return -EFAULT;

4666

4667

rcu_read_lock();

4667

rcu_read_lock();

4668

retval = -ESRCH;

4668

retval = -ESRCH;

4669

p = find_process_by_pid(pid);

4669

p = find_process_by_pid(pid);

4670

if (p != NULL)

4670

if (p != NULL)

4671

retval = sched_setscheduler(p, policy, &lparam);

4671

retval = sched_setscheduler(p, policy, &lparam);

4672

rcu_read_unlock();

4672

rcu_read_unlock();

4673

4674

return retval;

4674

return retval;

4675

}

4675

}

4676

4677

/**

4677

/**

4678

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

4678

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

4679

* @pid: the pid in question.

4679

* @pid: the pid in question.

4680

* @policy: new policy.

4680

* @policy: new policy.

4681

* @param: structure containing the new RT priority.

4681

* @param: structure containing the new RT priority.

4682

*/

4682

*/

4683

SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,

4683

SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,

4684

struct sched_param __user *, param)

4684

struct sched_param __user *, param)

4685

{

4685

{

4686

/* negative values for policy are not valid */

4686

/* negative values for policy are not valid */

4687

if (policy < 0)

4687

if (policy < 0)

4688

return -EINVAL;

4688

return -EINVAL;

4689

4690

return do_sched_setscheduler(pid, policy, param);

4690

return do_sched_setscheduler(pid, policy, param);

4691

}

4691

}

4692

4693

/**

4693

/**

4694

* sys_sched_setparam - set/change the RT priority of a thread

4694

* sys_sched_setparam - set/change the RT priority of a thread

4695

* @pid: the pid in question.

4695

* @pid: the pid in question.

4696

* @param: structure containing the new RT priority.

4696

* @param: structure containing the new RT priority.

4697

*/

4697

*/

4698

SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

4698

SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

4699

{

4699

{

4700

return do_sched_setscheduler(pid, -1, param);

4700

return do_sched_setscheduler(pid, -1, param);

4701

}

4701

}

4702

4703

/**

4703

/**

4704

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

4704

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

4705

* @pid: the pid in question.

4705

* @pid: the pid in question.

4706

*/

4706

*/

4707

SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

4707

SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

4708

{

4708

{

4709

struct task_struct *p;

4709

struct task_struct *p;

4710

int retval;

4710

int retval;

4711

4712

if (pid < 0)

4712

if (pid < 0)

4713

return -EINVAL;

4713

return -EINVAL;

4714

4715

retval = -ESRCH;

4715

retval = -ESRCH;

4716

rcu_read_lock();

4716

rcu_read_lock();

4717

p = find_process_by_pid(pid);

4717

p = find_process_by_pid(pid);

4718

if (p) {

4718

if (p) {

4719

retval = security_task_getscheduler(p);

4719

retval = security_task_getscheduler(p);

4720

if (!retval)

4720

if (!retval)

4721

retval = p->policy

4721

retval = p->policy

4722

| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);

4722

| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);

4723

}

4723

}

4724

rcu_read_unlock();

4724

rcu_read_unlock();

4725

return retval;

4725

return retval;

4726

}

4726

}

4727

4728

/**

4728

/**

4729

* sys_sched_getparam - get the RT priority of a thread

4729

* sys_sched_getparam - get the RT priority of a thread

4730

* @pid: the pid in question.

4730

* @pid: the pid in question.

4731

* @param: structure containing the RT priority.

4731

* @param: structure containing the RT priority.

4732

*/

4732

*/

4733

SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

4733

SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

4734

{

4734

{

4735

struct sched_param lp;

4735

struct sched_param lp;

4736

struct task_struct *p;

4736

struct task_struct *p;

4737

int retval;

4737

int retval;

4738

4739

if (!param || pid < 0)

4739

if (!param || pid < 0)

4740

return -EINVAL;

4740

return -EINVAL;

4741

4742

rcu_read_lock();

4742

rcu_read_lock();

4743

p = find_process_by_pid(pid);

4743

p = find_process_by_pid(pid);

4744

retval = -ESRCH;

4744

retval = -ESRCH;

4745

if (!p)

4745

if (!p)

4746

goto out_unlock;

4746

goto out_unlock;

4747

4748

retval = security_task_getscheduler(p);

4748

retval = security_task_getscheduler(p);

4749

if (retval)

4749

if (retval)

4750

goto out_unlock;

4750

goto out_unlock;

4751

4752

lp.sched_priority = p->rt_priority;

4752

lp.sched_priority = p->rt_priority;

4753

rcu_read_unlock();

4753

rcu_read_unlock();

4754

4755

/*

4755

/*

4756

* This one might sleep, we cannot do it with a spinlock held ...

4756

* This one might sleep, we cannot do it with a spinlock held ...

4757

*/

4757

*/

4758

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

4758

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

4759

4760

return retval;

4760

return retval;

4761

4762

out_unlock:

4762

out_unlock:

4763

rcu_read_unlock();

4763

rcu_read_unlock();

4764

return retval;

4764

return retval;

4765

}

4765

}

4766

4767

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

4767

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

4768

{

4768

{

4769

cpumask_var_t cpus_allowed, new_mask;

4769

cpumask_var_t cpus_allowed, new_mask;

4770

struct task_struct *p;

4770

struct task_struct *p;

4771

int retval;

4771

int retval;

4772

4773

get_online_cpus();

4773

get_online_cpus();

4774

rcu_read_lock();

4774

rcu_read_lock();

4775

4776

p = find_process_by_pid(pid);

4776

p = find_process_by_pid(pid);

4777

if (!p) {

4777

if (!p) {

4778

rcu_read_unlock();

4778

rcu_read_unlock();

4779

put_online_cpus();

4779

put_online_cpus();

4780

return -ESRCH;

4780

return -ESRCH;

4781

}

4781

}

4782

4783

/* Prevent p going away */

4783

/* Prevent p going away */

4784

get_task_struct(p);

4784

get_task_struct(p);

4785

rcu_read_unlock();

4785

rcu_read_unlock();

4786

4787

if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {

4787

if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {

4788

retval = -ENOMEM;

4788

retval = -ENOMEM;

4789

goto out_put_task;

4789

goto out_put_task;

4790

}

4790

}

4791

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

4791

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

4792

retval = -ENOMEM;

4792

retval = -ENOMEM;

4793

goto out_free_cpus_allowed;

4793

goto out_free_cpus_allowed;

4794

}

4794

}

4795

retval = -EPERM;

4795

retval = -EPERM;

4796

if (!check_same_owner(p) && !capable(CAP_SYS_NICE))

4796

if (!check_same_owner(p) && !capable(CAP_SYS_NICE))

4797

goto out_unlock;

4797

goto out_unlock;

4798

4799

retval = security_task_setscheduler(p, 0, NULL);

4799

retval = security_task_setscheduler(p, 0, NULL);

4800

if (retval)

4800

if (retval)

4801

goto out_unlock;

4801

goto out_unlock;

4802

4803

cpuset_cpus_allowed(p, cpus_allowed);

4803

cpuset_cpus_allowed(p, cpus_allowed);

4804

cpumask_and(new_mask, in_mask, cpus_allowed);

4804

cpumask_and(new_mask, in_mask, cpus_allowed);

4805

again:

4805

again:

4806

retval = set_cpus_allowed_ptr(p, new_mask);

4806

retval = set_cpus_allowed_ptr(p, new_mask);

4807

4808

if (!retval) {

4808

if (!retval) {

4809

cpuset_cpus_allowed(p, cpus_allowed);

4809

cpuset_cpus_allowed(p, cpus_allowed);

4810

if (!cpumask_subset(new_mask, cpus_allowed)) {

4810

if (!cpumask_subset(new_mask, cpus_allowed)) {

4811

/*

4811

/*

4812

* We must have raced with a concurrent cpuset

4812

* We must have raced with a concurrent cpuset

4813

* update. Just reset the cpus_allowed to the

4813

* update. Just reset the cpus_allowed to the

4814

* cpuset's cpus_allowed

4814

* cpuset's cpus_allowed

4815

*/

4815

*/

4816

cpumask_copy(new_mask, cpus_allowed);

4816

cpumask_copy(new_mask, cpus_allowed);

4817

goto again;

4817

goto again;

4818

}

4818

}

4819

}

4819

}

4820

out_unlock:

4820

out_unlock:

4821

free_cpumask_var(new_mask);

4821

free_cpumask_var(new_mask);

4822

out_free_cpus_allowed:

4822

out_free_cpus_allowed:

4823

free_cpumask_var(cpus_allowed);

4823

free_cpumask_var(cpus_allowed);

4824

out_put_task:

4824

out_put_task:

4825

put_task_struct(p);

4825

put_task_struct(p);

4826

put_online_cpus();

4826

put_online_cpus();

4827

return retval;

4827

return retval;

4828

}

4828

}

4829

4830

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

4830

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

4831

struct cpumask *new_mask)

4831

struct cpumask *new_mask)

4832

{

4832

{

4833

if (len < cpumask_size())

4833

if (len < cpumask_size())

4834

cpumask_clear(new_mask);

4834

cpumask_clear(new_mask);

4835

else if (len > cpumask_size())

4835

else if (len > cpumask_size())

4836

len = cpumask_size();

4836

len = cpumask_size();

4837

4838

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

4838

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

4839

}

4839

}

4840

4841

/**

4841

/**

4842

* sys_sched_setaffinity - set the cpu affinity of a process

4842

* sys_sched_setaffinity - set the cpu affinity of a process

4843

* @pid: pid of the process

4843

* @pid: pid of the process

4844

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

4844

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

4845

* @user_mask_ptr: user-space pointer to the new cpu mask

4845

* @user_mask_ptr: user-space pointer to the new cpu mask

4846

*/

4846

*/

4847

SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

4847

SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

4848

unsigned long __user *, user_mask_ptr)

4848

unsigned long __user *, user_mask_ptr)

4849

{

4849

{

4850

cpumask_var_t new_mask;

4850

cpumask_var_t new_mask;

4851

int retval;

4851

int retval;

4852

4853

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

4853

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

4854

return -ENOMEM;

4854

return -ENOMEM;

4855

4856

retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

4856

retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

4857

if (retval == 0)

4857

if (retval == 0)

4858

retval = sched_setaffinity(pid, new_mask);

4858

retval = sched_setaffinity(pid, new_mask);

4859

free_cpumask_var(new_mask);

4859

free_cpumask_var(new_mask);

4860

return retval;

4860

return retval;

4861

}

4861

}

4862

4863

long sched_getaffinity(pid_t pid, struct cpumask *mask)

4863

long sched_getaffinity(pid_t pid, struct cpumask *mask)

4864

{

4864

{

4865

struct task_struct *p;

4865

struct task_struct *p;

4866

unsigned long flags;

4866

unsigned long flags;

4867

struct rq *rq;

4867

struct rq *rq;

4868

int retval;

4868

int retval;

4869

4870

get_online_cpus();

4870

get_online_cpus();

4871

rcu_read_lock();

4871

rcu_read_lock();

4872

4873

retval = -ESRCH;

4873

retval = -ESRCH;

4874

p = find_process_by_pid(pid);

4874

p = find_process_by_pid(pid);

4875

if (!p)

4875

if (!p)

4876

goto out_unlock;

4876

goto out_unlock;

4877

4878

retval = security_task_getscheduler(p);

4878

retval = security_task_getscheduler(p);

4879

if (retval)

4879

if (retval)

4880

goto out_unlock;

4880

goto out_unlock;

4881

4882

rq = task_rq_lock(p, &flags);

4882

rq = task_rq_lock(p, &flags);

4883

cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);

4883

cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);

4884

task_rq_unlock(rq, &flags);

4884

task_rq_unlock(rq, &flags);

4885

4886

out_unlock:

4886

out_unlock:

4887

rcu_read_unlock();

4887

rcu_read_unlock();

4888

put_online_cpus();

4888

put_online_cpus();

4889

4890

return retval;

4890

return retval;

4891

}

4891

}

4892

4893

/**

4893

/**

4894

* sys_sched_getaffinity - get the cpu affinity of a process

4894

* sys_sched_getaffinity - get the cpu affinity of a process

4895

* @pid: pid of the process

4895

* @pid: pid of the process

4896

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

4896

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

4897

* @user_mask_ptr: user-space pointer to hold the current cpu mask

4897

* @user_mask_ptr: user-space pointer to hold the current cpu mask

4898

*/

4898

*/

4899

SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

4899

SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

4900

unsigned long __user *, user_mask_ptr)

4900

unsigned long __user *, user_mask_ptr)

4901

{

4901

{

4902

int ret;

4902

int ret;

4903

cpumask_var_t mask;

4903

cpumask_var_t mask;

4904

4905

if (len < nr_cpu_ids)

4905

if (len < nr_cpu_ids)

4906

return -EINVAL;

4906

return -EINVAL;

4907

if (len & (sizeof(unsigned long)-1))

4907

if (len & (sizeof(unsigned long)-1))

4908

return -EINVAL;

4908

return -EINVAL;

4909

4910

if (!alloc_cpumask_var(&mask, GFP_KERNEL))

4910

if (!alloc_cpumask_var(&mask, GFP_KERNEL))

4911

return -ENOMEM;

4911

return -ENOMEM;

4912

4913

ret = sched_getaffinity(pid, mask);

4913

ret = sched_getaffinity(pid, mask);

4914

if (ret == 0) {

4914

if (ret == 0) {

4915

size_t retlen = min_t(size_t, len, cpumask_size());

4915

size_t retlen = min_t(size_t, len, cpumask_size());

4916

4917

if (copy_to_user(user_mask_ptr, mask, retlen))

4917

if (copy_to_user(user_mask_ptr, mask, retlen))

4918

ret = -EFAULT;

4918

ret = -EFAULT;

4919

else

4919

else

4920

ret = retlen;

4920

ret = retlen;

4921

}

4921

}

4922

free_cpumask_var(mask);

4922

free_cpumask_var(mask);

4923

4924

return ret;

4924

return ret;

4925

}

4925

}

4926

4927

/**

4927

/**

4928

* sys_sched_yield - yield the current processor to other threads.

4928

* sys_sched_yield - yield the current processor to other threads.

4929

*

4929

*

4930

* This function yields the current CPU to other tasks. If there are no

4930

* This function yields the current CPU to other tasks. If there are no

4931

* other threads running on this CPU then this function will return.

4931

* other threads running on this CPU then this function will return.

4932

*/

4932

*/

4933

SYSCALL_DEFINE0(sched_yield)

4933

SYSCALL_DEFINE0(sched_yield)

4934

{

4934

{

4935

struct rq *rq = this_rq_lock();

4935

struct rq *rq = this_rq_lock();

4936

4937

schedstat_inc(rq, yld_count);

4937

schedstat_inc(rq, yld_count);

4938

current->sched_class->yield_task(rq);

4938

current->sched_class->yield_task(rq);

4939

4940

/*

4940

/*

4941

* Since we are going to call schedule() anyway, there's

4941

* Since we are going to call schedule() anyway, there's

4942

* no need to preempt or enable interrupts:

4942

* no need to preempt or enable interrupts:

4943

*/

4943

*/

4944

__release(rq->lock);

4944

__release(rq->lock);

4945

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

4945

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

4946

do_raw_spin_unlock(&rq->lock);

4946

do_raw_spin_unlock(&rq->lock);

4947

preempt_enable_no_resched();

4947

preempt_enable_no_resched();

4948

4949

schedule();

4949

schedule();

4950

4951

return 0;

4951

return 0;

4952

}

4952

}

4953

4954

static inline int should_resched(void)

4954

static inline int should_resched(void)

4955

{

4955

{

4956

return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);

4956

return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);

4957

}

4957

}

4958

4959

static void __cond_resched(void)

4959

static void __cond_resched(void)

4960

{

4960

{

4961

add_preempt_count(PREEMPT_ACTIVE);

4961

add_preempt_count(PREEMPT_ACTIVE);

4962

schedule();

4962

schedule();

4963

sub_preempt_count(PREEMPT_ACTIVE);

4963

sub_preempt_count(PREEMPT_ACTIVE);

4964

}

4964

}

4965

4966

int __sched _cond_resched(void)

4966

int __sched _cond_resched(void)

4967

{

4967

{

4968

if (should_resched()) {

4968

if (should_resched()) {

4969

__cond_resched();

4969

__cond_resched();

4970

return 1;

4970

return 1;

4971

}

4971

}

4972

return 0;

4972

return 0;

4973

}

4973

}

4974

EXPORT_SYMBOL(_cond_resched);

4974

EXPORT_SYMBOL(_cond_resched);

4975

4976

/*

4976

/*

4977

* __cond_resched_lock() - if a reschedule is pending, drop the given lock,

4977

* __cond_resched_lock() - if a reschedule is pending, drop the given lock,

4978

* call schedule, and on return reacquire the lock.

4978

* call schedule, and on return reacquire the lock.

4979

*

4979

*

4980

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

4980

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

4981

* operations here to prevent schedule() from being called twice (once via

4981

* operations here to prevent schedule() from being called twice (once via

4982

* spin_unlock(), once by hand).

4982

* spin_unlock(), once by hand).

4983

*/

4983

*/

4984

int __cond_resched_lock(spinlock_t *lock)

4984

int __cond_resched_lock(spinlock_t *lock)

4985

{

4985

{

4986

int resched = should_resched();

4986

int resched = should_resched();

4987

int ret = 0;

4987

int ret = 0;

4988

4989

lockdep_assert_held(lock);

4989

lockdep_assert_held(lock);

4990

4991

if (spin_needbreak(lock) || resched) {

4991

if (spin_needbreak(lock) || resched) {

4992

spin_unlock(lock);

4992

spin_unlock(lock);

4993

if (resched)

4993

if (resched)

4994

__cond_resched();

4994

__cond_resched();

4995

else

4995

else

4996

cpu_relax();

4996

cpu_relax();

4997

ret = 1;

4997

ret = 1;

4998

spin_lock(lock);

4998

spin_lock(lock);

4999

}

4999

}

5000

return ret;

5000

return ret;

5001

}

5001

}

5002

EXPORT_SYMBOL(__cond_resched_lock);

5002

EXPORT_SYMBOL(__cond_resched_lock);

5003

5004

int __sched __cond_resched_softirq(void)

5004

int __sched __cond_resched_softirq(void)

5005

{

5005

{

5006

BUG_ON(!in_softirq());

5006

BUG_ON(!in_softirq());

5007

5008

if (should_resched()) {

5008

if (should_resched()) {

5009

local_bh_enable();

5009

local_bh_enable();

5010

__cond_resched();

5010

__cond_resched();

5011

local_bh_disable();

5011

local_bh_disable();

5012

return 1;

5012

return 1;

5013

}

5013

}

5014

return 0;

5014

return 0;

5015

}

5015

}

5016

EXPORT_SYMBOL(__cond_resched_softirq);

5016

EXPORT_SYMBOL(__cond_resched_softirq);

5017

5018

/**

5018

/**

5019

* yield - yield the current processor to other threads.

5019

* yield - yield the current processor to other threads.

5020

*

5020

*

5021

* This is a shortcut for kernel-space yielding - it marks the

5021

* This is a shortcut for kernel-space yielding - it marks the

5022

* thread runnable and calls sys_sched_yield().

5022

* thread runnable and calls sys_sched_yield().

5023

*/

5023

*/

5024

void __sched yield(void)

5024

void __sched yield(void)

5025

{

5025

{

5026

set_current_state(TASK_RUNNING);

5026

set_current_state(TASK_RUNNING);

5027

sys_sched_yield();

5027

sys_sched_yield();

5028

}

5028

}

5029

EXPORT_SYMBOL(yield);

5029

EXPORT_SYMBOL(yield);

5030

5031

/*

5031

/*

5032

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

5032

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

5033

* that process accounting knows that this is a task in IO wait state.

5033

* that process accounting knows that this is a task in IO wait state.

5034

*/

5034

*/

5035

void __sched io_schedule(void)

5035

void __sched io_schedule(void)

5036

{

5036

{

5037

struct rq *rq = raw_rq();

5037

struct rq *rq = raw_rq();

5038

5039

delayacct_blkio_start();

5039

delayacct_blkio_start();

5040

atomic_inc(&rq->nr_iowait);

5040

atomic_inc(&rq->nr_iowait);

5041

current->in_iowait = 1;

5041

current->in_iowait = 1;

5042

schedule();

5042

schedule();

5043

current->in_iowait = 0;

5043

current->in_iowait = 0;

5044

atomic_dec(&rq->nr_iowait);

5044

atomic_dec(&rq->nr_iowait);

5045

delayacct_blkio_end();

5045

delayacct_blkio_end();

5046

}

5046

}

5047

EXPORT_SYMBOL(io_schedule);

5047

EXPORT_SYMBOL(io_schedule);

5048

5049

long __sched io_schedule_timeout(long timeout)

5049

long __sched io_schedule_timeout(long timeout)

5050

{

5050

{

5051

struct rq *rq = raw_rq();

5051

struct rq *rq = raw_rq();

5052

long ret;

5052

long ret;

5053

5054

delayacct_blkio_start();

5054

delayacct_blkio_start();

5055

atomic_inc(&rq->nr_iowait);

5055

atomic_inc(&rq->nr_iowait);

5056

current->in_iowait = 1;

5056

current->in_iowait = 1;

5057

ret = schedule_timeout(timeout);

5057

ret = schedule_timeout(timeout);

5058

current->in_iowait = 0;

5058

current->in_iowait = 0;

5059

atomic_dec(&rq->nr_iowait);

5059

atomic_dec(&rq->nr_iowait);

5060

delayacct_blkio_end();

5060

delayacct_blkio_end();

5061

return ret;

5061

return ret;

5062

}

5062

}

5063

5064

/**

5064

/**

5065

* sys_sched_get_priority_max - return maximum RT priority.

5065

* sys_sched_get_priority_max - return maximum RT priority.

5066

* @policy: scheduling class.

5066

* @policy: scheduling class.

5067

*

5067

*

5068

* this syscall returns the maximum rt_priority that can be used

5068

* this syscall returns the maximum rt_priority that can be used

5069

* by a given scheduling class.

5069

* by a given scheduling class.

5070

*/

5070

*/

5071

SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

5071

SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

5072

{

5072

{

5073

int ret = -EINVAL;

5073

int ret = -EINVAL;

5074

5075

switch (policy) {

5075

switch (policy) {

5076

case SCHED_FIFO:

5076

case SCHED_FIFO:

5077

case SCHED_RR:

5077

case SCHED_RR:

5078

ret = MAX_USER_RT_PRIO-1;

5078

ret = MAX_USER_RT_PRIO-1;

5079

break;

5079

break;

5080

case SCHED_NORMAL:

5080

case SCHED_NORMAL:

5081

case SCHED_BATCH:

5081

case SCHED_BATCH:

5082

case SCHED_IDLE:

5082

case SCHED_IDLE:

5083

ret = 0;

5083

ret = 0;

5084

break;

5084

break;

5085

}

5085

}

5086

return ret;

5086

return ret;

5087

}

5087

}

5088

5089

/**

5089

/**

5090

* sys_sched_get_priority_min - return minimum RT priority.

5090

* sys_sched_get_priority_min - return minimum RT priority.

5091

* @policy: scheduling class.

5091

* @policy: scheduling class.

5092

*

5092

*

5093

* this syscall returns the minimum rt_priority that can be used

5093

* this syscall returns the minimum rt_priority that can be used

5094

* by a given scheduling class.

5094

* by a given scheduling class.

5095

*/

5095

*/

5096

SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

5096

SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

5097

{

5097

{

5098

int ret = -EINVAL;

5098

int ret = -EINVAL;

5099

5100

switch (policy) {

5100

switch (policy) {

5101

case SCHED_FIFO:

5101

case SCHED_FIFO:

5102

case SCHED_RR:

5102

case SCHED_RR:

5103

ret = 1;

5103

ret = 1;

5104

break;

5104

break;

5105

case SCHED_NORMAL:

5105

case SCHED_NORMAL:

5106

case SCHED_BATCH:

5106

case SCHED_BATCH:

5107

case SCHED_IDLE:

5107

case SCHED_IDLE:

5108

ret = 0;

5108

ret = 0;

5109

}

5109

}

5110

return ret;

5110

return ret;

5111

}

5111

}

5112

5113

/**

5113

/**

5114

* sys_sched_rr_get_interval - return the default timeslice of a process.

5114

* sys_sched_rr_get_interval - return the default timeslice of a process.

5115

* @pid: pid of the process.

5115

* @pid: pid of the process.

5116

* @interval: userspace pointer to the timeslice value.

5116

* @interval: userspace pointer to the timeslice value.

5117

*

5117

*

5118

* this syscall writes the default timeslice value of a given process

5118

* this syscall writes the default timeslice value of a given process

5119

* into the user-space timespec buffer. A value of '0' means infinity.

5119

* into the user-space timespec buffer. A value of '0' means infinity.

5120

*/

5120

*/

5121

SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

5121

SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

5122

struct timespec __user *, interval)

5122

struct timespec __user *, interval)

5123

{

5123

{

5124

struct task_struct *p;

5124

struct task_struct *p;

5125

unsigned int time_slice;

5125

unsigned int time_slice;

5126

unsigned long flags;

5126

unsigned long flags;

5127

struct rq *rq;

5127

struct rq *rq;

5128

int retval;

5128

int retval;

5129

struct timespec t;

5129

struct timespec t;

5130

5131

if (pid < 0)

5131

if (pid < 0)

5132

return -EINVAL;

5132

return -EINVAL;

5133

5134

retval = -ESRCH;

5134

retval = -ESRCH;

5135

rcu_read_lock();

5135

rcu_read_lock();

5136

p = find_process_by_pid(pid);

5136

p = find_process_by_pid(pid);

5137

if (!p)

5137

if (!p)

5138

goto out_unlock;

5138

goto out_unlock;

5139

5140

retval = security_task_getscheduler(p);

5140

retval = security_task_getscheduler(p);

5141

if (retval)

5141

if (retval)

5142

goto out_unlock;

5142

goto out_unlock;

5143

5144

rq = task_rq_lock(p, &flags);

5144

rq = task_rq_lock(p, &flags);

5145

time_slice = p->sched_class->get_rr_interval(rq, p);

5145

time_slice = p->sched_class->get_rr_interval(rq, p);

5146

task_rq_unlock(rq, &flags);

5146

task_rq_unlock(rq, &flags);

5147

5148

rcu_read_unlock();

5148

rcu_read_unlock();

5149

jiffies_to_timespec(time_slice, &t);

5149

jiffies_to_timespec(time_slice, &t);

5150

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

5150

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

5151

return retval;

5151

return retval;

5152

5153

out_unlock:

5153

out_unlock:

5154

rcu_read_unlock();

5154

rcu_read_unlock();

5155

return retval;

5155

return retval;

5156

}

5156

}

5157

5158

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

5158

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

5159

5160

void sched_show_task(struct task_struct *p)

5160

void sched_show_task(struct task_struct *p)

5161

{

5161

{

5162

unsigned long free = 0;

5162

unsigned long free = 0;

5163

unsigned state;

5163

unsigned state;

5164

5165

state = p->state ? __ffs(p->state) + 1 : 0;

5165

state = p->state ? __ffs(p->state) + 1 : 0;

5166

printk(KERN_INFO "%-13.13s %c", p->comm,

5166

printk(KERN_INFO "%-13.13s %c", p->comm,

5167

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

5167

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

5168

#if BITS_PER_LONG == 32

5168

#if BITS_PER_LONG == 32

5169

if (state == TASK_RUNNING)

5169

if (state == TASK_RUNNING)

5170

printk(KERN_CONT " running ");

5170

printk(KERN_CONT " running ");

5171

else

5171

else

5172

printk(KERN_CONT " %08lx ", thread_saved_pc(p));

5172

printk(KERN_CONT " %08lx ", thread_saved_pc(p));

5173

#else

5173

#else

5174

if (state == TASK_RUNNING)

5174

if (state == TASK_RUNNING)

5175

printk(KERN_CONT " running task ");

5175

printk(KERN_CONT " running task ");

5176

else

5176

else

5177

printk(KERN_CONT " %016lx ", thread_saved_pc(p));

5177

printk(KERN_CONT " %016lx ", thread_saved_pc(p));

5178

#endif

5178

#endif

5179

#ifdef CONFIG_DEBUG_STACK_USAGE

5179

#ifdef CONFIG_DEBUG_STACK_USAGE

5180

free = stack_not_used(p);

5180

free = stack_not_used(p);

5181

#endif

5181

#endif

5182

printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,

5182

printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,

5183

task_pid_nr(p), task_pid_nr(p->real_parent),

5183

task_pid_nr(p), task_pid_nr(p->real_parent),

5184

(unsigned long)task_thread_info(p)->flags);

5184

(unsigned long)task_thread_info(p)->flags);

5185

5186

show_stack(p, NULL);

5186

show_stack(p, NULL);

5187

}

5187

}

5188

5189

void show_state_filter(unsigned long state_filter)

5189

void show_state_filter(unsigned long state_filter)

5190

{

5190

{

5191

struct task_struct *g, *p;

5191

struct task_struct *g, *p;

5192

5193

#if BITS_PER_LONG == 32

5193

#if BITS_PER_LONG == 32

5194

printk(KERN_INFO

5194

printk(KERN_INFO

5195

" task PC stack pid father\n");

5195

" task PC stack pid father\n");

5196

#else

5196

#else

5197

printk(KERN_INFO

5197

printk(KERN_INFO

5198

" task PC stack pid father\n");

5198

" task PC stack pid father\n");

5199

#endif

5199

#endif

5200

read_lock(&tasklist_lock);

5200

read_lock(&tasklist_lock);

5201

do_each_thread(g, p) {

5201

do_each_thread(g, p) {

5202

/*

5202

/*

5203

* reset the NMI-timeout, listing all files on a slow

5203

* reset the NMI-timeout, listing all files on a slow

5204

* console might take alot of time:

5204

* console might take alot of time:

5205

*/

5205

*/

5206

touch_nmi_watchdog();

5206

touch_nmi_watchdog();

5207

if (!state_filter || (p->state & state_filter))

5207

if (!state_filter || (p->state & state_filter))

5208

sched_show_task(p);

5208

sched_show_task(p);

5209

} while_each_thread(g, p);

5209

} while_each_thread(g, p);

5210

5211

touch_all_softlockup_watchdogs();

5211

touch_all_softlockup_watchdogs();

5212

5213

#ifdef CONFIG_SCHED_DEBUG

5213

#ifdef CONFIG_SCHED_DEBUG

5214

sysrq_sched_debug_show();

5214

sysrq_sched_debug_show();

5215

#endif

5215

#endif

5216

read_unlock(&tasklist_lock);

5216

read_unlock(&tasklist_lock);

5217

/*

5217

/*

5218

* Only show locks if all tasks are dumped:

5218

* Only show locks if all tasks are dumped:

5219

*/

5219

*/

5220

if (!state_filter)

5220

if (!state_filter)

5221

debug_show_all_locks();

5221

debug_show_all_locks();

5222

}

5222

}

5223

5224

void __cpuinit init_idle_bootup_task(struct task_struct *idle)

5224

void __cpuinit init_idle_bootup_task(struct task_struct *idle)

5225

{

5225

{

5226

idle->sched_class = &idle_sched_class;

5226

idle->sched_class = &idle_sched_class;

5227

}

5227

}

5228

5229

/**

5229

/**

5230

* init_idle - set up an idle thread for a given CPU

5230

* init_idle - set up an idle thread for a given CPU

5231

* @idle: task in question

5231

* @idle: task in question

5232

* @cpu: cpu the idle task belongs to

5232

* @cpu: cpu the idle task belongs to

5233

*

5233

*

5234

* NOTE: this function does not set the idle thread's NEED_RESCHED

5234

* NOTE: this function does not set the idle thread's NEED_RESCHED

5235

* flag, to make booting more robust.

5235

* flag, to make booting more robust.

5236

*/

5236

*/

5237

void __cpuinit init_idle(struct task_struct *idle, int cpu)

5237

void __cpuinit init_idle(struct task_struct *idle, int cpu)

5238

{

5238

{

5239

struct rq *rq = cpu_rq(cpu);

5239

struct rq *rq = cpu_rq(cpu);

5240

unsigned long flags;

5240

unsigned long flags;

5241

5242

raw_spin_lock_irqsave(&rq->lock, flags);

5242

raw_spin_lock_irqsave(&rq->lock, flags);

5243

5244

__sched_fork(idle);

5244

__sched_fork(idle);

5245

idle->state = TASK_RUNNING;

5245

idle->state = TASK_RUNNING;

5246

idle->se.exec_start = sched_clock();

5246

idle->se.exec_start = sched_clock();

5247

5248

cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));

5248

cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));

5249

__set_task_cpu(idle, cpu);

5249

__set_task_cpu(idle, cpu);

5250

5251

rq->curr = rq->idle = idle;

5251

rq->curr = rq->idle = idle;

5252

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

5252

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

5253

idle->oncpu = 1;

5253

idle->oncpu = 1;

5254

#endif

5254

#endif

5255

raw_spin_unlock_irqrestore(&rq->lock, flags);

5255

raw_spin_unlock_irqrestore(&rq->lock, flags);

5256

5257

/* Set the preempt count _outside_ the spinlocks! */

5257

/* Set the preempt count _outside_ the spinlocks! */

5258

#if defined(CONFIG_PREEMPT)

5258

#if defined(CONFIG_PREEMPT)

5259

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

5259

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

5260

#else

5260

#else

5261

task_thread_info(idle)->preempt_count = 0;

5261

task_thread_info(idle)->preempt_count = 0;

5262

#endif

5262

#endif

5263

/*

5263

/*

5264

* The idle tasks have their own, simple scheduling class:

5264

* The idle tasks have their own, simple scheduling class:

5265

*/

5265

*/

5266

idle->sched_class = &idle_sched_class;

5266

idle->sched_class = &idle_sched_class;

5267

ftrace_graph_init_task(idle);

5267

ftrace_graph_init_task(idle);

5268

}

5268

}

5269

5270

/*

5270

/*

5271

* In a system that switches off the HZ timer nohz_cpu_mask

5271

* In a system that switches off the HZ timer nohz_cpu_mask

5272

* indicates which cpus entered this state. This is used

5272

* indicates which cpus entered this state. This is used

5273

* in the rcu update to wait only for active cpus. For system

5273

* in the rcu update to wait only for active cpus. For system

5274

* which do not switch off the HZ timer nohz_cpu_mask should

5274

* which do not switch off the HZ timer nohz_cpu_mask should

5275

* always be CPU_BITS_NONE.

5275

* always be CPU_BITS_NONE.

5276

*/

5276

*/

5277

cpumask_var_t nohz_cpu_mask;

5277

cpumask_var_t nohz_cpu_mask;

5278

5279

/*

5279

/*

5280

* Increase the granularity value when there are more CPUs,

5280

* Increase the granularity value when there are more CPUs,

5281

* because with more CPUs the 'effective latency' as visible

5281

* because with more CPUs the 'effective latency' as visible

5282

* to users decreases. But the relationship is not linear,

5282

* to users decreases. But the relationship is not linear,

5283

* so pick a second-best guess by going with the log2 of the

5283

* so pick a second-best guess by going with the log2 of the

5284

* number of CPUs.

5284

* number of CPUs.

5285

*

5285

*

5286

* This idea comes from the SD scheduler of Con Kolivas:

5286

* This idea comes from the SD scheduler of Con Kolivas:

5287

*/

5287

*/

5288

static int get_update_sysctl_factor(void)

5288

static int get_update_sysctl_factor(void)

5289

{

5289

{

5290

unsigned int cpus = min_t(int, num_online_cpus(), 8);

5290

unsigned int cpus = min_t(int, num_online_cpus(), 8);

5291

unsigned int factor;

5291

unsigned int factor;

5292

5293

switch (sysctl_sched_tunable_scaling) {

5293

switch (sysctl_sched_tunable_scaling) {

5294

case SCHED_TUNABLESCALING_NONE:

5294

case SCHED_TUNABLESCALING_NONE:

5295

factor = 1;

5295

factor = 1;

5296

break;

5296

break;

5297

case SCHED_TUNABLESCALING_LINEAR:

5297

case SCHED_TUNABLESCALING_LINEAR:

5298

factor = cpus;

5298

factor = cpus;

5299

break;

5299

break;

5300

case SCHED_TUNABLESCALING_LOG:

5300

case SCHED_TUNABLESCALING_LOG:

5301

default:

5301

default:

5302

factor = 1 + ilog2(cpus);

5302

factor = 1 + ilog2(cpus);

5303

break;

5303

break;

5304

}

5304

}

5305

5306

return factor;

5306

return factor;

5307

}

5307

}

5308

5309

static void update_sysctl(void)

5309

static void update_sysctl(void)

5310

{

5310

{

5311

unsigned int factor = get_update_sysctl_factor();

5311

unsigned int factor = get_update_sysctl_factor();

5312

5313

#define SET_SYSCTL(name) \

5313

#define SET_SYSCTL(name) \

5314

(sysctl_##name = (factor) * normalized_sysctl_##name)

5314

(sysctl_##name = (factor) * normalized_sysctl_##name)

5315

SET_SYSCTL(sched_min_granularity);

5315

SET_SYSCTL(sched_min_granularity);

5316

SET_SYSCTL(sched_latency);

5316

SET_SYSCTL(sched_latency);

5317

SET_SYSCTL(sched_wakeup_granularity);

5317

SET_SYSCTL(sched_wakeup_granularity);

5318

SET_SYSCTL(sched_shares_ratelimit);

5318

SET_SYSCTL(sched_shares_ratelimit);

5319

#undef SET_SYSCTL

5319

#undef SET_SYSCTL

5320

}

5320

}

5321

5322

static inline void sched_init_granularity(void)

5322

static inline void sched_init_granularity(void)

5323

{

5323

{

5324

update_sysctl();

5324

update_sysctl();

5325

}

5325

}

5326

5327

#ifdef CONFIG_SMP

5327

#ifdef CONFIG_SMP

5328

/*

5328

/*

5329

* This is how migration works:

5329

* This is how migration works:

5330

*

5330

*

5331

* 1) we queue a struct migration_req structure in the source CPU's

5331

* 1) we queue a struct migration_req structure in the source CPU's

5332

* runqueue and wake up that CPU's migration thread.

5332

* runqueue and wake up that CPU's migration thread.

5333

* 2) we down() the locked semaphore => thread blocks.

5333

* 2) we down() the locked semaphore => thread blocks.

5334

* 3) migration thread wakes up (implicitly it forces the migrated

5334

* 3) migration thread wakes up (implicitly it forces the migrated

5335

* thread off the CPU)

5335

* thread off the CPU)

5336

* 4) it gets the migration request and checks whether the migrated

5336

* 4) it gets the migration request and checks whether the migrated

5337

* task is still in the wrong runqueue.

5337

* task is still in the wrong runqueue.

5338

* 5) if it's in the wrong runqueue then the migration thread removes

5338

* 5) if it's in the wrong runqueue then the migration thread removes

5339

* it and puts it into the right queue.

5339

* it and puts it into the right queue.

5340

* 6) migration thread up()s the semaphore.

5340

* 6) migration thread up()s the semaphore.

5341

* 7) we wake up and the migration is done.

5341

* 7) we wake up and the migration is done.

5342

*/

5342

*/

5343

5344

/*

5344

/*

5345

* Change a given task's CPU affinity. Migrate the thread to a

5345

* Change a given task's CPU affinity. Migrate the thread to a

5346

* proper CPU and schedule it away if the CPU it's executing on

5346

* proper CPU and schedule it away if the CPU it's executing on

5347

* is removed from the allowed bitmask.

5347

* is removed from the allowed bitmask.

5348

*

5348

*

5349

* NOTE: the caller must have a valid reference to the task, the

5349

* NOTE: the caller must have a valid reference to the task, the

5350

* task must not exit() & deallocate itself prematurely. The

5350

* task must not exit() & deallocate itself prematurely. The

5351

* call is not atomic; no spinlocks may be held.

5351

* call is not atomic; no spinlocks may be held.

5352

*/

5352

*/

5353

int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

5353

int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

5354

{

5354

{

5355

struct migration_req req;

5355

struct migration_req req;

5356

unsigned long flags;

5356

unsigned long flags;

5357

struct rq *rq;

5357

struct rq *rq;

5358

int ret = 0;

5358

int ret = 0;

5359

5360

rq = task_rq_lock(p, &flags);

5360

rq = task_rq_lock(p, &flags);

5361

5362

if (!cpumask_intersects(new_mask, cpu_active_mask)) {

5362

if (!cpumask_intersects(new_mask, cpu_active_mask)) {

5363

ret = -EINVAL;

5363

ret = -EINVAL;

5364

goto out;

5364

goto out;

5365

}

5365

}

5366

5367

if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&

5367

if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&

5368

!cpumask_equal(&p->cpus_allowed, new_mask))) {

5368

!cpumask_equal(&p->cpus_allowed, new_mask))) {

5369

ret = -EINVAL;

5369

ret = -EINVAL;

5370

goto out;

5370

goto out;

5371

}

5371

}

5372

5373

if (p->sched_class->set_cpus_allowed)

5373

if (p->sched_class->set_cpus_allowed)

5374

p->sched_class->set_cpus_allowed(p, new_mask);

5374

p->sched_class->set_cpus_allowed(p, new_mask);

5375

else {

5375

else {

5376

cpumask_copy(&p->cpus_allowed, new_mask);

5376

cpumask_copy(&p->cpus_allowed, new_mask);

5377

p->rt.nr_cpus_allowed = cpumask_weight(new_mask);

5377

p->rt.nr_cpus_allowed = cpumask_weight(new_mask);

5378

}

5378

}

5379

5380

/* Can the task run on the task's current CPU? If so, we're done */

5380

/* Can the task run on the task's current CPU? If so, we're done */

5381

if (cpumask_test_cpu(task_cpu(p), new_mask))

5381

if (cpumask_test_cpu(task_cpu(p), new_mask))

5382

goto out;

5382

goto out;

5383

5384

if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {

5384

if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {

5385

/* Need help from migration thread: drop lock and wait. */

5385

/* Need help from migration thread: drop lock and wait. */

5386

struct task_struct *mt = rq->migration_thread;

5386

struct task_struct *mt = rq->migration_thread;

5387

5388

get_task_struct(mt);

5388

get_task_struct(mt);

5389

task_rq_unlock(rq, &flags);

5389

task_rq_unlock(rq, &flags);

5390

wake_up_process(rq->migration_thread);

5390

wake_up_process(mt);

5391

put_task_struct(mt);

5391

put_task_struct(mt);

5392

wait_for_completion(&req.done);

5392

wait_for_completion(&req.done);

5393

tlb_migrate_finish(p->mm);

5393

tlb_migrate_finish(p->mm);

5394

return 0;

5394

return 0;

5395

}

5395

}

5396

out:

5396

out:

5397

task_rq_unlock(rq, &flags);

5397

task_rq_unlock(rq, &flags);

5398

5399

return ret;

5399

return ret;

5400

}

5400

}

5401

EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

5401

EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

5402

5403

/*

5403

/*

5404

* Move (not current) task off this cpu, onto dest cpu. We're doing

5404

* Move (not current) task off this cpu, onto dest cpu. We're doing

5405

* this because either it can't run here any more (set_cpus_allowed()

5405

* this because either it can't run here any more (set_cpus_allowed()

5406

* away from this CPU, or CPU going down), or because we're

5406

* away from this CPU, or CPU going down), or because we're

5407

* attempting to rebalance this task on exec (sched_exec).

5407

* attempting to rebalance this task on exec (sched_exec).

5408

*

5408

*

5409

* So we race with normal scheduler movements, but that's OK, as long

5409

* So we race with normal scheduler movements, but that's OK, as long

5410

* as the task is no longer on this CPU.

5410

* as the task is no longer on this CPU.

5411

*

5411

*

5412

* Returns non-zero if task was successfully migrated.

5412

* Returns non-zero if task was successfully migrated.

5413

*/

5413

*/

5414

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

5414

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

5415

{

5415

{

5416

struct rq *rq_dest, *rq_src;

5416

struct rq *rq_dest, *rq_src;

5417

int ret = 0;

5417

int ret = 0;

5418

5419

if (unlikely(!cpu_active(dest_cpu)))

5419

if (unlikely(!cpu_active(dest_cpu)))

5420

return ret;

5420

return ret;

5421

5422

rq_src = cpu_rq(src_cpu);

5422

rq_src = cpu_rq(src_cpu);

5423

rq_dest = cpu_rq(dest_cpu);

5423

rq_dest = cpu_rq(dest_cpu);

5424

5425

double_rq_lock(rq_src, rq_dest);

5425

double_rq_lock(rq_src, rq_dest);

5426

/* Already moved. */

5426

/* Already moved. */

5427

if (task_cpu(p) != src_cpu)

5427

if (task_cpu(p) != src_cpu)

5428

goto done;

5428

goto done;

5429

/* Affinity changed (again). */

5429

/* Affinity changed (again). */

5430

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

5430

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

5431

goto fail;

5431

goto fail;

5432

5433

/*

5433

/*

5434

* If we're not on a rq, the next wake-up will ensure we're

5434

* If we're not on a rq, the next wake-up will ensure we're

5435

* placed properly.

5435

* placed properly.

5436

*/

5436

*/

5437

if (p->se.on_rq) {

5437

if (p->se.on_rq) {

5438

deactivate_task(rq_src, p, 0);

5438

deactivate_task(rq_src, p, 0);

5439

set_task_cpu(p, dest_cpu);

5439

set_task_cpu(p, dest_cpu);

5440

activate_task(rq_dest, p, 0);

5440

activate_task(rq_dest, p, 0);

5441

check_preempt_curr(rq_dest, p, 0);

5441

check_preempt_curr(rq_dest, p, 0);

5442

}

5442

}

5443

done:

5443

done:

5444

ret = 1;

5444

ret = 1;

5445

fail:

5445

fail:

5446

double_rq_unlock(rq_src, rq_dest);

5446

double_rq_unlock(rq_src, rq_dest);

5447

return ret;

5447

return ret;

5448

}

5448

}

5449

5450

#define RCU_MIGRATION_IDLE 0

5450

#define RCU_MIGRATION_IDLE 0

5451

#define RCU_MIGRATION_NEED_QS 1

5451

#define RCU_MIGRATION_NEED_QS 1

5452

#define RCU_MIGRATION_GOT_QS 2

5452

#define RCU_MIGRATION_GOT_QS 2

5453

#define RCU_MIGRATION_MUST_SYNC 3

5453

#define RCU_MIGRATION_MUST_SYNC 3

5454

5455

/*

5455

/*

5456

* migration_thread - this is a highprio system thread that performs

5456

* migration_thread - this is a highprio system thread that performs

5457

* thread migration by bumping thread off CPU then 'pushing' onto

5457

* thread migration by bumping thread off CPU then 'pushing' onto

5458

* another runqueue.

5458

* another runqueue.

5459

*/

5459

*/

5460

static int migration_thread(void *data)

5460

static int migration_thread(void *data)

5461

{

5461

{

5462

int badcpu;

5462

int badcpu;

5463

int cpu = (long)data;

5463

int cpu = (long)data;

5464

struct rq *rq;

5464

struct rq *rq;

5465

5466

rq = cpu_rq(cpu);

5466

rq = cpu_rq(cpu);

5467

BUG_ON(rq->migration_thread != current);

5467

BUG_ON(rq->migration_thread != current);

5468

5469

set_current_state(TASK_INTERRUPTIBLE);

5469

set_current_state(TASK_INTERRUPTIBLE);

5470

while (!kthread_should_stop()) {

5470

while (!kthread_should_stop()) {

5471

struct migration_req *req;

5471

struct migration_req *req;

5472

struct list_head *head;

5472

struct list_head *head;

5473

5474

raw_spin_lock_irq(&rq->lock);

5474

raw_spin_lock_irq(&rq->lock);

5475

5476

if (cpu_is_offline(cpu)) {

5476

if (cpu_is_offline(cpu)) {

5477

raw_spin_unlock_irq(&rq->lock);

5477

raw_spin_unlock_irq(&rq->lock);

5478

break;

5478

break;

5479

}

5479

}

5480

5481

if (rq->active_balance) {

5481

if (rq->active_balance) {

5482

active_load_balance(rq, cpu);

5482

active_load_balance(rq, cpu);

5483

rq->active_balance = 0;

5483

rq->active_balance = 0;

5484

}

5484

}

5485

5486

head = &rq->migration_queue;

5486

head = &rq->migration_queue;

5487

5488

if (list_empty(head)) {

5488

if (list_empty(head)) {

5489

raw_spin_unlock_irq(&rq->lock);

5489

raw_spin_unlock_irq(&rq->lock);

5490

schedule();

5490

schedule();

5491

set_current_state(TASK_INTERRUPTIBLE);

5491

set_current_state(TASK_INTERRUPTIBLE);

5492

continue;

5492

continue;

5493

}

5493

}

5494

req = list_entry(head->next, struct migration_req, list);

5494

req = list_entry(head->next, struct migration_req, list);

5495

list_del_init(head->next);

5495

list_del_init(head->next);

5496

5497

if (req->task != NULL) {

5497

if (req->task != NULL) {

5498

raw_spin_unlock(&rq->lock);

5498

raw_spin_unlock(&rq->lock);

5499

__migrate_task(req->task, cpu, req->dest_cpu);

5499

__migrate_task(req->task, cpu, req->dest_cpu);

5500

} else if (likely(cpu == (badcpu = smp_processor_id()))) {

5500

} else if (likely(cpu == (badcpu = smp_processor_id()))) {

5501

req->dest_cpu = RCU_MIGRATION_GOT_QS;

5501

req->dest_cpu = RCU_MIGRATION_GOT_QS;

5502

raw_spin_unlock(&rq->lock);

5502

raw_spin_unlock(&rq->lock);

5503

} else {

5503

} else {

5504

req->dest_cpu = RCU_MIGRATION_MUST_SYNC;

5504

req->dest_cpu = RCU_MIGRATION_MUST_SYNC;

5505

raw_spin_unlock(&rq->lock);

5505

raw_spin_unlock(&rq->lock);

5506

WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);

5506

WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);

5507

}

5507

}

5508

local_irq_enable();

5508

local_irq_enable();

5509

5510

complete(&req->done);

5510

complete(&req->done);

5511

}

5511

}

5512

__set_current_state(TASK_RUNNING);

5512

__set_current_state(TASK_RUNNING);

5513

5514

return 0;

5514

return 0;

5515

}

5515

}

5516

5517

#ifdef CONFIG_HOTPLUG_CPU

5517

#ifdef CONFIG_HOTPLUG_CPU

5518

5519

static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)

5519

static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)

5520

{

5520

{

5521

int ret;

5521

int ret;

5522

5523

local_irq_disable();

5523

local_irq_disable();

5524

ret = __migrate_task(p, src_cpu, dest_cpu);

5524

ret = __migrate_task(p, src_cpu, dest_cpu);

5525

local_irq_enable();

5525

local_irq_enable();

5526

return ret;

5526

return ret;

5527

}

5527

}

5528

5529

/*

5529

/*

5530

* Figure out where task on dead CPU should go, use force if necessary.

5530

* Figure out where task on dead CPU should go, use force if necessary.

5531

*/

5531

*/

5532

static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)

5532

static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)

5533

{

5533

{

5534

int dest_cpu;

5534

int dest_cpu;

5535

5536

again:

5536

again:

5537

dest_cpu = select_fallback_rq(dead_cpu, p);

5537

dest_cpu = select_fallback_rq(dead_cpu, p);

5538

5539

/* It can have affinity changed while we were choosing. */

5539

/* It can have affinity changed while we were choosing. */

5540

if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))

5540

if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))

5541

goto again;

5541

goto again;

5542

}

5542

}

5543

5544

/*

5544

/*

5545

* While a dead CPU has no uninterruptible tasks queued at this point,

5545

* While a dead CPU has no uninterruptible tasks queued at this point,

5546

* it might still have a nonzero ->nr_uninterruptible counter, because

5546

* it might still have a nonzero ->nr_uninterruptible counter, because

5547

* for performance reasons the counter is not stricly tracking tasks to

5547

* for performance reasons the counter is not stricly tracking tasks to

5548

* their home CPUs. So we just add the counter to another CPU's counter,

5548

* their home CPUs. So we just add the counter to another CPU's counter,

5549

* to keep the global sum constant after CPU-down:

5549

* to keep the global sum constant after CPU-down:

5550

*/

5550

*/

5551

static void migrate_nr_uninterruptible(struct rq *rq_src)

5551

static void migrate_nr_uninterruptible(struct rq *rq_src)

5552

{

5552

{

5553

struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));

5553

struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));

5554

unsigned long flags;

5554

unsigned long flags;

5555

5556

local_irq_save(flags);

5556

local_irq_save(flags);

5557

double_rq_lock(rq_src, rq_dest);

5557

double_rq_lock(rq_src, rq_dest);

5558

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

5558

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

5559

rq_src->nr_uninterruptible = 0;

5559

rq_src->nr_uninterruptible = 0;

5560

double_rq_unlock(rq_src, rq_dest);

5560

double_rq_unlock(rq_src, rq_dest);

5561

local_irq_restore(flags);

5561

local_irq_restore(flags);

5562

}

5562

}

5563

5564

/* Run through task list and migrate tasks from the dead cpu. */

5564

/* Run through task list and migrate tasks from the dead cpu. */

5565

static void migrate_live_tasks(int src_cpu)

5565

static void migrate_live_tasks(int src_cpu)

5566

{

5566

{

5567

struct task_struct *p, *t;

5567

struct task_struct *p, *t;

5568

5569

read_lock(&tasklist_lock);

5569

read_lock(&tasklist_lock);

5570

5571

do_each_thread(t, p) {

5571

do_each_thread(t, p) {

5572

if (p == current)

5572

if (p == current)

5573

continue;

5573

continue;

5574

5575

if (task_cpu(p) == src_cpu)

5575

if (task_cpu(p) == src_cpu)

5576

move_task_off_dead_cpu(src_cpu, p);

5576

move_task_off_dead_cpu(src_cpu, p);

5577

} while_each_thread(t, p);

5577

} while_each_thread(t, p);

5578

5579

read_unlock(&tasklist_lock);

5579

read_unlock(&tasklist_lock);

5580

}

5580

}

5581

5582

/*

5582

/*

5583

* Schedules idle task to be the next runnable task on current CPU.

5583

* Schedules idle task to be the next runnable task on current CPU.

5584

* It does so by boosting its priority to highest possible.

5584

* It does so by boosting its priority to highest possible.

5585

* Used by CPU offline code.

5585

* Used by CPU offline code.

5586

*/

5586

*/

5587

void sched_idle_next(void)

5587

void sched_idle_next(void)

5588

{

5588

{

5589

int this_cpu = smp_processor_id();

5589

int this_cpu = smp_processor_id();

5590

struct rq *rq = cpu_rq(this_cpu);

5590

struct rq *rq = cpu_rq(this_cpu);

5591

struct task_struct *p = rq->idle;

5591

struct task_struct *p = rq->idle;

5592

unsigned long flags;

5592

unsigned long flags;

5593

5594

/* cpu has to be offline */

5594

/* cpu has to be offline */

5595

BUG_ON(cpu_online(this_cpu));

5595

BUG_ON(cpu_online(this_cpu));

5596

5597

/*

5597

/*

5598

* Strictly not necessary since rest of the CPUs are stopped by now

5598

* Strictly not necessary since rest of the CPUs are stopped by now

5599

* and interrupts disabled on the current cpu.

5599

* and interrupts disabled on the current cpu.

5600

*/

5600

*/

5601

raw_spin_lock_irqsave(&rq->lock, flags);

5601

raw_spin_lock_irqsave(&rq->lock, flags);

5602

5603

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

5603

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

5604

5605

update_rq_clock(rq);

5605

update_rq_clock(rq);

5606

activate_task(rq, p, 0);

5606

activate_task(rq, p, 0);

5607

5608

raw_spin_unlock_irqrestore(&rq->lock, flags);

5608

raw_spin_unlock_irqrestore(&rq->lock, flags);

5609

}

5609

}

5610

5611

/*

5611

/*

5612

* Ensures that the idle task is using init_mm right before its cpu goes

5612

* Ensures that the idle task is using init_mm right before its cpu goes

5613

* offline.

5613

* offline.

5614

*/

5614

*/

5615

void idle_task_exit(void)

5615

void idle_task_exit(void)

5616

{

5616

{

5617

struct mm_struct *mm = current->active_mm;

5617

struct mm_struct *mm = current->active_mm;

5618

5619

BUG_ON(cpu_online(smp_processor_id()));

5619

BUG_ON(cpu_online(smp_processor_id()));

5620

5621

if (mm != &init_mm)

5621

if (mm != &init_mm)

5622

switch_mm(mm, &init_mm, current);

5622

switch_mm(mm, &init_mm, current);

5623

mmdrop(mm);

5623

mmdrop(mm);

5624

}

5624

}

5625

5626

/* called under rq->lock with disabled interrupts */

5626

/* called under rq->lock with disabled interrupts */

5627

static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)

5627

static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)

5628

{

5628

{

5629

struct rq *rq = cpu_rq(dead_cpu);

5629

struct rq *rq = cpu_rq(dead_cpu);

5630

5631

/* Must be exiting, otherwise would be on tasklist. */

5631

/* Must be exiting, otherwise would be on tasklist. */

5632

BUG_ON(!p->exit_state);

5632

BUG_ON(!p->exit_state);

5633

5634

/* Cannot have done final schedule yet: would have vanished. */

5634

/* Cannot have done final schedule yet: would have vanished. */

5635

BUG_ON(p->state == TASK_DEAD);

5635

BUG_ON(p->state == TASK_DEAD);

5636

5637

get_task_struct(p);

5637

get_task_struct(p);

5638

5639

/*

5639

/*

5640

* Drop lock around migration; if someone else moves it,

5640

* Drop lock around migration; if someone else moves it,

5641

* that's OK. No task can be added to this CPU, so iteration is

5641

* that's OK. No task can be added to this CPU, so iteration is

5642

* fine.

5642

* fine.

5643

*/

5643

*/

5644

raw_spin_unlock_irq(&rq->lock);

5644

raw_spin_unlock_irq(&rq->lock);

5645

move_task_off_dead_cpu(dead_cpu, p);

5645

move_task_off_dead_cpu(dead_cpu, p);

5646

raw_spin_lock_irq(&rq->lock);

5646

raw_spin_lock_irq(&rq->lock);

5647

5648

put_task_struct(p);

5648

put_task_struct(p);

5649

}

5649

}

5650

5651

/* release_task() removes task from tasklist, so we won't find dead tasks. */

5651

/* release_task() removes task from tasklist, so we won't find dead tasks. */

5652

static void migrate_dead_tasks(unsigned int dead_cpu)

5652

static void migrate_dead_tasks(unsigned int dead_cpu)

5653

{

5653

{

5654

struct rq *rq = cpu_rq(dead_cpu);

5654

struct rq *rq = cpu_rq(dead_cpu);

5655

struct task_struct *next;

5655

struct task_struct *next;

5656

5657

for ( ; ; ) {

5657

for ( ; ; ) {

5658

if (!rq->nr_running)

5658

if (!rq->nr_running)

5659

break;

5659

break;

5660

update_rq_clock(rq);

5660

update_rq_clock(rq);

5661

next = pick_next_task(rq);

5661

next = pick_next_task(rq);

5662

if (!next)

5662

if (!next)

5663

break;

5663

break;

5664

next->sched_class->put_prev_task(rq, next);

5664

next->sched_class->put_prev_task(rq, next);

5665

migrate_dead(dead_cpu, next);

5665

migrate_dead(dead_cpu, next);

5666

5667

}

5667

}

5668

}

5668

}

5669

5670

/*

5670

/*

5671

* remove the tasks which were accounted by rq from calc_load_tasks.

5671

* remove the tasks which were accounted by rq from calc_load_tasks.

5672

*/

5672

*/

5673

static void calc_global_load_remove(struct rq *rq)

5673

static void calc_global_load_remove(struct rq *rq)

5674

{

5674

{

5675

atomic_long_sub(rq->calc_load_active, &calc_load_tasks);

5675

atomic_long_sub(rq->calc_load_active, &calc_load_tasks);

5676

rq->calc_load_active = 0;

5676

rq->calc_load_active = 0;

5677

}

5677

}

5678

#endif /* CONFIG_HOTPLUG_CPU */

5678

#endif /* CONFIG_HOTPLUG_CPU */

5679

5680

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

5680

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

5681

5682

static struct ctl_table sd_ctl_dir[] = {

5682

static struct ctl_table sd_ctl_dir[] = {

5683

{

5683

{

5684

.procname = "sched_domain",

5684

.procname = "sched_domain",

5685

.mode = 0555,

5685

.mode = 0555,

5686

},

5686

},

5687

{}

5687

{}

5688

};

5688

};

5689

5690

static struct ctl_table sd_ctl_root[] = {

5690

static struct ctl_table sd_ctl_root[] = {

5691

{

5691

{

5692

.procname = "kernel",

5692

.procname = "kernel",

5693

.mode = 0555,

5693

.mode = 0555,

5694

.child = sd_ctl_dir,

5694

.child = sd_ctl_dir,

5695

},

5695

},

5696

{}

5696

{}

5697

};

5697

};

5698

5699

static struct ctl_table *sd_alloc_ctl_entry(int n)

5699

static struct ctl_table *sd_alloc_ctl_entry(int n)

5700

{

5700

{

5701

struct ctl_table *entry =

5701

struct ctl_table *entry =

5702

kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);

5702

kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);

5703

5704

return entry;

5704

return entry;

5705

}

5705

}

5706

5707

static void sd_free_ctl_entry(struct ctl_table **tablep)

5707

static void sd_free_ctl_entry(struct ctl_table **tablep)

5708

{

5708

{

5709

struct ctl_table *entry;

5709

struct ctl_table *entry;

5710

5711

/*

5711

/*

5712

* In the intermediate directories, both the child directory and

5712

* In the intermediate directories, both the child directory and

5713

* procname are dynamically allocated and could fail but the mode

5713

* procname are dynamically allocated and could fail but the mode

5714

* will always be set. In the lowest directory the names are

5714

* will always be set. In the lowest directory the names are

5715

* static strings and all have proc handlers.

5715

* static strings and all have proc handlers.

5716

*/

5716

*/

5717

for (entry = *tablep; entry->mode; entry++) {

5717

for (entry = *tablep; entry->mode; entry++) {

5718

if (entry->child)

5718

if (entry->child)

5719

sd_free_ctl_entry(&entry->child);

5719

sd_free_ctl_entry(&entry->child);

5720

if (entry->proc_handler == NULL)

5720

if (entry->proc_handler == NULL)

5721

kfree(entry->procname);

5721

kfree(entry->procname);

5722

}

5722

}

5723

5724

kfree(*tablep);

5724

kfree(*tablep);

5725

*tablep = NULL;

5725

*tablep = NULL;

5726

}

5726

}

5727

5728

static void

5728

static void

5729

set_table_entry(struct ctl_table *entry,

5729

set_table_entry(struct ctl_table *entry,

5730

const char *procname, void *data, int maxlen,

5730

const char *procname, void *data, int maxlen,

5731

mode_t mode, proc_handler *proc_handler)

5731

mode_t mode, proc_handler *proc_handler)

5732

{

5732

{

5733

entry->procname = procname;

5733

entry->procname = procname;

5734

entry->data = data;

5734

entry->data = data;

5735

entry->maxlen = maxlen;

5735

entry->maxlen = maxlen;

5736

entry->mode = mode;

5736

entry->mode = mode;

5737

entry->proc_handler = proc_handler;

5737

entry->proc_handler = proc_handler;

5738

}

5738

}

5739

5740

static struct ctl_table *

5740

static struct ctl_table *

5741

sd_alloc_ctl_domain_table(struct sched_domain *sd)

5741

sd_alloc_ctl_domain_table(struct sched_domain *sd)

5742

{

5742

{

5743

struct ctl_table *table = sd_alloc_ctl_entry(13);

5743

struct ctl_table *table = sd_alloc_ctl_entry(13);

5744

5745

if (table == NULL)

5745

if (table == NULL)

5746

return NULL;

5746

return NULL;

5747

5748

set_table_entry(&table[0], "min_interval", &sd->min_interval,

5748

set_table_entry(&table[0], "min_interval", &sd->min_interval,

5749

sizeof(long), 0644, proc_doulongvec_minmax);

5749

sizeof(long), 0644, proc_doulongvec_minmax);

5750

set_table_entry(&table[1], "max_interval", &sd->max_interval,

5750

set_table_entry(&table[1], "max_interval", &sd->max_interval,

5751

sizeof(long), 0644, proc_doulongvec_minmax);

5751

sizeof(long), 0644, proc_doulongvec_minmax);

5752

set_table_entry(&table[2], "busy_idx", &sd->busy_idx,

5752

set_table_entry(&table[2], "busy_idx", &sd->busy_idx,

5753

sizeof(int), 0644, proc_dointvec_minmax);

5753

sizeof(int), 0644, proc_dointvec_minmax);

5754

set_table_entry(&table[3], "idle_idx", &sd->idle_idx,

5754

set_table_entry(&table[3], "idle_idx", &sd->idle_idx,

5755

sizeof(int), 0644, proc_dointvec_minmax);

5755

sizeof(int), 0644, proc_dointvec_minmax);

5756

set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,

5756

set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,

5757

sizeof(int), 0644, proc_dointvec_minmax);

5757

sizeof(int), 0644, proc_dointvec_minmax);

5758

set_table_entry(&table[5], "wake_idx", &sd->wake_idx,

5758

set_table_entry(&table[5], "wake_idx", &sd->wake_idx,

5759

sizeof(int), 0644, proc_dointvec_minmax);

5759

sizeof(int), 0644, proc_dointvec_minmax);

5760

set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,

5760

set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,

5761

sizeof(int), 0644, proc_dointvec_minmax);

5761

sizeof(int), 0644, proc_dointvec_minmax);

5762

set_table_entry(&table[7], "busy_factor", &sd->busy_factor,

5762

set_table_entry(&table[7], "busy_factor", &sd->busy_factor,

5763

sizeof(int), 0644, proc_dointvec_minmax);

5763

sizeof(int), 0644, proc_dointvec_minmax);

5764

set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,

5764

set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,

5765

sizeof(int), 0644, proc_dointvec_minmax);

5765

sizeof(int), 0644, proc_dointvec_minmax);

5766

set_table_entry(&table[9], "cache_nice_tries",

5766

set_table_entry(&table[9], "cache_nice_tries",

5767

&sd->cache_nice_tries,

5767

&sd->cache_nice_tries,

5768

sizeof(int), 0644, proc_dointvec_minmax);

5768

sizeof(int), 0644, proc_dointvec_minmax);

5769

set_table_entry(&table[10], "flags", &sd->flags,

5769

set_table_entry(&table[10], "flags", &sd->flags,

5770

sizeof(int), 0644, proc_dointvec_minmax);

5770

sizeof(int), 0644, proc_dointvec_minmax);

5771

set_table_entry(&table[11], "name", sd->name,

5771

set_table_entry(&table[11], "name", sd->name,

5772

CORENAME_MAX_SIZE, 0444, proc_dostring);

5772

CORENAME_MAX_SIZE, 0444, proc_dostring);

5773

/* &table[12] is terminator */

5773

/* &table[12] is terminator */

5774

5775

return table;

5775

return table;

5776

}

5776

}

5777

5778

static ctl_table *sd_alloc_ctl_cpu_table(int cpu)

5778

static ctl_table *sd_alloc_ctl_cpu_table(int cpu)

5779

{

5779

{

5780

struct ctl_table *entry, *table;

5780

struct ctl_table *entry, *table;

5781

struct sched_domain *sd;

5781

struct sched_domain *sd;

5782

int domain_num = 0, i;

5782

int domain_num = 0, i;

5783

char buf[32];

5783

char buf[32];

5784

5785

for_each_domain(cpu, sd)

5785

for_each_domain(cpu, sd)

5786

domain_num++;

5786

domain_num++;

5787

entry = table = sd_alloc_ctl_entry(domain_num + 1);

5787

entry = table = sd_alloc_ctl_entry(domain_num + 1);

5788

if (table == NULL)

5788

if (table == NULL)

5789

return NULL;

5789

return NULL;

5790

5791

i = 0;

5791

i = 0;

5792

for_each_domain(cpu, sd) {

5792

for_each_domain(cpu, sd) {

5793

snprintf(buf, 32, "domain%d", i);

5793

snprintf(buf, 32, "domain%d", i);

5794

entry->procname = kstrdup(buf, GFP_KERNEL);

5794

entry->procname = kstrdup(buf, GFP_KERNEL);

5795

entry->mode = 0555;

5795

entry->mode = 0555;

5796

entry->child = sd_alloc_ctl_domain_table(sd);

5796

entry->child = sd_alloc_ctl_domain_table(sd);

5797

entry++;

5797

entry++;

5798

i++;

5798

i++;

5799

}

5799

}

5800

return table;

5800

return table;

5801

}

5801

}

5802

5803

static struct ctl_table_header *sd_sysctl_header;

5803

static struct ctl_table_header *sd_sysctl_header;

5804

static void register_sched_domain_sysctl(void)

5804

static void register_sched_domain_sysctl(void)

5805

{

5805

{

5806

int i, cpu_num = num_possible_cpus();

5806

int i, cpu_num = num_possible_cpus();

5807

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

5807

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

5808

char buf[32];

5808

char buf[32];

5809

5810

WARN_ON(sd_ctl_dir[0].child);

5810

WARN_ON(sd_ctl_dir[0].child);

5811

sd_ctl_dir[0].child = entry;

5811

sd_ctl_dir[0].child = entry;

5812

5813

if (entry == NULL)

5813

if (entry == NULL)

5814

return;

5814

return;

5815

5816

for_each_possible_cpu(i) {

5816

for_each_possible_cpu(i) {

5817

snprintf(buf, 32, "cpu%d", i);

5817

snprintf(buf, 32, "cpu%d", i);

5818

entry->procname = kstrdup(buf, GFP_KERNEL);

5818

entry->procname = kstrdup(buf, GFP_KERNEL);

5819

entry->mode = 0555;

5819

entry->mode = 0555;

5820

entry->child = sd_alloc_ctl_cpu_table(i);

5820

entry->child = sd_alloc_ctl_cpu_table(i);

5821

entry++;

5821

entry++;

5822

}

5822

}

5823

5824

WARN_ON(sd_sysctl_header);

5824

WARN_ON(sd_sysctl_header);

5825

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

5825

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

5826

}

5826

}

5827

5828

/* may be called multiple times per register */

5828

/* may be called multiple times per register */

5829

static void unregister_sched_domain_sysctl(void)

5829

static void unregister_sched_domain_sysctl(void)

5830

{

5830

{

5831

if (sd_sysctl_header)

5831

if (sd_sysctl_header)

5832

unregister_sysctl_table(sd_sysctl_header);

5832

unregister_sysctl_table(sd_sysctl_header);

5833

sd_sysctl_header = NULL;

5833

sd_sysctl_header = NULL;

5834

if (sd_ctl_dir[0].child)

5834

if (sd_ctl_dir[0].child)

5835

sd_free_ctl_entry(&sd_ctl_dir[0].child);

5835

sd_free_ctl_entry(&sd_ctl_dir[0].child);

5836

}

5836

}

5837

#else

5837

#else

5838

static void register_sched_domain_sysctl(void)

5838

static void register_sched_domain_sysctl(void)

5839

{

5839

{

5840

}

5840

}

5841

static void unregister_sched_domain_sysctl(void)

5841

static void unregister_sched_domain_sysctl(void)

5842

{

5842

{

5843

}

5843

}

5844

#endif

5844

#endif

5845

5846

static void set_rq_online(struct rq *rq)

5846

static void set_rq_online(struct rq *rq)

5847

{

5847

{

5848

if (!rq->online) {

5848

if (!rq->online) {

5849

const struct sched_class *class;

5849

const struct sched_class *class;

5850

5851

cpumask_set_cpu(rq->cpu, rq->rd->online);

5851

cpumask_set_cpu(rq->cpu, rq->rd->online);

5852

rq->online = 1;

5852

rq->online = 1;

5853

5854

for_each_class(class) {

5854

for_each_class(class) {

5855

if (class->rq_online)

5855

if (class->rq_online)

5856

class->rq_online(rq);

5856

class->rq_online(rq);

5857

}

5857

}

5858

}

5858

}

5859

}

5859

}

5860

5861

static void set_rq_offline(struct rq *rq)

5861

static void set_rq_offline(struct rq *rq)

5862

{

5862

{

5863

if (rq->online) {

5863

if (rq->online) {

5864

const struct sched_class *class;

5864

const struct sched_class *class;

5865

5866

for_each_class(class) {

5866

for_each_class(class) {

5867

if (class->rq_offline)

5867

if (class->rq_offline)

5868

class->rq_offline(rq);

5868

class->rq_offline(rq);

5869

}

5869

}

5870

5871

cpumask_clear_cpu(rq->cpu, rq->rd->online);

5871

cpumask_clear_cpu(rq->cpu, rq->rd->online);

5872

rq->online = 0;

5872

rq->online = 0;

5873

}

5873

}

5874

}

5874

}

5875

5876

/*

5876

/*

5877

* migration_call - callback that gets triggered when a CPU is added.

5877

* migration_call - callback that gets triggered when a CPU is added.

5878

* Here we can start up the necessary migration thread for the new CPU.

5878

* Here we can start up the necessary migration thread for the new CPU.

5879

*/

5879

*/

5880

static int __cpuinit

5880

static int __cpuinit

5881

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

5881

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

5882

{

5882

{

5883

struct task_struct *p;

5883

struct task_struct *p;

5884

int cpu = (long)hcpu;

5884

int cpu = (long)hcpu;

5885

unsigned long flags;

5885

unsigned long flags;

5886

struct rq *rq;

5886

struct rq *rq;

5887

5888

switch (action) {

5888

switch (action) {

5889

5890

case CPU_UP_PREPARE:

5890

case CPU_UP_PREPARE:

5891

case CPU_UP_PREPARE_FROZEN:

5891

case CPU_UP_PREPARE_FROZEN:

5892

p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);

5892

p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);

5893

if (IS_ERR(p))

5893

if (IS_ERR(p))

5894

return NOTIFY_BAD;

5894

return NOTIFY_BAD;

5895

kthread_bind(p, cpu);

5895

kthread_bind(p, cpu);

5896

/* Must be high prio: stop_machine expects to yield to it. */

5896

/* Must be high prio: stop_machine expects to yield to it. */

5897

rq = task_rq_lock(p, &flags);

5897

rq = task_rq_lock(p, &flags);

5898

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

5898

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

5899

task_rq_unlock(rq, &flags);

5899

task_rq_unlock(rq, &flags);

5900

get_task_struct(p);

5900

get_task_struct(p);

5901

cpu_rq(cpu)->migration_thread = p;

5901

cpu_rq(cpu)->migration_thread = p;

5902

rq->calc_load_update = calc_load_update;

5902

rq->calc_load_update = calc_load_update;

5903

break;

5903

break;

5904

5905

case CPU_ONLINE:

5905

case CPU_ONLINE:

5906

case CPU_ONLINE_FROZEN:

5906

case CPU_ONLINE_FROZEN:

5907

/* Strictly unnecessary, as first user will wake it. */

5907

/* Strictly unnecessary, as first user will wake it. */

5908

wake_up_process(cpu_rq(cpu)->migration_thread);

5908

wake_up_process(cpu_rq(cpu)->migration_thread);

5909

5910

/* Update our root-domain */

5910

/* Update our root-domain */

5911

rq = cpu_rq(cpu);

5911

rq = cpu_rq(cpu);

5912

raw_spin_lock_irqsave(&rq->lock, flags);

5912

raw_spin_lock_irqsave(&rq->lock, flags);

5913

if (rq->rd) {

5913

if (rq->rd) {

5914

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

5914

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

5915

5916

set_rq_online(rq);

5916

set_rq_online(rq);

5917

}

5917

}

5918

raw_spin_unlock_irqrestore(&rq->lock, flags);

5918

raw_spin_unlock_irqrestore(&rq->lock, flags);

5919

break;

5919

break;

5920

5921

#ifdef CONFIG_HOTPLUG_CPU

5921

#ifdef CONFIG_HOTPLUG_CPU

5922

case CPU_UP_CANCELED:

5922

case CPU_UP_CANCELED:

5923

case CPU_UP_CANCELED_FROZEN:

5923

case CPU_UP_CANCELED_FROZEN:

5924

if (!cpu_rq(cpu)->migration_thread)

5924

if (!cpu_rq(cpu)->migration_thread)

5925

break;

5925

break;

5926

/* Unbind it from offline cpu so it can run. Fall thru. */

5926

/* Unbind it from offline cpu so it can run. Fall thru. */

5927

kthread_bind(cpu_rq(cpu)->migration_thread,

5927

kthread_bind(cpu_rq(cpu)->migration_thread,

5928

cpumask_any(cpu_online_mask));

5928

cpumask_any(cpu_online_mask));

5929

kthread_stop(cpu_rq(cpu)->migration_thread);

5929

kthread_stop(cpu_rq(cpu)->migration_thread);

5930

put_task_struct(cpu_rq(cpu)->migration_thread);

5930

put_task_struct(cpu_rq(cpu)->migration_thread);

5931

cpu_rq(cpu)->migration_thread = NULL;

5931

cpu_rq(cpu)->migration_thread = NULL;

5932

break;

5932

break;

5933

5934

case CPU_DEAD:

5934

case CPU_DEAD:

5935

case CPU_DEAD_FROZEN:

5935

case CPU_DEAD_FROZEN:

5936

cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */

5936

cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */

5937

migrate_live_tasks(cpu);

5937

migrate_live_tasks(cpu);

5938

rq = cpu_rq(cpu);

5938

rq = cpu_rq(cpu);

5939

kthread_stop(rq->migration_thread);

5939

kthread_stop(rq->migration_thread);

5940

put_task_struct(rq->migration_thread);

5940

put_task_struct(rq->migration_thread);

5941

rq->migration_thread = NULL;

5941

rq->migration_thread = NULL;

5942

/* Idle task back to normal (off runqueue, low prio) */

5942

/* Idle task back to normal (off runqueue, low prio) */

5943

raw_spin_lock_irq(&rq->lock);

5943

raw_spin_lock_irq(&rq->lock);

5944

update_rq_clock(rq);

5944

update_rq_clock(rq);

5945

deactivate_task(rq, rq->idle, 0);

5945

deactivate_task(rq, rq->idle, 0);

5946

__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);

5946

__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);

5947

rq->idle->sched_class = &idle_sched_class;

5947

rq->idle->sched_class = &idle_sched_class;

5948

migrate_dead_tasks(cpu);

5948

migrate_dead_tasks(cpu);

5949

raw_spin_unlock_irq(&rq->lock);

5949

raw_spin_unlock_irq(&rq->lock);

5950

cpuset_unlock();

5950

cpuset_unlock();

5951

migrate_nr_uninterruptible(rq);

5951

migrate_nr_uninterruptible(rq);

5952

BUG_ON(rq->nr_running != 0);

5952

BUG_ON(rq->nr_running != 0);

5953

calc_global_load_remove(rq);

5953

calc_global_load_remove(rq);

5954

/*

5954

/*

5955

* No need to migrate the tasks: it was best-effort if

5955

* No need to migrate the tasks: it was best-effort if

5956

* they didn't take sched_hotcpu_mutex. Just wake up

5956

* they didn't take sched_hotcpu_mutex. Just wake up

5957

* the requestors.

5957

* the requestors.

5958

*/

5958

*/

5959

raw_spin_lock_irq(&rq->lock);

5959

raw_spin_lock_irq(&rq->lock);

5960

while (!list_empty(&rq->migration_queue)) {

5960

while (!list_empty(&rq->migration_queue)) {

5961

struct migration_req *req;

5961

struct migration_req *req;

5962

5963

req = list_entry(rq->migration_queue.next,

5963

req = list_entry(rq->migration_queue.next,

5964

struct migration_req, list);

5964

struct migration_req, list);

5965

list_del_init(&req->list);

5965

list_del_init(&req->list);

5966

raw_spin_unlock_irq(&rq->lock);

5966

raw_spin_unlock_irq(&rq->lock);

5967

complete(&req->done);

5967

complete(&req->done);

5968

raw_spin_lock_irq(&rq->lock);

5968

raw_spin_lock_irq(&rq->lock);

5969

}

5969

}

5970

raw_spin_unlock_irq(&rq->lock);

5970

raw_spin_unlock_irq(&rq->lock);

5971

break;

5971

break;

5972

5973

case CPU_DYING:

5973

case CPU_DYING:

5974

case CPU_DYING_FROZEN:

5974

case CPU_DYING_FROZEN:

5975

/* Update our root-domain */

5975

/* Update our root-domain */

5976

rq = cpu_rq(cpu);

5976

rq = cpu_rq(cpu);

5977

raw_spin_lock_irqsave(&rq->lock, flags);

5977

raw_spin_lock_irqsave(&rq->lock, flags);

5978

if (rq->rd) {

5978

if (rq->rd) {

5979

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

5979

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

5980

set_rq_offline(rq);

5980

set_rq_offline(rq);

5981

}

5981

}

5982

raw_spin_unlock_irqrestore(&rq->lock, flags);

5982

raw_spin_unlock_irqrestore(&rq->lock, flags);

5983

break;

5983

break;

5984

#endif

5984

#endif

5985

}

5985

}

5986

return NOTIFY_OK;

5986

return NOTIFY_OK;

5987

}

5987

}

5988

5989

/*

5989

/*

5990

* Register at high priority so that task migration (migrate_all_tasks)

5990

* Register at high priority so that task migration (migrate_all_tasks)

5991

* happens before everything else. This has to be lower priority than

5991

* happens before everything else. This has to be lower priority than

5992

* the notifier in the perf_event subsystem, though.

5992

* the notifier in the perf_event subsystem, though.

5993

*/

5993

*/

5994

static struct notifier_block __cpuinitdata migration_notifier = {

5994

static struct notifier_block __cpuinitdata migration_notifier = {

5995

.notifier_call = migration_call,

5995

.notifier_call = migration_call,

5996

.priority = 10

5996

.priority = 10

5997

};

5997

};

5998

5999

static int __init migration_init(void)

5999

static int __init migration_init(void)

6000

{

6000

{

6001

void *cpu = (void *)(long)smp_processor_id();

6001

void *cpu = (void *)(long)smp_processor_id();

6002

int err;

6002

int err;

6003

6004

/* Start one for the boot CPU: */

6004

/* Start one for the boot CPU: */

6005

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

6005

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

6006

BUG_ON(err == NOTIFY_BAD);

6006

BUG_ON(err == NOTIFY_BAD);

6007

migration_call(&migration_notifier, CPU_ONLINE, cpu);

6007

migration_call(&migration_notifier, CPU_ONLINE, cpu);

6008

register_cpu_notifier(&migration_notifier);

6008

register_cpu_notifier(&migration_notifier);

6009

6010

return 0;

6010

return 0;

6011

}

6011

}

6012

early_initcall(migration_init);

6012

early_initcall(migration_init);

6013

#endif

6013

#endif

6014

6015

#ifdef CONFIG_SMP

6015

#ifdef CONFIG_SMP

6016

6017

#ifdef CONFIG_SCHED_DEBUG

6017

#ifdef CONFIG_SCHED_DEBUG

6018

6019

static __read_mostly int sched_domain_debug_enabled;

6019

static __read_mostly int sched_domain_debug_enabled;

6020

6021

static int __init sched_domain_debug_setup(char *str)

6021

static int __init sched_domain_debug_setup(char *str)

6022

{

6022

{

6023

sched_domain_debug_enabled = 1;

6023

sched_domain_debug_enabled = 1;

6024

6025

return 0;

6025

return 0;

6026

}

6026

}

6027

early_param("sched_debug", sched_domain_debug_setup);

6027

early_param("sched_debug", sched_domain_debug_setup);

6028

6029

static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

6029

static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

6030

struct cpumask *groupmask)

6030

struct cpumask *groupmask)

6031

{

6031

{

6032

struct sched_group *group = sd->groups;

6032

struct sched_group *group = sd->groups;

6033

char str[256];

6033

char str[256];

6034

6035

cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));

6035

cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));

6036

cpumask_clear(groupmask);

6036

cpumask_clear(groupmask);

6037

6038

printk(KERN_DEBUG "%*s domain %d: ", level, "", level);

6038

printk(KERN_DEBUG "%*s domain %d: ", level, "", level);

6039

6040

if (!(sd->flags & SD_LOAD_BALANCE)) {

6040

if (!(sd->flags & SD_LOAD_BALANCE)) {

6041

printk("does not load-balance\n");

6041

printk("does not load-balance\n");

6042

if (sd->parent)

6042

if (sd->parent)

6043

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"

6043

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"

6044

" has parent");

6044

" has parent");

6045

return -1;

6045

return -1;

6046

}

6046

}

6047

6048

printk(KERN_CONT "span %s level %s\n", str, sd->name);

6048

printk(KERN_CONT "span %s level %s\n", str, sd->name);

6049

6050

if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {

6050

if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {

6051

printk(KERN_ERR "ERROR: domain->span does not contain "

6051

printk(KERN_ERR "ERROR: domain->span does not contain "

6052

"CPU%d\n", cpu);

6052

"CPU%d\n", cpu);

6053

}

6053

}

6054

if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {

6054

if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {

6055

printk(KERN_ERR "ERROR: domain->groups does not contain"

6055

printk(KERN_ERR "ERROR: domain->groups does not contain"

6056

" CPU%d\n", cpu);

6056

" CPU%d\n", cpu);

6057

}

6057

}

6058

6059

printk(KERN_DEBUG "%*s groups:", level + 1, "");

6059

printk(KERN_DEBUG "%*s groups:", level + 1, "");

6060

do {

6060

do {

6061

if (!group) {

6061

if (!group) {

6062

printk("\n");

6062

printk("\n");

6063

printk(KERN_ERR "ERROR: group is NULL\n");

6063

printk(KERN_ERR "ERROR: group is NULL\n");

6064

break;

6064

break;

6065

}

6065

}

6066

6067

if (!group->cpu_power) {

6067

if (!group->cpu_power) {

6068

printk(KERN_CONT "\n");

6068

printk(KERN_CONT "\n");

6069

printk(KERN_ERR "ERROR: domain->cpu_power not "

6069

printk(KERN_ERR "ERROR: domain->cpu_power not "

6070

"set\n");

6070

"set\n");

6071

break;

6071

break;

6072

}

6072

}

6073

6074

if (!cpumask_weight(sched_group_cpus(group))) {

6074

if (!cpumask_weight(sched_group_cpus(group))) {

6075

printk(KERN_CONT "\n");

6075

printk(KERN_CONT "\n");

6076

printk(KERN_ERR "ERROR: empty group\n");

6076

printk(KERN_ERR "ERROR: empty group\n");

6077

break;

6077

break;

6078

}

6078

}

6079

6080

if (cpumask_intersects(groupmask, sched_group_cpus(group))) {

6080

if (cpumask_intersects(groupmask, sched_group_cpus(group))) {

6081

printk(KERN_CONT "\n");

6081

printk(KERN_CONT "\n");

6082

printk(KERN_ERR "ERROR: repeated CPUs\n");

6082

printk(KERN_ERR "ERROR: repeated CPUs\n");

6083

break;

6083

break;

6084

}

6084

}

6085

6086

cpumask_or(groupmask, groupmask, sched_group_cpus(group));

6086

cpumask_or(groupmask, groupmask, sched_group_cpus(group));

6087

6088

cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));

6088

cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));

6089

6090

printk(KERN_CONT " %s", str);

6090

printk(KERN_CONT " %s", str);

6091

if (group->cpu_power != SCHED_LOAD_SCALE) {

6091

if (group->cpu_power != SCHED_LOAD_SCALE) {

6092

printk(KERN_CONT " (cpu_power = %d)",

6092

printk(KERN_CONT " (cpu_power = %d)",

6093

group->cpu_power);

6093

group->cpu_power);

6094

}

6094

}

6095

6096

group = group->next;

6096

group = group->next;

6097

} while (group != sd->groups);

6097

} while (group != sd->groups);

6098

printk(KERN_CONT "\n");

6098

printk(KERN_CONT "\n");

6099

6100

if (!cpumask_equal(sched_domain_span(sd), groupmask))

6100

if (!cpumask_equal(sched_domain_span(sd), groupmask))

6101

printk(KERN_ERR "ERROR: groups don't span domain->span\n");

6101

printk(KERN_ERR "ERROR: groups don't span domain->span\n");

6102

6103

if (sd->parent &&

6103

if (sd->parent &&

6104

!cpumask_subset(groupmask, sched_domain_span(sd->parent)))

6104

!cpumask_subset(groupmask, sched_domain_span(sd->parent)))

6105

printk(KERN_ERR "ERROR: parent span is not a superset "

6105

printk(KERN_ERR "ERROR: parent span is not a superset "

6106

"of domain->span\n");

6106

"of domain->span\n");

6107

return 0;

6107

return 0;

6108

}

6108

}

6109

6110

static void sched_domain_debug(struct sched_domain *sd, int cpu)

6110

static void sched_domain_debug(struct sched_domain *sd, int cpu)

6111

{

6111

{

6112

cpumask_var_t groupmask;

6112

cpumask_var_t groupmask;

6113

int level = 0;

6113

int level = 0;

6114

6115

if (!sched_domain_debug_enabled)

6115

if (!sched_domain_debug_enabled)

6116

return;

6116

return;

6117

6118

if (!sd) {

6118

if (!sd) {

6119

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

6119

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

6120

return;

6120

return;

6121

}

6121

}

6122

6123

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

6123

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

6124

6125

if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {

6125

if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {

6126

printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");

6126

printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");

6127

return;

6127

return;

6128

}

6128

}

6129

6130

for (;;) {

6130

for (;;) {

6131

if (sched_domain_debug_one(sd, cpu, level, groupmask))

6131

if (sched_domain_debug_one(sd, cpu, level, groupmask))

6132

break;

6132

break;

6133

level++;

6133

level++;

6134

sd = sd->parent;

6134

sd = sd->parent;

6135

if (!sd)

6135

if (!sd)

6136

break;

6136

break;

6137

}

6137

}

6138

free_cpumask_var(groupmask);

6138

free_cpumask_var(groupmask);

6139

}

6139

}

6140

#else /* !CONFIG_SCHED_DEBUG */

6140

#else /* !CONFIG_SCHED_DEBUG */

6141

# define sched_domain_debug(sd, cpu) do { } while (0)

6141

# define sched_domain_debug(sd, cpu) do { } while (0)

6142

#endif /* CONFIG_SCHED_DEBUG */

6142

#endif /* CONFIG_SCHED_DEBUG */

6143

6144

static int sd_degenerate(struct sched_domain *sd)

6144

static int sd_degenerate(struct sched_domain *sd)

6145

{

6145

{

6146

if (cpumask_weight(sched_domain_span(sd)) == 1)

6146

if (cpumask_weight(sched_domain_span(sd)) == 1)

6147

return 1;

6147

return 1;

6148

6149

/* Following flags need at least 2 groups */

6149

/* Following flags need at least 2 groups */

6150

if (sd->flags & (SD_LOAD_BALANCE |

6150

if (sd->flags & (SD_LOAD_BALANCE |

6151

SD_BALANCE_NEWIDLE |

6151

SD_BALANCE_NEWIDLE |

6152

SD_BALANCE_FORK |

6152

SD_BALANCE_FORK |

6153

SD_BALANCE_EXEC |

6153

SD_BALANCE_EXEC |

6154

SD_SHARE_CPUPOWER |

6154

SD_SHARE_CPUPOWER |

6155

SD_SHARE_PKG_RESOURCES)) {

6155

SD_SHARE_PKG_RESOURCES)) {

6156

if (sd->groups != sd->groups->next)

6156

if (sd->groups != sd->groups->next)

6157

return 0;

6157

return 0;

6158

}

6158

}

6159

6160

/* Following flags don't use groups */

6160

/* Following flags don't use groups */

6161

if (sd->flags & (SD_WAKE_AFFINE))

6161

if (sd->flags & (SD_WAKE_AFFINE))

6162

return 0;

6162

return 0;

6163

6164

return 1;

6164

return 1;

6165

}

6165

}

6166

6167

static int

6167

static int

6168

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

6168

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

6169

{

6169

{

6170

unsigned long cflags = sd->flags, pflags = parent->flags;

6170

unsigned long cflags = sd->flags, pflags = parent->flags;

6171

6172

if (sd_degenerate(parent))

6172

if (sd_degenerate(parent))

6173

return 1;

6173

return 1;

6174

6175

if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))

6175

if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))

6176

return 0;

6176

return 0;

6177

6178

/* Flags needing groups don't count if only 1 group in parent */

6178

/* Flags needing groups don't count if only 1 group in parent */

6179

if (parent->groups == parent->groups->next) {

6179

if (parent->groups == parent->groups->next) {

6180

pflags &= ~(SD_LOAD_BALANCE |

6180

pflags &= ~(SD_LOAD_BALANCE |

6181

SD_BALANCE_NEWIDLE |

6181

SD_BALANCE_NEWIDLE |

6182

SD_BALANCE_FORK |

6182

SD_BALANCE_FORK |

6183

SD_BALANCE_EXEC |

6183

SD_BALANCE_EXEC |

6184

SD_SHARE_CPUPOWER |

6184

SD_SHARE_CPUPOWER |

6185

SD_SHARE_PKG_RESOURCES);

6185

SD_SHARE_PKG_RESOURCES);

6186

if (nr_node_ids == 1)

6186

if (nr_node_ids == 1)

6187

pflags &= ~SD_SERIALIZE;

6187

pflags &= ~SD_SERIALIZE;

6188

}

6188

}

6189

if (~cflags & pflags)

6189

if (~cflags & pflags)

6190

return 0;

6190

return 0;

6191

6192

return 1;

6192

return 1;

6193

}

6193

}

6194

6195

static void free_rootdomain(struct root_domain *rd)

6195

static void free_rootdomain(struct root_domain *rd)

6196

{

6196

{

6197

synchronize_sched();

6197

synchronize_sched();

6198

6199

cpupri_cleanup(&rd->cpupri);

6199

cpupri_cleanup(&rd->cpupri);

6200

6201

free_cpumask_var(rd->rto_mask);

6201

free_cpumask_var(rd->rto_mask);

6202

free_cpumask_var(rd->online);

6202

free_cpumask_var(rd->online);

6203

free_cpumask_var(rd->span);

6203

free_cpumask_var(rd->span);

6204

kfree(rd);

6204

kfree(rd);

6205

}

6205

}

6206

6207

static void rq_attach_root(struct rq *rq, struct root_domain *rd)

6207

static void rq_attach_root(struct rq *rq, struct root_domain *rd)

6208

{

6208

{

6209

struct root_domain *old_rd = NULL;

6209

struct root_domain *old_rd = NULL;

6210

unsigned long flags;

6210

unsigned long flags;

6211

6212

raw_spin_lock_irqsave(&rq->lock, flags);

6212

raw_spin_lock_irqsave(&rq->lock, flags);

6213

6214

if (rq->rd) {

6214

if (rq->rd) {

6215

old_rd = rq->rd;

6215

old_rd = rq->rd;

6216

6217

if (cpumask_test_cpu(rq->cpu, old_rd->online))

6217

if (cpumask_test_cpu(rq->cpu, old_rd->online))

6218

set_rq_offline(rq);

6218

set_rq_offline(rq);

6219

6220

cpumask_clear_cpu(rq->cpu, old_rd->span);

6220

cpumask_clear_cpu(rq->cpu, old_rd->span);

6221

6222

/*

6222

/*

6223

* If we dont want to free the old_rt yet then

6223

* If we dont want to free the old_rt yet then

6224

* set old_rd to NULL to skip the freeing later

6224

* set old_rd to NULL to skip the freeing later

6225

* in this function:

6225

* in this function:

6226

*/

6226

*/

6227

if (!atomic_dec_and_test(&old_rd->refcount))

6227

if (!atomic_dec_and_test(&old_rd->refcount))

6228

old_rd = NULL;

6228

old_rd = NULL;

6229

}

6229

}

6230

6231

atomic_inc(&rd->refcount);

6231

atomic_inc(&rd->refcount);

6232

rq->rd = rd;

6232

rq->rd = rd;

6233

6234

cpumask_set_cpu(rq->cpu, rd->span);

6234

cpumask_set_cpu(rq->cpu, rd->span);

6235

if (cpumask_test_cpu(rq->cpu, cpu_active_mask))

6235

if (cpumask_test_cpu(rq->cpu, cpu_active_mask))

6236

set_rq_online(rq);

6236

set_rq_online(rq);

6237

6238

raw_spin_unlock_irqrestore(&rq->lock, flags);

6238

raw_spin_unlock_irqrestore(&rq->lock, flags);

6239

6240

if (old_rd)

6240

if (old_rd)

6241

free_rootdomain(old_rd);

6241

free_rootdomain(old_rd);

6242

}

6242

}

6243

6244

static int init_rootdomain(struct root_domain *rd, bool bootmem)

6244

static int init_rootdomain(struct root_domain *rd, bool bootmem)

6245

{

6245

{

6246

gfp_t gfp = GFP_KERNEL;

6246

gfp_t gfp = GFP_KERNEL;

6247

6248

memset(rd, 0, sizeof(*rd));

6248

memset(rd, 0, sizeof(*rd));

6249

6250

if (bootmem)

6250

if (bootmem)

6251

gfp = GFP_NOWAIT;

6251

gfp = GFP_NOWAIT;

6252

6253

if (!alloc_cpumask_var(&rd->span, gfp))

6253

if (!alloc_cpumask_var(&rd->span, gfp))

6254

goto out;

6254

goto out;

6255

if (!alloc_cpumask_var(&rd->online, gfp))

6255

if (!alloc_cpumask_var(&rd->online, gfp))

6256

goto free_span;

6256

goto free_span;

6257

if (!alloc_cpumask_var(&rd->rto_mask, gfp))

6257

if (!alloc_cpumask_var(&rd->rto_mask, gfp))

6258

goto free_online;

6258

goto free_online;

6259

6260

if (cpupri_init(&rd->cpupri, bootmem) != 0)

6260

if (cpupri_init(&rd->cpupri, bootmem) != 0)

6261

goto free_rto_mask;

6261

goto free_rto_mask;

6262

return 0;

6262

return 0;

6263

6264

free_rto_mask:

6264

free_rto_mask:

6265

free_cpumask_var(rd->rto_mask);

6265

free_cpumask_var(rd->rto_mask);

6266

free_online:

6266

free_online:

6267

free_cpumask_var(rd->online);

6267

free_cpumask_var(rd->online);

6268

free_span:

6268

free_span:

6269

free_cpumask_var(rd->span);

6269

free_cpumask_var(rd->span);

6270

out:

6270

out:

6271

return -ENOMEM;

6271

return -ENOMEM;

6272

}

6272

}

6273

6274

static void init_defrootdomain(void)

6274

static void init_defrootdomain(void)

6275

{

6275

{

6276

init_rootdomain(&def_root_domain, true);

6276

init_rootdomain(&def_root_domain, true);

6277

6278

atomic_set(&def_root_domain.refcount, 1);

6278

atomic_set(&def_root_domain.refcount, 1);

6279

}

6279

}

6280

6281

static struct root_domain *alloc_rootdomain(void)

6281

static struct root_domain *alloc_rootdomain(void)

6282

{

6282

{

6283

struct root_domain *rd;

6283

struct root_domain *rd;

6284

6285

rd = kmalloc(sizeof(*rd), GFP_KERNEL);

6285

rd = kmalloc(sizeof(*rd), GFP_KERNEL);

6286

if (!rd)

6286

if (!rd)

6287

return NULL;

6287

return NULL;

6288

6289

if (init_rootdomain(rd, false) != 0) {

6289

if (init_rootdomain(rd, false) != 0) {

6290

kfree(rd);

6290

kfree(rd);

6291

return NULL;

6291

return NULL;

6292

}

6292

}

6293

6294

return rd;

6294

return rd;

6295

}

6295

}

6296

6297

/*

6297

/*

6298

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

6298

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

6299

* hold the hotplug lock.

6299

* hold the hotplug lock.

6300

*/

6300

*/

6301

static void

6301

static void

6302

cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

6302

cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

6303

{

6303

{

6304

struct rq *rq = cpu_rq(cpu);

6304

struct rq *rq = cpu_rq(cpu);

6305

struct sched_domain *tmp;

6305

struct sched_domain *tmp;

6306

6307

/* Remove the sched domains which do not contribute to scheduling. */

6307

/* Remove the sched domains which do not contribute to scheduling. */

6308

for (tmp = sd; tmp; ) {

6308

for (tmp = sd; tmp; ) {

6309

struct sched_domain *parent = tmp->parent;

6309

struct sched_domain *parent = tmp->parent;

6310

if (!parent)

6310

if (!parent)

6311

break;

6311

break;

6312

6313

if (sd_parent_degenerate(tmp, parent)) {

6313

if (sd_parent_degenerate(tmp, parent)) {

6314

tmp->parent = parent->parent;

6314

tmp->parent = parent->parent;

6315

if (parent->parent)

6315

if (parent->parent)

6316

parent->parent->child = tmp;

6316

parent->parent->child = tmp;

6317

} else

6317

} else

6318

tmp = tmp->parent;

6318

tmp = tmp->parent;

6319

}

6319

}

6320

6321

if (sd && sd_degenerate(sd)) {

6321

if (sd && sd_degenerate(sd)) {

6322

sd = sd->parent;

6322

sd = sd->parent;

6323

if (sd)

6323

if (sd)

6324

sd->child = NULL;

6324

sd->child = NULL;

6325

}

6325

}

6326

6327

sched_domain_debug(sd, cpu);

6327

sched_domain_debug(sd, cpu);

6328

6329

rq_attach_root(rq, rd);

6329

rq_attach_root(rq, rd);

6330

rcu_assign_pointer(rq->sd, sd);

6330

rcu_assign_pointer(rq->sd, sd);

6331

}

6331

}

6332

6333

/* cpus with isolated domains */

6333

/* cpus with isolated domains */

6334

static cpumask_var_t cpu_isolated_map;

6334

static cpumask_var_t cpu_isolated_map;

6335

6336

/* Setup the mask of cpus configured for isolated domains */

6336

/* Setup the mask of cpus configured for isolated domains */

6337

static int __init isolated_cpu_setup(char *str)

6337

static int __init isolated_cpu_setup(char *str)

6338

{

6338

{

6339

alloc_bootmem_cpumask_var(&cpu_isolated_map);

6339

alloc_bootmem_cpumask_var(&cpu_isolated_map);

6340

cpulist_parse(str, cpu_isolated_map);

6340

cpulist_parse(str, cpu_isolated_map);

6341

return 1;

6341

return 1;

6342

}

6342

}

6343

6344

__setup("isolcpus=", isolated_cpu_setup);

6344

__setup("isolcpus=", isolated_cpu_setup);

6345

6346

/*

6346

/*

6347

* init_sched_build_groups takes the cpumask we wish to span, and a pointer

6347

* init_sched_build_groups takes the cpumask we wish to span, and a pointer

6348

* to a function which identifies what group(along with sched group) a CPU

6348

* to a function which identifies what group(along with sched group) a CPU

6349

* belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids

6349

* belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids

6350

* (due to the fact that we keep track of groups covered with a struct cpumask).

6350

* (due to the fact that we keep track of groups covered with a struct cpumask).

6351

*

6351

*

6352

* init_sched_build_groups will build a circular linked list of the groups

6352

* init_sched_build_groups will build a circular linked list of the groups

6353

* covered by the given span, and will set each group's ->cpumask correctly,

6353

* covered by the given span, and will set each group's ->cpumask correctly,

6354

* and ->cpu_power to 0.

6354

* and ->cpu_power to 0.

6355

*/

6355

*/

6356

static void

6356

static void

6357

init_sched_build_groups(const struct cpumask *span,

6357

init_sched_build_groups(const struct cpumask *span,

6358

const struct cpumask *cpu_map,

6358

const struct cpumask *cpu_map,

6359

int (*group_fn)(int cpu, const struct cpumask *cpu_map,

6359

int (*group_fn)(int cpu, const struct cpumask *cpu_map,

6360

struct sched_group **sg,

6360

struct sched_group **sg,

6361

struct cpumask *tmpmask),

6361

struct cpumask *tmpmask),

6362

struct cpumask *covered, struct cpumask *tmpmask)

6362

struct cpumask *covered, struct cpumask *tmpmask)

6363

{

6363

{

6364

struct sched_group *first = NULL, *last = NULL;

6364

struct sched_group *first = NULL, *last = NULL;

6365

int i;

6365

int i;

6366

6367

cpumask_clear(covered);

6367

cpumask_clear(covered);

6368

6369

for_each_cpu(i, span) {

6369

for_each_cpu(i, span) {

6370

struct sched_group *sg;

6370

struct sched_group *sg;

6371

int group = group_fn(i, cpu_map, &sg, tmpmask);

6371

int group = group_fn(i, cpu_map, &sg, tmpmask);

6372

int j;

6372

int j;

6373

6374

if (cpumask_test_cpu(i, covered))

6374

if (cpumask_test_cpu(i, covered))

6375

continue;

6375

continue;

6376

6377

cpumask_clear(sched_group_cpus(sg));

6377

cpumask_clear(sched_group_cpus(sg));

6378

sg->cpu_power = 0;

6378

sg->cpu_power = 0;

6379

6380

for_each_cpu(j, span) {

6380

for_each_cpu(j, span) {

6381

if (group_fn(j, cpu_map, NULL, tmpmask) != group)

6381

if (group_fn(j, cpu_map, NULL, tmpmask) != group)

6382

continue;

6382

continue;

6383

6384

cpumask_set_cpu(j, covered);

6384

cpumask_set_cpu(j, covered);

6385

cpumask_set_cpu(j, sched_group_cpus(sg));

6385

cpumask_set_cpu(j, sched_group_cpus(sg));

6386

}

6386

}

6387

if (!first)

6387

if (!first)

6388

first = sg;

6388

first = sg;

6389

if (last)

6389

if (last)

6390

last->next = sg;

6390

last->next = sg;

6391

last = sg;

6391

last = sg;

6392

}

6392

}

6393

last->next = first;

6393

last->next = first;

6394

}

6394

}

6395

6396

#define SD_NODES_PER_DOMAIN 16

6396

#define SD_NODES_PER_DOMAIN 16

6397

6398

#ifdef CONFIG_NUMA

6398

#ifdef CONFIG_NUMA

6399

6400

/**

6400

/**

6401

* find_next_best_node - find the next node to include in a sched_domain

6401

* find_next_best_node - find the next node to include in a sched_domain

6402

* @node: node whose sched_domain we're building

6402

* @node: node whose sched_domain we're building

6403

* @used_nodes: nodes already in the sched_domain

6403

* @used_nodes: nodes already in the sched_domain

6404

*

6404

*

6405

* Find the next node to include in a given scheduling domain. Simply

6405

* Find the next node to include in a given scheduling domain. Simply

6406

* finds the closest node not already in the @used_nodes map.

6406

* finds the closest node not already in the @used_nodes map.

6407

*

6407

*

6408

* Should use nodemask_t.

6408

* Should use nodemask_t.

6409

*/

6409

*/

6410

static int find_next_best_node(int node, nodemask_t *used_nodes)

6410

static int find_next_best_node(int node, nodemask_t *used_nodes)

6411

{

6411

{

6412

int i, n, val, min_val, best_node = 0;

6412

int i, n, val, min_val, best_node = 0;

6413

6414

min_val = INT_MAX;

6414

min_val = INT_MAX;

6415

6416

for (i = 0; i < nr_node_ids; i++) {

6416

for (i = 0; i < nr_node_ids; i++) {

6417

/* Start at @node */

6417

/* Start at @node */

6418

n = (node + i) % nr_node_ids;

6418

n = (node + i) % nr_node_ids;

6419

6420

if (!nr_cpus_node(n))

6420

if (!nr_cpus_node(n))

6421

continue;

6421

continue;

6422

6423

/* Skip already used nodes */

6423

/* Skip already used nodes */

6424

if (node_isset(n, *used_nodes))

6424

if (node_isset(n, *used_nodes))

6425

continue;

6425

continue;

6426

6427

/* Simple min distance search */

6427

/* Simple min distance search */

6428

val = node_distance(node, n);

6428

val = node_distance(node, n);

6429

6430

if (val < min_val) {

6430

if (val < min_val) {

6431

min_val = val;

6431

min_val = val;

6432

best_node = n;

6432

best_node = n;

6433

}

6433

}

6434

}

6434

}

6435

6436

node_set(best_node, *used_nodes);

6436

node_set(best_node, *used_nodes);

6437

return best_node;

6437

return best_node;

6438

}

6438

}

6439

6440

/**

6440

/**

6441

* sched_domain_node_span - get a cpumask for a node's sched_domain

6441

* sched_domain_node_span - get a cpumask for a node's sched_domain

6442

* @node: node whose cpumask we're constructing

6442

* @node: node whose cpumask we're constructing

6443

* @span: resulting cpumask

6443

* @span: resulting cpumask

6444

*

6444

*

6445

* Given a node, construct a good cpumask for its sched_domain to span. It

6445

* Given a node, construct a good cpumask for its sched_domain to span. It

6446

* should be one that prevents unnecessary balancing, but also spreads tasks

6446

* should be one that prevents unnecessary balancing, but also spreads tasks

6447

* out optimally.

6447

* out optimally.

6448

*/

6448

*/

6449

static void sched_domain_node_span(int node, struct cpumask *span)

6449

static void sched_domain_node_span(int node, struct cpumask *span)

6450

{

6450

{

6451

nodemask_t used_nodes;

6451

nodemask_t used_nodes;

6452

int i;

6452

int i;

6453

6454

cpumask_clear(span);

6454

cpumask_clear(span);

6455

nodes_clear(used_nodes);

6455

nodes_clear(used_nodes);

6456

6457

cpumask_or(span, span, cpumask_of_node(node));

6457

cpumask_or(span, span, cpumask_of_node(node));

6458

node_set(node, used_nodes);

6458

node_set(node, used_nodes);

6459

6460

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

6460

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

6461

int next_node = find_next_best_node(node, &used_nodes);

6461

int next_node = find_next_best_node(node, &used_nodes);

6462

6463

cpumask_or(span, span, cpumask_of_node(next_node));

6463

cpumask_or(span, span, cpumask_of_node(next_node));

6464

}

6464

}

6465

}

6465

}

6466

#endif /* CONFIG_NUMA */

6466

#endif /* CONFIG_NUMA */

6467

6468

int sched_smt_power_savings = 0, sched_mc_power_savings = 0;

6468

int sched_smt_power_savings = 0, sched_mc_power_savings = 0;

6469

6470

/*

6470

/*

6471

* The cpus mask in sched_group and sched_domain hangs off the end.

6471

* The cpus mask in sched_group and sched_domain hangs off the end.

6472

*

6472

*

6473

* ( See the the comments in include/linux/sched.h:struct sched_group

6473

* ( See the the comments in include/linux/sched.h:struct sched_group

6474

* and struct sched_domain. )

6474

* and struct sched_domain. )

6475

*/

6475

*/

6476

struct static_sched_group {

6476

struct static_sched_group {

6477

struct sched_group sg;

6477

struct sched_group sg;

6478

DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);

6478

DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);

6479

};

6479

};

6480

6481

struct static_sched_domain {

6481

struct static_sched_domain {

6482

struct sched_domain sd;

6482

struct sched_domain sd;

6483

DECLARE_BITMAP(span, CONFIG_NR_CPUS);

6483

DECLARE_BITMAP(span, CONFIG_NR_CPUS);

6484

};

6484

};

6485

6486

struct s_data {

6486

struct s_data {

6487

#ifdef CONFIG_NUMA

6487

#ifdef CONFIG_NUMA

6488

int sd_allnodes;

6488

int sd_allnodes;

6489

cpumask_var_t domainspan;

6489

cpumask_var_t domainspan;

6490

cpumask_var_t covered;

6490

cpumask_var_t covered;

6491

cpumask_var_t notcovered;

6491

cpumask_var_t notcovered;

6492

#endif

6492

#endif

6493

cpumask_var_t nodemask;

6493

cpumask_var_t nodemask;

6494

cpumask_var_t this_sibling_map;

6494

cpumask_var_t this_sibling_map;

6495

cpumask_var_t this_core_map;

6495

cpumask_var_t this_core_map;

6496

cpumask_var_t send_covered;

6496

cpumask_var_t send_covered;

6497

cpumask_var_t tmpmask;

6497

cpumask_var_t tmpmask;

6498

struct sched_group **sched_group_nodes;

6498

struct sched_group **sched_group_nodes;

6499

struct root_domain *rd;

6499

struct root_domain *rd;

6500

};

6500

};

6501

6502

enum s_alloc {

6502

enum s_alloc {

6503

sa_sched_groups = 0,

6503

sa_sched_groups = 0,

6504

sa_rootdomain,

6504

sa_rootdomain,

6505

sa_tmpmask,

6505

sa_tmpmask,

6506

sa_send_covered,

6506

sa_send_covered,

6507

sa_this_core_map,

6507

sa_this_core_map,

6508

sa_this_sibling_map,

6508

sa_this_sibling_map,

6509

sa_nodemask,

6509

sa_nodemask,

6510

sa_sched_group_nodes,

6510

sa_sched_group_nodes,

6511

#ifdef CONFIG_NUMA

6511

#ifdef CONFIG_NUMA

6512

sa_notcovered,

6512

sa_notcovered,

6513

sa_covered,

6513

sa_covered,

6514

sa_domainspan,

6514

sa_domainspan,

6515

#endif

6515

#endif

6516

sa_none,

6516

sa_none,

6517

};

6517

};

6518

6519

/*

6519

/*

6520

* SMT sched-domains:

6520

* SMT sched-domains:

6521

*/

6521

*/

6522

#ifdef CONFIG_SCHED_SMT

6522

#ifdef CONFIG_SCHED_SMT

6523

static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);

6523

static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);

6524

static DEFINE_PER_CPU(struct static_sched_group, sched_groups);

6524

static DEFINE_PER_CPU(struct static_sched_group, sched_groups);

6525

6526

static int

6526

static int

6527

cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,

6527

cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,

6528

struct sched_group **sg, struct cpumask *unused)

6528

struct sched_group **sg, struct cpumask *unused)

6529

{

6529

{

6530

if (sg)

6530

if (sg)

6531

*sg = &per_cpu(sched_groups, cpu).sg;

6531

*sg = &per_cpu(sched_groups, cpu).sg;

6532

return cpu;

6532

return cpu;

6533

}

6533

}

6534

#endif /* CONFIG_SCHED_SMT */

6534

#endif /* CONFIG_SCHED_SMT */

6535

6536

/*

6536

/*

6537

* multi-core sched-domains:

6537

* multi-core sched-domains:

6538

*/

6538

*/

6539

#ifdef CONFIG_SCHED_MC

6539

#ifdef CONFIG_SCHED_MC

6540

static DEFINE_PER_CPU(struct static_sched_domain, core_domains);

6540

static DEFINE_PER_CPU(struct static_sched_domain, core_domains);

6541

static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);

6541

static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);

6542

#endif /* CONFIG_SCHED_MC */

6542

#endif /* CONFIG_SCHED_MC */

6543

6544

#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)

6544

#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)

6545

static int

6545

static int

6546

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

6546

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

6547

struct sched_group **sg, struct cpumask *mask)

6547

struct sched_group **sg, struct cpumask *mask)

6548

{

6548

{

6549

int group;

6549

int group;

6550

6551

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

6551

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

6552

group = cpumask_first(mask);

6552

group = cpumask_first(mask);

6553

if (sg)

6553

if (sg)

6554

*sg = &per_cpu(sched_group_core, group).sg;

6554

*sg = &per_cpu(sched_group_core, group).sg;

6555

return group;

6555

return group;

6556

}

6556

}

6557

#elif defined(CONFIG_SCHED_MC)

6557

#elif defined(CONFIG_SCHED_MC)

6558

static int

6558

static int

6559

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

6559

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

6560

struct sched_group **sg, struct cpumask *unused)

6560

struct sched_group **sg, struct cpumask *unused)

6561

{

6561

{

6562

if (sg)

6562

if (sg)

6563

*sg = &per_cpu(sched_group_core, cpu).sg;

6563

*sg = &per_cpu(sched_group_core, cpu).sg;

6564

return cpu;

6564

return cpu;

6565

}

6565

}

6566

#endif

6566

#endif

6567

6568

static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);

6568

static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);

6569

static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);

6569

static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);

6570

6571

static int

6571

static int

6572

cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,

6572

cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,

6573

struct sched_group **sg, struct cpumask *mask)

6573

struct sched_group **sg, struct cpumask *mask)

6574

{

6574

{

6575

int group;

6575

int group;

6576

#ifdef CONFIG_SCHED_MC

6576

#ifdef CONFIG_SCHED_MC

6577

cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);

6577

cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);

6578

group = cpumask_first(mask);

6578

group = cpumask_first(mask);

6579

#elif defined(CONFIG_SCHED_SMT)

6579

#elif defined(CONFIG_SCHED_SMT)

6580

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

6580

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

6581

group = cpumask_first(mask);

6581

group = cpumask_first(mask);

6582

#else

6582

#else

6583

group = cpu;

6583

group = cpu;

6584

#endif

6584

#endif

6585

if (sg)

6585

if (sg)

6586

*sg = &per_cpu(sched_group_phys, group).sg;

6586

*sg = &per_cpu(sched_group_phys, group).sg;

6587

return group;

6587

return group;

6588

}

6588

}

6589

6590

#ifdef CONFIG_NUMA

6590

#ifdef CONFIG_NUMA

6591

/*

6591

/*

6592

* The init_sched_build_groups can't handle what we want to do with node

6592

* The init_sched_build_groups can't handle what we want to do with node

6593

* groups, so roll our own. Now each node has its own list of groups which

6593

* groups, so roll our own. Now each node has its own list of groups which

6594

* gets dynamically allocated.

6594

* gets dynamically allocated.

6595

*/

6595

*/

6596

static DEFINE_PER_CPU(struct static_sched_domain, node_domains);

6596

static DEFINE_PER_CPU(struct static_sched_domain, node_domains);

6597

static struct sched_group ***sched_group_nodes_bycpu;

6597

static struct sched_group ***sched_group_nodes_bycpu;

6598

6599

static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);

6599

static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);

6600

static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);

6600

static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);

6601

6602

static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,

6602

static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,

6603

struct sched_group **sg,

6603

struct sched_group **sg,

6604

struct cpumask *nodemask)

6604

struct cpumask *nodemask)

6605

{

6605

{

6606

int group;

6606

int group;

6607

6608

cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);

6608

cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);

6609

group = cpumask_first(nodemask);

6609

group = cpumask_first(nodemask);

6610

6611

if (sg)

6611

if (sg)

6612

*sg = &per_cpu(sched_group_allnodes, group).sg;

6612

*sg = &per_cpu(sched_group_allnodes, group).sg;

6613

return group;

6613

return group;

6614

}

6614

}

6615

6616

static void init_numa_sched_groups_power(struct sched_group *group_head)

6616

static void init_numa_sched_groups_power(struct sched_group *group_head)

6617

{

6617

{

6618

struct sched_group *sg = group_head;

6618

struct sched_group *sg = group_head;

6619

int j;

6619

int j;

6620

6621

if (!sg)

6621

if (!sg)

6622

return;

6622

return;

6623

do {

6623

do {

6624

for_each_cpu(j, sched_group_cpus(sg)) {

6624

for_each_cpu(j, sched_group_cpus(sg)) {

6625

struct sched_domain *sd;

6625

struct sched_domain *sd;

6626

6627

sd = &per_cpu(phys_domains, j).sd;

6627

sd = &per_cpu(phys_domains, j).sd;

6628

if (j != group_first_cpu(sd->groups)) {

6628

if (j != group_first_cpu(sd->groups)) {

6629

/*

6629

/*

6630

* Only add "power" once for each

6630

* Only add "power" once for each

6631

* physical package.

6631

* physical package.

6632

*/

6632

*/

6633

continue;

6633

continue;

6634

}

6634

}

6635

6636

sg->cpu_power += sd->groups->cpu_power;

6636

sg->cpu_power += sd->groups->cpu_power;

6637

}

6637

}

6638

sg = sg->next;

6638

sg = sg->next;

6639

} while (sg != group_head);

6639

} while (sg != group_head);

6640

}

6640

}

6641

6642

static int build_numa_sched_groups(struct s_data *d,

6642

static int build_numa_sched_groups(struct s_data *d,

6643

const struct cpumask *cpu_map, int num)

6643

const struct cpumask *cpu_map, int num)

6644

{

6644

{

6645

struct sched_domain *sd;

6645

struct sched_domain *sd;

6646

struct sched_group *sg, *prev;

6646

struct sched_group *sg, *prev;

6647

int n, j;

6647

int n, j;

6648

6649

cpumask_clear(d->covered);

6649

cpumask_clear(d->covered);

6650

cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);

6650

cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);

6651

if (cpumask_empty(d->nodemask)) {

6651

if (cpumask_empty(d->nodemask)) {

6652

d->sched_group_nodes[num] = NULL;

6652

d->sched_group_nodes[num] = NULL;

6653

goto out;

6653

goto out;

6654

}

6654

}

6655

6656

sched_domain_node_span(num, d->domainspan);

6656

sched_domain_node_span(num, d->domainspan);

6657

cpumask_and(d->domainspan, d->domainspan, cpu_map);

6657

cpumask_and(d->domainspan, d->domainspan, cpu_map);

6658

6659

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

6659

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

6660

GFP_KERNEL, num);

6660

GFP_KERNEL, num);

6661

if (!sg) {

6661

if (!sg) {

6662

printk(KERN_WARNING "Can not alloc domain group for node %d\n",

6662

printk(KERN_WARNING "Can not alloc domain group for node %d\n",

6663

num);

6663

num);

6664

return -ENOMEM;

6664

return -ENOMEM;

6665

}

6665

}

6666

d->sched_group_nodes[num] = sg;

6666

d->sched_group_nodes[num] = sg;

6667

6668

for_each_cpu(j, d->nodemask) {

6668

for_each_cpu(j, d->nodemask) {

6669

sd = &per_cpu(node_domains, j).sd;

6669

sd = &per_cpu(node_domains, j).sd;

6670

sd->groups = sg;

6670

sd->groups = sg;

6671

}

6671

}

6672

6673

sg->cpu_power = 0;

6673

sg->cpu_power = 0;

6674

cpumask_copy(sched_group_cpus(sg), d->nodemask);

6674

cpumask_copy(sched_group_cpus(sg), d->nodemask);

6675

sg->next = sg;

6675

sg->next = sg;

6676

cpumask_or(d->covered, d->covered, d->nodemask);

6676

cpumask_or(d->covered, d->covered, d->nodemask);

6677

6678

prev = sg;

6678

prev = sg;

6679

for (j = 0; j < nr_node_ids; j++) {

6679

for (j = 0; j < nr_node_ids; j++) {

6680

n = (num + j) % nr_node_ids;

6680

n = (num + j) % nr_node_ids;

6681

cpumask_complement(d->notcovered, d->covered);

6681

cpumask_complement(d->notcovered, d->covered);

6682

cpumask_and(d->tmpmask, d->notcovered, cpu_map);

6682

cpumask_and(d->tmpmask, d->notcovered, cpu_map);

6683

cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);

6683

cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);

6684

if (cpumask_empty(d->tmpmask))

6684

if (cpumask_empty(d->tmpmask))

6685

break;

6685

break;

6686

cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));

6686

cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));

6687

if (cpumask_empty(d->tmpmask))

6687

if (cpumask_empty(d->tmpmask))

6688

continue;

6688

continue;

6689

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

6689

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

6690

GFP_KERNEL, num);

6690

GFP_KERNEL, num);

6691

if (!sg) {

6691

if (!sg) {

6692

printk(KERN_WARNING

6692

printk(KERN_WARNING

6693

"Can not alloc domain group for node %d\n", j);

6693

"Can not alloc domain group for node %d\n", j);

6694

return -ENOMEM;

6694

return -ENOMEM;

6695

}

6695

}

6696

sg->cpu_power = 0;

6696

sg->cpu_power = 0;

6697

cpumask_copy(sched_group_cpus(sg), d->tmpmask);

6697

cpumask_copy(sched_group_cpus(sg), d->tmpmask);

6698

sg->next = prev->next;

6698

sg->next = prev->next;

6699

cpumask_or(d->covered, d->covered, d->tmpmask);

6699

cpumask_or(d->covered, d->covered, d->tmpmask);

6700

prev->next = sg;

6700

prev->next = sg;

6701

prev = sg;

6701

prev = sg;

6702

}

6702

}

6703

out:

6703

out:

6704

return 0;

6704

return 0;

6705

}

6705

}

6706

#endif /* CONFIG_NUMA */

6706

#endif /* CONFIG_NUMA */

6707

6708

#ifdef CONFIG_NUMA

6708

#ifdef CONFIG_NUMA

6709

/* Free memory allocated for various sched_group structures */

6709

/* Free memory allocated for various sched_group structures */

6710

static void free_sched_groups(const struct cpumask *cpu_map,

6710

static void free_sched_groups(const struct cpumask *cpu_map,

6711

struct cpumask *nodemask)

6711

struct cpumask *nodemask)

6712

{

6712

{

6713

int cpu, i;

6713

int cpu, i;

6714

6715

for_each_cpu(cpu, cpu_map) {

6715

for_each_cpu(cpu, cpu_map) {

6716

struct sched_group **sched_group_nodes

6716

struct sched_group **sched_group_nodes

6717

= sched_group_nodes_bycpu[cpu];

6717

= sched_group_nodes_bycpu[cpu];

6718

6719

if (!sched_group_nodes)

6719

if (!sched_group_nodes)

6720

continue;

6720

continue;

6721

6722

for (i = 0; i < nr_node_ids; i++) {

6722

for (i = 0; i < nr_node_ids; i++) {

6723

struct sched_group *oldsg, *sg = sched_group_nodes[i];

6723

struct sched_group *oldsg, *sg = sched_group_nodes[i];

6724

6725

cpumask_and(nodemask, cpumask_of_node(i), cpu_map);

6725

cpumask_and(nodemask, cpumask_of_node(i), cpu_map);

6726

if (cpumask_empty(nodemask))

6726

if (cpumask_empty(nodemask))

6727

continue;

6727

continue;

6728

6729

if (sg == NULL)

6729

if (sg == NULL)

6730

continue;

6730

continue;

6731

sg = sg->next;

6731

sg = sg->next;

6732

next_sg:

6732

next_sg:

6733

oldsg = sg;

6733

oldsg = sg;

6734

sg = sg->next;

6734

sg = sg->next;

6735

kfree(oldsg);

6735

kfree(oldsg);

6736

if (oldsg != sched_group_nodes[i])

6736

if (oldsg != sched_group_nodes[i])

6737

goto next_sg;

6737

goto next_sg;

6738

}

6738

}

6739

kfree(sched_group_nodes);

6739

kfree(sched_group_nodes);

6740

sched_group_nodes_bycpu[cpu] = NULL;

6740

sched_group_nodes_bycpu[cpu] = NULL;

6741

}

6741

}

6742

}

6742

}

6743

#else /* !CONFIG_NUMA */

6743

#else /* !CONFIG_NUMA */

6744

static void free_sched_groups(const struct cpumask *cpu_map,

6744

static void free_sched_groups(const struct cpumask *cpu_map,

6745

struct cpumask *nodemask)

6745

struct cpumask *nodemask)

6746

{

6746

{

6747

}

6747

}

6748

#endif /* CONFIG_NUMA */

6748

#endif /* CONFIG_NUMA */

6749

6750

/*

6750

/*

6751

* Initialize sched groups cpu_power.

6751

* Initialize sched groups cpu_power.

6752

*

6752

*

6753

* cpu_power indicates the capacity of sched group, which is used while

6753

* cpu_power indicates the capacity of sched group, which is used while

6754

* distributing the load between different sched groups in a sched domain.

6754

* distributing the load between different sched groups in a sched domain.

6755

* Typically cpu_power for all the groups in a sched domain will be same unless

6755

* Typically cpu_power for all the groups in a sched domain will be same unless

6756

* there are asymmetries in the topology. If there are asymmetries, group

6756

* there are asymmetries in the topology. If there are asymmetries, group

6757

* having more cpu_power will pickup more load compared to the group having

6757

* having more cpu_power will pickup more load compared to the group having

6758

* less cpu_power.

6758

* less cpu_power.

6759

*/

6759

*/

6760

static void init_sched_groups_power(int cpu, struct sched_domain *sd)

6760

static void init_sched_groups_power(int cpu, struct sched_domain *sd)

6761

{

6761

{

6762

struct sched_domain *child;

6762

struct sched_domain *child;

6763

struct sched_group *group;

6763

struct sched_group *group;

6764

long power;

6764

long power;

6765

int weight;

6765

int weight;

6766

6767

WARN_ON(!sd || !sd->groups);

6767

WARN_ON(!sd || !sd->groups);

6768

6769

if (cpu != group_first_cpu(sd->groups))

6769

if (cpu != group_first_cpu(sd->groups))

6770

return;

6770

return;

6771

6772

child = sd->child;

6772

child = sd->child;

6773

6774

sd->groups->cpu_power = 0;

6774

sd->groups->cpu_power = 0;

6775

6776

if (!child) {

6776

if (!child) {

6777

power = SCHED_LOAD_SCALE;

6777

power = SCHED_LOAD_SCALE;

6778

weight = cpumask_weight(sched_domain_span(sd));

6778

weight = cpumask_weight(sched_domain_span(sd));

6779

/*

6779

/*

6780

* SMT siblings share the power of a single core.

6780

* SMT siblings share the power of a single core.

6781

* Usually multiple threads get a better yield out of

6781

* Usually multiple threads get a better yield out of

6782

* that one core than a single thread would have,

6782

* that one core than a single thread would have,

6783

* reflect that in sd->smt_gain.

6783

* reflect that in sd->smt_gain.

6784

*/

6784

*/

6785

if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {

6785

if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {

6786

power *= sd->smt_gain;

6786

power *= sd->smt_gain;

6787

power /= weight;

6787

power /= weight;

6788

power >>= SCHED_LOAD_SHIFT;

6788

power >>= SCHED_LOAD_SHIFT;

6789

}

6789

}

6790

sd->groups->cpu_power += power;

6790

sd->groups->cpu_power += power;

6791

return;

6791

return;

6792

}

6792

}

6793

6794

/*

6794

/*

6795

* Add cpu_power of each child group to this groups cpu_power.

6795

* Add cpu_power of each child group to this groups cpu_power.

6796

*/

6796

*/

6797

group = child->groups;

6797

group = child->groups;

6798

do {

6798

do {

6799

sd->groups->cpu_power += group->cpu_power;

6799

sd->groups->cpu_power += group->cpu_power;

6800

group = group->next;

6800

group = group->next;

6801

} while (group != child->groups);

6801

} while (group != child->groups);

6802

}

6802

}

6803

6804

/*

6804

/*

6805

* Initializers for schedule domains

6805

* Initializers for schedule domains

6806

* Non-inlined to reduce accumulated stack pressure in build_sched_domains()

6806

* Non-inlined to reduce accumulated stack pressure in build_sched_domains()

6807

*/

6807

*/

6808

6809

#ifdef CONFIG_SCHED_DEBUG

6809

#ifdef CONFIG_SCHED_DEBUG

6810

# define SD_INIT_NAME(sd, type) sd->name = #type

6810

# define SD_INIT_NAME(sd, type) sd->name = #type

6811

#else

6811

#else

6812

# define SD_INIT_NAME(sd, type) do { } while (0)

6812

# define SD_INIT_NAME(sd, type) do { } while (0)

6813

#endif

6813

#endif

6814

6815

#define SD_INIT(sd, type) sd_init_##type(sd)

6815

#define SD_INIT(sd, type) sd_init_##type(sd)

6816

6817

#define SD_INIT_FUNC(type) \

6817

#define SD_INIT_FUNC(type) \

6818

static noinline void sd_init_##type(struct sched_domain *sd) \

6818

static noinline void sd_init_##type(struct sched_domain *sd) \

6819

{ \

6819

{ \

6820

memset(sd, 0, sizeof(*sd)); \

6820

memset(sd, 0, sizeof(*sd)); \

6821

*sd = SD_##type##_INIT; \

6821

*sd = SD_##type##_INIT; \

6822

sd->level = SD_LV_##type; \

6822

sd->level = SD_LV_##type; \

6823

SD_INIT_NAME(sd, type); \

6823

SD_INIT_NAME(sd, type); \

6824

}

6824

}

6825

6826

SD_INIT_FUNC(CPU)

6826

SD_INIT_FUNC(CPU)

6827

#ifdef CONFIG_NUMA

6827

#ifdef CONFIG_NUMA

6828

SD_INIT_FUNC(ALLNODES)

6828

SD_INIT_FUNC(ALLNODES)

6829

SD_INIT_FUNC(NODE)

6829

SD_INIT_FUNC(NODE)

6830

#endif

6830

#endif

6831

#ifdef CONFIG_SCHED_SMT

6831

#ifdef CONFIG_SCHED_SMT

6832

SD_INIT_FUNC(SIBLING)

6832

SD_INIT_FUNC(SIBLING)

6833

#endif

6833

#endif

6834

#ifdef CONFIG_SCHED_MC

6834

#ifdef CONFIG_SCHED_MC

6835

SD_INIT_FUNC(MC)

6835

SD_INIT_FUNC(MC)

6836

#endif

6836

#endif

6837

6838

static int default_relax_domain_level = -1;

6838

static int default_relax_domain_level = -1;

6839

6840

static int __init setup_relax_domain_level(char *str)

6840

static int __init setup_relax_domain_level(char *str)

6841

{

6841

{

6842

unsigned long val;

6842

unsigned long val;

6843

6844

val = simple_strtoul(str, NULL, 0);

6844

val = simple_strtoul(str, NULL, 0);

6845

if (val < SD_LV_MAX)

6845

if (val < SD_LV_MAX)

6846

default_relax_domain_level = val;

6846

default_relax_domain_level = val;

6847

6848

return 1;

6848

return 1;

6849

}

6849

}

6850

__setup("relax_domain_level=", setup_relax_domain_level);

6850

__setup("relax_domain_level=", setup_relax_domain_level);

6851

6852

static void set_domain_attribute(struct sched_domain *sd,

6852

static void set_domain_attribute(struct sched_domain *sd,

6853

struct sched_domain_attr *attr)

6853

struct sched_domain_attr *attr)

6854

{

6854

{

6855

int request;

6855

int request;

6856

6857

if (!attr || attr->relax_domain_level < 0) {

6857

if (!attr || attr->relax_domain_level < 0) {

6858

if (default_relax_domain_level < 0)

6858

if (default_relax_domain_level < 0)

6859

return;

6859

return;

6860

else

6860

else

6861

request = default_relax_domain_level;

6861

request = default_relax_domain_level;

6862

} else

6862

} else

6863

request = attr->relax_domain_level;

6863

request = attr->relax_domain_level;

6864

if (request < sd->level) {

6864

if (request < sd->level) {

6865

/* turn off idle balance on this domain */

6865

/* turn off idle balance on this domain */

6866

sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

6866

sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

6867

} else {

6867

} else {

6868

/* turn on idle balance on this domain */

6868

/* turn on idle balance on this domain */

6869

sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

6869

sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

6870

}

6870

}

6871

}

6871

}

6872

6873

static void __free_domain_allocs(struct s_data *d, enum s_alloc what,

6873

static void __free_domain_allocs(struct s_data *d, enum s_alloc what,

6874

const struct cpumask *cpu_map)

6874

const struct cpumask *cpu_map)

6875

{

6875

{

6876

switch (what) {

6876

switch (what) {

6877

case sa_sched_groups:

6877

case sa_sched_groups:

6878

free_sched_groups(cpu_map, d->tmpmask); /* fall through */

6878

free_sched_groups(cpu_map, d->tmpmask); /* fall through */

6879

d->sched_group_nodes = NULL;

6879

d->sched_group_nodes = NULL;

6880

case sa_rootdomain:

6880

case sa_rootdomain:

6881

free_rootdomain(d->rd); /* fall through */

6881

free_rootdomain(d->rd); /* fall through */

6882

case sa_tmpmask:

6882

case sa_tmpmask:

6883

free_cpumask_var(d->tmpmask); /* fall through */

6883

free_cpumask_var(d->tmpmask); /* fall through */

6884

case sa_send_covered:

6884

case sa_send_covered:

6885

free_cpumask_var(d->send_covered); /* fall through */

6885

free_cpumask_var(d->send_covered); /* fall through */

6886

case sa_this_core_map:

6886

case sa_this_core_map:

6887

free_cpumask_var(d->this_core_map); /* fall through */

6887

free_cpumask_var(d->this_core_map); /* fall through */

6888

case sa_this_sibling_map:

6888

case sa_this_sibling_map:

6889

free_cpumask_var(d->this_sibling_map); /* fall through */

6889

free_cpumask_var(d->this_sibling_map); /* fall through */

6890

case sa_nodemask:

6890

case sa_nodemask:

6891

free_cpumask_var(d->nodemask); /* fall through */

6891

free_cpumask_var(d->nodemask); /* fall through */

6892

case sa_sched_group_nodes:

6892

case sa_sched_group_nodes:

6893

#ifdef CONFIG_NUMA

6893

#ifdef CONFIG_NUMA

6894

kfree(d->sched_group_nodes); /* fall through */

6894

kfree(d->sched_group_nodes); /* fall through */

6895

case sa_notcovered:

6895

case sa_notcovered:

6896

free_cpumask_var(d->notcovered); /* fall through */

6896

free_cpumask_var(d->notcovered); /* fall through */

6897

case sa_covered:

6897

case sa_covered:

6898

free_cpumask_var(d->covered); /* fall through */

6898

free_cpumask_var(d->covered); /* fall through */

6899

case sa_domainspan:

6899

case sa_domainspan:

6900

free_cpumask_var(d->domainspan); /* fall through */

6900

free_cpumask_var(d->domainspan); /* fall through */

6901

#endif

6901

#endif

6902

case sa_none:

6902

case sa_none:

6903

break;

6903

break;

6904

}

6904

}

6905

}

6905

}

6906

6907

static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,

6907

static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,

6908

const struct cpumask *cpu_map)

6908

const struct cpumask *cpu_map)

6909

{

6909

{

6910

#ifdef CONFIG_NUMA

6910

#ifdef CONFIG_NUMA

6911

if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))

6911

if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))

6912

return sa_none;

6912

return sa_none;

6913

if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))

6913

if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))

6914

return sa_domainspan;

6914

return sa_domainspan;

6915

if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))

6915

if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))

6916

return sa_covered;

6916

return sa_covered;

6917

/* Allocate the per-node list of sched groups */

6917

/* Allocate the per-node list of sched groups */

6918

d->sched_group_nodes = kcalloc(nr_node_ids,

6918

d->sched_group_nodes = kcalloc(nr_node_ids,

6919

sizeof(struct sched_group *), GFP_KERNEL);

6919

sizeof(struct sched_group *), GFP_KERNEL);

6920

if (!d->sched_group_nodes) {

6920

if (!d->sched_group_nodes) {

6921

printk(KERN_WARNING "Can not alloc sched group node list\n");

6921

printk(KERN_WARNING "Can not alloc sched group node list\n");

6922

return sa_notcovered;

6922

return sa_notcovered;

6923

}

6923

}

6924

sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;

6924

sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;

6925

#endif

6925

#endif

6926

if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))

6926

if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))

6927

return sa_sched_group_nodes;

6927

return sa_sched_group_nodes;

6928

if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))

6928

if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))

6929

return sa_nodemask;

6929

return sa_nodemask;

6930

if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))

6930

if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))

6931

return sa_this_sibling_map;

6931

return sa_this_sibling_map;

6932

if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))

6932

if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))

6933

return sa_this_core_map;

6933

return sa_this_core_map;

6934

if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))

6934

if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))

6935

return sa_send_covered;

6935

return sa_send_covered;

6936

d->rd = alloc_rootdomain();

6936

d->rd = alloc_rootdomain();

6937

if (!d->rd) {

6937

if (!d->rd) {

6938

printk(KERN_WARNING "Cannot alloc root domain\n");

6938

printk(KERN_WARNING "Cannot alloc root domain\n");

6939

return sa_tmpmask;

6939

return sa_tmpmask;

6940

}

6940

}

6941

return sa_rootdomain;

6941

return sa_rootdomain;

6942

}

6942

}

6943

6944

static struct sched_domain *__build_numa_sched_domains(struct s_data *d,

6944

static struct sched_domain *__build_numa_sched_domains(struct s_data *d,

6945

const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)

6945

const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)

6946

{

6946

{

6947

struct sched_domain *sd = NULL;

6947

struct sched_domain *sd = NULL;

6948

#ifdef CONFIG_NUMA

6948

#ifdef CONFIG_NUMA

6949

struct sched_domain *parent;

6949

struct sched_domain *parent;

6950

6951

d->sd_allnodes = 0;

6951

d->sd_allnodes = 0;

6952

if (cpumask_weight(cpu_map) >

6952

if (cpumask_weight(cpu_map) >

6953

SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {

6953

SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {

6954

sd = &per_cpu(allnodes_domains, i).sd;

6954

sd = &per_cpu(allnodes_domains, i).sd;

6955

SD_INIT(sd, ALLNODES);

6955

SD_INIT(sd, ALLNODES);

6956

set_domain_attribute(sd, attr);

6956

set_domain_attribute(sd, attr);

6957

cpumask_copy(sched_domain_span(sd), cpu_map);

6957

cpumask_copy(sched_domain_span(sd), cpu_map);

6958

cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);

6958

cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);

6959

d->sd_allnodes = 1;

6959

d->sd_allnodes = 1;

6960

}

6960

}

6961

parent = sd;

6961

parent = sd;

6962

6963

sd = &per_cpu(node_domains, i).sd;

6963

sd = &per_cpu(node_domains, i).sd;

6964

SD_INIT(sd, NODE);

6964

SD_INIT(sd, NODE);

6965

set_domain_attribute(sd, attr);

6965

set_domain_attribute(sd, attr);

6966

sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));

6966

sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));

6967

sd->parent = parent;

6967

sd->parent = parent;

6968

if (parent)

6968

if (parent)

6969

parent->child = sd;

6969

parent->child = sd;

6970

cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);

6970

cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);

6971

#endif

6971

#endif

6972

return sd;

6972

return sd;

6973

}

6973

}

6974

6975

static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,

6975

static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,

6976

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

6976

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

6977

struct sched_domain *parent, int i)

6977

struct sched_domain *parent, int i)

6978

{

6978

{

6979

struct sched_domain *sd;

6979

struct sched_domain *sd;

6980

sd = &per_cpu(phys_domains, i).sd;

6980

sd = &per_cpu(phys_domains, i).sd;

6981

SD_INIT(sd, CPU);

6981

SD_INIT(sd, CPU);

6982

set_domain_attribute(sd, attr);

6982

set_domain_attribute(sd, attr);

6983

cpumask_copy(sched_domain_span(sd), d->nodemask);

6983

cpumask_copy(sched_domain_span(sd), d->nodemask);

6984

sd->parent = parent;

6984

sd->parent = parent;

6985

if (parent)

6985

if (parent)

6986

parent->child = sd;

6986

parent->child = sd;

6987

cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);

6987

cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);

6988

return sd;

6988

return sd;

6989

}

6989

}

6990

6991

static struct sched_domain *__build_mc_sched_domain(struct s_data *d,

6991

static struct sched_domain *__build_mc_sched_domain(struct s_data *d,

6992

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

6992

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

6993

struct sched_domain *parent, int i)

6993

struct sched_domain *parent, int i)

6994

{

6994

{

6995

struct sched_domain *sd = parent;

6995

struct sched_domain *sd = parent;

6996

#ifdef CONFIG_SCHED_MC

6996

#ifdef CONFIG_SCHED_MC

6997

sd = &per_cpu(core_domains, i).sd;

6997

sd = &per_cpu(core_domains, i).sd;

6998

SD_INIT(sd, MC);

6998

SD_INIT(sd, MC);

6999

set_domain_attribute(sd, attr);

6999

set_domain_attribute(sd, attr);

7000

cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));

7000

cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));

7001

sd->parent = parent;

7001

sd->parent = parent;

7002

parent->child = sd;

7002

parent->child = sd;

7003

cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);

7003

cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);

7004

#endif

7004

#endif

7005

return sd;

7005

return sd;

7006

}

7006

}

7007

7008

static struct sched_domain *__build_smt_sched_domain(struct s_data *d,

7008

static struct sched_domain *__build_smt_sched_domain(struct s_data *d,

7009

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

7009

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

7010

struct sched_domain *parent, int i)

7010

struct sched_domain *parent, int i)

7011

{

7011

{

7012

struct sched_domain *sd = parent;

7012

struct sched_domain *sd = parent;

7013

#ifdef CONFIG_SCHED_SMT

7013

#ifdef CONFIG_SCHED_SMT

7014

sd = &per_cpu(cpu_domains, i).sd;

7014

sd = &per_cpu(cpu_domains, i).sd;

7015

SD_INIT(sd, SIBLING);

7015

SD_INIT(sd, SIBLING);

7016

set_domain_attribute(sd, attr);

7016

set_domain_attribute(sd, attr);

7017

cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));

7017

cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));

7018

sd->parent = parent;

7018

sd->parent = parent;

7019

parent->child = sd;

7019

parent->child = sd;

7020

cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);

7020

cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);

7021

#endif

7021

#endif

7022

return sd;

7022

return sd;

7023

}

7023

}

7024

7025

static void build_sched_groups(struct s_data *d, enum sched_domain_level l,

7025

static void build_sched_groups(struct s_data *d, enum sched_domain_level l,

7026

const struct cpumask *cpu_map, int cpu)

7026

const struct cpumask *cpu_map, int cpu)

7027

{

7027

{

7028

switch (l) {

7028

switch (l) {

7029

#ifdef CONFIG_SCHED_SMT

7029

#ifdef CONFIG_SCHED_SMT

7030

case SD_LV_SIBLING: /* set up CPU (sibling) groups */

7030

case SD_LV_SIBLING: /* set up CPU (sibling) groups */

7031

cpumask_and(d->this_sibling_map, cpu_map,

7031

cpumask_and(d->this_sibling_map, cpu_map,

7032

topology_thread_cpumask(cpu));

7032

topology_thread_cpumask(cpu));

7033

if (cpu == cpumask_first(d->this_sibling_map))

7033

if (cpu == cpumask_first(d->this_sibling_map))

7034

init_sched_build_groups(d->this_sibling_map, cpu_map,

7034

init_sched_build_groups(d->this_sibling_map, cpu_map,

7035

&cpu_to_cpu_group,

7035

&cpu_to_cpu_group,

7036

d->send_covered, d->tmpmask);

7036

d->send_covered, d->tmpmask);

7037

break;

7037

break;

7038

#endif

7038

#endif

7039

#ifdef CONFIG_SCHED_MC

7039

#ifdef CONFIG_SCHED_MC

7040

case SD_LV_MC: /* set up multi-core groups */

7040

case SD_LV_MC: /* set up multi-core groups */

7041

cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));

7041

cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));

7042

if (cpu == cpumask_first(d->this_core_map))

7042

if (cpu == cpumask_first(d->this_core_map))

7043

init_sched_build_groups(d->this_core_map, cpu_map,

7043

init_sched_build_groups(d->this_core_map, cpu_map,

7044

&cpu_to_core_group,

7044

&cpu_to_core_group,

7045

d->send_covered, d->tmpmask);

7045

d->send_covered, d->tmpmask);

7046

break;

7046

break;

7047

#endif

7047

#endif

7048

case SD_LV_CPU: /* set up physical groups */

7048

case SD_LV_CPU: /* set up physical groups */

7049

cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);

7049

cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);

7050

if (!cpumask_empty(d->nodemask))

7050

if (!cpumask_empty(d->nodemask))

7051

init_sched_build_groups(d->nodemask, cpu_map,

7051

init_sched_build_groups(d->nodemask, cpu_map,

7052

&cpu_to_phys_group,

7052

&cpu_to_phys_group,

7053

d->send_covered, d->tmpmask);

7053

d->send_covered, d->tmpmask);

7054

break;

7054

break;

7055

#ifdef CONFIG_NUMA

7055

#ifdef CONFIG_NUMA

7056

case SD_LV_ALLNODES:

7056

case SD_LV_ALLNODES:

7057

init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,

7057

init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,

7058

d->send_covered, d->tmpmask);

7058

d->send_covered, d->tmpmask);

7059

break;

7059

break;

7060

#endif

7060

#endif

7061

default:

7061

default:

7062

break;

7062

break;

7063

}

7063

}

7064

}

7064

}

7065

7066

/*

7066

/*

7067

* Build sched domains for a given set of cpus and attach the sched domains

7067

* Build sched domains for a given set of cpus and attach the sched domains

7068

* to the individual cpus

7068

* to the individual cpus

7069

*/

7069

*/

7070

static int __build_sched_domains(const struct cpumask *cpu_map,

7070

static int __build_sched_domains(const struct cpumask *cpu_map,

7071

struct sched_domain_attr *attr)

7071

struct sched_domain_attr *attr)

7072

{

7072

{

7073

enum s_alloc alloc_state = sa_none;

7073

enum s_alloc alloc_state = sa_none;

7074

struct s_data d;

7074

struct s_data d;

7075

struct sched_domain *sd;

7075

struct sched_domain *sd;

7076

int i;

7076

int i;

7077

#ifdef CONFIG_NUMA

7077

#ifdef CONFIG_NUMA

7078

d.sd_allnodes = 0;

7078

d.sd_allnodes = 0;

7079

#endif

7079

#endif

7080

7081

alloc_state = __visit_domain_allocation_hell(&d, cpu_map);

7081

alloc_state = __visit_domain_allocation_hell(&d, cpu_map);

7082

if (alloc_state != sa_rootdomain)

7082

if (alloc_state != sa_rootdomain)

7083

goto error;

7083

goto error;

7084

alloc_state = sa_sched_groups;

7084

alloc_state = sa_sched_groups;

7085

7086

/*

7086

/*

7087

* Set up domains for cpus specified by the cpu_map.

7087

* Set up domains for cpus specified by the cpu_map.

7088

*/

7088

*/

7089

for_each_cpu(i, cpu_map) {

7089

for_each_cpu(i, cpu_map) {

7090

cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),

7090

cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),

7091

cpu_map);

7091

cpu_map);

7092

7093

sd = __build_numa_sched_domains(&d, cpu_map, attr, i);

7093

sd = __build_numa_sched_domains(&d, cpu_map, attr, i);

7094

sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);

7094

sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);

7095

sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);

7095

sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);

7096

sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);

7096

sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);

7097

}

7097

}

7098

7099

for_each_cpu(i, cpu_map) {

7099

for_each_cpu(i, cpu_map) {

7100

build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);

7100

build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);

7101

build_sched_groups(&d, SD_LV_MC, cpu_map, i);

7101

build_sched_groups(&d, SD_LV_MC, cpu_map, i);

7102

}

7102

}

7103

7104

/* Set up physical groups */

7104

/* Set up physical groups */

7105

for (i = 0; i < nr_node_ids; i++)

7105

for (i = 0; i < nr_node_ids; i++)

7106

build_sched_groups(&d, SD_LV_CPU, cpu_map, i);

7106

build_sched_groups(&d, SD_LV_CPU, cpu_map, i);

7107

7108

#ifdef CONFIG_NUMA

7108

#ifdef CONFIG_NUMA

7109

/* Set up node groups */

7109

/* Set up node groups */

7110

if (d.sd_allnodes)

7110

if (d.sd_allnodes)

7111

build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);

7111

build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);

7112

7113

for (i = 0; i < nr_node_ids; i++)

7113

for (i = 0; i < nr_node_ids; i++)

7114

if (build_numa_sched_groups(&d, cpu_map, i))

7114

if (build_numa_sched_groups(&d, cpu_map, i))

7115

goto error;

7115

goto error;

7116

#endif

7116

#endif

7117

7118

/* Calculate CPU power for physical packages and nodes */

7118

/* Calculate CPU power for physical packages and nodes */

7119

#ifdef CONFIG_SCHED_SMT

7119

#ifdef CONFIG_SCHED_SMT

7120

for_each_cpu(i, cpu_map) {

7120

for_each_cpu(i, cpu_map) {

7121

sd = &per_cpu(cpu_domains, i).sd;

7121

sd = &per_cpu(cpu_domains, i).sd;

7122

init_sched_groups_power(i, sd);

7122

init_sched_groups_power(i, sd);

7123

}

7123

}

7124

#endif

7124

#endif

7125

#ifdef CONFIG_SCHED_MC

7125

#ifdef CONFIG_SCHED_MC

7126

for_each_cpu(i, cpu_map) {

7126

for_each_cpu(i, cpu_map) {

7127

sd = &per_cpu(core_domains, i).sd;

7127

sd = &per_cpu(core_domains, i).sd;

7128

init_sched_groups_power(i, sd);

7128

init_sched_groups_power(i, sd);

7129

}

7129

}

7130

#endif

7130

#endif

7131

7132

for_each_cpu(i, cpu_map) {

7132

for_each_cpu(i, cpu_map) {

7133

sd = &per_cpu(phys_domains, i).sd;

7133

sd = &per_cpu(phys_domains, i).sd;

7134

init_sched_groups_power(i, sd);

7134

init_sched_groups_power(i, sd);

7135

}

7135

}

7136

7137

#ifdef CONFIG_NUMA

7137

#ifdef CONFIG_NUMA

7138

for (i = 0; i < nr_node_ids; i++)

7138

for (i = 0; i < nr_node_ids; i++)

7139

init_numa_sched_groups_power(d.sched_group_nodes[i]);

7139

init_numa_sched_groups_power(d.sched_group_nodes[i]);

7140

7141

if (d.sd_allnodes) {

7141

if (d.sd_allnodes) {

7142

struct sched_group *sg;

7142

struct sched_group *sg;

7143

7144

cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,

7144

cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,

7145

d.tmpmask);

7145

d.tmpmask);

7146

init_numa_sched_groups_power(sg);

7146

init_numa_sched_groups_power(sg);

7147

}

7147

}

7148

#endif

7148

#endif

7149

7150

/* Attach the domains */

7150

/* Attach the domains */

7151

for_each_cpu(i, cpu_map) {

7151

for_each_cpu(i, cpu_map) {

7152

#ifdef CONFIG_SCHED_SMT

7152

#ifdef CONFIG_SCHED_SMT

7153

sd = &per_cpu(cpu_domains, i).sd;

7153

sd = &per_cpu(cpu_domains, i).sd;

7154

#elif defined(CONFIG_SCHED_MC)

7154

#elif defined(CONFIG_SCHED_MC)

7155

sd = &per_cpu(core_domains, i).sd;

7155

sd = &per_cpu(core_domains, i).sd;

7156

#else

7156

#else

7157

sd = &per_cpu(phys_domains, i).sd;

7157

sd = &per_cpu(phys_domains, i).sd;

7158

#endif

7158

#endif

7159

cpu_attach_domain(sd, d.rd, i);

7159

cpu_attach_domain(sd, d.rd, i);

7160

}

7160

}

7161

7162

d.sched_group_nodes = NULL; /* don't free this we still need it */

7162

d.sched_group_nodes = NULL; /* don't free this we still need it */

7163

__free_domain_allocs(&d, sa_tmpmask, cpu_map);

7163

__free_domain_allocs(&d, sa_tmpmask, cpu_map);

7164

return 0;

7164

return 0;

7165

7166

error:

7166

error:

7167

__free_domain_allocs(&d, alloc_state, cpu_map);

7167

__free_domain_allocs(&d, alloc_state, cpu_map);

7168

return -ENOMEM;

7168

return -ENOMEM;

7169

}

7169

}

7170

7171

static int build_sched_domains(const struct cpumask *cpu_map)

7171

static int build_sched_domains(const struct cpumask *cpu_map)

7172

{

7172

{

7173

return __build_sched_domains(cpu_map, NULL);

7173

return __build_sched_domains(cpu_map, NULL);

7174

}

7174

}

7175

7176

static cpumask_var_t *doms_cur; /* current sched domains */

7176

static cpumask_var_t *doms_cur; /* current sched domains */

7177

static int ndoms_cur; /* number of sched domains in 'doms_cur' */

7177

static int ndoms_cur; /* number of sched domains in 'doms_cur' */

7178

static struct sched_domain_attr *dattr_cur;

7178

static struct sched_domain_attr *dattr_cur;

7179

/* attribues of custom domains in 'doms_cur' */

7179

/* attribues of custom domains in 'doms_cur' */

7180

7181

/*

7181

/*

7182

* Special case: If a kmalloc of a doms_cur partition (array of

7182

* Special case: If a kmalloc of a doms_cur partition (array of

7183

* cpumask) fails, then fallback to a single sched domain,

7183

* cpumask) fails, then fallback to a single sched domain,

7184

* as determined by the single cpumask fallback_doms.

7184

* as determined by the single cpumask fallback_doms.

7185

*/

7185

*/

7186

static cpumask_var_t fallback_doms;

7186

static cpumask_var_t fallback_doms;

7187

7188

/*

7188

/*

7189

* arch_update_cpu_topology lets virtualized architectures update the

7189

* arch_update_cpu_topology lets virtualized architectures update the

7190

* cpu core maps. It is supposed to return 1 if the topology changed

7190

* cpu core maps. It is supposed to return 1 if the topology changed

7191

* or 0 if it stayed the same.

7191

* or 0 if it stayed the same.

7192

*/

7192

*/

7193

int __attribute__((weak)) arch_update_cpu_topology(void)

7193

int __attribute__((weak)) arch_update_cpu_topology(void)

7194

{

7194

{

7195

return 0;

7195

return 0;

7196

}

7196

}

7197

7198

cpumask_var_t *alloc_sched_domains(unsigned int ndoms)

7198

cpumask_var_t *alloc_sched_domains(unsigned int ndoms)

7199

{

7199

{

7200

int i;

7200

int i;

7201

cpumask_var_t *doms;

7201

cpumask_var_t *doms;

7202

7203

doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);

7203

doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);

7204

if (!doms)

7204

if (!doms)

7205

return NULL;

7205

return NULL;

7206

for (i = 0; i < ndoms; i++) {

7206

for (i = 0; i < ndoms; i++) {

7207

if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {

7207

if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {

7208

free_sched_domains(doms, i);

7208

free_sched_domains(doms, i);

7209

return NULL;

7209

return NULL;

7210

}

7210

}

7211

}

7211

}

7212

return doms;

7212

return doms;

7213

}

7213

}

7214

7215

void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)

7215

void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)

7216

{

7216

{

7217

unsigned int i;

7217

unsigned int i;

7218

for (i = 0; i < ndoms; i++)

7218

for (i = 0; i < ndoms; i++)

7219

free_cpumask_var(doms[i]);

7219

free_cpumask_var(doms[i]);

7220

kfree(doms);

7220

kfree(doms);

7221

}

7221

}

7222

7223

/*

7223

/*

7224

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

7224

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

7225

* For now this just excludes isolated cpus, but could be used to

7225

* For now this just excludes isolated cpus, but could be used to

7226

* exclude other special cases in the future.

7226

* exclude other special cases in the future.

7227

*/

7227

*/

7228

static int arch_init_sched_domains(const struct cpumask *cpu_map)

7228

static int arch_init_sched_domains(const struct cpumask *cpu_map)

7229

{

7229

{

7230

int err;

7230

int err;

7231

7232

arch_update_cpu_topology();

7232

arch_update_cpu_topology();

7233

ndoms_cur = 1;

7233

ndoms_cur = 1;

7234

doms_cur = alloc_sched_domains(ndoms_cur);

7234

doms_cur = alloc_sched_domains(ndoms_cur);

7235

if (!doms_cur)

7235

if (!doms_cur)

7236

doms_cur = &fallback_doms;

7236

doms_cur = &fallback_doms;

7237

cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);

7237

cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);

7238

dattr_cur = NULL;

7238

dattr_cur = NULL;

7239

err = build_sched_domains(doms_cur[0]);

7239

err = build_sched_domains(doms_cur[0]);

7240

register_sched_domain_sysctl();

7240

register_sched_domain_sysctl();

7241

7242

return err;

7242

return err;

7243

}

7243

}

7244

7245

static void arch_destroy_sched_domains(const struct cpumask *cpu_map,

7245

static void arch_destroy_sched_domains(const struct cpumask *cpu_map,

7246

struct cpumask *tmpmask)

7246

struct cpumask *tmpmask)

7247

{

7247

{

7248

free_sched_groups(cpu_map, tmpmask);

7248

free_sched_groups(cpu_map, tmpmask);

7249

}

7249

}

7250

7251

/*

7251

/*

7252

* Detach sched domains from a group of cpus specified in cpu_map

7252

* Detach sched domains from a group of cpus specified in cpu_map

7253

* These cpus will now be attached to the NULL domain

7253

* These cpus will now be attached to the NULL domain

7254

*/

7254

*/

7255

static void detach_destroy_domains(const struct cpumask *cpu_map)

7255

static void detach_destroy_domains(const struct cpumask *cpu_map)

7256

{

7256

{

7257

/* Save because hotplug lock held. */

7257

/* Save because hotplug lock held. */

7258

static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);

7258

static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);

7259

int i;

7259

int i;

7260

7261

for_each_cpu(i, cpu_map)

7261

for_each_cpu(i, cpu_map)

7262

cpu_attach_domain(NULL, &def_root_domain, i);

7262

cpu_attach_domain(NULL, &def_root_domain, i);

7263

synchronize_sched();

7263

synchronize_sched();

7264

arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));

7264

arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));

7265

}

7265

}

7266

7267

/* handle null as "default" */

7267

/* handle null as "default" */

7268

static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,

7268

static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,

7269

struct sched_domain_attr *new, int idx_new)

7269

struct sched_domain_attr *new, int idx_new)

7270

{

7270

{

7271

struct sched_domain_attr tmp;

7271

struct sched_domain_attr tmp;

7272

7273

/* fast path */

7273

/* fast path */

7274

if (!new && !cur)

7274

if (!new && !cur)

7275

return 1;

7275

return 1;

7276

7277

tmp = SD_ATTR_INIT;

7277

tmp = SD_ATTR_INIT;

7278

return !memcmp(cur ? (cur + idx_cur) : &tmp,

7278

return !memcmp(cur ? (cur + idx_cur) : &tmp,

7279

new ? (new + idx_new) : &tmp,

7279

new ? (new + idx_new) : &tmp,

7280

sizeof(struct sched_domain_attr));

7280

sizeof(struct sched_domain_attr));

7281

}

7281

}

7282

7283

/*

7283

/*

7284

* Partition sched domains as specified by the 'ndoms_new'

7284

* Partition sched domains as specified by the 'ndoms_new'

7285

* cpumasks in the array doms_new[] of cpumasks. This compares

7285

* cpumasks in the array doms_new[] of cpumasks. This compares

7286

* doms_new[] to the current sched domain partitioning, doms_cur[].

7286

* doms_new[] to the current sched domain partitioning, doms_cur[].

7287

* It destroys each deleted domain and builds each new domain.

7287

* It destroys each deleted domain and builds each new domain.

7288

*

7288

*

7289

* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.

7289

* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.

7290

* The masks don't intersect (don't overlap.) We should setup one

7290

* The masks don't intersect (don't overlap.) We should setup one

7291

* sched domain for each mask. CPUs not in any of the cpumasks will

7291

* sched domain for each mask. CPUs not in any of the cpumasks will

7292

* not be load balanced. If the same cpumask appears both in the

7292

* not be load balanced. If the same cpumask appears both in the

7293

* current 'doms_cur' domains and in the new 'doms_new', we can leave

7293

* current 'doms_cur' domains and in the new 'doms_new', we can leave

7294

* it as it is.

7294

* it as it is.

7295

*

7295

*

7296

* The passed in 'doms_new' should be allocated using

7296

* The passed in 'doms_new' should be allocated using

7297

* alloc_sched_domains. This routine takes ownership of it and will

7297

* alloc_sched_domains. This routine takes ownership of it and will

7298

* free_sched_domains it when done with it. If the caller failed the

7298

* free_sched_domains it when done with it. If the caller failed the

7299

* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,

7299

* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,

7300

* and partition_sched_domains() will fallback to the single partition

7300

* and partition_sched_domains() will fallback to the single partition

7301

* 'fallback_doms', it also forces the domains to be rebuilt.

7301

* 'fallback_doms', it also forces the domains to be rebuilt.

7302

*

7302

*

7303

* If doms_new == NULL it will be replaced with cpu_online_mask.

7303

* If doms_new == NULL it will be replaced with cpu_online_mask.

7304

* ndoms_new == 0 is a special case for destroying existing domains,

7304

* ndoms_new == 0 is a special case for destroying existing domains,

7305

* and it will not create the default domain.

7305

* and it will not create the default domain.

7306

*

7306

*

7307

* Call with hotplug lock held

7307

* Call with hotplug lock held

7308

*/

7308

*/

7309

void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

7309

void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

7310

struct sched_domain_attr *dattr_new)

7310

struct sched_domain_attr *dattr_new)

7311

{

7311

{

7312

int i, j, n;

7312

int i, j, n;

7313

int new_topology;

7313

int new_topology;

7314

7315

mutex_lock(&sched_domains_mutex);

7315

mutex_lock(&sched_domains_mutex);

7316

7317

/* always unregister in case we don't destroy any domains */

7317

/* always unregister in case we don't destroy any domains */

7318

unregister_sched_domain_sysctl();

7318

unregister_sched_domain_sysctl();

7319

7320

/* Let architecture update cpu core mappings. */

7320

/* Let architecture update cpu core mappings. */

7321

new_topology = arch_update_cpu_topology();

7321

new_topology = arch_update_cpu_topology();

7322

7323

n = doms_new ? ndoms_new : 0;

7323

n = doms_new ? ndoms_new : 0;

7324

7325

/* Destroy deleted domains */

7325

/* Destroy deleted domains */

7326

for (i = 0; i < ndoms_cur; i++) {

7326

for (i = 0; i < ndoms_cur; i++) {

7327

for (j = 0; j < n && !new_topology; j++) {

7327

for (j = 0; j < n && !new_topology; j++) {

7328

if (cpumask_equal(doms_cur[i], doms_new[j])

7328

if (cpumask_equal(doms_cur[i], doms_new[j])

7329

&& dattrs_equal(dattr_cur, i, dattr_new, j))

7329

&& dattrs_equal(dattr_cur, i, dattr_new, j))

7330

goto match1;

7330

goto match1;

7331

}

7331

}

7332

/* no match - a current sched domain not in new doms_new[] */

7332

/* no match - a current sched domain not in new doms_new[] */

7333

detach_destroy_domains(doms_cur[i]);

7333

detach_destroy_domains(doms_cur[i]);

7334

match1:

7334

match1:

7335

;

7335

;

7336

}

7336

}

7337

7338

if (doms_new == NULL) {

7338

if (doms_new == NULL) {

7339

ndoms_cur = 0;

7339

ndoms_cur = 0;

7340

doms_new = &fallback_doms;

7340

doms_new = &fallback_doms;

7341

cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);

7341

cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);

7342

WARN_ON_ONCE(dattr_new);

7342

WARN_ON_ONCE(dattr_new);

7343

}

7343

}

7344

7345

/* Build new domains */

7345

/* Build new domains */

7346

for (i = 0; i < ndoms_new; i++) {

7346

for (i = 0; i < ndoms_new; i++) {

7347

for (j = 0; j < ndoms_cur && !new_topology; j++) {

7347

for (j = 0; j < ndoms_cur && !new_topology; j++) {

7348

if (cpumask_equal(doms_new[i], doms_cur[j])

7348

if (cpumask_equal(doms_new[i], doms_cur[j])

7349

&& dattrs_equal(dattr_new, i, dattr_cur, j))

7349

&& dattrs_equal(dattr_new, i, dattr_cur, j))

7350

goto match2;

7350

goto match2;

7351

}

7351

}

7352

/* no match - add a new doms_new */

7352

/* no match - add a new doms_new */

7353

__build_sched_domains(doms_new[i],

7353

__build_sched_domains(doms_new[i],

7354

dattr_new ? dattr_new + i : NULL);

7354

dattr_new ? dattr_new + i : NULL);

7355

match2:

7355

match2:

7356

;

7356

;

7357

}

7357

}

7358

7359

/* Remember the new sched domains */

7359

/* Remember the new sched domains */

7360

if (doms_cur != &fallback_doms)

7360

if (doms_cur != &fallback_doms)

7361

free_sched_domains(doms_cur, ndoms_cur);

7361

free_sched_domains(doms_cur, ndoms_cur);

7362

kfree(dattr_cur); /* kfree(NULL) is safe */

7362

kfree(dattr_cur); /* kfree(NULL) is safe */

7363

doms_cur = doms_new;

7363

doms_cur = doms_new;

7364

dattr_cur = dattr_new;

7364

dattr_cur = dattr_new;

7365

ndoms_cur = ndoms_new;

7365

ndoms_cur = ndoms_new;

7366

7367

register_sched_domain_sysctl();

7367

register_sched_domain_sysctl();

7368

7369

mutex_unlock(&sched_domains_mutex);

7369

mutex_unlock(&sched_domains_mutex);

7370

}

7370

}

7371

7372

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

7372

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

7373

static void arch_reinit_sched_domains(void)

7373

static void arch_reinit_sched_domains(void)

7374

{

7374

{

7375

get_online_cpus();

7375

get_online_cpus();

7376

7377

/* Destroy domains first to force the rebuild */

7377

/* Destroy domains first to force the rebuild */

7378

partition_sched_domains(0, NULL, NULL);

7378

partition_sched_domains(0, NULL, NULL);

7379

7380

rebuild_sched_domains();

7380

rebuild_sched_domains();

7381

put_online_cpus();

7381

put_online_cpus();

7382

}

7382

}

7383

7384

static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)

7384

static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)

7385

{

7385

{

7386

unsigned int level = 0;

7386

unsigned int level = 0;

7387

7388

if (sscanf(buf, "%u", &level) != 1)

7388

if (sscanf(buf, "%u", &level) != 1)

7389

return -EINVAL;

7389

return -EINVAL;

7390

7391

/*

7391

/*

7392

* level is always be positive so don't check for

7392

* level is always be positive so don't check for

7393

* level < POWERSAVINGS_BALANCE_NONE which is 0

7393

* level < POWERSAVINGS_BALANCE_NONE which is 0

7394

* What happens on 0 or 1 byte write,

7394

* What happens on 0 or 1 byte write,

7395

* need to check for count as well?

7395

* need to check for count as well?

7396

*/

7396

*/

7397

7398

if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)

7398

if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)

7399

return -EINVAL;

7399

return -EINVAL;

7400

7401

if (smt)

7401

if (smt)

7402

sched_smt_power_savings = level;

7402

sched_smt_power_savings = level;

7403

else

7403

else

7404

sched_mc_power_savings = level;

7404

sched_mc_power_savings = level;

7405

7406

arch_reinit_sched_domains();

7406

arch_reinit_sched_domains();

7407

7408

return count;

7408

return count;

7409

}

7409

}

7410

7411

#ifdef CONFIG_SCHED_MC

7411

#ifdef CONFIG_SCHED_MC

7412

static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,

7412

static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,

7413

struct sysdev_class_attribute *attr,

7413

struct sysdev_class_attribute *attr,

7414

char *page)

7414

char *page)

7415

{

7415

{

7416

return sprintf(page, "%u\n", sched_mc_power_savings);

7416

return sprintf(page, "%u\n", sched_mc_power_savings);

7417

}

7417

}

7418

static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,

7418

static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,

7419

struct sysdev_class_attribute *attr,

7419

struct sysdev_class_attribute *attr,

7420

const char *buf, size_t count)

7420

const char *buf, size_t count)

7421

{

7421

{

7422

return sched_power_savings_store(buf, count, 0);

7422

return sched_power_savings_store(buf, count, 0);

7423

}

7423

}

7424

static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,

7424

static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,

7425

sched_mc_power_savings_show,

7425

sched_mc_power_savings_show,

7426

sched_mc_power_savings_store);

7426

sched_mc_power_savings_store);

7427

#endif

7427

#endif

7428

7429

#ifdef CONFIG_SCHED_SMT

7429

#ifdef CONFIG_SCHED_SMT

7430

static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,

7430

static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,

7431

struct sysdev_class_attribute *attr,

7431

struct sysdev_class_attribute *attr,

7432

char *page)

7432

char *page)

7433

{

7433

{

7434

return sprintf(page, "%u\n", sched_smt_power_savings);

7434

return sprintf(page, "%u\n", sched_smt_power_savings);

7435

}

7435

}

7436

static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,

7436

static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,

7437

struct sysdev_class_attribute *attr,

7437

struct sysdev_class_attribute *attr,

7438

const char *buf, size_t count)

7438

const char *buf, size_t count)

7439

{

7439

{

7440

return sched_power_savings_store(buf, count, 1);

7440

return sched_power_savings_store(buf, count, 1);

7441

}

7441

}

7442

static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,

7442

static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,

7443

sched_smt_power_savings_show,

7443

sched_smt_power_savings_show,

7444

sched_smt_power_savings_store);

7444

sched_smt_power_savings_store);

7445

#endif

7445

#endif

7446

7447

int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)

7447

int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)

7448

{

7448

{

7449

int err = 0;

7449

int err = 0;

7450

7451

#ifdef CONFIG_SCHED_SMT

7451

#ifdef CONFIG_SCHED_SMT

7452

if (smt_capable())

7452

if (smt_capable())

7453

err = sysfs_create_file(&cls->kset.kobj,

7453

err = sysfs_create_file(&cls->kset.kobj,

7454

&attr_sched_smt_power_savings.attr);

7454

&attr_sched_smt_power_savings.attr);

7455

#endif

7455

#endif

7456

#ifdef CONFIG_SCHED_MC

7456

#ifdef CONFIG_SCHED_MC

7457

if (!err && mc_capable())

7457

if (!err && mc_capable())

7458

err = sysfs_create_file(&cls->kset.kobj,

7458

err = sysfs_create_file(&cls->kset.kobj,

7459

&attr_sched_mc_power_savings.attr);

7459

&attr_sched_mc_power_savings.attr);

7460

#endif

7460

#endif

7461

return err;

7461

return err;

7462

}

7462

}

7463

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

7463

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

7464

7465

#ifndef CONFIG_CPUSETS

7465

#ifndef CONFIG_CPUSETS

7466

/*

7466

/*

7467

* Add online and remove offline CPUs from the scheduler domains.

7467

* Add online and remove offline CPUs from the scheduler domains.

7468

* When cpusets are enabled they take over this function.

7468

* When cpusets are enabled they take over this function.

7469

*/

7469

*/

7470

static int update_sched_domains(struct notifier_block *nfb,

7470

static int update_sched_domains(struct notifier_block *nfb,

7471

unsigned long action, void *hcpu)

7471

unsigned long action, void *hcpu)

7472

{

7472

{

7473

switch (action) {

7473

switch (action) {

7474

case CPU_ONLINE:

7474

case CPU_ONLINE:

7475

case CPU_ONLINE_FROZEN:

7475

case CPU_ONLINE_FROZEN:

7476

case CPU_DOWN_PREPARE:

7476

case CPU_DOWN_PREPARE:

7477

case CPU_DOWN_PREPARE_FROZEN:

7477

case CPU_DOWN_PREPARE_FROZEN:

7478

case CPU_DOWN_FAILED:

7478

case CPU_DOWN_FAILED:

7479

case CPU_DOWN_FAILED_FROZEN:

7479

case CPU_DOWN_FAILED_FROZEN:

7480

partition_sched_domains(1, NULL, NULL);

7480

partition_sched_domains(1, NULL, NULL);

7481

return NOTIFY_OK;

7481

return NOTIFY_OK;

7482

7483

default:

7483

default:

7484

return NOTIFY_DONE;

7484

return NOTIFY_DONE;

7485

}

7485

}

7486

}

7486

}

7487

#endif

7487

#endif

7488

7489

static int update_runtime(struct notifier_block *nfb,

7489

static int update_runtime(struct notifier_block *nfb,

7490

unsigned long action, void *hcpu)

7490

unsigned long action, void *hcpu)

7491

{

7491

{

7492

int cpu = (int)(long)hcpu;

7492

int cpu = (int)(long)hcpu;

7493

7494

switch (action) {

7494

switch (action) {

7495

case CPU_DOWN_PREPARE:

7495

case CPU_DOWN_PREPARE:

7496

case CPU_DOWN_PREPARE_FROZEN:

7496

case CPU_DOWN_PREPARE_FROZEN:

7497

disable_runtime(cpu_rq(cpu));

7497

disable_runtime(cpu_rq(cpu));

7498

return NOTIFY_OK;

7498

return NOTIFY_OK;

7499

7500

case CPU_DOWN_FAILED:

7500

case CPU_DOWN_FAILED:

7501

case CPU_DOWN_FAILED_FROZEN:

7501

case CPU_DOWN_FAILED_FROZEN:

7502

case CPU_ONLINE:

7502

case CPU_ONLINE:

7503

case CPU_ONLINE_FROZEN:

7503

case CPU_ONLINE_FROZEN:

7504

enable_runtime(cpu_rq(cpu));

7504

enable_runtime(cpu_rq(cpu));

7505

return NOTIFY_OK;

7505

return NOTIFY_OK;

7506

7507

default:

7507

default:

7508

return NOTIFY_DONE;

7508

return NOTIFY_DONE;

7509

}

7509

}

7510

}

7510

}

7511

7512

void __init sched_init_smp(void)

7512

void __init sched_init_smp(void)

7513

{

7513

{

7514

cpumask_var_t non_isolated_cpus;

7514

cpumask_var_t non_isolated_cpus;

7515

7516

alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);

7516

alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);

7517

alloc_cpumask_var(&fallback_doms, GFP_KERNEL);

7517

alloc_cpumask_var(&fallback_doms, GFP_KERNEL);

7518

7519

#if defined(CONFIG_NUMA)

7519

#if defined(CONFIG_NUMA)

7520

sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),

7520

sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),

7521

GFP_KERNEL);

7521

GFP_KERNEL);

7522

BUG_ON(sched_group_nodes_bycpu == NULL);

7522

BUG_ON(sched_group_nodes_bycpu == NULL);

7523

#endif

7523

#endif

7524

get_online_cpus();

7524

get_online_cpus();

7525

mutex_lock(&sched_domains_mutex);

7525

mutex_lock(&sched_domains_mutex);

7526

arch_init_sched_domains(cpu_active_mask);

7526

arch_init_sched_domains(cpu_active_mask);

7527

cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);

7527

cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);

7528

if (cpumask_empty(non_isolated_cpus))

7528

if (cpumask_empty(non_isolated_cpus))

7529

cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);

7529

cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);

7530

mutex_unlock(&sched_domains_mutex);

7530

mutex_unlock(&sched_domains_mutex);

7531

put_online_cpus();

7531

put_online_cpus();

7532

7533

#ifndef CONFIG_CPUSETS

7533

#ifndef CONFIG_CPUSETS

7534

/* XXX: Theoretical race here - CPU may be hotplugged now */

7534

/* XXX: Theoretical race here - CPU may be hotplugged now */

7535

hotcpu_notifier(update_sched_domains, 0);

7535

hotcpu_notifier(update_sched_domains, 0);

7536

#endif

7536

#endif

7537

7538

/* RT runtime code needs to handle some hotplug events */

7538

/* RT runtime code needs to handle some hotplug events */

7539

hotcpu_notifier(update_runtime, 0);

7539

hotcpu_notifier(update_runtime, 0);

7540

7541

init_hrtick();

7541

init_hrtick();

7542

7543

/* Move init over to a non-isolated CPU */

7543

/* Move init over to a non-isolated CPU */

7544

if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)

7544

if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)

7545

BUG();

7545

BUG();

7546

sched_init_granularity();

7546

sched_init_granularity();

7547

free_cpumask_var(non_isolated_cpus);

7547

free_cpumask_var(non_isolated_cpus);

7548

7549

init_sched_rt_class();

7549

init_sched_rt_class();

7550

}

7550

}

7551

#else

7551

#else

7552

void __init sched_init_smp(void)

7552

void __init sched_init_smp(void)

7553

{

7553

{

7554

sched_init_granularity();

7554

sched_init_granularity();

7555

}

7555

}

7556

#endif /* CONFIG_SMP */

7556

#endif /* CONFIG_SMP */

7557

7558

const_debug unsigned int sysctl_timer_migration = 1;

7558

const_debug unsigned int sysctl_timer_migration = 1;

7559

7560

int in_sched_functions(unsigned long addr)

7560

int in_sched_functions(unsigned long addr)

7561

{

7561

{

7562

return in_lock_functions(addr) ||

7562

return in_lock_functions(addr) ||

7563

(addr >= (unsigned long)__sched_text_start

7563

(addr >= (unsigned long)__sched_text_start

7564

&& addr < (unsigned long)__sched_text_end);

7564

&& addr < (unsigned long)__sched_text_end);

7565

}

7565

}

7566

7567

static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)

7567

static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)

7568

{

7568

{

7569

cfs_rq->tasks_timeline = RB_ROOT;

7569

cfs_rq->tasks_timeline = RB_ROOT;

7570

INIT_LIST_HEAD(&cfs_rq->tasks);

7570

INIT_LIST_HEAD(&cfs_rq->tasks);

7571

#ifdef CONFIG_FAIR_GROUP_SCHED

7571

#ifdef CONFIG_FAIR_GROUP_SCHED

7572

cfs_rq->rq = rq;

7572

cfs_rq->rq = rq;

7573

#endif

7573

#endif

7574

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

7574

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

7575

}

7575

}

7576

7577

static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)

7577

static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)

7578

{

7578

{

7579

struct rt_prio_array *array;

7579

struct rt_prio_array *array;

7580

int i;

7580

int i;

7581

7582

array = &rt_rq->active;

7582

array = &rt_rq->active;

7583

for (i = 0; i < MAX_RT_PRIO; i++) {

7583

for (i = 0; i < MAX_RT_PRIO; i++) {

7584

INIT_LIST_HEAD(array->queue + i);

7584

INIT_LIST_HEAD(array->queue + i);

7585

__clear_bit(i, array->bitmap);

7585

__clear_bit(i, array->bitmap);

7586

}

7586

}

7587

/* delimiter for bitsearch: */

7587

/* delimiter for bitsearch: */

7588

__set_bit(MAX_RT_PRIO, array->bitmap);

7588

__set_bit(MAX_RT_PRIO, array->bitmap);

7589

7590

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

7590

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

7591

rt_rq->highest_prio.curr = MAX_RT_PRIO;

7591

rt_rq->highest_prio.curr = MAX_RT_PRIO;

7592

#ifdef CONFIG_SMP

7592

#ifdef CONFIG_SMP

7593

rt_rq->highest_prio.next = MAX_RT_PRIO;

7593

rt_rq->highest_prio.next = MAX_RT_PRIO;

7594

#endif

7594

#endif

7595

#endif

7595

#endif

7596

#ifdef CONFIG_SMP

7596

#ifdef CONFIG_SMP

7597

rt_rq->rt_nr_migratory = 0;

7597

rt_rq->rt_nr_migratory = 0;

7598

rt_rq->overloaded = 0;

7598

rt_rq->overloaded = 0;

7599

plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);

7599

plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);

7600

#endif

7600

#endif

7601

7602

rt_rq->rt_time = 0;

7602

rt_rq->rt_time = 0;

7603

rt_rq->rt_throttled = 0;

7603

rt_rq->rt_throttled = 0;

7604

rt_rq->rt_runtime = 0;

7604

rt_rq->rt_runtime = 0;

7605

raw_spin_lock_init(&rt_rq->rt_runtime_lock);

7605

raw_spin_lock_init(&rt_rq->rt_runtime_lock);

7606

7607

#ifdef CONFIG_RT_GROUP_SCHED

7607

#ifdef CONFIG_RT_GROUP_SCHED

7608

rt_rq->rt_nr_boosted = 0;

7608

rt_rq->rt_nr_boosted = 0;

7609

rt_rq->rq = rq;

7609

rt_rq->rq = rq;

7610

#endif

7610

#endif

7611

}

7611

}

7612

7613

#ifdef CONFIG_FAIR_GROUP_SCHED

7613

#ifdef CONFIG_FAIR_GROUP_SCHED

7614

static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

7614

static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

7615

struct sched_entity *se, int cpu, int add,

7615

struct sched_entity *se, int cpu, int add,

7616

struct sched_entity *parent)

7616

struct sched_entity *parent)

7617

{

7617

{

7618

struct rq *rq = cpu_rq(cpu);

7618

struct rq *rq = cpu_rq(cpu);

7619

tg->cfs_rq[cpu] = cfs_rq;

7619

tg->cfs_rq[cpu] = cfs_rq;

7620

init_cfs_rq(cfs_rq, rq);

7620

init_cfs_rq(cfs_rq, rq);

7621

cfs_rq->tg = tg;

7621

cfs_rq->tg = tg;

7622

if (add)

7622

if (add)

7623

list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);

7623

list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);

7624

7625

tg->se[cpu] = se;

7625

tg->se[cpu] = se;

7626

/* se could be NULL for init_task_group */

7626

/* se could be NULL for init_task_group */

7627

if (!se)

7627

if (!se)

7628

return;

7628

return;

7629

7630

if (!parent)

7630

if (!parent)

7631

se->cfs_rq = &rq->cfs;

7631

se->cfs_rq = &rq->cfs;

7632

else

7632

else

7633

se->cfs_rq = parent->my_q;

7633

se->cfs_rq = parent->my_q;

7634

7635

se->my_q = cfs_rq;

7635

se->my_q = cfs_rq;

7636

se->load.weight = tg->shares;

7636

se->load.weight = tg->shares;

7637

se->load.inv_weight = 0;

7637

se->load.inv_weight = 0;

7638

se->parent = parent;

7638

se->parent = parent;

7639

}

7639

}

7640

#endif

7640

#endif

7641

7642

#ifdef CONFIG_RT_GROUP_SCHED

7642

#ifdef CONFIG_RT_GROUP_SCHED

7643

static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,

7643

static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,

7644

struct sched_rt_entity *rt_se, int cpu, int add,

7644

struct sched_rt_entity *rt_se, int cpu, int add,

7645

struct sched_rt_entity *parent)

7645

struct sched_rt_entity *parent)

7646

{

7646

{

7647

struct rq *rq = cpu_rq(cpu);

7647

struct rq *rq = cpu_rq(cpu);

7648

7649

tg->rt_rq[cpu] = rt_rq;

7649

tg->rt_rq[cpu] = rt_rq;

7650

init_rt_rq(rt_rq, rq);

7650

init_rt_rq(rt_rq, rq);

7651

rt_rq->tg = tg;

7651

rt_rq->tg = tg;

7652

rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;

7652

rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;

7653

if (add)

7653

if (add)

7654

list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);

7654

list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);

7655

7656

tg->rt_se[cpu] = rt_se;

7656

tg->rt_se[cpu] = rt_se;

7657

if (!rt_se)

7657

if (!rt_se)

7658

return;

7658

return;

7659

7660

if (!parent)

7660

if (!parent)

7661

rt_se->rt_rq = &rq->rt;

7661

rt_se->rt_rq = &rq->rt;

7662

else

7662

else

7663

rt_se->rt_rq = parent->my_q;

7663

rt_se->rt_rq = parent->my_q;

7664

7665

rt_se->my_q = rt_rq;

7665

rt_se->my_q = rt_rq;

7666

rt_se->parent = parent;

7666

rt_se->parent = parent;

7667

INIT_LIST_HEAD(&rt_se->run_list);

7667

INIT_LIST_HEAD(&rt_se->run_list);

7668

}

7668

}

7669

#endif

7669

#endif

7670

7671

void __init sched_init(void)

7671

void __init sched_init(void)

7672

{

7672

{

7673

int i, j;

7673

int i, j;

7674

unsigned long alloc_size = 0, ptr;

7674

unsigned long alloc_size = 0, ptr;

7675

7676

#ifdef CONFIG_FAIR_GROUP_SCHED

7676

#ifdef CONFIG_FAIR_GROUP_SCHED

7677

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

7677

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

7678

#endif

7678

#endif

7679

#ifdef CONFIG_RT_GROUP_SCHED

7679

#ifdef CONFIG_RT_GROUP_SCHED

7680

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

7680

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

7681

#endif

7681

#endif

7682

#ifdef CONFIG_CPUMASK_OFFSTACK

7682

#ifdef CONFIG_CPUMASK_OFFSTACK

7683

alloc_size += num_possible_cpus() * cpumask_size();

7683

alloc_size += num_possible_cpus() * cpumask_size();

7684

#endif

7684

#endif

7685

if (alloc_size) {

7685

if (alloc_size) {

7686

ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

7686

ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

7687

7688

#ifdef CONFIG_FAIR_GROUP_SCHED

7688

#ifdef CONFIG_FAIR_GROUP_SCHED

7689

init_task_group.se = (struct sched_entity **)ptr;

7689

init_task_group.se = (struct sched_entity **)ptr;

7690

ptr += nr_cpu_ids * sizeof(void **);

7690

ptr += nr_cpu_ids * sizeof(void **);

7691

7692

init_task_group.cfs_rq = (struct cfs_rq **)ptr;

7692

init_task_group.cfs_rq = (struct cfs_rq **)ptr;

7693

ptr += nr_cpu_ids * sizeof(void **);

7693

ptr += nr_cpu_ids * sizeof(void **);

7694

7695

#endif /* CONFIG_FAIR_GROUP_SCHED */

7695

#endif /* CONFIG_FAIR_GROUP_SCHED */

7696

#ifdef CONFIG_RT_GROUP_SCHED

7696

#ifdef CONFIG_RT_GROUP_SCHED

7697

init_task_group.rt_se = (struct sched_rt_entity **)ptr;

7697

init_task_group.rt_se = (struct sched_rt_entity **)ptr;

7698

ptr += nr_cpu_ids * sizeof(void **);

7698

ptr += nr_cpu_ids * sizeof(void **);

7699

7700

init_task_group.rt_rq = (struct rt_rq **)ptr;

7700

init_task_group.rt_rq = (struct rt_rq **)ptr;

7701

ptr += nr_cpu_ids * sizeof(void **);

7701

ptr += nr_cpu_ids * sizeof(void **);

7702

7703

#endif /* CONFIG_RT_GROUP_SCHED */

7703

#endif /* CONFIG_RT_GROUP_SCHED */

7704

#ifdef CONFIG_CPUMASK_OFFSTACK

7704

#ifdef CONFIG_CPUMASK_OFFSTACK

7705

for_each_possible_cpu(i) {

7705

for_each_possible_cpu(i) {

7706

per_cpu(load_balance_tmpmask, i) = (void *)ptr;

7706

per_cpu(load_balance_tmpmask, i) = (void *)ptr;

7707

ptr += cpumask_size();

7707

ptr += cpumask_size();

7708

}

7708

}

7709

#endif /* CONFIG_CPUMASK_OFFSTACK */

7709

#endif /* CONFIG_CPUMASK_OFFSTACK */

7710

}

7710

}

7711

7712

#ifdef CONFIG_SMP

7712

#ifdef CONFIG_SMP

7713

init_defrootdomain();

7713

init_defrootdomain();

7714

#endif

7714

#endif

7715

7716

init_rt_bandwidth(&def_rt_bandwidth,

7716

init_rt_bandwidth(&def_rt_bandwidth,

7717

global_rt_period(), global_rt_runtime());

7717

global_rt_period(), global_rt_runtime());

7718

7719

#ifdef CONFIG_RT_GROUP_SCHED

7719

#ifdef CONFIG_RT_GROUP_SCHED

7720

init_rt_bandwidth(&init_task_group.rt_bandwidth,

7720

init_rt_bandwidth(&init_task_group.rt_bandwidth,

7721

global_rt_period(), global_rt_runtime());

7721

global_rt_period(), global_rt_runtime());

7722

#endif /* CONFIG_RT_GROUP_SCHED */

7722

#endif /* CONFIG_RT_GROUP_SCHED */

7723

7724

#ifdef CONFIG_CGROUP_SCHED

7724

#ifdef CONFIG_CGROUP_SCHED

7725

list_add(&init_task_group.list, &task_groups);

7725

list_add(&init_task_group.list, &task_groups);

7726

INIT_LIST_HEAD(&init_task_group.children);

7726

INIT_LIST_HEAD(&init_task_group.children);

7727

7728

#endif /* CONFIG_CGROUP_SCHED */

7728

#endif /* CONFIG_CGROUP_SCHED */

7729

7730

#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP

7730

#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP

7731

update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),

7731

update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),

7732

__alignof__(unsigned long));

7732

__alignof__(unsigned long));

7733

#endif

7733

#endif

7734

for_each_possible_cpu(i) {

7734

for_each_possible_cpu(i) {

7735

struct rq *rq;

7735

struct rq *rq;

7736

7737

rq = cpu_rq(i);

7737

rq = cpu_rq(i);

7738

raw_spin_lock_init(&rq->lock);

7738

raw_spin_lock_init(&rq->lock);

7739

rq->nr_running = 0;

7739

rq->nr_running = 0;

7740

rq->calc_load_active = 0;

7740

rq->calc_load_active = 0;

7741

rq->calc_load_update = jiffies + LOAD_FREQ;

7741

rq->calc_load_update = jiffies + LOAD_FREQ;

7742

init_cfs_rq(&rq->cfs, rq);

7742

init_cfs_rq(&rq->cfs, rq);

7743

init_rt_rq(&rq->rt, rq);

7743

init_rt_rq(&rq->rt, rq);

7744

#ifdef CONFIG_FAIR_GROUP_SCHED

7744

#ifdef CONFIG_FAIR_GROUP_SCHED

7745

init_task_group.shares = init_task_group_load;

7745

init_task_group.shares = init_task_group_load;

7746

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

7746

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

7747

#ifdef CONFIG_CGROUP_SCHED

7747

#ifdef CONFIG_CGROUP_SCHED

7748

/*

7748

/*

7749

* How much cpu bandwidth does init_task_group get?

7749

* How much cpu bandwidth does init_task_group get?

7750

*

7750

*

7751

* In case of task-groups formed thr' the cgroup filesystem, it

7751

* In case of task-groups formed thr' the cgroup filesystem, it

7752

* gets 100% of the cpu resources in the system. This overall

7752

* gets 100% of the cpu resources in the system. This overall

7753

* system cpu resource is divided among the tasks of

7753

* system cpu resource is divided among the tasks of

7754

* init_task_group and its child task-groups in a fair manner,

7754

* init_task_group and its child task-groups in a fair manner,

7755

* based on each entity's (task or task-group's) weight

7755

* based on each entity's (task or task-group's) weight

7756

* (se->load.weight).

7756

* (se->load.weight).

7757

*

7757

*

7758

* In other words, if init_task_group has 10 tasks of weight

7758

* In other words, if init_task_group has 10 tasks of weight

7759

* 1024) and two child groups A0 and A1 (of weight 1024 each),

7759

* 1024) and two child groups A0 and A1 (of weight 1024 each),

7760

* then A0's share of the cpu resource is:

7760

* then A0's share of the cpu resource is:

7761

*

7761

*

7762

* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%

7762

* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%

7763

*

7763

*

7764

* We achieve this by letting init_task_group's tasks sit

7764

* We achieve this by letting init_task_group's tasks sit

7765

* directly in rq->cfs (i.e init_task_group->se[] = NULL).

7765

* directly in rq->cfs (i.e init_task_group->se[] = NULL).

7766

*/

7766

*/

7767

init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);

7767

init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);

7768

#endif

7768

#endif

7769

#endif /* CONFIG_FAIR_GROUP_SCHED */

7769

#endif /* CONFIG_FAIR_GROUP_SCHED */

7770

7771

rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

7771

rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

7772

#ifdef CONFIG_RT_GROUP_SCHED

7772

#ifdef CONFIG_RT_GROUP_SCHED

7773

INIT_LIST_HEAD(&rq->leaf_rt_rq_list);

7773

INIT_LIST_HEAD(&rq->leaf_rt_rq_list);

7774

#ifdef CONFIG_CGROUP_SCHED

7774

#ifdef CONFIG_CGROUP_SCHED

7775

init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);

7775

init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);

7776

#endif

7776

#endif

7777

#endif

7777

#endif

7778

7779

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

7779

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

7780

rq->cpu_load[j] = 0;

7780

rq->cpu_load[j] = 0;

7781

#ifdef CONFIG_SMP

7781

#ifdef CONFIG_SMP

7782

rq->sd = NULL;

7782

rq->sd = NULL;

7783

rq->rd = NULL;

7783

rq->rd = NULL;

7784

rq->post_schedule = 0;

7784

rq->post_schedule = 0;

7785

rq->active_balance = 0;

7785

rq->active_balance = 0;

7786

rq->next_balance = jiffies;

7786

rq->next_balance = jiffies;

7787

rq->push_cpu = 0;

7787

rq->push_cpu = 0;

7788

rq->cpu = i;

7788

rq->cpu = i;

7789

rq->online = 0;

7789

rq->online = 0;

7790

rq->migration_thread = NULL;

7790

rq->migration_thread = NULL;

7791

rq->idle_stamp = 0;

7791

rq->idle_stamp = 0;

7792

rq->avg_idle = 2*sysctl_sched_migration_cost;

7792

rq->avg_idle = 2*sysctl_sched_migration_cost;

7793

INIT_LIST_HEAD(&rq->migration_queue);

7793

INIT_LIST_HEAD(&rq->migration_queue);

7794

rq_attach_root(rq, &def_root_domain);

7794

rq_attach_root(rq, &def_root_domain);

7795

#endif

7795

#endif

7796

init_rq_hrtick(rq);

7796

init_rq_hrtick(rq);

7797

atomic_set(&rq->nr_iowait, 0);

7797

atomic_set(&rq->nr_iowait, 0);

7798

}

7798

}

7799

7800

set_load_weight(&init_task);

7800

set_load_weight(&init_task);

7801

7802

#ifdef CONFIG_PREEMPT_NOTIFIERS

7802

#ifdef CONFIG_PREEMPT_NOTIFIERS

7803

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

7803

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

7804

#endif

7804

#endif

7805

7806

#ifdef CONFIG_SMP

7806

#ifdef CONFIG_SMP

7807

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

7807

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

7808

#endif

7808

#endif

7809

7810

#ifdef CONFIG_RT_MUTEXES

7810

#ifdef CONFIG_RT_MUTEXES

7811

plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);

7811

plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);

7812

#endif

7812

#endif

7813

7814

/*

7814

/*

7815

* The boot idle thread does lazy MMU switching as well:

7815

* The boot idle thread does lazy MMU switching as well:

7816

*/

7816

*/

7817

atomic_inc(&init_mm.mm_count);

7817

atomic_inc(&init_mm.mm_count);

7818

enter_lazy_tlb(&init_mm, current);

7818

enter_lazy_tlb(&init_mm, current);

7819

7820

/*

7820

/*

7821

* Make us the idle thread. Technically, schedule() should not be

7821

* Make us the idle thread. Technically, schedule() should not be

7822

* called from this thread, however somewhere below it might be,

7822

* called from this thread, however somewhere below it might be,

7823

* but because we are the idle thread, we just pick up running again

7823

* but because we are the idle thread, we just pick up running again

7824

* when this runqueue becomes "idle".

7824

* when this runqueue becomes "idle".

7825

*/

7825

*/

7826

init_idle(current, smp_processor_id());

7826

init_idle(current, smp_processor_id());

7827

7828

calc_load_update = jiffies + LOAD_FREQ;

7828

calc_load_update = jiffies + LOAD_FREQ;

7829

7830

/*

7830

/*

7831

* During early bootup we pretend to be a normal task:

7831

* During early bootup we pretend to be a normal task:

7832

*/

7832

*/

7833

current->sched_class = &fair_sched_class;

7833

current->sched_class = &fair_sched_class;

7834

7835

/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */

7835

/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */

7836

zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);

7836

zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);

7837

#ifdef CONFIG_SMP

7837

#ifdef CONFIG_SMP

7838

#ifdef CONFIG_NO_HZ

7838

#ifdef CONFIG_NO_HZ

7839

zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);

7839

zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);

7840

alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);

7840

alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);

7841

#endif

7841

#endif

7842

/* May be allocated at isolcpus cmdline parse time */

7842

/* May be allocated at isolcpus cmdline parse time */

7843

if (cpu_isolated_map == NULL)

7843

if (cpu_isolated_map == NULL)

7844

zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);

7844

zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);

7845

#endif /* SMP */

7845

#endif /* SMP */

7846

7847

perf_event_init();

7847

perf_event_init();

7848

7849

scheduler_running = 1;

7849

scheduler_running = 1;

7850

}

7850

}

7851

7852

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

7852

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

7853

static inline int preempt_count_equals(int preempt_offset)

7853

static inline int preempt_count_equals(int preempt_offset)

7854

{

7854

{

7855

int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();

7855

int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();

7856

7857

return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);

7857

return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);

7858

}

7858

}

7859

7860

void __might_sleep(const char *file, int line, int preempt_offset)

7860

void __might_sleep(const char *file, int line, int preempt_offset)

7861

{

7861

{

7862

#ifdef in_atomic

7862

#ifdef in_atomic

7863

static unsigned long prev_jiffy; /* ratelimiting */

7863

static unsigned long prev_jiffy; /* ratelimiting */

7864

7865

if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||

7865

if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||

7866

system_state != SYSTEM_RUNNING || oops_in_progress)

7866

system_state != SYSTEM_RUNNING || oops_in_progress)

7867

return;

7867

return;

7868

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

7868

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

7869

return;

7869

return;

7870

prev_jiffy = jiffies;

7870

prev_jiffy = jiffies;

7871

7872

printk(KERN_ERR

7872

printk(KERN_ERR

7873

"BUG: sleeping function called from invalid context at %s:%d\n",

7873

"BUG: sleeping function called from invalid context at %s:%d\n",

7874

file, line);

7874

file, line);

7875

printk(KERN_ERR

7875

printk(KERN_ERR

7876

"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

7876

"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

7877

in_atomic(), irqs_disabled(),

7877

in_atomic(), irqs_disabled(),

7878

current->pid, current->comm);

7878

current->pid, current->comm);

7879

7880

debug_show_held_locks(current);

7880

debug_show_held_locks(current);

7881

if (irqs_disabled())

7881

if (irqs_disabled())

7882

print_irqtrace_events(current);

7882

print_irqtrace_events(current);

7883

dump_stack();

7883

dump_stack();

7884

#endif

7884

#endif

7885

}

7885

}

7886

EXPORT_SYMBOL(__might_sleep);

7886

EXPORT_SYMBOL(__might_sleep);

7887

#endif

7887

#endif

7888

7889

#ifdef CONFIG_MAGIC_SYSRQ

7889

#ifdef CONFIG_MAGIC_SYSRQ

7890

static void normalize_task(struct rq *rq, struct task_struct *p)

7890

static void normalize_task(struct rq *rq, struct task_struct *p)

7891

{

7891

{

7892

int on_rq;

7892

int on_rq;

7893

7894

update_rq_clock(rq);

7894

update_rq_clock(rq);

7895

on_rq = p->se.on_rq;

7895

on_rq = p->se.on_rq;

7896

if (on_rq)

7896

if (on_rq)

7897

deactivate_task(rq, p, 0);

7897

deactivate_task(rq, p, 0);

7898

__setscheduler(rq, p, SCHED_NORMAL, 0);

7898

__setscheduler(rq, p, SCHED_NORMAL, 0);

7899

if (on_rq) {

7899

if (on_rq) {

7900

activate_task(rq, p, 0);

7900

activate_task(rq, p, 0);

7901

resched_task(rq->curr);

7901

resched_task(rq->curr);

7902

}

7902

}

7903

}

7903

}

7904

7905

void normalize_rt_tasks(void)

7905

void normalize_rt_tasks(void)

7906

{

7906

{

7907

struct task_struct *g, *p;

7907

struct task_struct *g, *p;

7908

unsigned long flags;

7908

unsigned long flags;

7909

struct rq *rq;

7909

struct rq *rq;

7910

7911

read_lock_irqsave(&tasklist_lock, flags);

7911

read_lock_irqsave(&tasklist_lock, flags);

7912

do_each_thread(g, p) {

7912

do_each_thread(g, p) {

7913

/*

7913

/*

7914

* Only normalize user tasks:

7914

* Only normalize user tasks:

7915

*/

7915

*/

7916

if (!p->mm)

7916

if (!p->mm)

7917

continue;

7917

continue;

7918

7919

p->se.exec_start = 0;

7919

p->se.exec_start = 0;

7920

#ifdef CONFIG_SCHEDSTATS

7920

#ifdef CONFIG_SCHEDSTATS

7921

p->se.wait_start = 0;

7921

p->se.wait_start = 0;

7922

p->se.sleep_start = 0;

7922

p->se.sleep_start = 0;

7923

p->se.block_start = 0;

7923

p->se.block_start = 0;

7924

#endif

7924

#endif

7925

7926

if (!rt_task(p)) {

7926

if (!rt_task(p)) {

7927

/*

7927

/*

7928

* Renice negative nice level userspace

7928

* Renice negative nice level userspace

7929

* tasks back to 0:

7929

* tasks back to 0:

7930

*/

7930

*/

7931

if (TASK_NICE(p) < 0 && p->mm)

7931

if (TASK_NICE(p) < 0 && p->mm)

7932

set_user_nice(p, 0);

7932

set_user_nice(p, 0);

7933

continue;

7933

continue;

7934

}

7934

}

7935

7936

raw_spin_lock(&p->pi_lock);

7936

raw_spin_lock(&p->pi_lock);

7937

rq = __task_rq_lock(p);

7937

rq = __task_rq_lock(p);

7938

7939

normalize_task(rq, p);

7939

normalize_task(rq, p);

7940

7941

__task_rq_unlock(rq);

7941

__task_rq_unlock(rq);

7942

raw_spin_unlock(&p->pi_lock);

7942

raw_spin_unlock(&p->pi_lock);

7943

} while_each_thread(g, p);

7943

} while_each_thread(g, p);

7944

7945

read_unlock_irqrestore(&tasklist_lock, flags);

7945

read_unlock_irqrestore(&tasklist_lock, flags);

7946

}

7946

}

7947

7948

#endif /* CONFIG_MAGIC_SYSRQ */

7948

#endif /* CONFIG_MAGIC_SYSRQ */

7949

7950

#ifdef CONFIG_IA64

7950

#ifdef CONFIG_IA64

7951

/*

7951

/*

7952

* These functions are only useful for the IA64 MCA handling.

7952

* These functions are only useful for the IA64 MCA handling.

7953

*

7953

*

7954

* They can only be called when the whole system has been

7954

* They can only be called when the whole system has been

7955

* stopped - every CPU needs to be quiescent, and no scheduling

7955

* stopped - every CPU needs to be quiescent, and no scheduling

7956

* activity can take place. Using them for anything else would

7956

* activity can take place. Using them for anything else would

7957

* be a serious bug, and as a result, they aren't even visible

7957

* be a serious bug, and as a result, they aren't even visible

7958

* under any other configuration.

7958

* under any other configuration.

7959

*/

7959

*/

7960

7961

/**

7961

/**

7962

* curr_task - return the current task for a given cpu.

7962

* curr_task - return the current task for a given cpu.

7963

* @cpu: the processor in question.

7963

* @cpu: the processor in question.

7964

*

7964

*

7965

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

7965

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

7966

*/

7966

*/

7967

struct task_struct *curr_task(int cpu)

7967

struct task_struct *curr_task(int cpu)

7968

{

7968

{

7969

return cpu_curr(cpu);

7969

return cpu_curr(cpu);

7970

}

7970

}

7971

7972

/**

7972

/**

7973

* set_curr_task - set the current task for a given cpu.

7973

* set_curr_task - set the current task for a given cpu.

7974

* @cpu: the processor in question.

7974

* @cpu: the processor in question.

7975

* @p: the task pointer to set.

7975

* @p: the task pointer to set.

7976

*

7976

*

7977

* Description: This function must only be used when non-maskable interrupts

7977

* Description: This function must only be used when non-maskable interrupts

7978

* are serviced on a separate stack. It allows the architecture to switch the

7978

* are serviced on a separate stack. It allows the architecture to switch the

7979

* notion of the current task on a cpu in a non-blocking manner. This function

7979

* notion of the current task on a cpu in a non-blocking manner. This function

7980

* must be called with all CPU's synchronized, and interrupts disabled, the

7980

* must be called with all CPU's synchronized, and interrupts disabled, the

7981

* and caller must save the original value of the current task (see

7981

* and caller must save the original value of the current task (see

7982

* curr_task() above) and restore that value before reenabling interrupts and

7982

* curr_task() above) and restore that value before reenabling interrupts and

7983

* re-starting the system.

7983

* re-starting the system.

7984

*

7984

*

7985

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

7985

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

7986

*/

7986

*/

7987

void set_curr_task(int cpu, struct task_struct *p)

7987

void set_curr_task(int cpu, struct task_struct *p)

7988

{

7988

{

7989

cpu_curr(cpu) = p;

7989

cpu_curr(cpu) = p;

7990

}

7990

}

7991

7992

#endif

7992

#endif

7993

7994

#ifdef CONFIG_FAIR_GROUP_SCHED

7994

#ifdef CONFIG_FAIR_GROUP_SCHED

7995

static void free_fair_sched_group(struct task_group *tg)

7995

static void free_fair_sched_group(struct task_group *tg)

7996

{

7996

{

7997

int i;

7997

int i;

7998

7999

for_each_possible_cpu(i) {

7999

for_each_possible_cpu(i) {

8000

if (tg->cfs_rq)

8000

if (tg->cfs_rq)

8001

kfree(tg->cfs_rq[i]);

8001

kfree(tg->cfs_rq[i]);

8002

if (tg->se)

8002

if (tg->se)

8003

kfree(tg->se[i]);

8003

kfree(tg->se[i]);

8004

}

8004

}

8005

8006

kfree(tg->cfs_rq);

8006

kfree(tg->cfs_rq);

8007

kfree(tg->se);

8007

kfree(tg->se);

8008

}

8008

}

8009

8010

static

8010

static

8011

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

8011

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

8012

{

8012

{

8013

struct cfs_rq *cfs_rq;

8013

struct cfs_rq *cfs_rq;

8014

struct sched_entity *se;

8014

struct sched_entity *se;

8015

struct rq *rq;

8015

struct rq *rq;

8016

int i;

8016

int i;

8017

8018

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

8018

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

8019

if (!tg->cfs_rq)

8019

if (!tg->cfs_rq)

8020

goto err;

8020

goto err;

8021

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

8021

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

8022

if (!tg->se)

8022

if (!tg->se)

8023

goto err;

8023

goto err;

8024

8025

tg->shares = NICE_0_LOAD;

8025

tg->shares = NICE_0_LOAD;

8026

8027

for_each_possible_cpu(i) {

8027

for_each_possible_cpu(i) {

8028

rq = cpu_rq(i);

8028

rq = cpu_rq(i);

8029

8030

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

8030

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

8031

GFP_KERNEL, cpu_to_node(i));

8031

GFP_KERNEL, cpu_to_node(i));

8032

if (!cfs_rq)

8032

if (!cfs_rq)

8033

goto err;

8033

goto err;

8034

8035

se = kzalloc_node(sizeof(struct sched_entity),

8035

se = kzalloc_node(sizeof(struct sched_entity),

8036

GFP_KERNEL, cpu_to_node(i));

8036

GFP_KERNEL, cpu_to_node(i));

8037

if (!se)

8037

if (!se)

8038

goto err_free_rq;

8038

goto err_free_rq;

8039

8040

init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);

8040

init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);

8041

}

8041

}

8042

8043

return 1;

8043

return 1;

8044

8045

err_free_rq:

8045

err_free_rq:

8046

kfree(cfs_rq);

8046

kfree(cfs_rq);

8047

err:

8047

err:

8048

return 0;

8048

return 0;

8049

}

8049

}

8050

8051

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

8051

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

8052

{

8052

{

8053

list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,

8053

list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,

8054

&cpu_rq(cpu)->leaf_cfs_rq_list);

8054

&cpu_rq(cpu)->leaf_cfs_rq_list);

8055

}

8055

}

8056

8057

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

8057

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

8058

{

8058

{

8059

list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);

8059

list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);

8060

}

8060

}

8061

#else /* !CONFG_FAIR_GROUP_SCHED */

8061

#else /* !CONFG_FAIR_GROUP_SCHED */

8062

static inline void free_fair_sched_group(struct task_group *tg)

8062

static inline void free_fair_sched_group(struct task_group *tg)

8063

{

8063

{

8064

}

8064

}

8065

8066

static inline

8066

static inline

8067

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

8067

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

8068

{

8068

{

8069

return 1;

8069

return 1;

8070

}

8070

}

8071

8072

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

8072

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

8073

{

8073

{

8074

}

8074

}

8075

8076

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

8076

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

8077

{

8077

{

8078

}

8078

}

8079

#endif /* CONFIG_FAIR_GROUP_SCHED */

8079

#endif /* CONFIG_FAIR_GROUP_SCHED */

8080

8081

#ifdef CONFIG_RT_GROUP_SCHED

8081

#ifdef CONFIG_RT_GROUP_SCHED

8082

static void free_rt_sched_group(struct task_group *tg)

8082

static void free_rt_sched_group(struct task_group *tg)

8083

{

8083

{

8084

int i;

8084

int i;

8085

8086

destroy_rt_bandwidth(&tg->rt_bandwidth);

8086

destroy_rt_bandwidth(&tg->rt_bandwidth);

8087

8088

for_each_possible_cpu(i) {

8088

for_each_possible_cpu(i) {

8089

if (tg->rt_rq)

8089

if (tg->rt_rq)

8090

kfree(tg->rt_rq[i]);

8090

kfree(tg->rt_rq[i]);

8091

if (tg->rt_se)

8091

if (tg->rt_se)

8092

kfree(tg->rt_se[i]);

8092

kfree(tg->rt_se[i]);

8093

}

8093

}

8094

8095

kfree(tg->rt_rq);

8095

kfree(tg->rt_rq);

8096

kfree(tg->rt_se);

8096

kfree(tg->rt_se);

8097

}

8097

}

8098

8099

static

8099

static

8100

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

8100

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

8101

{

8101

{

8102

struct rt_rq *rt_rq;

8102

struct rt_rq *rt_rq;

8103

struct sched_rt_entity *rt_se;

8103

struct sched_rt_entity *rt_se;

8104

struct rq *rq;

8104

struct rq *rq;

8105

int i;

8105

int i;

8106

8107

tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);

8107

tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);

8108

if (!tg->rt_rq)

8108

if (!tg->rt_rq)

8109

goto err;

8109

goto err;

8110

tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);

8110

tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);

8111

if (!tg->rt_se)

8111

if (!tg->rt_se)

8112

goto err;

8112

goto err;

8113

8114

init_rt_bandwidth(&tg->rt_bandwidth,

8114

init_rt_bandwidth(&tg->rt_bandwidth,

8115

ktime_to_ns(def_rt_bandwidth.rt_period), 0);

8115

ktime_to_ns(def_rt_bandwidth.rt_period), 0);

8116

8117

for_each_possible_cpu(i) {

8117

for_each_possible_cpu(i) {

8118

rq = cpu_rq(i);

8118

rq = cpu_rq(i);

8119

8120

rt_rq = kzalloc_node(sizeof(struct rt_rq),

8120

rt_rq = kzalloc_node(sizeof(struct rt_rq),

8121

GFP_KERNEL, cpu_to_node(i));

8121

GFP_KERNEL, cpu_to_node(i));

8122

if (!rt_rq)

8122

if (!rt_rq)

8123

goto err;

8123

goto err;

8124

8125

rt_se = kzalloc_node(sizeof(struct sched_rt_entity),

8125

rt_se = kzalloc_node(sizeof(struct sched_rt_entity),

8126

GFP_KERNEL, cpu_to_node(i));

8126

GFP_KERNEL, cpu_to_node(i));

8127

if (!rt_se)

8127

if (!rt_se)

8128

goto err_free_rq;

8128

goto err_free_rq;

8129

8130

init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);

8130

init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);

8131

}

8131

}

8132

8133

return 1;

8133

return 1;

8134

8135

err_free_rq:

8135

err_free_rq:

8136

kfree(rt_rq);

8136

kfree(rt_rq);

8137

err:

8137

err:

8138

return 0;

8138

return 0;

8139

}

8139

}

8140

8141

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

8141

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

8142

{

8142

{

8143

list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,

8143

list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,

8144

&cpu_rq(cpu)->leaf_rt_rq_list);

8144

&cpu_rq(cpu)->leaf_rt_rq_list);

8145

}

8145

}

8146

8147

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

8147

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

8148

{

8148

{

8149

list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);

8149

list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);

8150

}

8150

}

8151

#else /* !CONFIG_RT_GROUP_SCHED */

8151

#else /* !CONFIG_RT_GROUP_SCHED */

8152

static inline void free_rt_sched_group(struct task_group *tg)

8152

static inline void free_rt_sched_group(struct task_group *tg)

8153

{

8153

{

8154

}

8154

}

8155

8156

static inline

8156

static inline

8157

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

8157

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

8158

{

8158

{

8159

return 1;

8159

return 1;

8160

}

8160

}

8161

8162

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

8162

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

8163

{

8163

{

8164

}

8164

}

8165

8166

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

8166

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

8167

{

8167

{

8168

}

8168

}

8169

#endif /* CONFIG_RT_GROUP_SCHED */

8169

#endif /* CONFIG_RT_GROUP_SCHED */

8170

8171

#ifdef CONFIG_CGROUP_SCHED

8171

#ifdef CONFIG_CGROUP_SCHED

8172

static void free_sched_group(struct task_group *tg)

8172

static void free_sched_group(struct task_group *tg)

8173

{

8173

{

8174

free_fair_sched_group(tg);

8174

free_fair_sched_group(tg);

8175

free_rt_sched_group(tg);

8175

free_rt_sched_group(tg);

8176

kfree(tg);

8176

kfree(tg);

8177

}

8177

}

8178

8179

/* allocate runqueue etc for a new task group */

8179

/* allocate runqueue etc for a new task group */

8180

struct task_group *sched_create_group(struct task_group *parent)

8180

struct task_group *sched_create_group(struct task_group *parent)

8181

{

8181

{

8182

struct task_group *tg;

8182

struct task_group *tg;

8183

unsigned long flags;

8183

unsigned long flags;

8184

int i;

8184

int i;

8185

8186

tg = kzalloc(sizeof(*tg), GFP_KERNEL);

8186

tg = kzalloc(sizeof(*tg), GFP_KERNEL);

8187

if (!tg)

8187

if (!tg)

8188

return ERR_PTR(-ENOMEM);

8188

return ERR_PTR(-ENOMEM);

8189

8190

if (!alloc_fair_sched_group(tg, parent))

8190

if (!alloc_fair_sched_group(tg, parent))

8191

goto err;

8191

goto err;

8192

8193

if (!alloc_rt_sched_group(tg, parent))

8193

if (!alloc_rt_sched_group(tg, parent))

8194

goto err;

8194

goto err;

8195

8196

spin_lock_irqsave(&task_group_lock, flags);

8196

spin_lock_irqsave(&task_group_lock, flags);

8197

for_each_possible_cpu(i) {

8197

for_each_possible_cpu(i) {

8198

register_fair_sched_group(tg, i);

8198

register_fair_sched_group(tg, i);

8199

register_rt_sched_group(tg, i);

8199

register_rt_sched_group(tg, i);

8200

}

8200

}

8201

list_add_rcu(&tg->list, &task_groups);

8201

list_add_rcu(&tg->list, &task_groups);

8202

8203

WARN_ON(!parent); /* root should already exist */

8203

WARN_ON(!parent); /* root should already exist */

8204

8205

tg->parent = parent;

8205

tg->parent = parent;

8206

INIT_LIST_HEAD(&tg->children);

8206

INIT_LIST_HEAD(&tg->children);

8207

list_add_rcu(&tg->siblings, &parent->children);

8207

list_add_rcu(&tg->siblings, &parent->children);

8208

spin_unlock_irqrestore(&task_group_lock, flags);

8208

spin_unlock_irqrestore(&task_group_lock, flags);

8209

8210

return tg;

8210

return tg;

8211

8212

err:

8212

err:

8213

free_sched_group(tg);

8213

free_sched_group(tg);

8214

return ERR_PTR(-ENOMEM);

8214

return ERR_PTR(-ENOMEM);

8215

}

8215

}

8216

8217

/* rcu callback to free various structures associated with a task group */

8217

/* rcu callback to free various structures associated with a task group */

8218

static void free_sched_group_rcu(struct rcu_head *rhp)

8218

static void free_sched_group_rcu(struct rcu_head *rhp)

8219

{

8219

{

8220

/* now it should be safe to free those cfs_rqs */

8220

/* now it should be safe to free those cfs_rqs */

8221

free_sched_group(container_of(rhp, struct task_group, rcu));

8221

free_sched_group(container_of(rhp, struct task_group, rcu));

8222

}

8222

}

8223

8224

/* Destroy runqueue etc associated with a task group */

8224

/* Destroy runqueue etc associated with a task group */

8225

void sched_destroy_group(struct task_group *tg)

8225

void sched_destroy_group(struct task_group *tg)

8226

{

8226

{

8227

unsigned long flags;

8227

unsigned long flags;

8228

int i;

8228

int i;

8229

8230

spin_lock_irqsave(&task_group_lock, flags);

8230

spin_lock_irqsave(&task_group_lock, flags);

8231

for_each_possible_cpu(i) {

8231

for_each_possible_cpu(i) {

8232

unregister_fair_sched_group(tg, i);

8232

unregister_fair_sched_group(tg, i);

8233

unregister_rt_sched_group(tg, i);

8233

unregister_rt_sched_group(tg, i);

8234

}

8234

}

8235

list_del_rcu(&tg->list);

8235

list_del_rcu(&tg->list);

8236

list_del_rcu(&tg->siblings);

8236

list_del_rcu(&tg->siblings);

8237

spin_unlock_irqrestore(&task_group_lock, flags);

8237

spin_unlock_irqrestore(&task_group_lock, flags);

8238

8239

/* wait for possible concurrent references to cfs_rqs complete */

8239

/* wait for possible concurrent references to cfs_rqs complete */

8240

call_rcu(&tg->rcu, free_sched_group_rcu);

8240

call_rcu(&tg->rcu, free_sched_group_rcu);

8241

}

8241

}

8242

8243

/* change task's runqueue when it moves between groups.

8243

/* change task's runqueue when it moves between groups.

8244

* The caller of this function should have put the task in its new group

8244

* The caller of this function should have put the task in its new group

8245

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

8245

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

8246

* reflect its new group.

8246

* reflect its new group.

8247

*/

8247

*/

8248

void sched_move_task(struct task_struct *tsk)

8248

void sched_move_task(struct task_struct *tsk)

8249

{

8249

{

8250

int on_rq, running;

8250

int on_rq, running;

8251

unsigned long flags;

8251

unsigned long flags;

8252

struct rq *rq;

8252

struct rq *rq;

8253

8254

rq = task_rq_lock(tsk, &flags);

8254

rq = task_rq_lock(tsk, &flags);

8255

8256

update_rq_clock(rq);

8256

update_rq_clock(rq);

8257

8258

running = task_current(rq, tsk);

8258

running = task_current(rq, tsk);

8259

on_rq = tsk->se.on_rq;

8259

on_rq = tsk->se.on_rq;

8260

8261

if (on_rq)

8261

if (on_rq)

8262

dequeue_task(rq, tsk, 0);

8262

dequeue_task(rq, tsk, 0);

8263

if (unlikely(running))

8263

if (unlikely(running))

8264

tsk->sched_class->put_prev_task(rq, tsk);

8264

tsk->sched_class->put_prev_task(rq, tsk);

8265

8266

set_task_rq(tsk, task_cpu(tsk));

8266

set_task_rq(tsk, task_cpu(tsk));

8267

8268

#ifdef CONFIG_FAIR_GROUP_SCHED

8268

#ifdef CONFIG_FAIR_GROUP_SCHED

8269

if (tsk->sched_class->moved_group)

8269

if (tsk->sched_class->moved_group)

8270

tsk->sched_class->moved_group(tsk, on_rq);

8270

tsk->sched_class->moved_group(tsk, on_rq);

8271

#endif

8271

#endif

8272

8273

if (unlikely(running))

8273

if (unlikely(running))

8274

tsk->sched_class->set_curr_task(rq);

8274

tsk->sched_class->set_curr_task(rq);

8275

if (on_rq)

8275

if (on_rq)

8276

enqueue_task(rq, tsk, 0, false);

8276

enqueue_task(rq, tsk, 0, false);

8277

8278

task_rq_unlock(rq, &flags);

8278

task_rq_unlock(rq, &flags);

8279

}

8279

}

8280

#endif /* CONFIG_CGROUP_SCHED */

8280

#endif /* CONFIG_CGROUP_SCHED */

8281

8282

#ifdef CONFIG_FAIR_GROUP_SCHED

8282

#ifdef CONFIG_FAIR_GROUP_SCHED

8283

static void __set_se_shares(struct sched_entity *se, unsigned long shares)

8283

static void __set_se_shares(struct sched_entity *se, unsigned long shares)

8284

{

8284

{

8285

struct cfs_rq *cfs_rq = se->cfs_rq;

8285

struct cfs_rq *cfs_rq = se->cfs_rq;

8286

int on_rq;

8286

int on_rq;

8287

8288

on_rq = se->on_rq;

8288

on_rq = se->on_rq;

8289

if (on_rq)

8289

if (on_rq)

8290

dequeue_entity(cfs_rq, se, 0);

8290

dequeue_entity(cfs_rq, se, 0);

8291

8292

se->load.weight = shares;

8292

se->load.weight = shares;

8293

se->load.inv_weight = 0;

8293

se->load.inv_weight = 0;

8294

8295

if (on_rq)

8295

if (on_rq)

8296

enqueue_entity(cfs_rq, se, 0);

8296

enqueue_entity(cfs_rq, se, 0);

8297

}

8297

}

8298

8299

static void set_se_shares(struct sched_entity *se, unsigned long shares)

8299

static void set_se_shares(struct sched_entity *se, unsigned long shares)

8300

{

8300

{

8301

struct cfs_rq *cfs_rq = se->cfs_rq;

8301

struct cfs_rq *cfs_rq = se->cfs_rq;

8302

struct rq *rq = cfs_rq->rq;

8302

struct rq *rq = cfs_rq->rq;

8303

unsigned long flags;

8303

unsigned long flags;

8304

8305

raw_spin_lock_irqsave(&rq->lock, flags);

8305

raw_spin_lock_irqsave(&rq->lock, flags);

8306

__set_se_shares(se, shares);

8306

__set_se_shares(se, shares);

8307

raw_spin_unlock_irqrestore(&rq->lock, flags);

8307

raw_spin_unlock_irqrestore(&rq->lock, flags);

8308

}

8308

}

8309

8310

static DEFINE_MUTEX(shares_mutex);

8310

static DEFINE_MUTEX(shares_mutex);

8311

8312

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

8312

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

8313

{

8313

{

8314

int i;

8314

int i;

8315

unsigned long flags;

8315

unsigned long flags;

8316

8317

/*

8317

/*

8318

* We can't change the weight of the root cgroup.

8318

* We can't change the weight of the root cgroup.

8319

*/

8319

*/

8320

if (!tg->se[0])

8320

if (!tg->se[0])

8321

return -EINVAL;

8321

return -EINVAL;

8322

8323

if (shares < MIN_SHARES)

8323

if (shares < MIN_SHARES)

8324

shares = MIN_SHARES;

8324

shares = MIN_SHARES;

8325

else if (shares > MAX_SHARES)

8325

else if (shares > MAX_SHARES)

8326

shares = MAX_SHARES;

8326

shares = MAX_SHARES;

8327

8328

mutex_lock(&shares_mutex);

8328

mutex_lock(&shares_mutex);

8329

if (tg->shares == shares)

8329

if (tg->shares == shares)

8330

goto done;

8330

goto done;

8331

8332

spin_lock_irqsave(&task_group_lock, flags);

8332

spin_lock_irqsave(&task_group_lock, flags);

8333

for_each_possible_cpu(i)

8333

for_each_possible_cpu(i)

8334

unregister_fair_sched_group(tg, i);

8334

unregister_fair_sched_group(tg, i);

8335

list_del_rcu(&tg->siblings);

8335

list_del_rcu(&tg->siblings);

8336

spin_unlock_irqrestore(&task_group_lock, flags);

8336

spin_unlock_irqrestore(&task_group_lock, flags);

8337

8338

/* wait for any ongoing reference to this group to finish */

8338

/* wait for any ongoing reference to this group to finish */

8339

synchronize_sched();

8339

synchronize_sched();

8340

8341

/*

8341

/*

8342

* Now we are free to modify the group's share on each cpu

8342

* Now we are free to modify the group's share on each cpu

8343

* w/o tripping rebalance_share or load_balance_fair.

8343

* w/o tripping rebalance_share or load_balance_fair.

8344

*/

8344

*/

8345

tg->shares = shares;

8345

tg->shares = shares;

8346

for_each_possible_cpu(i) {

8346

for_each_possible_cpu(i) {

8347

/*

8347

/*

8348

* force a rebalance

8348

* force a rebalance

8349

*/

8349

*/

8350

cfs_rq_set_shares(tg->cfs_rq[i], 0);

8350

cfs_rq_set_shares(tg->cfs_rq[i], 0);

8351

set_se_shares(tg->se[i], shares);

8351

set_se_shares(tg->se[i], shares);

8352

}

8352

}

8353

8354

/*

8354

/*

8355

* Enable load balance activity on this group, by inserting it back on

8355

* Enable load balance activity on this group, by inserting it back on

8356

* each cpu's rq->leaf_cfs_rq_list.

8356

* each cpu's rq->leaf_cfs_rq_list.

8357

*/

8357

*/

8358

spin_lock_irqsave(&task_group_lock, flags);

8358

spin_lock_irqsave(&task_group_lock, flags);

8359

for_each_possible_cpu(i)

8359

for_each_possible_cpu(i)

8360

register_fair_sched_group(tg, i);

8360

register_fair_sched_group(tg, i);

8361

list_add_rcu(&tg->siblings, &tg->parent->children);

8361

list_add_rcu(&tg->siblings, &tg->parent->children);

8362

spin_unlock_irqrestore(&task_group_lock, flags);

8362

spin_unlock_irqrestore(&task_group_lock, flags);

8363

done:

8363

done:

8364

mutex_unlock(&shares_mutex);

8364

mutex_unlock(&shares_mutex);

8365

return 0;

8365

return 0;

8366

}

8366

}

8367

8368

unsigned long sched_group_shares(struct task_group *tg)

8368

unsigned long sched_group_shares(struct task_group *tg)

8369

{

8369

{

8370

return tg->shares;

8370

return tg->shares;

8371

}

8371

}

8372

#endif

8372

#endif

8373

8374

#ifdef CONFIG_RT_GROUP_SCHED

8374

#ifdef CONFIG_RT_GROUP_SCHED

8375

/*

8375

/*

8376

* Ensure that the real time constraints are schedulable.

8376

* Ensure that the real time constraints are schedulable.

8377

*/

8377

*/

8378

static DEFINE_MUTEX(rt_constraints_mutex);

8378

static DEFINE_MUTEX(rt_constraints_mutex);

8379

8380

static unsigned long to_ratio(u64 period, u64 runtime)

8380

static unsigned long to_ratio(u64 period, u64 runtime)

8381

{

8381

{

8382

if (runtime == RUNTIME_INF)

8382

if (runtime == RUNTIME_INF)

8383

return 1ULL << 20;

8383

return 1ULL << 20;

8384

8385

return div64_u64(runtime << 20, period);

8385

return div64_u64(runtime << 20, period);

8386

}

8386

}

8387

8388

/* Must be called with tasklist_lock held */

8388

/* Must be called with tasklist_lock held */

8389

static inline int tg_has_rt_tasks(struct task_group *tg)

8389

static inline int tg_has_rt_tasks(struct task_group *tg)

8390

{

8390

{

8391

struct task_struct *g, *p;

8391

struct task_struct *g, *p;

8392

8393

do_each_thread(g, p) {

8393

do_each_thread(g, p) {

8394

if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)

8394

if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)

8395

return 1;

8395

return 1;

8396

} while_each_thread(g, p);

8396

} while_each_thread(g, p);

8397

8398

return 0;

8398

return 0;

8399

}

8399

}

8400

8401

struct rt_schedulable_data {

8401

struct rt_schedulable_data {

8402

struct task_group *tg;

8402

struct task_group *tg;

8403

u64 rt_period;

8403

u64 rt_period;

8404

u64 rt_runtime;

8404

u64 rt_runtime;

8405

};

8405

};

8406

8407

static int tg_schedulable(struct task_group *tg, void *data)

8407

static int tg_schedulable(struct task_group *tg, void *data)

8408

{

8408

{

8409

struct rt_schedulable_data *d = data;

8409

struct rt_schedulable_data *d = data;

8410

struct task_group *child;

8410

struct task_group *child;

8411

unsigned long total, sum = 0;

8411

unsigned long total, sum = 0;

8412

u64 period, runtime;

8412

u64 period, runtime;

8413

8414

period = ktime_to_ns(tg->rt_bandwidth.rt_period);

8414

period = ktime_to_ns(tg->rt_bandwidth.rt_period);

8415

runtime = tg->rt_bandwidth.rt_runtime;

8415

runtime = tg->rt_bandwidth.rt_runtime;

8416

8417

if (tg == d->tg) {

8417

if (tg == d->tg) {

8418

period = d->rt_period;

8418

period = d->rt_period;

8419

runtime = d->rt_runtime;

8419

runtime = d->rt_runtime;

8420

}

8420

}

8421

8422

/*

8422

/*

8423

* Cannot have more runtime than the period.

8423

* Cannot have more runtime than the period.

8424

*/

8424

*/

8425

if (runtime > period && runtime != RUNTIME_INF)

8425

if (runtime > period && runtime != RUNTIME_INF)

8426

return -EINVAL;

8426

return -EINVAL;

8427

8428

/*

8428

/*

8429

* Ensure we don't starve existing RT tasks.

8429

* Ensure we don't starve existing RT tasks.

8430

*/

8430

*/

8431

if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))

8431

if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))

8432

return -EBUSY;

8432

return -EBUSY;

8433

8434

total = to_ratio(period, runtime);

8434

total = to_ratio(period, runtime);

8435

8436

/*

8436

/*

8437

* Nobody can have more than the global setting allows.

8437

* Nobody can have more than the global setting allows.

8438

*/

8438

*/

8439

if (total > to_ratio(global_rt_period(), global_rt_runtime()))

8439

if (total > to_ratio(global_rt_period(), global_rt_runtime()))

8440

return -EINVAL;

8440

return -EINVAL;

8441

8442

/*

8442

/*

8443

* The sum of our children's runtime should not exceed our own.

8443

* The sum of our children's runtime should not exceed our own.

8444

*/

8444

*/

8445

list_for_each_entry_rcu(child, &tg->children, siblings) {

8445

list_for_each_entry_rcu(child, &tg->children, siblings) {

8446

period = ktime_to_ns(child->rt_bandwidth.rt_period);

8446

period = ktime_to_ns(child->rt_bandwidth.rt_period);

8447

runtime = child->rt_bandwidth.rt_runtime;

8447

runtime = child->rt_bandwidth.rt_runtime;

8448

8449

if (child == d->tg) {

8449

if (child == d->tg) {

8450

period = d->rt_period;

8450

period = d->rt_period;

8451

runtime = d->rt_runtime;

8451

runtime = d->rt_runtime;

8452

}

8452

}

8453

8454

sum += to_ratio(period, runtime);

8454

sum += to_ratio(period, runtime);

8455

}

8455

}

8456

8457

if (sum > total)

8457

if (sum > total)

8458

return -EINVAL;

8458

return -EINVAL;

8459

8460

return 0;

8460

return 0;

8461

}

8461

}

8462

8463

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)

8463

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)

8464

{

8464

{

8465

struct rt_schedulable_data data = {

8465

struct rt_schedulable_data data = {

8466

.tg = tg,

8466

.tg = tg,

8467

.rt_period = period,

8467

.rt_period = period,

8468

.rt_runtime = runtime,

8468

.rt_runtime = runtime,

8469

};

8469

};

8470

8471

return walk_tg_tree(tg_schedulable, tg_nop, &data);

8471

return walk_tg_tree(tg_schedulable, tg_nop, &data);

8472

}

8472

}

8473

8474

static int tg_set_bandwidth(struct task_group *tg,

8474

static int tg_set_bandwidth(struct task_group *tg,

8475

u64 rt_period, u64 rt_runtime)

8475

u64 rt_period, u64 rt_runtime)

8476

{

8476

{

8477

int i, err = 0;

8477

int i, err = 0;

8478

8479

mutex_lock(&rt_constraints_mutex);

8479

mutex_lock(&rt_constraints_mutex);

8480

read_lock(&tasklist_lock);

8480

read_lock(&tasklist_lock);

8481

err = __rt_schedulable(tg, rt_period, rt_runtime);

8481

err = __rt_schedulable(tg, rt_period, rt_runtime);

8482

if (err)

8482

if (err)

8483

goto unlock;

8483

goto unlock;

8484

8485

raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);

8485

raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);

8486

tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);

8486

tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);

8487

tg->rt_bandwidth.rt_runtime = rt_runtime;

8487

tg->rt_bandwidth.rt_runtime = rt_runtime;

8488

8489

for_each_possible_cpu(i) {

8489

for_each_possible_cpu(i) {

8490

struct rt_rq *rt_rq = tg->rt_rq[i];

8490

struct rt_rq *rt_rq = tg->rt_rq[i];

8491

8492

raw_spin_lock(&rt_rq->rt_runtime_lock);

8492

raw_spin_lock(&rt_rq->rt_runtime_lock);

8493

rt_rq->rt_runtime = rt_runtime;

8493

rt_rq->rt_runtime = rt_runtime;

8494

raw_spin_unlock(&rt_rq->rt_runtime_lock);

8494

raw_spin_unlock(&rt_rq->rt_runtime_lock);

8495

}

8495

}

8496

raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);

8496

raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);

8497

unlock:

8497

unlock:

8498

read_unlock(&tasklist_lock);

8498

read_unlock(&tasklist_lock);

8499

mutex_unlock(&rt_constraints_mutex);

8499

mutex_unlock(&rt_constraints_mutex);

8500

8501

return err;

8501

return err;

8502

}

8502

}

8503

8504

int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)

8504

int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)

8505

{

8505

{

8506

u64 rt_runtime, rt_period;

8506

u64 rt_runtime, rt_period;

8507

8508

rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);

8508

rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);

8509

rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;

8509

rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;

8510

if (rt_runtime_us < 0)

8510

if (rt_runtime_us < 0)

8511

rt_runtime = RUNTIME_INF;

8511

rt_runtime = RUNTIME_INF;

8512

8513

return tg_set_bandwidth(tg, rt_period, rt_runtime);

8513

return tg_set_bandwidth(tg, rt_period, rt_runtime);

8514

}

8514

}

8515

8516

long sched_group_rt_runtime(struct task_group *tg)

8516

long sched_group_rt_runtime(struct task_group *tg)

8517

{

8517

{

8518

u64 rt_runtime_us;

8518

u64 rt_runtime_us;

8519

8520

if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)

8520

if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)

8521

return -1;

8521

return -1;

8522

8523

rt_runtime_us = tg->rt_bandwidth.rt_runtime;

8523

rt_runtime_us = tg->rt_bandwidth.rt_runtime;

8524

do_div(rt_runtime_us, NSEC_PER_USEC);

8524

do_div(rt_runtime_us, NSEC_PER_USEC);

8525

return rt_runtime_us;

8525

return rt_runtime_us;

8526

}

8526

}

8527

8528

int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)

8528

int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)

8529

{

8529

{

8530

u64 rt_runtime, rt_period;

8530

u64 rt_runtime, rt_period;

8531

8532

rt_period = (u64)rt_period_us * NSEC_PER_USEC;

8532

rt_period = (u64)rt_period_us * NSEC_PER_USEC;

8533

rt_runtime = tg->rt_bandwidth.rt_runtime;

8533

rt_runtime = tg->rt_bandwidth.rt_runtime;

8534

8535

if (rt_period == 0)

8535

if (rt_period == 0)

8536

return -EINVAL;

8536

return -EINVAL;

8537

8538

return tg_set_bandwidth(tg, rt_period, rt_runtime);

8538

return tg_set_bandwidth(tg, rt_period, rt_runtime);

8539

}

8539

}

8540

8541

long sched_group_rt_period(struct task_group *tg)

8541

long sched_group_rt_period(struct task_group *tg)

8542

{

8542

{

8543

u64 rt_period_us;

8543

u64 rt_period_us;

8544

8545

rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);

8545

rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);

8546

do_div(rt_period_us, NSEC_PER_USEC);

8546

do_div(rt_period_us, NSEC_PER_USEC);

8547

return rt_period_us;

8547

return rt_period_us;

8548

}

8548

}

8549

8550

static int sched_rt_global_constraints(void)

8550

static int sched_rt_global_constraints(void)

8551

{

8551

{

8552

u64 runtime, period;

8552

u64 runtime, period;

8553

int ret = 0;

8553

int ret = 0;

8554

8555

if (sysctl_sched_rt_period <= 0)

8555

if (sysctl_sched_rt_period <= 0)

8556

return -EINVAL;

8556

return -EINVAL;

8557

8558

runtime = global_rt_runtime();

8558

runtime = global_rt_runtime();

8559

period = global_rt_period();

8559

period = global_rt_period();

8560

8561

/*

8561

/*

8562

* Sanity check on the sysctl variables.

8562

* Sanity check on the sysctl variables.

8563

*/

8563

*/

8564

if (runtime > period && runtime != RUNTIME_INF)

8564

if (runtime > period && runtime != RUNTIME_INF)

8565

return -EINVAL;

8565

return -EINVAL;

8566

8567

mutex_lock(&rt_constraints_mutex);

8567

mutex_lock(&rt_constraints_mutex);

8568

read_lock(&tasklist_lock);

8568

read_lock(&tasklist_lock);

8569

ret = __rt_schedulable(NULL, 0, 0);

8569

ret = __rt_schedulable(NULL, 0, 0);

8570

read_unlock(&tasklist_lock);

8570

read_unlock(&tasklist_lock);

8571

mutex_unlock(&rt_constraints_mutex);

8571

mutex_unlock(&rt_constraints_mutex);

8572

8573

return ret;

8573

return ret;

8574

}

8574

}

8575

8576

int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)

8576

int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)

8577

{

8577

{

8578

/* Don't accept realtime tasks when there is no way for them to run */

8578

/* Don't accept realtime tasks when there is no way for them to run */

8579

if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)

8579

if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)

8580

return 0;

8580

return 0;

8581

8582

return 1;

8582

return 1;

8583

}

8583

}

8584

8585

#else /* !CONFIG_RT_GROUP_SCHED */

8585

#else /* !CONFIG_RT_GROUP_SCHED */

8586

static int sched_rt_global_constraints(void)

8586

static int sched_rt_global_constraints(void)

8587

{

8587

{

8588

unsigned long flags;

8588

unsigned long flags;

8589

int i;

8589

int i;

8590

8591

if (sysctl_sched_rt_period <= 0)

8591

if (sysctl_sched_rt_period <= 0)

8592

return -EINVAL;

8592

return -EINVAL;

8593

8594

/*

8594

/*

8595

* There's always some RT tasks in the root group

8595

* There's always some RT tasks in the root group

8596

* -- migration, kstopmachine etc..

8596

* -- migration, kstopmachine etc..

8597

*/

8597

*/

8598

if (sysctl_sched_rt_runtime == 0)

8598

if (sysctl_sched_rt_runtime == 0)

8599

return -EBUSY;

8599

return -EBUSY;

8600

8601

raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);

8601

raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);

8602

for_each_possible_cpu(i) {

8602

for_each_possible_cpu(i) {

8603

struct rt_rq *rt_rq = &cpu_rq(i)->rt;

8603

struct rt_rq *rt_rq = &cpu_rq(i)->rt;

8604

8605

raw_spin_lock(&rt_rq->rt_runtime_lock);

8605

raw_spin_lock(&rt_rq->rt_runtime_lock);

8606

rt_rq->rt_runtime = global_rt_runtime();

8606

rt_rq->rt_runtime = global_rt_runtime();

8607

raw_spin_unlock(&rt_rq->rt_runtime_lock);

8607

raw_spin_unlock(&rt_rq->rt_runtime_lock);

8608

}

8608

}

8609

raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

8609

raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

8610

8611

return 0;

8611

return 0;

8612

}

8612

}

8613

#endif /* CONFIG_RT_GROUP_SCHED */

8613

#endif /* CONFIG_RT_GROUP_SCHED */

8614

8615

int sched_rt_handler(struct ctl_table *table, int write,

8615

int sched_rt_handler(struct ctl_table *table, int write,

8616

void __user *buffer, size_t *lenp,

8616

void __user *buffer, size_t *lenp,

8617

loff_t *ppos)

8617

loff_t *ppos)

8618

{

8618

{

8619

int ret;

8619

int ret;

8620

int old_period, old_runtime;

8620

int old_period, old_runtime;

8621

static DEFINE_MUTEX(mutex);

8621

static DEFINE_MUTEX(mutex);

8622

8623

mutex_lock(&mutex);

8623

mutex_lock(&mutex);

8624

old_period = sysctl_sched_rt_period;

8624

old_period = sysctl_sched_rt_period;

8625

old_runtime = sysctl_sched_rt_runtime;

8625

old_runtime = sysctl_sched_rt_runtime;

8626

8627

ret = proc_dointvec(table, write, buffer, lenp, ppos);

8627

ret = proc_dointvec(table, write, buffer, lenp, ppos);

8628

8629

if (!ret && write) {

8629

if (!ret && write) {

8630

ret = sched_rt_global_constraints();

8630

ret = sched_rt_global_constraints();

8631

if (ret) {

8631

if (ret) {

8632

sysctl_sched_rt_period = old_period;

8632

sysctl_sched_rt_period = old_period;

8633

sysctl_sched_rt_runtime = old_runtime;

8633

sysctl_sched_rt_runtime = old_runtime;

8634

} else {

8634

} else {

8635

def_rt_bandwidth.rt_runtime = global_rt_runtime();

8635

def_rt_bandwidth.rt_runtime = global_rt_runtime();

8636

def_rt_bandwidth.rt_period =

8636

def_rt_bandwidth.rt_period =

8637

ns_to_ktime(global_rt_period());

8637

ns_to_ktime(global_rt_period());

8638

}

8638

}

8639

}

8639

}

8640

mutex_unlock(&mutex);

8640

mutex_unlock(&mutex);

8641

8642

return ret;

8642

return ret;

8643

}

8643

}

8644

8645

#ifdef CONFIG_CGROUP_SCHED

8645

#ifdef CONFIG_CGROUP_SCHED

8646

8647

/* return corresponding task_group object of a cgroup */

8647

/* return corresponding task_group object of a cgroup */

8648

static inline struct task_group *cgroup_tg(struct cgroup *cgrp)

8648

static inline struct task_group *cgroup_tg(struct cgroup *cgrp)

8649

{

8649

{

8650

return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),

8650

return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),

8651

struct task_group, css);

8651

struct task_group, css);

8652

}

8652

}

8653

8654

static struct cgroup_subsys_state *

8654

static struct cgroup_subsys_state *

8655

cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)

8655

cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)

8656

{

8656

{

8657

struct task_group *tg, *parent;

8657

struct task_group *tg, *parent;

8658

8659

if (!cgrp->parent) {

8659

if (!cgrp->parent) {

8660

/* This is early initialization for the top cgroup */

8660

/* This is early initialization for the top cgroup */

8661

return &init_task_group.css;

8661

return &init_task_group.css;

8662

}

8662

}

8663

8664

parent = cgroup_tg(cgrp->parent);

8664

parent = cgroup_tg(cgrp->parent);

8665

tg = sched_create_group(parent);

8665

tg = sched_create_group(parent);

8666

if (IS_ERR(tg))

8666

if (IS_ERR(tg))

8667

return ERR_PTR(-ENOMEM);

8667

return ERR_PTR(-ENOMEM);

8668

8669

return &tg->css;

8669

return &tg->css;

8670

}

8670

}

8671

8672

static void

8672

static void

8673

cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

8673

cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

8674

{

8674

{

8675

struct task_group *tg = cgroup_tg(cgrp);

8675

struct task_group *tg = cgroup_tg(cgrp);

8676

8677

sched_destroy_group(tg);

8677

sched_destroy_group(tg);

8678

}

8678

}

8679

8680

static int

8680

static int

8681

cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

8681

cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

8682

{

8682

{

8683

#ifdef CONFIG_RT_GROUP_SCHED

8683

#ifdef CONFIG_RT_GROUP_SCHED

8684

if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))

8684

if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))

8685

return -EINVAL;

8685

return -EINVAL;

8686

#else

8686

#else

8687

/* We don't support RT-tasks being in separate groups */

8687

/* We don't support RT-tasks being in separate groups */

8688

if (tsk->sched_class != &fair_sched_class)

8688

if (tsk->sched_class != &fair_sched_class)

8689

return -EINVAL;

8689

return -EINVAL;

8690

#endif

8690

#endif

8691

return 0;

8691

return 0;

8692

}

8692

}

8693

8694

static int

8694

static int

8695

cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

8695

cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

8696

struct task_struct *tsk, bool threadgroup)

8696

struct task_struct *tsk, bool threadgroup)

8697

{

8697

{

8698

int retval = cpu_cgroup_can_attach_task(cgrp, tsk);

8698

int retval = cpu_cgroup_can_attach_task(cgrp, tsk);

8699

if (retval)

8699

if (retval)

8700

return retval;

8700

return retval;

8701

if (threadgroup) {

8701

if (threadgroup) {

8702

struct task_struct *c;

8702

struct task_struct *c;

8703

rcu_read_lock();

8703

rcu_read_lock();

8704

list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {

8704

list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {

8705

retval = cpu_cgroup_can_attach_task(cgrp, c);

8705

retval = cpu_cgroup_can_attach_task(cgrp, c);

8706

if (retval) {

8706

if (retval) {

8707

rcu_read_unlock();

8707

rcu_read_unlock();

8708

return retval;

8708

return retval;

8709

}

8709

}

8710

}

8710

}

8711

rcu_read_unlock();

8711

rcu_read_unlock();

8712

}

8712

}

8713

return 0;

8713

return 0;

8714

}

8714

}

8715

8716

static void

8716

static void

8717

cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

8717

cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

8718

struct cgroup *old_cont, struct task_struct *tsk,

8718

struct cgroup *old_cont, struct task_struct *tsk,

8719

bool threadgroup)

8719

bool threadgroup)

8720

{

8720

{

8721

sched_move_task(tsk);

8721

sched_move_task(tsk);

8722

if (threadgroup) {

8722

if (threadgroup) {

8723

struct task_struct *c;

8723

struct task_struct *c;

8724

rcu_read_lock();

8724

rcu_read_lock();

8725

list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {

8725

list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {

8726

sched_move_task(c);

8726

sched_move_task(c);

8727

}

8727

}

8728

rcu_read_unlock();

8728

rcu_read_unlock();

8729

}

8729

}

8730

}

8730

}

8731

8732

#ifdef CONFIG_FAIR_GROUP_SCHED

8732

#ifdef CONFIG_FAIR_GROUP_SCHED

8733

static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,

8733

static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,

8734

u64 shareval)

8734

u64 shareval)

8735

{

8735

{

8736

return sched_group_set_shares(cgroup_tg(cgrp), shareval);

8736

return sched_group_set_shares(cgroup_tg(cgrp), shareval);

8737

}

8737

}

8738

8739

static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)

8739

static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)

8740

{

8740

{

8741

struct task_group *tg = cgroup_tg(cgrp);

8741

struct task_group *tg = cgroup_tg(cgrp);

8742

8743

return (u64) tg->shares;

8743

return (u64) tg->shares;

8744

}

8744

}

8745

#endif /* CONFIG_FAIR_GROUP_SCHED */

8745

#endif /* CONFIG_FAIR_GROUP_SCHED */

8746

8747

#ifdef CONFIG_RT_GROUP_SCHED

8747

#ifdef CONFIG_RT_GROUP_SCHED

8748

static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,

8748

static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,

8749

s64 val)

8749

s64 val)

8750

{

8750

{

8751

return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);

8751

return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);

8752

}

8752

}

8753

8754

static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)

8754

static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)

8755

{

8755

{

8756

return sched_group_rt_runtime(cgroup_tg(cgrp));

8756

return sched_group_rt_runtime(cgroup_tg(cgrp));

8757

}

8757

}

8758

8759

static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,

8759

static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,

8760

u64 rt_period_us)

8760

u64 rt_period_us)

8761

{

8761

{

8762

return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);

8762

return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);

8763

}

8763

}

8764

8765

static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)

8765

static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)

8766

{

8766

{

8767

return sched_group_rt_period(cgroup_tg(cgrp));

8767

return sched_group_rt_period(cgroup_tg(cgrp));

8768

}

8768

}

8769

#endif /* CONFIG_RT_GROUP_SCHED */

8769

#endif /* CONFIG_RT_GROUP_SCHED */

8770

8771

static struct cftype cpu_files[] = {

8771

static struct cftype cpu_files[] = {

8772

#ifdef CONFIG_FAIR_GROUP_SCHED

8772

#ifdef CONFIG_FAIR_GROUP_SCHED

8773

{

8773

{

8774

.name = "shares",

8774

.name = "shares",

8775

.read_u64 = cpu_shares_read_u64,

8775

.read_u64 = cpu_shares_read_u64,

8776

.write_u64 = cpu_shares_write_u64,

8776

.write_u64 = cpu_shares_write_u64,

8777

},

8777

},

8778

#endif

8778

#endif

8779

#ifdef CONFIG_RT_GROUP_SCHED

8779

#ifdef CONFIG_RT_GROUP_SCHED

8780

{

8780

{

8781

.name = "rt_runtime_us",

8781

.name = "rt_runtime_us",

8782

.read_s64 = cpu_rt_runtime_read,

8782

.read_s64 = cpu_rt_runtime_read,

8783

.write_s64 = cpu_rt_runtime_write,

8783

.write_s64 = cpu_rt_runtime_write,

8784

},

8784

},

8785

{

8785

{

8786

.name = "rt_period_us",

8786

.name = "rt_period_us",

8787

.read_u64 = cpu_rt_period_read_uint,

8787

.read_u64 = cpu_rt_period_read_uint,

8788

.write_u64 = cpu_rt_period_write_uint,

8788

.write_u64 = cpu_rt_period_write_uint,

8789

},

8789

},

8790

#endif

8790

#endif

8791

};

8791

};

8792

8793

static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)

8793

static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)

8794

{

8794

{

8795

return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));

8795

return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));

8796

}

8796

}

8797

8798

struct cgroup_subsys cpu_cgroup_subsys = {

8798

struct cgroup_subsys cpu_cgroup_subsys = {

8799

.name = "cpu",

8799

.name = "cpu",

8800

.create = cpu_cgroup_create,

8800

.create = cpu_cgroup_create,

8801

.destroy = cpu_cgroup_destroy,

8801

.destroy = cpu_cgroup_destroy,

8802

.can_attach = cpu_cgroup_can_attach,

8802

.can_attach = cpu_cgroup_can_attach,

8803

.attach = cpu_cgroup_attach,

8803

.attach = cpu_cgroup_attach,

8804

.populate = cpu_cgroup_populate,

8804

.populate = cpu_cgroup_populate,

8805

.subsys_id = cpu_cgroup_subsys_id,

8805

.subsys_id = cpu_cgroup_subsys_id,

8806

.early_init = 1,

8806

.early_init = 1,

8807

};

8807

};

8808

8809

#endif /* CONFIG_CGROUP_SCHED */

8809

#endif /* CONFIG_CGROUP_SCHED */

8810

8811

#ifdef CONFIG_CGROUP_CPUACCT

8811

#ifdef CONFIG_CGROUP_CPUACCT

8812

8813

/*

8813

/*

8814

* CPU accounting code for task groups.

8814

* CPU accounting code for task groups.

8815

*

8815

*

8816

* Based on the work by Paul Menage (menage@google.com) and Balbir Singh

8816

* Based on the work by Paul Menage (menage@google.com) and Balbir Singh

8817

* (balbir@in.ibm.com).

8817

* (balbir@in.ibm.com).

8818

*/

8818

*/

8819

8820

/* track cpu usage of a group of tasks and its child groups */

8820

/* track cpu usage of a group of tasks and its child groups */

8821

struct cpuacct {

8821

struct cpuacct {

8822

struct cgroup_subsys_state css;

8822

struct cgroup_subsys_state css;

8823

/* cpuusage holds pointer to a u64-type object on every cpu */

8823

/* cpuusage holds pointer to a u64-type object on every cpu */

8824

u64 __percpu *cpuusage;

8824

u64 __percpu *cpuusage;

8825

struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];

8825

struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];

8826

struct cpuacct *parent;

8826

struct cpuacct *parent;

8827

};

8827

};

8828

8829

struct cgroup_subsys cpuacct_subsys;

8829

struct cgroup_subsys cpuacct_subsys;

8830

8831

/* return cpu accounting group corresponding to this container */

8831

/* return cpu accounting group corresponding to this container */

8832

static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)

8832

static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)

8833

{

8833

{

8834

return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),

8834

return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),

8835

struct cpuacct, css);

8835

struct cpuacct, css);

8836

}

8836

}

8837

8838

/* return cpu accounting group to which this task belongs */

8838

/* return cpu accounting group to which this task belongs */

8839

static inline struct cpuacct *task_ca(struct task_struct *tsk)

8839

static inline struct cpuacct *task_ca(struct task_struct *tsk)

8840

{

8840

{

8841

return container_of(task_subsys_state(tsk, cpuacct_subsys_id),

8841

return container_of(task_subsys_state(tsk, cpuacct_subsys_id),

8842

struct cpuacct, css);

8842

struct cpuacct, css);

8843

}

8843

}

8844

8845

/* create a new cpu accounting group */

8845

/* create a new cpu accounting group */

8846

static struct cgroup_subsys_state *cpuacct_create(

8846

static struct cgroup_subsys_state *cpuacct_create(

8847

struct cgroup_subsys *ss, struct cgroup *cgrp)

8847

struct cgroup_subsys *ss, struct cgroup *cgrp)

8848

{

8848

{

8849

struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);

8849

struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);

8850

int i;

8850

int i;

8851

8852

if (!ca)

8852

if (!ca)

8853

goto out;

8853

goto out;

8854

8855

ca->cpuusage = alloc_percpu(u64);

8855

ca->cpuusage = alloc_percpu(u64);

8856

if (!ca->cpuusage)

8856

if (!ca->cpuusage)

8857

goto out_free_ca;

8857

goto out_free_ca;

8858

8859

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

8859

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

8860

if (percpu_counter_init(&ca->cpustat[i], 0))

8860

if (percpu_counter_init(&ca->cpustat[i], 0))

8861

goto out_free_counters;

8861

goto out_free_counters;

8862

8863

if (cgrp->parent)

8863

if (cgrp->parent)

8864

ca->parent = cgroup_ca(cgrp->parent);

8864

ca->parent = cgroup_ca(cgrp->parent);

8865

8866

return &ca->css;

8866

return &ca->css;

8867

8868

out_free_counters:

8868

out_free_counters:

8869

while (--i >= 0)

8869

while (--i >= 0)

8870

percpu_counter_destroy(&ca->cpustat[i]);

8870

percpu_counter_destroy(&ca->cpustat[i]);

8871

free_percpu(ca->cpuusage);

8871

free_percpu(ca->cpuusage);

8872

out_free_ca:

8872

out_free_ca:

8873

kfree(ca);

8873

kfree(ca);

8874

out:

8874

out:

8875

return ERR_PTR(-ENOMEM);

8875

return ERR_PTR(-ENOMEM);

8876

}

8876

}

8877

8878

/* destroy an existing cpu accounting group */

8878

/* destroy an existing cpu accounting group */

8879

static void

8879

static void

8880

cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

8880

cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

8881

{

8881

{

8882

struct cpuacct *ca = cgroup_ca(cgrp);

8882

struct cpuacct *ca = cgroup_ca(cgrp);

8883

int i;

8883

int i;

8884

8885

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

8885

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

8886

percpu_counter_destroy(&ca->cpustat[i]);

8886

percpu_counter_destroy(&ca->cpustat[i]);

8887

free_percpu(ca->cpuusage);

8887

free_percpu(ca->cpuusage);

8888

kfree(ca);

8888

kfree(ca);

8889

}

8889

}

8890

8891

static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)

8891

static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)

8892

{

8892

{

8893

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

8893

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

8894

u64 data;

8894

u64 data;

8895

8896

#ifndef CONFIG_64BIT

8896

#ifndef CONFIG_64BIT

8897

/*

8897

/*

8898

* Take rq->lock to make 64-bit read safe on 32-bit platforms.

8898

* Take rq->lock to make 64-bit read safe on 32-bit platforms.

8899

*/

8899

*/

8900

raw_spin_lock_irq(&cpu_rq(cpu)->lock);

8900

raw_spin_lock_irq(&cpu_rq(cpu)->lock);

8901

data = *cpuusage;

8901

data = *cpuusage;

8902

raw_spin_unlock_irq(&cpu_rq(cpu)->lock);

8902

raw_spin_unlock_irq(&cpu_rq(cpu)->lock);

8903

#else

8903

#else

8904

data = *cpuusage;

8904

data = *cpuusage;

8905

#endif

8905

#endif

8906

8907

return data;

8907

return data;

8908

}

8908

}

8909

8910

static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)

8910

static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)

8911

{

8911

{

8912

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

8912

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

8913

8914

#ifndef CONFIG_64BIT

8914

#ifndef CONFIG_64BIT

8915

/*

8915

/*

8916

* Take rq->lock to make 64-bit write safe on 32-bit platforms.

8916

* Take rq->lock to make 64-bit write safe on 32-bit platforms.

8917

*/

8917

*/

8918

raw_spin_lock_irq(&cpu_rq(cpu)->lock);

8918

raw_spin_lock_irq(&cpu_rq(cpu)->lock);

8919

*cpuusage = val;

8919

*cpuusage = val;

8920

raw_spin_unlock_irq(&cpu_rq(cpu)->lock);

8920

raw_spin_unlock_irq(&cpu_rq(cpu)->lock);

8921

#else

8921

#else

8922

*cpuusage = val;

8922

*cpuusage = val;

8923

#endif

8923

#endif

8924

}

8924

}

8925

8926

/* return total cpu usage (in nanoseconds) of a group */

8926

/* return total cpu usage (in nanoseconds) of a group */

8927

static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)

8927

static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)

8928

{

8928

{

8929

struct cpuacct *ca = cgroup_ca(cgrp);

8929

struct cpuacct *ca = cgroup_ca(cgrp);

8930

u64 totalcpuusage = 0;

8930

u64 totalcpuusage = 0;

8931

int i;

8931

int i;

8932

8933

for_each_present_cpu(i)

8933

for_each_present_cpu(i)

8934

totalcpuusage += cpuacct_cpuusage_read(ca, i);

8934

totalcpuusage += cpuacct_cpuusage_read(ca, i);

8935

8936

return totalcpuusage;

8936

return totalcpuusage;

8937

}

8937

}

8938

8939

static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,

8939

static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,

8940

u64 reset)

8940

u64 reset)

8941

{

8941

{

8942

struct cpuacct *ca = cgroup_ca(cgrp);

8942

struct cpuacct *ca = cgroup_ca(cgrp);

8943

int err = 0;

8943

int err = 0;

8944

int i;

8944

int i;

8945

8946

if (reset) {

8946

if (reset) {

8947

err = -EINVAL;

8947

err = -EINVAL;

8948

goto out;

8948

goto out;

8949

}

8949

}

8950

8951

for_each_present_cpu(i)

8951

for_each_present_cpu(i)

8952

cpuacct_cpuusage_write(ca, i, 0);

8952

cpuacct_cpuusage_write(ca, i, 0);

8953

8954

out:

8954

out:

8955

return err;

8955

return err;

8956

}

8956

}

8957

8958

static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,

8958

static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,

8959

struct seq_file *m)

8959

struct seq_file *m)

8960

{

8960

{

8961

struct cpuacct *ca = cgroup_ca(cgroup);

8961

struct cpuacct *ca = cgroup_ca(cgroup);

8962

u64 percpu;

8962

u64 percpu;

8963

int i;

8963

int i;

8964

8965

for_each_present_cpu(i) {

8965

for_each_present_cpu(i) {

8966

percpu = cpuacct_cpuusage_read(ca, i);

8966

percpu = cpuacct_cpuusage_read(ca, i);

8967

seq_printf(m, "%llu ", (unsigned long long) percpu);

8967

seq_printf(m, "%llu ", (unsigned long long) percpu);

8968

}

8968

}

8969

seq_printf(m, "\n");

8969

seq_printf(m, "\n");

8970

return 0;

8970

return 0;

8971

}

8971

}

8972

8973

static const char *cpuacct_stat_desc[] = {

8973

static const char *cpuacct_stat_desc[] = {

8974

[CPUACCT_STAT_USER] = "user",

8974

[CPUACCT_STAT_USER] = "user",

8975

[CPUACCT_STAT_SYSTEM] = "system",

8975

[CPUACCT_STAT_SYSTEM] = "system",

8976

};

8976

};

8977

8978

static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,

8978

static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,

8979

struct cgroup_map_cb *cb)

8979

struct cgroup_map_cb *cb)

8980

{

8980

{

8981

struct cpuacct *ca = cgroup_ca(cgrp);

8981

struct cpuacct *ca = cgroup_ca(cgrp);

8982

int i;

8982

int i;

8983

8984

for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {

8984

for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {

8985

s64 val = percpu_counter_read(&ca->cpustat[i]);

8985

s64 val = percpu_counter_read(&ca->cpustat[i]);

8986

val = cputime64_to_clock_t(val);

8986

val = cputime64_to_clock_t(val);

8987

cb->fill(cb, cpuacct_stat_desc[i], val);

8987

cb->fill(cb, cpuacct_stat_desc[i], val);

8988

}

8988

}

8989

return 0;

8989

return 0;

8990

}

8990

}

8991

8992

static struct cftype files[] = {

8992

static struct cftype files[] = {

8993

{

8993

{

8994

.name = "usage",

8994

.name = "usage",

8995

.read_u64 = cpuusage_read,

8995

.read_u64 = cpuusage_read,

8996

.write_u64 = cpuusage_write,

8996

.write_u64 = cpuusage_write,

8997

},

8997

},

8998

{

8998

{

8999

.name = "usage_percpu",

8999

.name = "usage_percpu",

9000

.read_seq_string = cpuacct_percpu_seq_read,

9000

.read_seq_string = cpuacct_percpu_seq_read,

9001

},

9001

},

9002

{

9002

{

9003

.name = "stat",

9003

.name = "stat",

9004

.read_map = cpuacct_stats_show,

9004

.read_map = cpuacct_stats_show,

9005

},

9005

},

9006

};

9006

};

9007

9008

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)

9008

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)

9009

{

9009

{

9010

return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));

9010

return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));

9011

}

9011

}

9012

9013

/*

9013

/*

9014

* charge this task's execution time to its accounting group.

9014

* charge this task's execution time to its accounting group.

9015

*

9015

*

9016

* called with rq->lock held.

9016

* called with rq->lock held.

9017

*/

9017

*/

9018

static void cpuacct_charge(struct task_struct *tsk, u64 cputime)

9018

static void cpuacct_charge(struct task_struct *tsk, u64 cputime)

9019

{

9019

{

9020

struct cpuacct *ca;

9020

struct cpuacct *ca;

9021

int cpu;

9021

int cpu;

9022

9023

if (unlikely(!cpuacct_subsys.active))

9023

if (unlikely(!cpuacct_subsys.active))

9024

return;

9024

return;

9025

9026

cpu = task_cpu(tsk);

9026

cpu = task_cpu(tsk);

9027

9028

rcu_read_lock();

9028

rcu_read_lock();

9029

9030

ca = task_ca(tsk);

9030

ca = task_ca(tsk);

9031

9032

for (; ca; ca = ca->parent) {

9032

for (; ca; ca = ca->parent) {

9033

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

9033

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

9034

*cpuusage += cputime;

9034

*cpuusage += cputime;

9035

}

9035

}

9036

9037

rcu_read_unlock();

9037

rcu_read_unlock();

9038

}

9038

}

9039

9040

/*

9040

/*

9041

* When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large

9041

* When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large

9042

* in cputime_t units. As a result, cpuacct_update_stats calls

9042

* in cputime_t units. As a result, cpuacct_update_stats calls

9043

* percpu_counter_add with values large enough to always overflow the

9043

* percpu_counter_add with values large enough to always overflow the

9044

* per cpu batch limit causing bad SMP scalability.

9044

* per cpu batch limit causing bad SMP scalability.

9045

*

9045

*

9046

* To fix this we scale percpu_counter_batch by cputime_one_jiffy so we

9046

* To fix this we scale percpu_counter_batch by cputime_one_jiffy so we

9047

* batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled

9047

* batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled

9048

* and enabled. We cap it at INT_MAX which is the largest allowed batch value.

9048

* and enabled. We cap it at INT_MAX which is the largest allowed batch value.

9049

*/

9049

*/

9050

#ifdef CONFIG_SMP

9050

#ifdef CONFIG_SMP

9051

#define CPUACCT_BATCH \

9051

#define CPUACCT_BATCH \

9052

min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)

9052

min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)

9053

#else

9053

#else

9054

#define CPUACCT_BATCH 0

9054

#define CPUACCT_BATCH 0

9055

#endif

9055

#endif

9056

9057

/*

9057

/*

9058

* Charge the system/user time to the task's accounting group.

9058

* Charge the system/user time to the task's accounting group.

9059

*/

9059

*/

9060

static void cpuacct_update_stats(struct task_struct *tsk,

9060

static void cpuacct_update_stats(struct task_struct *tsk,

9061

enum cpuacct_stat_index idx, cputime_t val)

9061

enum cpuacct_stat_index idx, cputime_t val)

9062

{

9062

{

9063

struct cpuacct *ca;

9063

struct cpuacct *ca;

9064

int batch = CPUACCT_BATCH;

9064

int batch = CPUACCT_BATCH;

9065

9066

if (unlikely(!cpuacct_subsys.active))

9066

if (unlikely(!cpuacct_subsys.active))

9067

return;

9067

return;

9068

9069

rcu_read_lock();

9069

rcu_read_lock();

9070

ca = task_ca(tsk);

9070

ca = task_ca(tsk);

9071

9072

do {

9072

do {

9073

__percpu_counter_add(&ca->cpustat[idx], val, batch);

9073

__percpu_counter_add(&ca->cpustat[idx], val, batch);

9074

ca = ca->parent;

9074

ca = ca->parent;

9075

} while (ca);

9075

} while (ca);

9076

rcu_read_unlock();

9076

rcu_read_unlock();

9077

}

9077

}

9078

9079

struct cgroup_subsys cpuacct_subsys = {

9079

struct cgroup_subsys cpuacct_subsys = {

9080

.name = "cpuacct",

9080

.name = "cpuacct",

9081

.create = cpuacct_create,

9081

.create = cpuacct_create,

9082

.destroy = cpuacct_destroy,

9082

.destroy = cpuacct_destroy,

9083

.populate = cpuacct_populate,

9083

.populate = cpuacct_populate,

9084

.subsys_id = cpuacct_subsys_id,

9084

.subsys_id = cpuacct_subsys_id,

9085

};

9085

};

9086

#endif /* CONFIG_CGROUP_CPUACCT */

9086

#endif /* CONFIG_CGROUP_CPUACCT */

9087

9088

#ifndef CONFIG_SMP

9088

#ifndef CONFIG_SMP

9089

9090

int rcu_expedited_torture_stats(char *page)

9090

int rcu_expedited_torture_stats(char *page)

9091

{

9091

{

9092

return 0;

9092

return 0;

9093

}

9093

}

9094

EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);

9094

EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);

9095

9096

void synchronize_sched_expedited(void)

9096

void synchronize_sched_expedited(void)

9097

{

9097

{

9098

}

9098

}

9099

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

9099

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

9100

9101

#else /* #ifndef CONFIG_SMP */

9101

#else /* #ifndef CONFIG_SMP */

9102

9103

static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);

9103

static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);

9104

static DEFINE_MUTEX(rcu_sched_expedited_mutex);

9104

static DEFINE_MUTEX(rcu_sched_expedited_mutex);

9105

9106

#define RCU_EXPEDITED_STATE_POST -2

9106

#define RCU_EXPEDITED_STATE_POST -2

9107

#define RCU_EXPEDITED_STATE_IDLE -1

9107

#define RCU_EXPEDITED_STATE_IDLE -1

9108

9109

static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;

9109

static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;

9110

9111

int rcu_expedited_torture_stats(char *page)

9111

int rcu_expedited_torture_stats(char *page)

9112

{

9112

{

9113

int cnt = 0;

9113

int cnt = 0;

9114

int cpu;

9114

int cpu;

9115

9116

cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);

9116

cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);

9117

for_each_online_cpu(cpu) {

9117

for_each_online_cpu(cpu) {

9118

cnt += sprintf(&page[cnt], " %d:%d",

9118

cnt += sprintf(&page[cnt], " %d:%d",

9119

cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);

9119

cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);

9120

}

9120

}

9121

cnt += sprintf(&page[cnt], "\n");

9121

cnt += sprintf(&page[cnt], "\n");

9122

return cnt;

9122

return cnt;

9123

}

9123

}

9124

EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);

9124

EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);

9125

9126

static long synchronize_sched_expedited_count;

9126

static long synchronize_sched_expedited_count;

9127

9128

/*

9128

/*

9129

* Wait for an rcu-sched grace period to elapse, but use "big hammer"

9129

* Wait for an rcu-sched grace period to elapse, but use "big hammer"

9130

* approach to force grace period to end quickly. This consumes

9130

* approach to force grace period to end quickly. This consumes

9131

* significant time on all CPUs, and is thus not recommended for

9131

* significant time on all CPUs, and is thus not recommended for

9132

* any sort of common-case code.

9132

* any sort of common-case code.

9133

*

9133

*

9134

* Note that it is illegal to call this function while holding any

9134

* Note that it is illegal to call this function while holding any

9135

* lock that is acquired by a CPU-hotplug notifier. Failing to

9135

* lock that is acquired by a CPU-hotplug notifier. Failing to

9136

* observe this restriction will result in deadlock.

9136

* observe this restriction will result in deadlock.

9137

*/

9137

*/

9138

void synchronize_sched_expedited(void)

9138

void synchronize_sched_expedited(void)

9139

{

9139

{

9140

int cpu;

9140

int cpu;

9141

unsigned long flags;

9141

unsigned long flags;

9142

bool need_full_sync = 0;

9142

bool need_full_sync = 0;

9143

struct rq *rq;

9143

struct rq *rq;

9144

struct migration_req *req;

9144

struct migration_req *req;

9145

long snap;

9145

long snap;

9146

int trycount = 0;

9146

int trycount = 0;

9147

9148

smp_mb(); /* ensure prior mod happens before capturing snap. */

9148

smp_mb(); /* ensure prior mod happens before capturing snap. */

9149

snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;

9149

snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;

9150

get_online_cpus();

9150

get_online_cpus();

9151

while (!mutex_trylock(&rcu_sched_expedited_mutex)) {

9151

while (!mutex_trylock(&rcu_sched_expedited_mutex)) {

9152

put_online_cpus();

9152

put_online_cpus();

9153

if (trycount++ < 10)

9153

if (trycount++ < 10)

9154

udelay(trycount * num_online_cpus());

9154

udelay(trycount * num_online_cpus());

9155

else {

9155

else {

9156

synchronize_sched();

9156

synchronize_sched();

9157

return;

9157

return;

9158

}

9158

}

9159

if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {

9159

if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {

9160

smp_mb(); /* ensure test happens before caller kfree */

9160

smp_mb(); /* ensure test happens before caller kfree */

9161

return;

9161

return;

9162

}

9162

}

9163

get_online_cpus();

9163

get_online_cpus();

9164

}

9164

}

9165

rcu_expedited_state = RCU_EXPEDITED_STATE_POST;

9165

rcu_expedited_state = RCU_EXPEDITED_STATE_POST;

9166

for_each_online_cpu(cpu) {

9166

for_each_online_cpu(cpu) {

9167

rq = cpu_rq(cpu);

9167

rq = cpu_rq(cpu);

9168

req = &per_cpu(rcu_migration_req, cpu);

9168

req = &per_cpu(rcu_migration_req, cpu);

9169

init_completion(&req->done);

9169

init_completion(&req->done);

9170

req->task = NULL;

9170

req->task = NULL;

9171

req->dest_cpu = RCU_MIGRATION_NEED_QS;

9171

req->dest_cpu = RCU_MIGRATION_NEED_QS;

9172

raw_spin_lock_irqsave(&rq->lock, flags);

9172

raw_spin_lock_irqsave(&rq->lock, flags);

9173

list_add(&req->list, &rq->migration_queue);

9173

list_add(&req->list, &rq->migration_queue);

9174

raw_spin_unlock_irqrestore(&rq->lock, flags);

9174

raw_spin_unlock_irqrestore(&rq->lock, flags);

9175

wake_up_process(rq->migration_thread);

9175

wake_up_process(rq->migration_thread);

9176

}

9176

}

9177

for_each_online_cpu(cpu) {

9177

for_each_online_cpu(cpu) {

9178

rcu_expedited_state = cpu;

9178

rcu_expedited_state = cpu;

9179

req = &per_cpu(rcu_migration_req, cpu);

9179

req = &per_cpu(rcu_migration_req, cpu);

9180

rq = cpu_rq(cpu);

9180

rq = cpu_rq(cpu);

9181

wait_for_completion(&req->done);

9181

wait_for_completion(&req->done);

9182

raw_spin_lock_irqsave(&rq->lock, flags);

9182

raw_spin_lock_irqsave(&rq->lock, flags);

9183

if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))

9183

if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))

9184

need_full_sync = 1;

9184

need_full_sync = 1;

9185

req->dest_cpu = RCU_MIGRATION_IDLE;

9185

req->dest_cpu = RCU_MIGRATION_IDLE;

9186

raw_spin_unlock_irqrestore(&rq->lock, flags);

9186

raw_spin_unlock_irqrestore(&rq->lock, flags);

9187

}

9187

}

9188

rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;

9188

rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;

9189

synchronize_sched_expedited_count++;

9189

synchronize_sched_expedited_count++;

9190

mutex_unlock(&rcu_sched_expedited_mutex);

9190

mutex_unlock(&rcu_sched_expedited_mutex);

9191

put_online_cpus();

9191

put_online_cpus();

9192

if (need_full_sync)

9192

if (need_full_sync)

9193

synchronize_sched();

9193

synchronize_sched();

9194

}

9194

}

9195

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

9195

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

9196

9197

#endif /* #else #ifndef CONFIG_SMP */

9197

#endif /* #else #ifndef CONFIG_SMP */

9198

GITLAB

Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kerne…

 /*
  * kernel/time/sched_debug.c
  *
  * Print the CFS rbtree
  *
  * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
 #include <linux/proc_fs.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
 /*
  * This allows printing both to /proc/sched_debug and
  * to the console
  */
 #define SEQ_printf(m, x...)			\
  do {						\
 	if (m)					\
 		seq_printf(m, x);		\
 	else					\
 		printk(x);			\
  } while (0)
 /*
  * Ease the printing of nsec fields:
  */
 static long long nsec_high(unsigned long long nsec)
 {
 	if ((long long)nsec < 0) {
 		nsec = -nsec;
 		do_div(nsec, 1000000);
 		return -nsec;
 	}
 	do_div(nsec, 1000000);
 	return nsec;
 }
 static unsigned long nsec_low(unsigned long long nsec)
 {
 	if ((long long)nsec < 0)
 		nsec = -nsec;
 	return do_div(nsec, 1000000);
 }
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void print_cfs_group_stats(struct seq_file *m, int cpu,
 		struct task_group *tg)
 {
 	struct sched_entity *se = tg->se[cpu];
 	if (!se)
 		return;
 #define P(F) \
 	SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
 #define PN(F) \
 	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
 	PN(se->exec_start);
 	PN(se->vruntime);
 	PN(se->sum_exec_runtime);
 #ifdef CONFIG_SCHEDSTATS
 	PN(se->wait_start);
 	PN(se->sleep_start);
 	PN(se->block_start);
 	PN(se->sleep_max);
 	PN(se->block_max);
 	PN(se->exec_max);
 	PN(se->slice_max);
 	PN(se->wait_max);
 	PN(se->wait_sum);
 	P(se->wait_count);
 #endif
 	P(se->load.weight);
 #undef PN
 #undef P
 }
 #endif
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
 	if (rq->curr == p)
 		SEQ_printf(m, "R");
 	else
 		SEQ_printf(m, " ");
 	SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
 		p->comm, p->pid,
 		SPLIT_NS(p->se.vruntime),
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio);
 #ifdef CONFIG_SCHEDSTATS
 	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
 		SPLIT_NS(p->se.vruntime),
 		SPLIT_NS(p->se.sum_exec_runtime),
 		SPLIT_NS(p->se.sum_sleep_runtime));
 #else
 	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
 		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
 #ifdef CONFIG_CGROUP_SCHED
 	{
 		char path[64];
 		cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
 		SEQ_printf(m, " %s", path);
 	}
 #endif
 	SEQ_printf(m, "\n");
 }
 static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 {
 	struct task_struct *g, *p;
 	unsigned long flags;
 	SEQ_printf(m,
 	"\nrunnable tasks:\n"
 	"            task   PID         tree-key  switches  prio"
 	"     exec-runtime         sum-exec        sum-sleep\n"
 	"------------------------------------------------------"
 	"----------------------------------------------------\n");
 	read_lock_irqsave(&tasklist_lock, flags);
 	do_each_thread(g, p) {
 		if (!p->se.on_rq || task_cpu(p) != rq_cpu)
 			continue;
 		print_task(m, rq, p);
 	} while_each_thread(g, p);
 	read_unlock_irqrestore(&tasklist_lock, flags);
 }
 #if defined(CONFIG_CGROUP_SCHED) && \
 	(defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
 static void task_group_path(struct task_group *tg, char *buf, int buflen)
 {
 	/* may be NULL if the underlying cgroup isn't fully-created yet */
 	if (!tg->css.cgroup) {
 		buf[0] = '\0';
 		return;
 	}
 	cgroup_path(tg->css.cgroup, buf, buflen);
 }
 #endif
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
 	s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
 		spread, rq0_min_vruntime, spread0;
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_entity *last;
 	unsigned long flags;
 #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
 	char path[128];
 	struct task_group *tg = cfs_rq->tg;
 	task_group_path(tg, path, sizeof(path));
 	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
 #elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
 	{
 		uid_t uid = cfs_rq->tg->uid;
 		SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
 	}
 #else
 	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
 			SPLIT_NS(cfs_rq->exec_clock));
 	raw_spin_lock_irqsave(&rq->lock, flags);
 	if (cfs_rq->rb_leftmost)
 		MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
 	last = __pick_last_entity(cfs_rq);
 	if (last)
 		max_vruntime = last->vruntime;
 	min_vruntime = cfs_rq->min_vruntime;
 	rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
 			SPLIT_NS(MIN_vruntime));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
 			SPLIT_NS(min_vruntime));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "max_vruntime",
 			SPLIT_NS(max_vruntime));
 	spread = max_vruntime - MIN_vruntime;
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread",
 			SPLIT_NS(spread));
 	spread0 = min_vruntime - rq0_min_vruntime;
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
 			SPLIT_NS(spread0));
 	SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 	SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
 			cfs_rq->nr_spread_over);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
 	SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
 #endif
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
 #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
 	char path[128];
 	struct task_group *tg = rt_rq->tg;
 	task_group_path(tg, path, sizeof(path));
 	SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
 #else
 	SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
 #endif
 #define P(x) \
 	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
 #define PN(x) \
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
 	P(rt_nr_running);
 	P(rt_throttled);
 	PN(rt_time);
 	PN(rt_runtime);
 #undef PN
 #undef P
 }
 static void print_cpu(struct seq_file *m, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 #ifdef CONFIG_X86
 	{
 		unsigned int freq = cpu_khz ? : 1;
 		SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
 			   cpu, freq / 1000, (freq % 1000));
 	}
 #else
 	SEQ_printf(m, "\ncpu#%d\n", cpu);
 #endif
 #define P(x) \
 	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x))
 #define PN(x) \
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
 	P(nr_running);
 	SEQ_printf(m, "  .%-30s: %lu\n", "load",
 		   rq->load.weight);
 	P(nr_switches);
 	P(nr_load_updates);
 	P(nr_uninterruptible);
 	PN(next_balance);
 	P(curr->pid);
 	PN(clock);
 	P(cpu_load[0]);
 	P(cpu_load[1]);
 	P(cpu_load[2]);
 	P(cpu_load[3]);
 	P(cpu_load[4]);
 #undef P
 #undef PN
 #ifdef CONFIG_SCHEDSTATS
 #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
 #define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
 	P(yld_count);
 	P(sched_switch);
 	P(sched_count);
 	P(sched_goidle);
 #ifdef CONFIG_SMP
 	P64(avg_idle);
 #endif
 	P(ttwu_count);
 	P(ttwu_local);
 	P(bkl_count);
 #undef P
 #endif
 	print_cfs_stats(m, cpu);
 	print_rt_stats(m, cpu);
 	print_rq(m, rq, cpu);
 }
 static const char *sched_tunable_scaling_names[] = {
 	"none",
 	"logaritmic",
 	"linear"
 };
 static int sched_debug_show(struct seq_file *m, void *v)
 {
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 	SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
 	SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now));
 #define P(x) \
 	SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
 #define PN(x) \
 	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
 	P(jiffies);
 	PN(sysctl_sched_latency);
 	PN(sysctl_sched_min_granularity);
 	PN(sysctl_sched_wakeup_granularity);
 	PN(sysctl_sched_child_runs_first);
 	P(sysctl_sched_features);
 #undef PN
 #undef P
 	SEQ_printf(m, "  .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
 		sysctl_sched_tunable_scaling,
 		sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
 	for_each_online_cpu(cpu)
 		print_cpu(m, cpu);
 	SEQ_printf(m, "\n");
 	return 0;
 }
 static void sysrq_sched_debug_show(void)
 {
 	sched_debug_show(NULL, NULL);
 }
 static int sched_debug_open(struct inode *inode, struct file *filp)
 {
 	return single_open(filp, sched_debug_show, NULL);
 }
 static const struct file_operations sched_debug_fops = {
 	.open		= sched_debug_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
 static int __init init_sched_debug_procfs(void)
 {
 	struct proc_dir_entry *pe;
 	pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops);
 	if (!pe)
 		return -ENOMEM;
 	return 0;
 }
 __initcall(init_sched_debug_procfs);
 void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 {
 	unsigned long nr_switches;
 	unsigned long flags;
 	int num_threads = 1;
 	if (lock_task_sighand(p, &flags)) {
 		num_threads = atomic_read(&p->signal->count);
 		unlock_task_sighand(p, &flags);
 	}
 	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
 	SEQ_printf(m,
 		"---------------------------------------------------------\n");
 #define __P(F) \
 	SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F)
 #define P(F) \
 	SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F)
 #define __PN(F) \
 	SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
 #define PN(F) \
 	SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
 	PN(se.exec_start);
 	PN(se.vruntime);
 	PN(se.sum_exec_runtime);
 	PN(se.avg_overlap);
 	PN(se.avg_wakeup);
 	nr_switches = p->nvcsw + p->nivcsw;
 #ifdef CONFIG_SCHEDSTATS
 	PN(se.wait_start);
 	PN(se.sleep_start);
 	PN(se.block_start);
 	PN(se.sleep_max);
 	PN(se.block_max);
 	PN(se.exec_max);
 	PN(se.slice_max);
 	PN(se.wait_max);
 	PN(se.wait_sum);
 	P(se.wait_count);
 	PN(se.iowait_sum);
 	P(se.iowait_count);
 	P(sched_info.bkl_count);
 	P(se.nr_migrations);
 	P(se.nr_migrations_cold);
 	P(se.nr_failed_migrations_affine);
 	P(se.nr_failed_migrations_running);
 	P(se.nr_failed_migrations_hot);
 	P(se.nr_forced_migrations);
 	P(se.nr_wakeups);
 	P(se.nr_wakeups_sync);
 	P(se.nr_wakeups_migrate);
 	P(se.nr_wakeups_local);
 	P(se.nr_wakeups_remote);
 	P(se.nr_wakeups_affine);
 	P(se.nr_wakeups_affine_attempts);
 	P(se.nr_wakeups_passive);
 	P(se.nr_wakeups_idle);
 	{
 		u64 avg_atom, avg_per_cpu;
 		avg_atom = p->se.sum_exec_runtime;
 		if (nr_switches)
 			do_div(avg_atom, nr_switches);
 		else
 			avg_atom = -1LL;
 		avg_per_cpu = p->se.sum_exec_runtime;
 		if (p->se.nr_migrations) {
 			avg_per_cpu = div64_u64(avg_per_cpu,
 						p->se.nr_migrations);
 		} else {
 			avg_per_cpu = -1LL;
 		}
 		__PN(avg_atom);
 		__PN(avg_per_cpu);
 	}
 #endif
 	__P(nr_switches);
 	SEQ_printf(m, "%-35s:%21Ld\n",
 		   "nr_voluntary_switches", (long long)p->nvcsw);
 	SEQ_printf(m, "%-35s:%21Ld\n",
 		   "nr_involuntary_switches", (long long)p->nivcsw);
 	P(se.load.weight);
 	P(policy);
 	P(prio);
 #undef PN
 #undef __PN
 #undef P
 #undef __P
 	{
 		unsigned int this_cpu = raw_smp_processor_id();
 		u64 t0, t1;
 		t0 = cpu_clock(this_cpu);
 		t1 = cpu_clock(this_cpu);
 		SEQ_printf(m, "%-35s:%21Ld\n",
 			   "clock-delta", (long long)(t1-t0));
 	}
 }
 void proc_sched_set_task(struct task_struct *p)
 {
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_max				= 0;
 	p->se.wait_sum				= 0;
 	p->se.wait_count			= 0;
 	p->se.iowait_sum			= 0;
 	p->se.iowait_count			= 0;
 	p->se.sleep_max				= 0;
 	p->se.sum_sleep_runtime			= 0;
 	p->se.block_max				= 0;
 	p->se.exec_max				= 0;
 	p->se.slice_max				= 0;
 	p->se.nr_migrations			= 0;
 	p->se.nr_migrations_cold		= 0;
 	p->se.nr_failed_migrations_affine	= 0;
 	p->se.nr_failed_migrations_running	= 0;
 	p->se.nr_failed_migrations_hot		= 0;
 	p->se.nr_forced_migrations		= 0;
 	p->se.nr_wakeups			= 0;
 	p->se.nr_wakeups_sync			= 0;
 	p->se.nr_wakeups_migrate		= 0;
 	p->se.nr_wakeups_local			= 0;
 	p->se.nr_wakeups_remote			= 0;
 	p->se.nr_wakeups_affine			= 0;
 	p->se.nr_wakeups_affine_attempts	= 0;
 	p->se.nr_wakeups_passive		= 0;
 	p->se.nr_wakeups_idle			= 0;
 	p->sched_info.bkl_count			= 0;
 #endif
-	p->se.sum_exec_runtime			= 0;
-	p->se.prev_sum_exec_runtime		= 0;
-	p->nvcsw				= 0;
-	p->nivcsw				= 0;
 }