Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* kernel/sched.c

2

* kernel/sched.c

3

*

3

*

4

* Kernel scheduler and related syscalls

4

* Kernel scheduler and related syscalls

5

*

5

*

6

7

*

7

*

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

9

* make semaphores SMP safe

9

* make semaphores SMP safe

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

11

* by Andrea Arcangeli

11

* by Andrea Arcangeli

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

13

* hybrid priority-list and round-robin design with

13

* hybrid priority-list and round-robin design with

14

* an array-switch method of distributing timeslices

14

* an array-switch method of distributing timeslices

15

* and per-CPU runqueues. Cleanups and useful suggestions

15

* and per-CPU runqueues. Cleanups and useful suggestions

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

18

* 2004-04-02 Scheduler domains code by Nick Piggin

18

* 2004-04-02 Scheduler domains code by Nick Piggin

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

20

* fair scheduling design by Con Kolivas.

20

* fair scheduling design by Con Kolivas.

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

22

* by Peter Williams

22

* by Peter Williams

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

25

* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,

25

* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,

26

* Thomas Gleixner, Mike Kravetz

26

* Thomas Gleixner, Mike Kravetz

27

*/

27

*/

28

29

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

29

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

30

31

#include <linux/mm.h>

31

#include <linux/mm.h>

32

#include <linux/module.h>

32

#include <linux/module.h>

33

#include <linux/nmi.h>

33

#include <linux/nmi.h>

34

#include <linux/init.h>

34

#include <linux/init.h>

35

#include <linux/uaccess.h>

35

#include <linux/uaccess.h>

36

#include <linux/highmem.h>

36

#include <linux/highmem.h>

37

#include <linux/smp_lock.h>

37

#include <linux/smp_lock.h>

38

#include <asm/mmu_context.h>

38

#include <asm/mmu_context.h>

39

#include <linux/interrupt.h>

39

#include <linux/interrupt.h>

40

#include <linux/capability.h>

40

#include <linux/capability.h>

41

#include <linux/completion.h>

41

#include <linux/completion.h>

42

#include <linux/kernel_stat.h>

42

#include <linux/kernel_stat.h>

43

#include <linux/debug_locks.h>

43

#include <linux/debug_locks.h>

44

#include <linux/perf_event.h>

44

#include <linux/perf_event.h>

45

#include <linux/security.h>

45

#include <linux/security.h>

46

#include <linux/notifier.h>

46

#include <linux/notifier.h>

47

#include <linux/profile.h>

47

#include <linux/profile.h>

48

#include <linux/freezer.h>

48

#include <linux/freezer.h>

49

#include <linux/vmalloc.h>

49

#include <linux/vmalloc.h>

50

#include <linux/blkdev.h>

50

#include <linux/blkdev.h>

51

#include <linux/delay.h>

51

#include <linux/delay.h>

52

#include <linux/pid_namespace.h>

52

#include <linux/pid_namespace.h>

53

#include <linux/smp.h>

53

#include <linux/smp.h>

54

#include <linux/threads.h>

54

#include <linux/threads.h>

55

#include <linux/timer.h>

55

#include <linux/timer.h>

56

#include <linux/rcupdate.h>

56

#include <linux/rcupdate.h>

57

#include <linux/cpu.h>

57

#include <linux/cpu.h>

58

#include <linux/cpuset.h>

58

#include <linux/cpuset.h>

59

#include <linux/percpu.h>

59

#include <linux/percpu.h>

60

#include <linux/kthread.h>

60

#include <linux/kthread.h>

61

#include <linux/proc_fs.h>

61

#include <linux/proc_fs.h>

62

#include <linux/seq_file.h>

62

#include <linux/seq_file.h>

63

#include <linux/sysctl.h>

63

#include <linux/sysctl.h>

64

#include <linux/syscalls.h>

64

#include <linux/syscalls.h>

65

#include <linux/times.h>

65

#include <linux/times.h>

66

#include <linux/tsacct_kern.h>

66

#include <linux/tsacct_kern.h>

67

#include <linux/kprobes.h>

67

#include <linux/kprobes.h>

68

#include <linux/delayacct.h>

68

#include <linux/delayacct.h>

69

#include <linux/unistd.h>

69

#include <linux/unistd.h>

70

#include <linux/pagemap.h>

70

#include <linux/pagemap.h>

71

#include <linux/hrtimer.h>

71

#include <linux/hrtimer.h>

72

#include <linux/tick.h>

72

#include <linux/tick.h>

73

#include <linux/debugfs.h>

73

#include <linux/debugfs.h>

74

#include <linux/ctype.h>

74

#include <linux/ctype.h>

75

#include <linux/ftrace.h>

75

#include <linux/ftrace.h>

76

77

#include <asm/tlb.h>

77

#include <asm/tlb.h>

78

#include <asm/irq_regs.h>

78

#include <asm/irq_regs.h>

79

80

#include "sched_cpupri.h"

80

#include "sched_cpupri.h"

81

82

#define CREATE_TRACE_POINTS

82

#define CREATE_TRACE_POINTS

83

#include <trace/events/sched.h>

83

#include <trace/events/sched.h>

84

85

/*

85

/*

86

* Convert user-nice values [ -20 ... 0 ... 19 ]

86

* Convert user-nice values [ -20 ... 0 ... 19 ]

87

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

87

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

88

* and back.

88

* and back.

89

*/

89

*/

90

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

90

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

91

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

91

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

92

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

92

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

93

94

/*

94

/*

95

* 'User priority' is the nice value converted to something we

95

* 'User priority' is the nice value converted to something we

96

* can work with better when scaling various scheduler parameters,

96

* can work with better when scaling various scheduler parameters,

97

* it's a [ 0 ... 39 ] range.

97

* it's a [ 0 ... 39 ] range.

98

*/

98

*/

99

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

99

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

100

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

100

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

101

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

101

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

102

103

/*

103

/*

104

* Helpers for converting nanosecond timing to jiffy resolution

104

* Helpers for converting nanosecond timing to jiffy resolution

105

*/

105

*/

106

#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))

106

#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))

107

108

#define NICE_0_LOAD SCHED_LOAD_SCALE

108

#define NICE_0_LOAD SCHED_LOAD_SCALE

109

#define NICE_0_SHIFT SCHED_LOAD_SHIFT

109

#define NICE_0_SHIFT SCHED_LOAD_SHIFT

110

111

/*

111

/*

112

* These are the 'tuning knobs' of the scheduler:

112

* These are the 'tuning knobs' of the scheduler:

113

*

113

*

114

* default timeslice is 100 msecs (used only for SCHED_RR tasks).

114

* default timeslice is 100 msecs (used only for SCHED_RR tasks).

115

* Timeslices get refilled after they expire.

115

* Timeslices get refilled after they expire.

116

*/

116

*/

117

#define DEF_TIMESLICE (100 * HZ / 1000)

117

#define DEF_TIMESLICE (100 * HZ / 1000)

118

119

/*

119

/*

120

* single value that denotes runtime == period, ie unlimited time.

120

* single value that denotes runtime == period, ie unlimited time.

121

*/

121

*/

122

#define RUNTIME_INF ((u64)~0ULL)

122

#define RUNTIME_INF ((u64)~0ULL)

123

124

static inline int rt_policy(int policy)

124

static inline int rt_policy(int policy)

125

{

125

{

126

if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))

126

if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))

127

return 1;

127

return 1;

128

return 0;

128

return 0;

129

}

129

}

130

131

static inline int task_has_rt_policy(struct task_struct *p)

131

static inline int task_has_rt_policy(struct task_struct *p)

132

{

132

{

133

return rt_policy(p->policy);

133

return rt_policy(p->policy);

134

}

134

}

135

136

/*

136

/*

137

* This is the priority-queue data structure of the RT scheduling class:

137

* This is the priority-queue data structure of the RT scheduling class:

138

*/

138

*/

139

struct rt_prio_array {

139

struct rt_prio_array {

140

DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */

140

DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */

141

struct list_head queue[MAX_RT_PRIO];

141

struct list_head queue[MAX_RT_PRIO];

142

};

142

};

143

144

struct rt_bandwidth {

144

struct rt_bandwidth {

145

/* nests inside the rq lock: */

145

/* nests inside the rq lock: */

146

spinlock_t rt_runtime_lock;

146

spinlock_t rt_runtime_lock;

147

ktime_t rt_period;

147

ktime_t rt_period;

148

u64 rt_runtime;

148

u64 rt_runtime;

149

struct hrtimer rt_period_timer;

149

struct hrtimer rt_period_timer;

150

};

150

};

151

152

static struct rt_bandwidth def_rt_bandwidth;

152

static struct rt_bandwidth def_rt_bandwidth;

153

154

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

154

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

155

156

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)

156

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)

157

{

157

{

158

struct rt_bandwidth *rt_b =

158

struct rt_bandwidth *rt_b =

159

container_of(timer, struct rt_bandwidth, rt_period_timer);

159

container_of(timer, struct rt_bandwidth, rt_period_timer);

160

ktime_t now;

160

ktime_t now;

161

int overrun;

161

int overrun;

162

int idle = 0;

162

int idle = 0;

163

164

for (;;) {

164

for (;;) {

165

now = hrtimer_cb_get_time(timer);

165

now = hrtimer_cb_get_time(timer);

166

overrun = hrtimer_forward(timer, now, rt_b->rt_period);

166

overrun = hrtimer_forward(timer, now, rt_b->rt_period);

167

168

if (!overrun)

168

if (!overrun)

169

break;

169

break;

170

171

idle = do_sched_rt_period_timer(rt_b, overrun);

171

idle = do_sched_rt_period_timer(rt_b, overrun);

172

}

172

}

173

174

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

174

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

175

}

175

}

176

177

static

177

static

178

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)

178

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)

179

{

179

{

180

rt_b->rt_period = ns_to_ktime(period);

180

rt_b->rt_period = ns_to_ktime(period);

181

rt_b->rt_runtime = runtime;

181

rt_b->rt_runtime = runtime;

182

183

spin_lock_init(&rt_b->rt_runtime_lock);

183

spin_lock_init(&rt_b->rt_runtime_lock);

184

185

hrtimer_init(&rt_b->rt_period_timer,

185

hrtimer_init(&rt_b->rt_period_timer,

186

CLOCK_MONOTONIC, HRTIMER_MODE_REL);

186

CLOCK_MONOTONIC, HRTIMER_MODE_REL);

187

rt_b->rt_period_timer.function = sched_rt_period_timer;

187

rt_b->rt_period_timer.function = sched_rt_period_timer;

188

}

188

}

189

190

static inline int rt_bandwidth_enabled(void)

190

static inline int rt_bandwidth_enabled(void)

191

{

191

{

192

return sysctl_sched_rt_runtime >= 0;

192

return sysctl_sched_rt_runtime >= 0;

193

}

193

}

194

195

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)

195

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)

196

{

196

{

197

ktime_t now;

197

ktime_t now;

198

199

if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)

199

if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)

200

return;

200

return;

201

202

if (hrtimer_active(&rt_b->rt_period_timer))

202

if (hrtimer_active(&rt_b->rt_period_timer))

203

return;

203

return;

204

205

spin_lock(&rt_b->rt_runtime_lock);

205

spin_lock(&rt_b->rt_runtime_lock);

206

for (;;) {

206

for (;;) {

207

unsigned long delta;

207

unsigned long delta;

208

ktime_t soft, hard;

208

ktime_t soft, hard;

209

210

if (hrtimer_active(&rt_b->rt_period_timer))

210

if (hrtimer_active(&rt_b->rt_period_timer))

211

break;

211

break;

212

213

now = hrtimer_cb_get_time(&rt_b->rt_period_timer);

213

now = hrtimer_cb_get_time(&rt_b->rt_period_timer);

214

hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);

214

hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);

215

216

soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);

216

soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);

217

hard = hrtimer_get_expires(&rt_b->rt_period_timer);

217

hard = hrtimer_get_expires(&rt_b->rt_period_timer);

218

delta = ktime_to_ns(ktime_sub(hard, soft));

218

delta = ktime_to_ns(ktime_sub(hard, soft));

219

__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,

219

__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,

220

HRTIMER_MODE_ABS_PINNED, 0);

220

HRTIMER_MODE_ABS_PINNED, 0);

221

}

221

}

222

spin_unlock(&rt_b->rt_runtime_lock);

222

spin_unlock(&rt_b->rt_runtime_lock);

223

}

223

}

224

225

#ifdef CONFIG_RT_GROUP_SCHED

225

#ifdef CONFIG_RT_GROUP_SCHED

226

static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)

226

static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)

227

{

227

{

228

hrtimer_cancel(&rt_b->rt_period_timer);

228

hrtimer_cancel(&rt_b->rt_period_timer);

229

}

229

}

230

#endif

230

#endif

231

232

/*

232

/*

233

* sched_domains_mutex serializes calls to arch_init_sched_domains,

233

* sched_domains_mutex serializes calls to arch_init_sched_domains,

234

* detach_destroy_domains and partition_sched_domains.

234

* detach_destroy_domains and partition_sched_domains.

235

*/

235

*/

236

static DEFINE_MUTEX(sched_domains_mutex);

236

static DEFINE_MUTEX(sched_domains_mutex);

237

238

#ifdef CONFIG_GROUP_SCHED

238

#ifdef CONFIG_GROUP_SCHED

239

240

#include <linux/cgroup.h>

240

#include <linux/cgroup.h>

241

242

struct cfs_rq;

242

struct cfs_rq;

243

244

static LIST_HEAD(task_groups);

244

static LIST_HEAD(task_groups);

245

246

/* task group related information */

246

/* task group related information */

247

struct task_group {

247

struct task_group {

248

#ifdef CONFIG_CGROUP_SCHED

248

#ifdef CONFIG_CGROUP_SCHED

249

struct cgroup_subsys_state css;

249

struct cgroup_subsys_state css;

250

#endif

250

#endif

251

252

#ifdef CONFIG_USER_SCHED

252

#ifdef CONFIG_USER_SCHED

253

uid_t uid;

253

uid_t uid;

254

#endif

254

#endif

255

256

#ifdef CONFIG_FAIR_GROUP_SCHED

256

#ifdef CONFIG_FAIR_GROUP_SCHED

257

/* schedulable entities of this group on each cpu */

257

/* schedulable entities of this group on each cpu */

258

struct sched_entity **se;

258

struct sched_entity **se;

259

/* runqueue "owned" by this group on each cpu */

259

/* runqueue "owned" by this group on each cpu */

260

struct cfs_rq **cfs_rq;

260

struct cfs_rq **cfs_rq;

261

unsigned long shares;

261

unsigned long shares;

262

#endif

262

#endif

263

264

#ifdef CONFIG_RT_GROUP_SCHED

264

#ifdef CONFIG_RT_GROUP_SCHED

265

struct sched_rt_entity **rt_se;

265

struct sched_rt_entity **rt_se;

266

struct rt_rq **rt_rq;

266

struct rt_rq **rt_rq;

267

268

struct rt_bandwidth rt_bandwidth;

268

struct rt_bandwidth rt_bandwidth;

269

#endif

269

#endif

270

271

struct rcu_head rcu;

271

struct rcu_head rcu;

272

struct list_head list;

272

struct list_head list;

273

274

struct task_group *parent;

274

struct task_group *parent;

275

struct list_head siblings;

275

struct list_head siblings;

276

struct list_head children;

276

struct list_head children;

277

};

277

};

278

279

#ifdef CONFIG_USER_SCHED

279

#ifdef CONFIG_USER_SCHED

280

281

/* Helper function to pass uid information to create_sched_user() */

281

/* Helper function to pass uid information to create_sched_user() */

282

void set_tg_uid(struct user_struct *user)

282

void set_tg_uid(struct user_struct *user)

283

{

283

{

284

user->tg->uid = user->uid;

284

user->tg->uid = user->uid;

285

}

285

}

286

287

/*

287

/*

288

* Root task group.

288

* Root task group.

289

* Every UID task group (including init_task_group aka UID-0) will

289

* Every UID task group (including init_task_group aka UID-0) will

290

* be a child to this group.

290

* be a child to this group.

291

*/

291

*/

292

struct task_group root_task_group;

292

struct task_group root_task_group;

293

294

#ifdef CONFIG_FAIR_GROUP_SCHED

294

#ifdef CONFIG_FAIR_GROUP_SCHED

295

/* Default task group's sched entity on each cpu */

295

/* Default task group's sched entity on each cpu */

296

static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);

296

static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);

297

/* Default task group's cfs_rq on each cpu */

297

/* Default task group's cfs_rq on each cpu */

298

static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);

298

static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);

299

#endif /* CONFIG_FAIR_GROUP_SCHED */

299

#endif /* CONFIG_FAIR_GROUP_SCHED */

300

301

#ifdef CONFIG_RT_GROUP_SCHED

301

#ifdef CONFIG_RT_GROUP_SCHED

302

static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);

302

static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);

303

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);

303

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);

304

#endif /* CONFIG_RT_GROUP_SCHED */

304

#endif /* CONFIG_RT_GROUP_SCHED */

305

#else /* !CONFIG_USER_SCHED */

305

#else /* !CONFIG_USER_SCHED */

306

#define root_task_group init_task_group

306

#define root_task_group init_task_group

307

#endif /* CONFIG_USER_SCHED */

307

#endif /* CONFIG_USER_SCHED */

308

309

/* task_group_lock serializes add/remove of task groups and also changes to

309

/* task_group_lock serializes add/remove of task groups and also changes to

310

* a task group's cpu shares.

310

* a task group's cpu shares.

311

*/

311

*/

312

static DEFINE_SPINLOCK(task_group_lock);

312

static DEFINE_SPINLOCK(task_group_lock);

313

314

#ifdef CONFIG_FAIR_GROUP_SCHED

314

#ifdef CONFIG_FAIR_GROUP_SCHED

315

316

#ifdef CONFIG_SMP

316

#ifdef CONFIG_SMP

317

static int root_task_group_empty(void)

317

static int root_task_group_empty(void)

318

{

318

{

319

return list_empty(&root_task_group.children);

319

return list_empty(&root_task_group.children);

320

}

320

}

321

#endif

321

#endif

322

323

#ifdef CONFIG_USER_SCHED

323

#ifdef CONFIG_USER_SCHED

324

# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)

324

# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)

325

#else /* !CONFIG_USER_SCHED */

325

#else /* !CONFIG_USER_SCHED */

326

# define INIT_TASK_GROUP_LOAD NICE_0_LOAD

326

# define INIT_TASK_GROUP_LOAD NICE_0_LOAD

327

#endif /* CONFIG_USER_SCHED */

327

#endif /* CONFIG_USER_SCHED */

328

329

/*

329

/*

330

* A weight of 0 or 1 can cause arithmetics problems.

330

* A weight of 0 or 1 can cause arithmetics problems.

331

* A weight of a cfs_rq is the sum of weights of which entities

331

* A weight of a cfs_rq is the sum of weights of which entities

332

* are queued on this cfs_rq, so a weight of a entity should not be

332

* are queued on this cfs_rq, so a weight of a entity should not be

333

* too large, so as the shares value of a task group.

333

* too large, so as the shares value of a task group.

334

* (The default weight is 1024 - so there's no practical

334

* (The default weight is 1024 - so there's no practical

335

* limitation from this.)

335

* limitation from this.)

336

*/

336

*/

337

#define MIN_SHARES 2

337

#define MIN_SHARES 2

338

#define MAX_SHARES (1UL << 18)

338

#define MAX_SHARES (1UL << 18)

339

340

static int init_task_group_load = INIT_TASK_GROUP_LOAD;

340

static int init_task_group_load = INIT_TASK_GROUP_LOAD;

341

#endif

341

#endif

342

343

/* Default task group.

343

/* Default task group.

344

* Every task in system belong to this group at bootup.

344

* Every task in system belong to this group at bootup.

345

*/

345

*/

346

struct task_group init_task_group;

346

struct task_group init_task_group;

347

348

/* return group to which a task belongs */

348

/* return group to which a task belongs */

349

static inline struct task_group *task_group(struct task_struct *p)

349

static inline struct task_group *task_group(struct task_struct *p)

350

{

350

{

351

struct task_group *tg;

351

struct task_group *tg;

352

353

#ifdef CONFIG_USER_SCHED

353

#ifdef CONFIG_USER_SCHED

354

rcu_read_lock();

354

rcu_read_lock();

355

tg = __task_cred(p)->user->tg;

355

tg = __task_cred(p)->user->tg;

356

rcu_read_unlock();

356

rcu_read_unlock();

357

#elif defined(CONFIG_CGROUP_SCHED)

357

#elif defined(CONFIG_CGROUP_SCHED)

358

tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),

358

tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),

359

struct task_group, css);

359

struct task_group, css);

360

#else

360

#else

361

tg = &init_task_group;

361

tg = &init_task_group;

362

#endif

362

#endif

363

return tg;

363

return tg;

364

}

364

}

365

366

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

366

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

367

static inline void set_task_rq(struct task_struct *p, unsigned int cpu)

367

static inline void set_task_rq(struct task_struct *p, unsigned int cpu)

368

{

368

{

369

#ifdef CONFIG_FAIR_GROUP_SCHED

369

#ifdef CONFIG_FAIR_GROUP_SCHED

370

p->se.cfs_rq = task_group(p)->cfs_rq[cpu];

370

p->se.cfs_rq = task_group(p)->cfs_rq[cpu];

371

p->se.parent = task_group(p)->se[cpu];

371

p->se.parent = task_group(p)->se[cpu];

372

#endif

372

#endif

373

374

#ifdef CONFIG_RT_GROUP_SCHED

374

#ifdef CONFIG_RT_GROUP_SCHED

375

p->rt.rt_rq = task_group(p)->rt_rq[cpu];

375

p->rt.rt_rq = task_group(p)->rt_rq[cpu];

376

p->rt.parent = task_group(p)->rt_se[cpu];

376

p->rt.parent = task_group(p)->rt_se[cpu];

377

#endif

377

#endif

378

}

378

}

379

380

#else

380

#else

381

382

static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }

382

static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }

383

static inline struct task_group *task_group(struct task_struct *p)

383

static inline struct task_group *task_group(struct task_struct *p)

384

{

384

{

385

return NULL;

385

return NULL;

386

}

386

}

387

388

#endif /* CONFIG_GROUP_SCHED */

388

#endif /* CONFIG_GROUP_SCHED */

389

390

/* CFS-related fields in a runqueue */

390

/* CFS-related fields in a runqueue */

391

struct cfs_rq {

391

struct cfs_rq {

392

struct load_weight load;

392

struct load_weight load;

393

unsigned long nr_running;

393

unsigned long nr_running;

394

395

u64 exec_clock;

395

u64 exec_clock;

396

u64 min_vruntime;

396

u64 min_vruntime;

397

398

struct rb_root tasks_timeline;

398

struct rb_root tasks_timeline;

399

struct rb_node *rb_leftmost;

399

struct rb_node *rb_leftmost;

400

401

struct list_head tasks;

401

struct list_head tasks;

402

struct list_head *balance_iterator;

402

struct list_head *balance_iterator;

403

404

/*

404

/*

405

* 'curr' points to currently running entity on this cfs_rq.

405

* 'curr' points to currently running entity on this cfs_rq.

406

* It is set to NULL otherwise (i.e when none are currently running).

406

* It is set to NULL otherwise (i.e when none are currently running).

407

*/

407

*/

408

struct sched_entity *curr, *next, *last;

408

struct sched_entity *curr, *next, *last;

409

410

unsigned int nr_spread_over;

410

unsigned int nr_spread_over;

411

412

#ifdef CONFIG_FAIR_GROUP_SCHED

412

#ifdef CONFIG_FAIR_GROUP_SCHED

413

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

413

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

414

415

/*

415

/*

416

* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

416

* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

417

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

417

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

418

* (like users, containers etc.)

418

* (like users, containers etc.)

419

*

419

*

420

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

420

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

421

* list is used during load balance.

421

* list is used during load balance.

422

*/

422

*/

423

struct list_head leaf_cfs_rq_list;

423

struct list_head leaf_cfs_rq_list;

424

struct task_group *tg; /* group that "owns" this runqueue */

424

struct task_group *tg; /* group that "owns" this runqueue */

425

426

#ifdef CONFIG_SMP

426

#ifdef CONFIG_SMP

427

/*

427

/*

428

* the part of load.weight contributed by tasks

428

* the part of load.weight contributed by tasks

429

*/

429

*/

430

unsigned long task_weight;

430

unsigned long task_weight;

431

432

/*

432

/*

433

* h_load = weight * f(tg)

433

* h_load = weight * f(tg)

434

*

434

*

435

* Where f(tg) is the recursive weight fraction assigned to

435

* Where f(tg) is the recursive weight fraction assigned to

436

* this group.

436

* this group.

437

*/

437

*/

438

unsigned long h_load;

438

unsigned long h_load;

439

440

/*

440

/*

441

* this cpu's part of tg->shares

441

* this cpu's part of tg->shares

442

*/

442

*/

443

unsigned long shares;

443

unsigned long shares;

444

445

/*

445

/*

446

* load.weight at the time we set shares

446

* load.weight at the time we set shares

447

*/

447

*/

448

unsigned long rq_weight;

448

unsigned long rq_weight;

449

#endif

449

#endif

450

#endif

450

#endif

451

};

451

};

452

453

/* Real-Time classes' related field in a runqueue: */

453

/* Real-Time classes' related field in a runqueue: */

454

struct rt_rq {

454

struct rt_rq {

455

struct rt_prio_array active;

455

struct rt_prio_array active;

456

unsigned long rt_nr_running;

456

unsigned long rt_nr_running;

457

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

457

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

458

struct {

458

struct {

459

int curr; /* highest queued rt task prio */

459

int curr; /* highest queued rt task prio */

460

#ifdef CONFIG_SMP

460

#ifdef CONFIG_SMP

461

int next; /* next highest */

461

int next; /* next highest */

462

#endif

462

#endif

463

} highest_prio;

463

} highest_prio;

464

#endif

464

#endif

465

#ifdef CONFIG_SMP

465

#ifdef CONFIG_SMP

466

unsigned long rt_nr_migratory;

466

unsigned long rt_nr_migratory;

467

unsigned long rt_nr_total;

467

unsigned long rt_nr_total;

468

int overloaded;

468

int overloaded;

469

struct plist_head pushable_tasks;

469

struct plist_head pushable_tasks;

470

#endif

470

#endif

471

int rt_throttled;

471

int rt_throttled;

472

u64 rt_time;

472

u64 rt_time;

473

u64 rt_runtime;

473

u64 rt_runtime;

474

/* Nests inside the rq lock: */

474

/* Nests inside the rq lock: */

475

spinlock_t rt_runtime_lock;

475

spinlock_t rt_runtime_lock;

476

477

#ifdef CONFIG_RT_GROUP_SCHED

477

#ifdef CONFIG_RT_GROUP_SCHED

478

unsigned long rt_nr_boosted;

478

unsigned long rt_nr_boosted;

479

480

struct rq *rq;

480

struct rq *rq;

481

struct list_head leaf_rt_rq_list;

481

struct list_head leaf_rt_rq_list;

482

struct task_group *tg;

482

struct task_group *tg;

483

struct sched_rt_entity *rt_se;

483

struct sched_rt_entity *rt_se;

484

#endif

484

#endif

485

};

485

};

486

487

#ifdef CONFIG_SMP

487

#ifdef CONFIG_SMP

488

489

/*

489

/*

490

* We add the notion of a root-domain which will be used to define per-domain

490

* We add the notion of a root-domain which will be used to define per-domain

491

* variables. Each exclusive cpuset essentially defines an island domain by

491

* variables. Each exclusive cpuset essentially defines an island domain by

492

* fully partitioning the member cpus from any other cpuset. Whenever a new

492

* fully partitioning the member cpus from any other cpuset. Whenever a new

493

* exclusive cpuset is created, we also create and attach a new root-domain

493

* exclusive cpuset is created, we also create and attach a new root-domain

494

* object.

494

* object.

495

*

495

*

496

*/

496

*/

497

struct root_domain {

497

struct root_domain {

498

atomic_t refcount;

498

atomic_t refcount;

499

cpumask_var_t span;

499

cpumask_var_t span;

500

cpumask_var_t online;

500

cpumask_var_t online;

501

502

/*

502

/*

503

* The "RT overload" flag: it gets set if a CPU has more than

503

* The "RT overload" flag: it gets set if a CPU has more than

504

* one runnable RT task.

504

* one runnable RT task.

505

*/

505

*/

506

cpumask_var_t rto_mask;

506

cpumask_var_t rto_mask;

507

atomic_t rto_count;

507

atomic_t rto_count;

508

#ifdef CONFIG_SMP

508

#ifdef CONFIG_SMP

509

struct cpupri cpupri;

509

struct cpupri cpupri;

510

#endif

510

#endif

511

};

511

};

512

513

/*

513

/*

514

* By default the system creates a single root-domain with all cpus as

514

* By default the system creates a single root-domain with all cpus as

515

* members (mimicking the global state we have today).

515

* members (mimicking the global state we have today).

516

*/

516

*/

517

static struct root_domain def_root_domain;

517

static struct root_domain def_root_domain;

518

519

#endif

519

#endif

520

521

/*

521

/*

522

* This is the main, per-CPU runqueue data structure.

522

* This is the main, per-CPU runqueue data structure.

523

*

523

*

524

* Locking rule: those places that want to lock multiple runqueues

524

* Locking rule: those places that want to lock multiple runqueues

525

* (such as the load balancing or the thread migration code), lock

525

* (such as the load balancing or the thread migration code), lock

526

* acquire operations must be ordered by ascending &runqueue.

526

* acquire operations must be ordered by ascending &runqueue.

527

*/

527

*/

528

struct rq {

528

struct rq {

529

/* runqueue lock: */

529

/* runqueue lock: */

530

spinlock_t lock;

530

spinlock_t lock;

531

532

/*

532

/*

533

* nr_running and cpu_load should be in the same cacheline because

533

* nr_running and cpu_load should be in the same cacheline because

534

* remote CPUs use both these fields when doing load calculation.

534

* remote CPUs use both these fields when doing load calculation.

535

*/

535

*/

536

unsigned long nr_running;

536

unsigned long nr_running;

537

#define CPU_LOAD_IDX_MAX 5

537

#define CPU_LOAD_IDX_MAX 5

538

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

538

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

539

#ifdef CONFIG_NO_HZ

539

#ifdef CONFIG_NO_HZ

540

unsigned char in_nohz_recently;

540

unsigned char in_nohz_recently;

541

#endif

541

#endif

542

/* capture load from *all* tasks on this cpu: */

542

/* capture load from *all* tasks on this cpu: */

543

struct load_weight load;

543

struct load_weight load;

544

unsigned long nr_load_updates;

544

unsigned long nr_load_updates;

545

u64 nr_switches;

545

u64 nr_switches;

546

547

struct cfs_rq cfs;

547

struct cfs_rq cfs;

548

struct rt_rq rt;

548

struct rt_rq rt;

549

550

#ifdef CONFIG_FAIR_GROUP_SCHED

550

#ifdef CONFIG_FAIR_GROUP_SCHED

551

/* list of leaf cfs_rq on this cpu: */

551

/* list of leaf cfs_rq on this cpu: */

552

struct list_head leaf_cfs_rq_list;

552

struct list_head leaf_cfs_rq_list;

553

#endif

553

#endif

554

#ifdef CONFIG_RT_GROUP_SCHED

554

#ifdef CONFIG_RT_GROUP_SCHED

555

struct list_head leaf_rt_rq_list;

555

struct list_head leaf_rt_rq_list;

556

#endif

556

#endif

557

558

/*

558

/*

559

* This is part of a global counter where only the total sum

559

* This is part of a global counter where only the total sum

560

* over all CPUs matters. A task can increase this counter on

560

* over all CPUs matters. A task can increase this counter on

561

* one CPU and if it got migrated afterwards it may decrease

561

* one CPU and if it got migrated afterwards it may decrease

562

* it on another CPU. Always updated under the runqueue lock:

562

* it on another CPU. Always updated under the runqueue lock:

563

*/

563

*/

564

unsigned long nr_uninterruptible;

564

unsigned long nr_uninterruptible;

565

566

struct task_struct *curr, *idle;

566

struct task_struct *curr, *idle;

567

unsigned long next_balance;

567

unsigned long next_balance;

568

struct mm_struct *prev_mm;

568

struct mm_struct *prev_mm;

569

570

u64 clock;

570

u64 clock;

571

572

atomic_t nr_iowait;

572

atomic_t nr_iowait;

573

574

#ifdef CONFIG_SMP

574

#ifdef CONFIG_SMP

575

struct root_domain *rd;

575

struct root_domain *rd;

576

struct sched_domain *sd;

576

struct sched_domain *sd;

577

578

unsigned char idle_at_tick;

578

unsigned char idle_at_tick;

579

/* For active balancing */

579

/* For active balancing */

580

int post_schedule;

580

int post_schedule;

581

int active_balance;

581

int active_balance;

582

int push_cpu;

582

int push_cpu;

583

/* cpu of this runqueue: */

583

/* cpu of this runqueue: */

584

int cpu;

584

int cpu;

585

int online;

585

int online;

586

587

unsigned long avg_load_per_task;

587

unsigned long avg_load_per_task;

588

589

struct task_struct *migration_thread;

589

struct task_struct *migration_thread;

590

struct list_head migration_queue;

590

struct list_head migration_queue;

591

592

u64 rt_avg;

592

u64 rt_avg;

593

u64 age_stamp;

593

u64 age_stamp;

594

u64 idle_stamp;

594

u64 idle_stamp;

595

u64 avg_idle;

595

u64 avg_idle;

596

#endif

596

#endif

597

598

/* calc_load related fields */

598

/* calc_load related fields */

599

unsigned long calc_load_update;

599

unsigned long calc_load_update;

600

long calc_load_active;

600

long calc_load_active;

601

602

#ifdef CONFIG_SCHED_HRTICK

602

#ifdef CONFIG_SCHED_HRTICK

603

#ifdef CONFIG_SMP

603

#ifdef CONFIG_SMP

604

int hrtick_csd_pending;

604

int hrtick_csd_pending;

605

struct call_single_data hrtick_csd;

605

struct call_single_data hrtick_csd;

606

#endif

606

#endif

607

struct hrtimer hrtick_timer;

607

struct hrtimer hrtick_timer;

608

#endif

608

#endif

609

610

#ifdef CONFIG_SCHEDSTATS

610

#ifdef CONFIG_SCHEDSTATS

611

/* latency stats */

611

/* latency stats */

612

struct sched_info rq_sched_info;

612

struct sched_info rq_sched_info;

613

unsigned long long rq_cpu_time;

613

unsigned long long rq_cpu_time;

614

/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

614

/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

615

616

/* sys_sched_yield() stats */

616

/* sys_sched_yield() stats */

617

unsigned int yld_count;

617

unsigned int yld_count;

618

619

/* schedule() stats */

619

/* schedule() stats */

620

unsigned int sched_switch;

620

unsigned int sched_switch;

621

unsigned int sched_count;

621

unsigned int sched_count;

622

unsigned int sched_goidle;

622

unsigned int sched_goidle;

623

624

/* try_to_wake_up() stats */

624

/* try_to_wake_up() stats */

625

unsigned int ttwu_count;

625

unsigned int ttwu_count;

626

unsigned int ttwu_local;

626

unsigned int ttwu_local;

627

628

/* BKL stats */

628

/* BKL stats */

629

unsigned int bkl_count;

629

unsigned int bkl_count;

630

#endif

630

#endif

631

};

631

};

632

633

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

633

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

634

635

static inline

635

static inline

636

void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)

636

void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)

637

{

637

{

638

rq->curr->sched_class->check_preempt_curr(rq, p, flags);

638

rq->curr->sched_class->check_preempt_curr(rq, p, flags);

639

}

639

}

640

641

static inline int cpu_of(struct rq *rq)

641

static inline int cpu_of(struct rq *rq)

642

{

642

{

643

#ifdef CONFIG_SMP

643

#ifdef CONFIG_SMP

644

return rq->cpu;

644

return rq->cpu;

645

#else

645

#else

646

return 0;

646

return 0;

647

#endif

647

#endif

648

}

648

}

649

650

/*

650

/*

651

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

651

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

652

* See detach_destroy_domains: synchronize_sched for details.

652

* See detach_destroy_domains: synchronize_sched for details.

653

*

653

*

654

* The domain tree of any CPU may only be accessed from within

654

* The domain tree of any CPU may only be accessed from within

655

* preempt-disabled sections.

655

* preempt-disabled sections.

656

*/

656

*/

657

#define for_each_domain(cpu, __sd) \

657

#define for_each_domain(cpu, __sd) \

658

for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)

658

for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)

659

660

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

660

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

661

#define this_rq() (&__get_cpu_var(runqueues))

661

#define this_rq() (&__get_cpu_var(runqueues))

662

#define task_rq(p) cpu_rq(task_cpu(p))

662

#define task_rq(p) cpu_rq(task_cpu(p))

663

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

663

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

664

#define raw_rq() (&__raw_get_cpu_var(runqueues))

664

#define raw_rq() (&__raw_get_cpu_var(runqueues))

665

666

inline void update_rq_clock(struct rq *rq)

666

inline void update_rq_clock(struct rq *rq)

667

{

667

{

668

rq->clock = sched_clock_cpu(cpu_of(rq));

668

rq->clock = sched_clock_cpu(cpu_of(rq));

669

}

669

}

670

671

/*

671

/*

672

* Tunables that become constants when CONFIG_SCHED_DEBUG is off:

672

* Tunables that become constants when CONFIG_SCHED_DEBUG is off:

673

*/

673

*/

674

#ifdef CONFIG_SCHED_DEBUG

674

#ifdef CONFIG_SCHED_DEBUG

675

# define const_debug __read_mostly

675

# define const_debug __read_mostly

676

#else

676

#else

677

# define const_debug static const

677

# define const_debug static const

678

#endif

678

#endif

679

680

/**

680

/**

681

* runqueue_is_locked

681

* runqueue_is_locked

682

* @cpu: the processor in question.

682

* @cpu: the processor in question.

683

*

683

*

684

* Returns true if the current cpu runqueue is locked.

684

* Returns true if the current cpu runqueue is locked.

685

* This interface allows printk to be called with the runqueue lock

685

* This interface allows printk to be called with the runqueue lock

686

* held and know whether or not it is OK to wake up the klogd.

686

* held and know whether or not it is OK to wake up the klogd.

687

*/

687

*/

688

int runqueue_is_locked(int cpu)

688

int runqueue_is_locked(int cpu)

689

{

689

{

690

return spin_is_locked(&cpu_rq(cpu)->lock);

690

return spin_is_locked(&cpu_rq(cpu)->lock);

691

}

691

}

692

693

/*

693

/*

694

* Debugging: various feature bits

694

* Debugging: various feature bits

695

*/

695

*/

696

697

#define SCHED_FEAT(name, enabled) \

697

#define SCHED_FEAT(name, enabled) \

698

__SCHED_FEAT_##name ,

698

__SCHED_FEAT_##name ,

699

700

enum {

700

enum {

701

#include "sched_features.h"

701

#include "sched_features.h"

702

};

702

};

703

704

#undef SCHED_FEAT

704

#undef SCHED_FEAT

705

706

#define SCHED_FEAT(name, enabled) \

706

#define SCHED_FEAT(name, enabled) \

707

(1UL << __SCHED_FEAT_##name) * enabled |

707

(1UL << __SCHED_FEAT_##name) * enabled |

708

709

const_debug unsigned int sysctl_sched_features =

709

const_debug unsigned int sysctl_sched_features =

710

#include "sched_features.h"

710

#include "sched_features.h"

711

0;

711

0;

712

713

#undef SCHED_FEAT

713

#undef SCHED_FEAT

714

715

#ifdef CONFIG_SCHED_DEBUG

715

#ifdef CONFIG_SCHED_DEBUG

716

#define SCHED_FEAT(name, enabled) \

716

#define SCHED_FEAT(name, enabled) \

717

#name ,

717

#name ,

718

719

static __read_mostly char *sched_feat_names[] = {

719

static __read_mostly char *sched_feat_names[] = {

720

#include "sched_features.h"

720

#include "sched_features.h"

721

NULL

721

NULL

722

};

722

};

723

724

#undef SCHED_FEAT

724

#undef SCHED_FEAT

725

726

static int sched_feat_show(struct seq_file *m, void *v)

726

static int sched_feat_show(struct seq_file *m, void *v)

727

{

727

{

728

int i;

728

int i;

729

730

for (i = 0; sched_feat_names[i]; i++) {

730

for (i = 0; sched_feat_names[i]; i++) {

731

if (!(sysctl_sched_features & (1UL << i)))

731

if (!(sysctl_sched_features & (1UL << i)))

732

seq_puts(m, "NO_");

732

seq_puts(m, "NO_");

733

seq_printf(m, "%s ", sched_feat_names[i]);

733

seq_printf(m, "%s ", sched_feat_names[i]);

734

}

734

}

735

seq_puts(m, "\n");

735

seq_puts(m, "\n");

736

737

return 0;

737

return 0;

738

}

738

}

739

740

static ssize_t

740

static ssize_t

741

sched_feat_write(struct file *filp, const char __user *ubuf,

741

sched_feat_write(struct file *filp, const char __user *ubuf,

742

size_t cnt, loff_t *ppos)

742

size_t cnt, loff_t *ppos)

743

{

743

{

744

char buf[64];

744

char buf[64];

745

char *cmp = buf;

745

char *cmp = buf;

746

int neg = 0;

746

int neg = 0;

747

int i;

747

int i;

748

749

if (cnt > 63)

749

if (cnt > 63)

750

cnt = 63;

750

cnt = 63;

751

752

if (copy_from_user(&buf, ubuf, cnt))

752

if (copy_from_user(&buf, ubuf, cnt))

753

return -EFAULT;

753

return -EFAULT;

754

755

buf[cnt] = 0;

755

buf[cnt] = 0;

756

757

if (strncmp(buf, "NO_", 3) == 0) {

757

if (strncmp(buf, "NO_", 3) == 0) {

758

neg = 1;

758

neg = 1;

759

cmp += 3;

759

cmp += 3;

760

}

760

}

761

762

for (i = 0; sched_feat_names[i]; i++) {

762

for (i = 0; sched_feat_names[i]; i++) {

763

int len = strlen(sched_feat_names[i]);

763

int len = strlen(sched_feat_names[i]);

764

765

if (strncmp(cmp, sched_feat_names[i], len) == 0) {

765

if (strncmp(cmp, sched_feat_names[i], len) == 0) {

766

if (neg)

766

if (neg)

767

sysctl_sched_features &= ~(1UL << i);

767

sysctl_sched_features &= ~(1UL << i);

768

else

768

else

769

sysctl_sched_features |= (1UL << i);

769

sysctl_sched_features |= (1UL << i);

770

break;

770

break;

771

}

771

}

772

}

772

}

773

774

if (!sched_feat_names[i])

774

if (!sched_feat_names[i])

775

return -EINVAL;

775

return -EINVAL;

776

777

*ppos += cnt;

777

*ppos += cnt;

778

779

return cnt;

779

return cnt;

780

}

780

}

781

782

static int sched_feat_open(struct inode *inode, struct file *filp)

782

static int sched_feat_open(struct inode *inode, struct file *filp)

783

{

783

{

784

return single_open(filp, sched_feat_show, NULL);

784

return single_open(filp, sched_feat_show, NULL);

785

}

785

}

786

787

static const struct file_operations sched_feat_fops = {

787

static const struct file_operations sched_feat_fops = {

788

.open = sched_feat_open,

788

.open = sched_feat_open,

789

.write = sched_feat_write,

789

.write = sched_feat_write,

790

.read = seq_read,

790

.read = seq_read,

791

.llseek = seq_lseek,

791

.llseek = seq_lseek,

792

.release = single_release,

792

.release = single_release,

793

};

793

};

794

795

static __init int sched_init_debug(void)

795

static __init int sched_init_debug(void)

796

{

796

{

797

debugfs_create_file("sched_features", 0644, NULL, NULL,

797

debugfs_create_file("sched_features", 0644, NULL, NULL,

798

&sched_feat_fops);

798

&sched_feat_fops);

799

800

return 0;

800

return 0;

801

}

801

}

802

late_initcall(sched_init_debug);

802

late_initcall(sched_init_debug);

803

804

#endif

804

#endif

805

806

#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))

806

#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))

807

808

/*

808

/*

809

* Number of tasks to iterate in a single balance run.

809

* Number of tasks to iterate in a single balance run.

810

* Limited because this is done with IRQs disabled.

810

* Limited because this is done with IRQs disabled.

811

*/

811

*/

812

const_debug unsigned int sysctl_sched_nr_migrate = 32;

812

const_debug unsigned int sysctl_sched_nr_migrate = 32;

813

814

/*

814

/*

815

* ratelimit for updating the group shares.

815

* ratelimit for updating the group shares.

816

* default: 0.25ms

816

* default: 0.25ms

817

*/

817

*/

818

unsigned int sysctl_sched_shares_ratelimit = 250000;

818

unsigned int sysctl_sched_shares_ratelimit = 250000;

819

unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;

819

unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;

820

821

/*

821

/*

822

* Inject some fuzzyness into changing the per-cpu group shares

822

* Inject some fuzzyness into changing the per-cpu group shares

823

* this avoids remote rq-locks at the expense of fairness.

823

* this avoids remote rq-locks at the expense of fairness.

824

* default: 4

824

* default: 4

825

*/

825

*/

826

unsigned int sysctl_sched_shares_thresh = 4;

826

unsigned int sysctl_sched_shares_thresh = 4;

827

828

/*

828

/*

829

* period over which we average the RT time consumption, measured

829

* period over which we average the RT time consumption, measured

830

* in ms.

830

* in ms.

831

*

831

*

832

* default: 1s

832

* default: 1s

833

*/

833

*/

834

const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

834

const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

835

836

/*

836

/*

837

* period over which we measure -rt task cpu usage in us.

837

* period over which we measure -rt task cpu usage in us.

838

* default: 1s

838

* default: 1s

839

*/

839

*/

840

unsigned int sysctl_sched_rt_period = 1000000;

840

unsigned int sysctl_sched_rt_period = 1000000;

841

842

static __read_mostly int scheduler_running;

842

static __read_mostly int scheduler_running;

843

844

/*

844

/*

845

* part of the period that we allow rt tasks to run in us.

845

* part of the period that we allow rt tasks to run in us.

846

* default: 0.95s

846

* default: 0.95s

847

*/

847

*/

848

int sysctl_sched_rt_runtime = 950000;

848

int sysctl_sched_rt_runtime = 950000;

849

850

static inline u64 global_rt_period(void)

850

static inline u64 global_rt_period(void)

851

{

851

{

852

return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;

852

return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;

853

}

853

}

854

855

static inline u64 global_rt_runtime(void)

855

static inline u64 global_rt_runtime(void)

856

{

856

{

857

if (sysctl_sched_rt_runtime < 0)

857

if (sysctl_sched_rt_runtime < 0)

858

return RUNTIME_INF;

858

return RUNTIME_INF;

859

860

return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;

860

return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;

861

}

861

}

862

863

#ifndef prepare_arch_switch

863

#ifndef prepare_arch_switch

864

# define prepare_arch_switch(next) do { } while (0)

864

# define prepare_arch_switch(next) do { } while (0)

865

#endif

865

#endif

866

#ifndef finish_arch_switch

866

#ifndef finish_arch_switch

867

# define finish_arch_switch(prev) do { } while (0)

867

# define finish_arch_switch(prev) do { } while (0)

868

#endif

868

#endif

869

870

static inline int task_current(struct rq *rq, struct task_struct *p)

870

static inline int task_current(struct rq *rq, struct task_struct *p)

871

{

871

{

872

return rq->curr == p;

872

return rq->curr == p;

873

}

873

}

874

875

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

875

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

876

static inline int task_running(struct rq *rq, struct task_struct *p)

876

static inline int task_running(struct rq *rq, struct task_struct *p)

877

{

877

{

878

return task_current(rq, p);

878

return task_current(rq, p);

879

}

879

}

880

881

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

881

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

882

{

882

{

883

}

883

}

884

885

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

885

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

886

{

886

{

887

#ifdef CONFIG_DEBUG_SPINLOCK

887

#ifdef CONFIG_DEBUG_SPINLOCK

888

/* this is a valid case when another task releases the spinlock */

888

/* this is a valid case when another task releases the spinlock */

889

rq->lock.owner = current;

889

rq->lock.owner = current;

890

#endif

890

#endif

891

/*

891

/*

892

* If we are tracking spinlock dependencies then we have to

892

* If we are tracking spinlock dependencies then we have to

893

* fix up the runqueue lock - which gets 'carried over' from

893

* fix up the runqueue lock - which gets 'carried over' from

894

* prev into current:

894

* prev into current:

895

*/

895

*/

896

spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

896

spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

897

898

spin_unlock_irq(&rq->lock);

898

spin_unlock_irq(&rq->lock);

899

}

899

}

900

901

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

901

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

902

static inline int task_running(struct rq *rq, struct task_struct *p)

902

static inline int task_running(struct rq *rq, struct task_struct *p)

903

{

903

{

904

#ifdef CONFIG_SMP

904

#ifdef CONFIG_SMP

905

return p->oncpu;

905

return p->oncpu;

906

#else

906

#else

907

return task_current(rq, p);

907

return task_current(rq, p);

908

#endif

908

#endif

909

}

909

}

910

911

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

911

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

912

{

912

{

913

#ifdef CONFIG_SMP

913

#ifdef CONFIG_SMP

914

/*

914

/*

915

* We can optimise this out completely for !SMP, because the

915

* We can optimise this out completely for !SMP, because the

916

* SMP rebalancing from interrupt is the only thing that cares

916

* SMP rebalancing from interrupt is the only thing that cares

917

* here.

917

* here.

918

*/

918

*/

919

next->oncpu = 1;

919

next->oncpu = 1;

920

#endif

920

#endif

921

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

921

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

922

spin_unlock_irq(&rq->lock);

922

spin_unlock_irq(&rq->lock);

923

#else

923

#else

924

spin_unlock(&rq->lock);

924

spin_unlock(&rq->lock);

925

#endif

925

#endif

926

}

926

}

927

928

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

928

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

929

{

929

{

930

#ifdef CONFIG_SMP

930

#ifdef CONFIG_SMP

931

/*

931

/*

932

* After ->oncpu is cleared, the task can be moved to a different CPU.

932

* After ->oncpu is cleared, the task can be moved to a different CPU.

933

* We must ensure this doesn't happen until the switch is completely

933

* We must ensure this doesn't happen until the switch is completely

934

* finished.

934

* finished.

935

*/

935

*/

936

smp_wmb();

936

smp_wmb();

937

prev->oncpu = 0;

937

prev->oncpu = 0;

938

#endif

938

#endif

939

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

939

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

940

local_irq_enable();

940

local_irq_enable();

941

#endif

941

#endif

942

}

942

}

943

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

943

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

944

945

/*

945

/*

946

* __task_rq_lock - lock the runqueue a given task resides on.

946

* __task_rq_lock - lock the runqueue a given task resides on.

947

* Must be called interrupts disabled.

947

* Must be called interrupts disabled.

948

*/

948

*/

949

static inline struct rq *__task_rq_lock(struct task_struct *p)

949

static inline struct rq *__task_rq_lock(struct task_struct *p)

950

__acquires(rq->lock)

950

__acquires(rq->lock)

951

{

951

{

952

for (;;) {

952

for (;;) {

953

struct rq *rq = task_rq(p);

953

struct rq *rq = task_rq(p);

954

spin_lock(&rq->lock);

954

spin_lock(&rq->lock);

955

if (likely(rq == task_rq(p)))

955

if (likely(rq == task_rq(p)))

956

return rq;

956

return rq;

957

spin_unlock(&rq->lock);

957

spin_unlock(&rq->lock);

958

}

958

}

959

}

959

}

960

961

/*

961

/*

962

* task_rq_lock - lock the runqueue a given task resides on and disable

962

* task_rq_lock - lock the runqueue a given task resides on and disable

963

* interrupts. Note the ordering: we can safely lookup the task_rq without

963

* interrupts. Note the ordering: we can safely lookup the task_rq without

964

* explicitly disabling preemption.

964

* explicitly disabling preemption.

965

*/

965

*/

966

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

966

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

967

__acquires(rq->lock)

967

__acquires(rq->lock)

968

{

968

{

969

struct rq *rq;

969

struct rq *rq;

970

971

for (;;) {

971

for (;;) {

972

local_irq_save(*flags);

972

local_irq_save(*flags);

973

rq = task_rq(p);

973

rq = task_rq(p);

974

spin_lock(&rq->lock);

974

spin_lock(&rq->lock);

975

if (likely(rq == task_rq(p)))

975

if (likely(rq == task_rq(p)))

976

return rq;

976

return rq;

977

spin_unlock_irqrestore(&rq->lock, *flags);

977

spin_unlock_irqrestore(&rq->lock, *flags);

978

}

978

}

979

}

979

}

980

981

void task_rq_unlock_wait(struct task_struct *p)

981

void task_rq_unlock_wait(struct task_struct *p)

982

{

982

{

983

struct rq *rq = task_rq(p);

983

struct rq *rq = task_rq(p);

984

985

smp_mb(); /* spin-unlock-wait is not a full memory barrier */

985

smp_mb(); /* spin-unlock-wait is not a full memory barrier */

986

spin_unlock_wait(&rq->lock);

986

spin_unlock_wait(&rq->lock);

987

}

987

}

988

989

static void __task_rq_unlock(struct rq *rq)

989

static void __task_rq_unlock(struct rq *rq)

990

__releases(rq->lock)

990

__releases(rq->lock)

991

{

991

{

992

spin_unlock(&rq->lock);

992

spin_unlock(&rq->lock);

993

}

993

}

994

995

static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)

995

static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)

996

__releases(rq->lock)

996

__releases(rq->lock)

997

{

997

{

998

spin_unlock_irqrestore(&rq->lock, *flags);

998

spin_unlock_irqrestore(&rq->lock, *flags);

999

}

999

}

1000

1001

/*

1001

/*

1002

* this_rq_lock - lock this runqueue and disable interrupts.

1002

* this_rq_lock - lock this runqueue and disable interrupts.

1003

*/

1003

*/

1004

static struct rq *this_rq_lock(void)

1004

static struct rq *this_rq_lock(void)

1005

__acquires(rq->lock)

1005

__acquires(rq->lock)

1006

{

1006

{

1007

struct rq *rq;

1007

struct rq *rq;

1008

1009

local_irq_disable();

1009

local_irq_disable();

1010

rq = this_rq();

1010

rq = this_rq();

1011

spin_lock(&rq->lock);

1011

spin_lock(&rq->lock);

1012

1013

return rq;

1013

return rq;

1014

}

1014

}

1015

1016

#ifdef CONFIG_SCHED_HRTICK

1016

#ifdef CONFIG_SCHED_HRTICK

1017

/*

1017

/*

1018

* Use HR-timers to deliver accurate preemption points.

1018

* Use HR-timers to deliver accurate preemption points.

1019

*

1019

*

1020

* Its all a bit involved since we cannot program an hrt while holding the

1020

* Its all a bit involved since we cannot program an hrt while holding the

1021

* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a

1021

* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a

1022

* reschedule event.

1022

* reschedule event.

1023

*

1023

*

1024

* When we get rescheduled we reprogram the hrtick_timer outside of the

1024

* When we get rescheduled we reprogram the hrtick_timer outside of the

1025

* rq->lock.

1025

* rq->lock.

1026

*/

1026

*/

1027

1028

/*

1028

/*

1029

* Use hrtick when:

1029

* Use hrtick when:

1030

* - enabled by features

1030

* - enabled by features

1031

* - hrtimer is actually high res

1031

* - hrtimer is actually high res

1032

*/

1032

*/

1033

static inline int hrtick_enabled(struct rq *rq)

1033

static inline int hrtick_enabled(struct rq *rq)

1034

{

1034

{

1035

if (!sched_feat(HRTICK))

1035

if (!sched_feat(HRTICK))

1036

return 0;

1036

return 0;

1037

if (!cpu_active(cpu_of(rq)))

1037

if (!cpu_active(cpu_of(rq)))

1038

return 0;

1038

return 0;

1039

return hrtimer_is_hres_active(&rq->hrtick_timer);

1039

return hrtimer_is_hres_active(&rq->hrtick_timer);

1040

}

1040

}

1041

1042

static void hrtick_clear(struct rq *rq)

1042

static void hrtick_clear(struct rq *rq)

1043

{

1043

{

1044

if (hrtimer_active(&rq->hrtick_timer))

1044

if (hrtimer_active(&rq->hrtick_timer))

1045

hrtimer_cancel(&rq->hrtick_timer);

1045

hrtimer_cancel(&rq->hrtick_timer);

1046

}

1046

}

1047

1048

/*

1048

/*

1049

* High-resolution timer tick.

1049

* High-resolution timer tick.

1050

* Runs from hardirq context with interrupts disabled.

1050

* Runs from hardirq context with interrupts disabled.

1051

*/

1051

*/

1052

static enum hrtimer_restart hrtick(struct hrtimer *timer)

1052

static enum hrtimer_restart hrtick(struct hrtimer *timer)

1053

{

1053

{

1054

struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1054

struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1055

1056

WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1056

WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1057

1058

spin_lock(&rq->lock);

1058

spin_lock(&rq->lock);

1059

update_rq_clock(rq);

1059

update_rq_clock(rq);

1060

rq->curr->sched_class->task_tick(rq, rq->curr, 1);

1060

rq->curr->sched_class->task_tick(rq, rq->curr, 1);

1061

spin_unlock(&rq->lock);

1061

spin_unlock(&rq->lock);

1062

1063

return HRTIMER_NORESTART;

1063

return HRTIMER_NORESTART;

1064

}

1064

}

1065

1066

#ifdef CONFIG_SMP

1066

#ifdef CONFIG_SMP

1067

/*

1067

/*

1068

* called from hardirq (IPI) context

1068

* called from hardirq (IPI) context

1069

*/

1069

*/

1070

static void __hrtick_start(void *arg)

1070

static void __hrtick_start(void *arg)

1071

{

1071

{

1072

struct rq *rq = arg;

1072

struct rq *rq = arg;

1073

1074

spin_lock(&rq->lock);

1074

spin_lock(&rq->lock);

1075

hrtimer_restart(&rq->hrtick_timer);

1075

hrtimer_restart(&rq->hrtick_timer);

1076

rq->hrtick_csd_pending = 0;

1076

rq->hrtick_csd_pending = 0;

1077

spin_unlock(&rq->lock);

1077

spin_unlock(&rq->lock);

1078

}

1078

}

1079

1080

/*

1080

/*

1081

* Called to set the hrtick timer state.

1081

* Called to set the hrtick timer state.

1082

*

1082

*

1083

* called with rq->lock held and irqs disabled

1083

* called with rq->lock held and irqs disabled

1084

*/

1084

*/

1085

static void hrtick_start(struct rq *rq, u64 delay)

1085

static void hrtick_start(struct rq *rq, u64 delay)

1086

{

1086

{

1087

struct hrtimer *timer = &rq->hrtick_timer;

1087

struct hrtimer *timer = &rq->hrtick_timer;

1088

ktime_t time = ktime_add_ns(timer->base->get_time(), delay);

1088

ktime_t time = ktime_add_ns(timer->base->get_time(), delay);

1089

1090

hrtimer_set_expires(timer, time);

1090

hrtimer_set_expires(timer, time);

1091

1092

if (rq == this_rq()) {

1092

if (rq == this_rq()) {

1093

hrtimer_restart(timer);

1093

hrtimer_restart(timer);

1094

} else if (!rq->hrtick_csd_pending) {

1094

} else if (!rq->hrtick_csd_pending) {

1095

__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);

1095

__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);

1096

rq->hrtick_csd_pending = 1;

1096

rq->hrtick_csd_pending = 1;

1097

}

1097

}

1098

}

1098

}

1099

1100

static int

1100

static int

1101

hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)

1101

hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)

1102

{

1102

{

1103

int cpu = (int)(long)hcpu;

1103

int cpu = (int)(long)hcpu;

1104

1105

switch (action) {

1105

switch (action) {

1106

case CPU_UP_CANCELED:

1106

case CPU_UP_CANCELED:

1107

case CPU_UP_CANCELED_FROZEN:

1107

case CPU_UP_CANCELED_FROZEN:

1108

case CPU_DOWN_PREPARE:

1108

case CPU_DOWN_PREPARE:

1109

case CPU_DOWN_PREPARE_FROZEN:

1109

case CPU_DOWN_PREPARE_FROZEN:

1110

case CPU_DEAD:

1110

case CPU_DEAD:

1111

case CPU_DEAD_FROZEN:

1111

case CPU_DEAD_FROZEN:

1112

hrtick_clear(cpu_rq(cpu));

1112

hrtick_clear(cpu_rq(cpu));

1113

return NOTIFY_OK;

1113

return NOTIFY_OK;

1114

}

1114

}

1115

1116

return NOTIFY_DONE;

1116

return NOTIFY_DONE;

1117

}

1117

}

1118

1119

static __init void init_hrtick(void)

1119

static __init void init_hrtick(void)

1120

{

1120

{

1121

hotcpu_notifier(hotplug_hrtick, 0);

1121

hotcpu_notifier(hotplug_hrtick, 0);

1122

}

1122

}

1123

#else

1123

#else

1124

/*

1124

/*

1125

* Called to set the hrtick timer state.

1125

* Called to set the hrtick timer state.

1126

*

1126

*

1127

* called with rq->lock held and irqs disabled

1127

* called with rq->lock held and irqs disabled

1128

*/

1128

*/

1129

static void hrtick_start(struct rq *rq, u64 delay)

1129

static void hrtick_start(struct rq *rq, u64 delay)

1130

{

1130

{

1131

__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,

1131

__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,

1132

HRTIMER_MODE_REL_PINNED, 0);

1132

HRTIMER_MODE_REL_PINNED, 0);

1133

}

1133

}

1134

1135

static inline void init_hrtick(void)

1135

static inline void init_hrtick(void)

1136

{

1136

{

1137

}

1137

}

1138

#endif /* CONFIG_SMP */

1138

#endif /* CONFIG_SMP */

1139

1140

static void init_rq_hrtick(struct rq *rq)

1140

static void init_rq_hrtick(struct rq *rq)

1141

{

1141

{

1142

#ifdef CONFIG_SMP

1142

#ifdef CONFIG_SMP

1143

rq->hrtick_csd_pending = 0;

1143

rq->hrtick_csd_pending = 0;

1144

1145

rq->hrtick_csd.flags = 0;

1145

rq->hrtick_csd.flags = 0;

1146

rq->hrtick_csd.func = __hrtick_start;

1146

rq->hrtick_csd.func = __hrtick_start;

1147

rq->hrtick_csd.info = rq;

1147

rq->hrtick_csd.info = rq;

1148

#endif

1148

#endif

1149

1150

hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

1150

hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

1151

rq->hrtick_timer.function = hrtick;

1151

rq->hrtick_timer.function = hrtick;

1152

}

1152

}

1153

#else /* CONFIG_SCHED_HRTICK */

1153

#else /* CONFIG_SCHED_HRTICK */

1154

static inline void hrtick_clear(struct rq *rq)

1154

static inline void hrtick_clear(struct rq *rq)

1155

{

1155

{

1156

}

1156

}

1157

1158

static inline void init_rq_hrtick(struct rq *rq)

1158

static inline void init_rq_hrtick(struct rq *rq)

1159

{

1159

{

1160

}

1160

}

1161

1162

static inline void init_hrtick(void)

1162

static inline void init_hrtick(void)

1163

{

1163

{

1164

}

1164

}

1165

#endif /* CONFIG_SCHED_HRTICK */

1165

#endif /* CONFIG_SCHED_HRTICK */

1166

1167

/*

1167

/*

1168

* resched_task - mark a task 'to be rescheduled now'.

1168

* resched_task - mark a task 'to be rescheduled now'.

1169

*

1169

*

1170

* On UP this means the setting of the need_resched flag, on SMP it

1170

* On UP this means the setting of the need_resched flag, on SMP it

1171

* might also involve a cross-CPU call to trigger the scheduler on

1171

* might also involve a cross-CPU call to trigger the scheduler on

1172

* the target CPU.

1172

* the target CPU.

1173

*/

1173

*/

1174

#ifdef CONFIG_SMP

1174

#ifdef CONFIG_SMP

1175

1176

#ifndef tsk_is_polling

1176

#ifndef tsk_is_polling

1177

#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)

1177

#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)

1178

#endif

1178

#endif

1179

1180

static void resched_task(struct task_struct *p)

1180

static void resched_task(struct task_struct *p)

1181

{

1181

{

1182

int cpu;

1182

int cpu;

1183

1184

assert_spin_locked(&task_rq(p)->lock);

1184

assert_spin_locked(&task_rq(p)->lock);

1185

1186

if (test_tsk_need_resched(p))

1186

if (test_tsk_need_resched(p))

1187

return;

1187

return;

1188

1189

set_tsk_need_resched(p);

1189

set_tsk_need_resched(p);

1190

1191

cpu = task_cpu(p);

1191

cpu = task_cpu(p);

1192

if (cpu == smp_processor_id())

1192

if (cpu == smp_processor_id())

1193

return;

1193

return;

1194

1195

/* NEED_RESCHED must be visible before we test polling */

1195

/* NEED_RESCHED must be visible before we test polling */

1196

smp_mb();

1196

smp_mb();

1197

if (!tsk_is_polling(p))

1197

if (!tsk_is_polling(p))

1198

smp_send_reschedule(cpu);

1198

smp_send_reschedule(cpu);

1199

}

1199

}

1200

1201

static void resched_cpu(int cpu)

1201

static void resched_cpu(int cpu)

1202

{

1202

{

1203

struct rq *rq = cpu_rq(cpu);

1203

struct rq *rq = cpu_rq(cpu);

1204

unsigned long flags;

1204

unsigned long flags;

1205

1206

if (!spin_trylock_irqsave(&rq->lock, flags))

1206

if (!spin_trylock_irqsave(&rq->lock, flags))

1207

return;

1207

return;

1208

resched_task(cpu_curr(cpu));

1208

resched_task(cpu_curr(cpu));

1209

spin_unlock_irqrestore(&rq->lock, flags);

1209

spin_unlock_irqrestore(&rq->lock, flags);

1210

}

1210

}

1211

1212

#ifdef CONFIG_NO_HZ

1212

#ifdef CONFIG_NO_HZ

1213

/*

1213

/*

1214

* When add_timer_on() enqueues a timer into the timer wheel of an

1214

* When add_timer_on() enqueues a timer into the timer wheel of an

1215

* idle CPU then this timer might expire before the next timer event

1215

* idle CPU then this timer might expire before the next timer event

1216

* which is scheduled to wake up that CPU. In case of a completely

1216

* which is scheduled to wake up that CPU. In case of a completely

1217

* idle system the next event might even be infinite time into the

1217

* idle system the next event might even be infinite time into the

1218

* future. wake_up_idle_cpu() ensures that the CPU is woken up and

1218

* future. wake_up_idle_cpu() ensures that the CPU is woken up and

1219

* leaves the inner idle loop so the newly added timer is taken into

1219

* leaves the inner idle loop so the newly added timer is taken into

1220

* account when the CPU goes back to idle and evaluates the timer

1220

* account when the CPU goes back to idle and evaluates the timer

1221

* wheel for the next timer event.

1221

* wheel for the next timer event.

1222

*/

1222

*/

1223

void wake_up_idle_cpu(int cpu)

1223

void wake_up_idle_cpu(int cpu)

1224

{

1224

{

1225

struct rq *rq = cpu_rq(cpu);

1225

struct rq *rq = cpu_rq(cpu);

1226

1227

if (cpu == smp_processor_id())

1227

if (cpu == smp_processor_id())

1228

return;

1228

return;

1229

1230

/*

1230

/*

1231

* This is safe, as this function is called with the timer

1231

* This is safe, as this function is called with the timer

1232

* wheel base lock of (cpu) held. When the CPU is on the way

1232

* wheel base lock of (cpu) held. When the CPU is on the way

1233

* to idle and has not yet set rq->curr to idle then it will

1233

* to idle and has not yet set rq->curr to idle then it will

1234

* be serialized on the timer wheel base lock and take the new

1234

* be serialized on the timer wheel base lock and take the new

1235

* timer into account automatically.

1235

* timer into account automatically.

1236

*/

1236

*/

1237

if (rq->curr != rq->idle)

1237

if (rq->curr != rq->idle)

1238

return;

1238

return;

1239

1240

/*

1240

/*

1241

* We can set TIF_RESCHED on the idle task of the other CPU

1241

* We can set TIF_RESCHED on the idle task of the other CPU

1242

* lockless. The worst case is that the other CPU runs the

1242

* lockless. The worst case is that the other CPU runs the

1243

* idle task through an additional NOOP schedule()

1243

* idle task through an additional NOOP schedule()

1244

*/

1244

*/

1245

set_tsk_need_resched(rq->idle);

1245

set_tsk_need_resched(rq->idle);

1246

1247

/* NEED_RESCHED must be visible before we test polling */

1247

/* NEED_RESCHED must be visible before we test polling */

1248

smp_mb();

1248

smp_mb();

1249

if (!tsk_is_polling(rq->idle))

1249

if (!tsk_is_polling(rq->idle))

1250

smp_send_reschedule(cpu);

1250

smp_send_reschedule(cpu);

1251

}

1251

}

1252

#endif /* CONFIG_NO_HZ */

1252

#endif /* CONFIG_NO_HZ */

1253

1254

static u64 sched_avg_period(void)

1254

static u64 sched_avg_period(void)

1255

{

1255

{

1256

return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;

1256

return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;

1257

}

1257

}

1258

1259

static void sched_avg_update(struct rq *rq)

1259

static void sched_avg_update(struct rq *rq)

1260

{

1260

{

1261

s64 period = sched_avg_period();

1261

s64 period = sched_avg_period();

1262

1263

while ((s64)(rq->clock - rq->age_stamp) > period) {

1263

while ((s64)(rq->clock - rq->age_stamp) > period) {

1264

rq->age_stamp += period;

1264

rq->age_stamp += period;

1265

rq->rt_avg /= 2;

1265

rq->rt_avg /= 2;

1266

}

1266

}

1267

}

1267

}

1268

1269

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1269

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1270

{

1270

{

1271

rq->rt_avg += rt_delta;

1271

rq->rt_avg += rt_delta;

1272

sched_avg_update(rq);

1272

sched_avg_update(rq);

1273

}

1273

}

1274

1275

#else /* !CONFIG_SMP */

1275

#else /* !CONFIG_SMP */

1276

static void resched_task(struct task_struct *p)

1276

static void resched_task(struct task_struct *p)

1277

{

1277

{

1278

assert_spin_locked(&task_rq(p)->lock);

1278

assert_spin_locked(&task_rq(p)->lock);

1279

set_tsk_need_resched(p);

1279

set_tsk_need_resched(p);

1280

}

1280

}

1281

1282

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1282

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1283

{

1283

{

1284

}

1284

}

1285

#endif /* CONFIG_SMP */

1285

#endif /* CONFIG_SMP */

1286

1287

#if BITS_PER_LONG == 32

1287

#if BITS_PER_LONG == 32

1288

# define WMULT_CONST (~0UL)

1288

# define WMULT_CONST (~0UL)

1289

#else

1289

#else

1290

# define WMULT_CONST (1UL << 32)

1290

# define WMULT_CONST (1UL << 32)

1291

#endif

1291

#endif

1292

1293

#define WMULT_SHIFT 32

1293

#define WMULT_SHIFT 32

1294

1295

/*

1295

/*

1296

* Shift right and round:

1296

* Shift right and round:

1297

*/

1297

*/

1298

#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))

1298

#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))

1299

1300

/*

1300

/*

1301

* delta *= weight / lw

1301

* delta *= weight / lw

1302

*/

1302

*/

1303

static unsigned long

1303

static unsigned long

1304

calc_delta_mine(unsigned long delta_exec, unsigned long weight,

1304

calc_delta_mine(unsigned long delta_exec, unsigned long weight,

1305

struct load_weight *lw)

1305

struct load_weight *lw)

1306

{

1306

{

1307

u64 tmp;

1307

u64 tmp;

1308

1309

if (!lw->inv_weight) {

1309

if (!lw->inv_weight) {

1310

if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))

1310

if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))

1311

lw->inv_weight = 1;

1311

lw->inv_weight = 1;

1312

else

1312

else

1313

lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)

1313

lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)

1314

/ (lw->weight+1);

1314

/ (lw->weight+1);

1315

}

1315

}

1316

1317

tmp = (u64)delta_exec * weight;

1317

tmp = (u64)delta_exec * weight;

1318

/*

1318

/*

1319

* Check whether we'd overflow the 64-bit multiplication:

1319

* Check whether we'd overflow the 64-bit multiplication:

1320

*/

1320

*/

1321

if (unlikely(tmp > WMULT_CONST))

1321

if (unlikely(tmp > WMULT_CONST))

1322

tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,

1322

tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,

1323

WMULT_SHIFT/2);

1323

WMULT_SHIFT/2);

1324

else

1324

else

1325

tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

1325

tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

1326

1327

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);

1327

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);

1328

}

1328

}

1329

1330

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

1330

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

1331

{

1331

{

1332

lw->weight += inc;

1332

lw->weight += inc;

1333

lw->inv_weight = 0;

1333

lw->inv_weight = 0;

1334

}

1334

}

1335

1336

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

1336

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

1337

{

1337

{

1338

lw->weight -= dec;

1338

lw->weight -= dec;

1339

lw->inv_weight = 0;

1339

lw->inv_weight = 0;

1340

}

1340

}

1341

1342

/*

1342

/*

1343

* To aid in avoiding the subversion of "niceness" due to uneven distribution

1343

* To aid in avoiding the subversion of "niceness" due to uneven distribution

1344

* of tasks with abnormal "nice" values across CPUs the contribution that

1344

* of tasks with abnormal "nice" values across CPUs the contribution that

1345

* each task makes to its run queue's load is weighted according to its

1345

* each task makes to its run queue's load is weighted according to its

1346

* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a

1346

* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a

1347

* scaled version of the new time slice allocation that they receive on time

1347

* scaled version of the new time slice allocation that they receive on time

1348

* slice expiry etc.

1348

* slice expiry etc.

1349

*/

1349

*/

1350

1351

#define WEIGHT_IDLEPRIO 3

1351

#define WEIGHT_IDLEPRIO 3

1352

#define WMULT_IDLEPRIO 1431655765

1352

#define WMULT_IDLEPRIO 1431655765

1353

1354

/*

1354

/*

1355

* Nice levels are multiplicative, with a gentle 10% change for every

1355

* Nice levels are multiplicative, with a gentle 10% change for every

1356

* nice level changed. I.e. when a CPU-bound task goes from nice 0 to

1356

* nice level changed. I.e. when a CPU-bound task goes from nice 0 to

1357

* nice 1, it will get ~10% less CPU time than another CPU-bound task

1357

* nice 1, it will get ~10% less CPU time than another CPU-bound task

1358

* that remained on nice 0.

1358

* that remained on nice 0.

1359

*

1359

*

1360

* The "10% effect" is relative and cumulative: from _any_ nice level,

1360

* The "10% effect" is relative and cumulative: from _any_ nice level,

1361

* if you go up 1 level, it's -10% CPU usage, if you go down 1 level

1361

* if you go up 1 level, it's -10% CPU usage, if you go down 1 level

1362

* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.

1362

* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.

1363

* If a task goes up by ~10% and another task goes down by ~10% then

1363

* If a task goes up by ~10% and another task goes down by ~10% then

1364

* the relative distance between them is ~25%.)

1364

* the relative distance between them is ~25%.)

1365

*/

1365

*/

1366

static const int prio_to_weight[40] = {

1366

static const int prio_to_weight[40] = {

1367

/* -20 */ 88761, 71755, 56483, 46273, 36291,

1367

/* -20 */ 88761, 71755, 56483, 46273, 36291,

1368

/* -15 */ 29154, 23254, 18705, 14949, 11916,

1368

/* -15 */ 29154, 23254, 18705, 14949, 11916,

1369

/* -10 */ 9548, 7620, 6100, 4904, 3906,

1369

/* -10 */ 9548, 7620, 6100, 4904, 3906,

1370

/* -5 */ 3121, 2501, 1991, 1586, 1277,

1370

/* -5 */ 3121, 2501, 1991, 1586, 1277,

1371

/* 0 */ 1024, 820, 655, 526, 423,

1371

/* 0 */ 1024, 820, 655, 526, 423,

1372

/* 5 */ 335, 272, 215, 172, 137,

1372

/* 5 */ 335, 272, 215, 172, 137,

1373

/* 10 */ 110, 87, 70, 56, 45,

1373

/* 10 */ 110, 87, 70, 56, 45,

1374

/* 15 */ 36, 29, 23, 18, 15,

1374

/* 15 */ 36, 29, 23, 18, 15,

1375

};

1375

};

1376

1377

/*

1377

/*

1378

* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.

1378

* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.

1379

*

1379

*

1380

* In cases where the weight does not change often, we can use the

1380

* In cases where the weight does not change often, we can use the

1381

* precalculated inverse to speed up arithmetics by turning divisions

1381

* precalculated inverse to speed up arithmetics by turning divisions

1382

* into multiplications:

1382

* into multiplications:

1383

*/

1383

*/

1384

static const u32 prio_to_wmult[40] = {

1384

static const u32 prio_to_wmult[40] = {

1385

/* -20 */ 48388, 59856, 76040, 92818, 118348,

1385

/* -20 */ 48388, 59856, 76040, 92818, 118348,

1386

/* -15 */ 147320, 184698, 229616, 287308, 360437,

1386

/* -15 */ 147320, 184698, 229616, 287308, 360437,

1387

/* -10 */ 449829, 563644, 704093, 875809, 1099582,

1387

/* -10 */ 449829, 563644, 704093, 875809, 1099582,

1388

/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,

1388

/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,

1389

/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,

1389

/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,

1390

/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,

1390

/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,

1391

/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,

1391

/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,

1392

/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,

1392

/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,

1393

};

1393

};

1394

1395

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);

1395

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);

1396

1397

/*

1397

/*

1398

* runqueue iterator, to support SMP load-balancing between different

1398

* runqueue iterator, to support SMP load-balancing between different

1399

* scheduling classes, without having to expose their internal data

1399

* scheduling classes, without having to expose their internal data

1400

* structures to the load-balancing proper:

1400

* structures to the load-balancing proper:

1401

*/

1401

*/

1402

struct rq_iterator {

1402

struct rq_iterator {

1403

void *arg;

1403

void *arg;

1404

struct task_struct *(*start)(void *);

1404

struct task_struct *(*start)(void *);

1405

struct task_struct *(*next)(void *);

1405

struct task_struct *(*next)(void *);

1406

};

1406

};

1407

1408

#ifdef CONFIG_SMP

1408

#ifdef CONFIG_SMP

1409

static unsigned long

1409

static unsigned long

1410

balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

1410

balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

1411

unsigned long max_load_move, struct sched_domain *sd,

1411

unsigned long max_load_move, struct sched_domain *sd,

1412

enum cpu_idle_type idle, int *all_pinned,

1412

enum cpu_idle_type idle, int *all_pinned,

1413

int *this_best_prio, struct rq_iterator *iterator);

1413

int *this_best_prio, struct rq_iterator *iterator);

1414

1415

static int

1415

static int

1416

iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

1416

iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

1417

struct sched_domain *sd, enum cpu_idle_type idle,

1417

struct sched_domain *sd, enum cpu_idle_type idle,

1418

struct rq_iterator *iterator);

1418

struct rq_iterator *iterator);

1419

#endif

1419

#endif

1420

1421

/* Time spent by the tasks of the cpu accounting group executing in ... */

1421

/* Time spent by the tasks of the cpu accounting group executing in ... */

1422

enum cpuacct_stat_index {

1422

enum cpuacct_stat_index {

1423

CPUACCT_STAT_USER, /* ... user mode */

1423

CPUACCT_STAT_USER, /* ... user mode */

1424

CPUACCT_STAT_SYSTEM, /* ... kernel mode */

1424

CPUACCT_STAT_SYSTEM, /* ... kernel mode */

1425

1426

CPUACCT_STAT_NSTATS,

1426

CPUACCT_STAT_NSTATS,

1427

};

1427

};

1428

1429

#ifdef CONFIG_CGROUP_CPUACCT

1429

#ifdef CONFIG_CGROUP_CPUACCT

1430

static void cpuacct_charge(struct task_struct *tsk, u64 cputime);

1430

static void cpuacct_charge(struct task_struct *tsk, u64 cputime);

1431

static void cpuacct_update_stats(struct task_struct *tsk,

1431

static void cpuacct_update_stats(struct task_struct *tsk,

1432

enum cpuacct_stat_index idx, cputime_t val);

1432

enum cpuacct_stat_index idx, cputime_t val);

1433

#else

1433

#else

1434

static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}

1434

static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}

1435

static inline void cpuacct_update_stats(struct task_struct *tsk,

1435

static inline void cpuacct_update_stats(struct task_struct *tsk,

1436

enum cpuacct_stat_index idx, cputime_t val) {}

1436

enum cpuacct_stat_index idx, cputime_t val) {}

1437

#endif

1437

#endif

1438

1439

static inline void inc_cpu_load(struct rq *rq, unsigned long load)

1439

static inline void inc_cpu_load(struct rq *rq, unsigned long load)

1440

{

1440

{

1441

update_load_add(&rq->load, load);

1441

update_load_add(&rq->load, load);

1442

}

1442

}

1443

1444

static inline void dec_cpu_load(struct rq *rq, unsigned long load)

1444

static inline void dec_cpu_load(struct rq *rq, unsigned long load)

1445

{

1445

{

1446

update_load_sub(&rq->load, load);

1446

update_load_sub(&rq->load, load);

1447

}

1447

}

1448

1449

#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)

1449

#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)

1450

typedef int (*tg_visitor)(struct task_group *, void *);

1450

typedef int (*tg_visitor)(struct task_group *, void *);

1451

1452

/*

1452

/*

1453

* Iterate the full tree, calling @down when first entering a node and @up when

1453

* Iterate the full tree, calling @down when first entering a node and @up when

1454

* leaving it for the final time.

1454

* leaving it for the final time.

1455

*/

1455

*/

1456

static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)

1456

static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)

1457

{

1457

{

1458

struct task_group *parent, *child;

1458

struct task_group *parent, *child;

1459

int ret;

1459

int ret;

1460

1461

rcu_read_lock();

1461

rcu_read_lock();

1462

parent = &root_task_group;

1462

parent = &root_task_group;

1463

down:

1463

down:

1464

ret = (*down)(parent, data);

1464

ret = (*down)(parent, data);

1465

if (ret)

1465

if (ret)

1466

goto out_unlock;

1466

goto out_unlock;

1467

list_for_each_entry_rcu(child, &parent->children, siblings) {

1467

list_for_each_entry_rcu(child, &parent->children, siblings) {

1468

parent = child;

1468

parent = child;

1469

goto down;

1469

goto down;

1470

1471

up:

1471

up:

1472

continue;

1472

continue;

1473

}

1473

}

1474

ret = (*up)(parent, data);

1474

ret = (*up)(parent, data);

1475

if (ret)

1475

if (ret)

1476

goto out_unlock;

1476

goto out_unlock;

1477

1478

child = parent;

1478

child = parent;

1479

parent = parent->parent;

1479

parent = parent->parent;

1480

if (parent)

1480

if (parent)

1481

goto up;

1481

goto up;

1482

out_unlock:

1482

out_unlock:

1483

rcu_read_unlock();

1483

rcu_read_unlock();

1484

1485

return ret;

1485

return ret;

1486

}

1486

}

1487

1488

static int tg_nop(struct task_group *tg, void *data)

1488

static int tg_nop(struct task_group *tg, void *data)

1489

{

1489

{

1490

return 0;

1490

return 0;

1491

}

1491

}

1492

#endif

1492

#endif

1493

1494

#ifdef CONFIG_SMP

1494

#ifdef CONFIG_SMP

1495

/* Used instead of source_load when we know the type == 0 */

1495

/* Used instead of source_load when we know the type == 0 */

1496

static unsigned long weighted_cpuload(const int cpu)

1496

static unsigned long weighted_cpuload(const int cpu)

1497

{

1497

{

1498

return cpu_rq(cpu)->load.weight;

1498

return cpu_rq(cpu)->load.weight;

1499

}

1499

}

1500

1501

/*

1501

/*

1502

* Return a low guess at the load of a migration-source cpu weighted

1502

* Return a low guess at the load of a migration-source cpu weighted

1503

* according to the scheduling class and "nice" value.

1503

* according to the scheduling class and "nice" value.

1504

*

1504

*

1505

* We want to under-estimate the load of migration sources, to

1505

* We want to under-estimate the load of migration sources, to

1506

* balance conservatively.

1506

* balance conservatively.

1507

*/

1507

*/

1508

static unsigned long source_load(int cpu, int type)

1508

static unsigned long source_load(int cpu, int type)

1509

{

1509

{

1510

struct rq *rq = cpu_rq(cpu);

1510

struct rq *rq = cpu_rq(cpu);

1511

unsigned long total = weighted_cpuload(cpu);

1511

unsigned long total = weighted_cpuload(cpu);

1512

1513

if (type == 0 || !sched_feat(LB_BIAS))

1513

if (type == 0 || !sched_feat(LB_BIAS))

1514

return total;

1514

return total;

1515

1516

return min(rq->cpu_load[type-1], total);

1516

return min(rq->cpu_load[type-1], total);

1517

}

1517

}

1518

1519

/*

1519

/*

1520

* Return a high guess at the load of a migration-target cpu weighted

1520

* Return a high guess at the load of a migration-target cpu weighted

1521

* according to the scheduling class and "nice" value.

1521

* according to the scheduling class and "nice" value.

1522

*/

1522

*/

1523

static unsigned long target_load(int cpu, int type)

1523

static unsigned long target_load(int cpu, int type)

1524

{

1524

{

1525

struct rq *rq = cpu_rq(cpu);

1525

struct rq *rq = cpu_rq(cpu);

1526

unsigned long total = weighted_cpuload(cpu);

1526

unsigned long total = weighted_cpuload(cpu);

1527

1528

if (type == 0 || !sched_feat(LB_BIAS))

1528

if (type == 0 || !sched_feat(LB_BIAS))

1529

return total;

1529

return total;

1530

1531

return max(rq->cpu_load[type-1], total);

1531

return max(rq->cpu_load[type-1], total);

1532

}

1532

}

1533

1534

static struct sched_group *group_of(int cpu)

1534

static struct sched_group *group_of(int cpu)

1535

{

1535

{

1536

struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);

1536

struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);

1537

1538

if (!sd)

1538

if (!sd)

1539

return NULL;

1539

return NULL;

1540

1541

return sd->groups;

1541

return sd->groups;

1542

}

1542

}

1543

1544

static unsigned long power_of(int cpu)

1544

static unsigned long power_of(int cpu)

1545

{

1545

{

1546

struct sched_group *group = group_of(cpu);

1546

struct sched_group *group = group_of(cpu);

1547

1548

if (!group)

1548

if (!group)

1549

return SCHED_LOAD_SCALE;

1549

return SCHED_LOAD_SCALE;

1550

1551

return group->cpu_power;

1551

return group->cpu_power;

1552

}

1552

}

1553

1554

static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);

1554

static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);

1555

1556

static unsigned long cpu_avg_load_per_task(int cpu)

1556

static unsigned long cpu_avg_load_per_task(int cpu)

1557

{

1557

{

1558

struct rq *rq = cpu_rq(cpu);

1558

struct rq *rq = cpu_rq(cpu);

1559

unsigned long nr_running = ACCESS_ONCE(rq->nr_running);

1559

unsigned long nr_running = ACCESS_ONCE(rq->nr_running);

1560

1561

if (nr_running)

1561

if (nr_running)

1562

rq->avg_load_per_task = rq->load.weight / nr_running;

1562

rq->avg_load_per_task = rq->load.weight / nr_running;

1563

else

1563

else

1564

rq->avg_load_per_task = 0;

1564

rq->avg_load_per_task = 0;

1565

1566

return rq->avg_load_per_task;

1566

return rq->avg_load_per_task;

1567

}

1567

}

1568

1569

#ifdef CONFIG_FAIR_GROUP_SCHED

1569

#ifdef CONFIG_FAIR_GROUP_SCHED

1570

1571

static __read_mostly unsigned long *update_shares_data;

1571

static __read_mostly unsigned long *update_shares_data;

1572

1573

static void __set_se_shares(struct sched_entity *se, unsigned long shares);

1573

static void __set_se_shares(struct sched_entity *se, unsigned long shares);

1574

1575

/*

1575

/*

1576

* Calculate and set the cpu's group shares.

1576

* Calculate and set the cpu's group shares.

1577

*/

1577

*/

1578

static void update_group_shares_cpu(struct task_group *tg, int cpu,

1578

static void update_group_shares_cpu(struct task_group *tg, int cpu,

1579

unsigned long sd_shares,

1579

unsigned long sd_shares,

1580

unsigned long sd_rq_weight,

1580

unsigned long sd_rq_weight,

1581

unsigned long *usd_rq_weight)

1581

unsigned long *usd_rq_weight)

1582

{

1582

{

1583

unsigned long shares, rq_weight;

1583

unsigned long shares, rq_weight;

1584

int boost = 0;

1584

int boost = 0;

1585

1586

rq_weight = usd_rq_weight[cpu];

1586

rq_weight = usd_rq_weight[cpu];

1587

if (!rq_weight) {

1587

if (!rq_weight) {

1588

boost = 1;

1588

boost = 1;

1589

rq_weight = NICE_0_LOAD;

1589

rq_weight = NICE_0_LOAD;

1590

}

1590

}

1591

1592

/*

1592

/*

1593

* \Sum_j shares_j * rq_weight_i

1593

* \Sum_j shares_j * rq_weight_i

1594

* shares_i = -----------------------------

1594

* shares_i = -----------------------------

1595

* \Sum_j rq_weight_j

1595

* \Sum_j rq_weight_j

1596

*/

1596

*/

1597

shares = (sd_shares * rq_weight) / sd_rq_weight;

1597

shares = (sd_shares * rq_weight) / sd_rq_weight;

1598

shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);

1598

shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);

1599

1600

if (abs(shares - tg->se[cpu]->load.weight) >

1600

if (abs(shares - tg->se[cpu]->load.weight) >

1601

sysctl_sched_shares_thresh) {

1601

sysctl_sched_shares_thresh) {

1602

struct rq *rq = cpu_rq(cpu);

1602

struct rq *rq = cpu_rq(cpu);

1603

unsigned long flags;

1603

unsigned long flags;

1604

1605

spin_lock_irqsave(&rq->lock, flags);

1605

spin_lock_irqsave(&rq->lock, flags);

1606

tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;

1606

tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;

1607

tg->cfs_rq[cpu]->shares = boost ? 0 : shares;

1607

tg->cfs_rq[cpu]->shares = boost ? 0 : shares;

1608

__set_se_shares(tg->se[cpu], shares);

1608

__set_se_shares(tg->se[cpu], shares);

1609

spin_unlock_irqrestore(&rq->lock, flags);

1609

spin_unlock_irqrestore(&rq->lock, flags);

1610

}

1610

}

1611

}

1611

}

1612

1613

/*

1613

/*

1614

* Re-compute the task group their per cpu shares over the given domain.

1614

* Re-compute the task group their per cpu shares over the given domain.

1615

* This needs to be done in a bottom-up fashion because the rq weight of a

1615

* This needs to be done in a bottom-up fashion because the rq weight of a

1616

* parent group depends on the shares of its child groups.

1616

* parent group depends on the shares of its child groups.

1617

*/

1617

*/

1618

static int tg_shares_up(struct task_group *tg, void *data)

1618

static int tg_shares_up(struct task_group *tg, void *data)

1619

{

1619

{

1620

unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;

1620

unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;

1621

unsigned long *usd_rq_weight;

1621

unsigned long *usd_rq_weight;

1622

struct sched_domain *sd = data;

1622

struct sched_domain *sd = data;

1623

unsigned long flags;

1623

unsigned long flags;

1624

int i;

1624

int i;

1625

1626

if (!tg->se[0])

1626

if (!tg->se[0])

1627

return 0;

1627

return 0;

1628

1629

local_irq_save(flags);

1629

local_irq_save(flags);

1630

usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());

1630

usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());

1631

1632

for_each_cpu(i, sched_domain_span(sd)) {

1632

for_each_cpu(i, sched_domain_span(sd)) {

1633

weight = tg->cfs_rq[i]->load.weight;

1633

weight = tg->cfs_rq[i]->load.weight;

1634

usd_rq_weight[i] = weight;

1634

usd_rq_weight[i] = weight;

1635

1636

rq_weight += weight;

1636

rq_weight += weight;

1637

/*

1637

/*

1638

* If there are currently no tasks on the cpu pretend there

1638

* If there are currently no tasks on the cpu pretend there

1639

* is one of average load so that when a new task gets to

1639

* is one of average load so that when a new task gets to

1640

* run here it will not get delayed by group starvation.

1640

* run here it will not get delayed by group starvation.

1641

*/

1641

*/

1642

if (!weight)

1642

if (!weight)

1643

weight = NICE_0_LOAD;

1643

weight = NICE_0_LOAD;

1644

1645

sum_weight += weight;

1645

sum_weight += weight;

1646

shares += tg->cfs_rq[i]->shares;

1646

shares += tg->cfs_rq[i]->shares;

1647

}

1647

}

1648

1649

if (!rq_weight)

1649

if (!rq_weight)

1650

rq_weight = sum_weight;

1650

rq_weight = sum_weight;

1651

1652

if ((!shares && rq_weight) || shares > tg->shares)

1652

if ((!shares && rq_weight) || shares > tg->shares)

1653

shares = tg->shares;

1653

shares = tg->shares;

1654

1655

if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))

1655

if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))

1656

shares = tg->shares;

1656

shares = tg->shares;

1657

1658

for_each_cpu(i, sched_domain_span(sd))

1658

for_each_cpu(i, sched_domain_span(sd))

1659

update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);

1659

update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);

1660

1661

local_irq_restore(flags);

1661

local_irq_restore(flags);

1662

1663

return 0;

1663

return 0;

1664

}

1664

}

1665

1666

/*

1666

/*

1667

* Compute the cpu's hierarchical load factor for each task group.

1667

* Compute the cpu's hierarchical load factor for each task group.

1668

* This needs to be done in a top-down fashion because the load of a child

1668

* This needs to be done in a top-down fashion because the load of a child

1669

* group is a fraction of its parents load.

1669

* group is a fraction of its parents load.

1670

*/

1670

*/

1671

static int tg_load_down(struct task_group *tg, void *data)

1671

static int tg_load_down(struct task_group *tg, void *data)

1672

{

1672

{

1673

unsigned long load;

1673

unsigned long load;

1674

long cpu = (long)data;

1674

long cpu = (long)data;

1675

1676

if (!tg->parent) {

1676

if (!tg->parent) {

1677

load = cpu_rq(cpu)->load.weight;

1677

load = cpu_rq(cpu)->load.weight;

1678

} else {

1678

} else {

1679

load = tg->parent->cfs_rq[cpu]->h_load;

1679

load = tg->parent->cfs_rq[cpu]->h_load;

1680

load *= tg->cfs_rq[cpu]->shares;

1680

load *= tg->cfs_rq[cpu]->shares;

1681

load /= tg->parent->cfs_rq[cpu]->load.weight + 1;

1681

load /= tg->parent->cfs_rq[cpu]->load.weight + 1;

1682

}

1682

}

1683

1684

tg->cfs_rq[cpu]->h_load = load;

1684

tg->cfs_rq[cpu]->h_load = load;

1685

1686

return 0;

1686

return 0;

1687

}

1687

}

1688

1689

static void update_shares(struct sched_domain *sd)

1689

static void update_shares(struct sched_domain *sd)

1690

{

1690

{

1691

s64 elapsed;

1691

s64 elapsed;

1692

u64 now;

1692

u64 now;

1693

1694

if (root_task_group_empty())

1694

if (root_task_group_empty())

1695

return;

1695

return;

1696

1697

now = cpu_clock(raw_smp_processor_id());

1697

now = cpu_clock(raw_smp_processor_id());

1698

elapsed = now - sd->last_update;

1698

elapsed = now - sd->last_update;

1699

1700

if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {

1700

if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {

1701

sd->last_update = now;

1701

sd->last_update = now;

1702

walk_tg_tree(tg_nop, tg_shares_up, sd);

1702

walk_tg_tree(tg_nop, tg_shares_up, sd);

1703

}

1703

}

1704

}

1704

}

1705

1706

static void update_shares_locked(struct rq *rq, struct sched_domain *sd)

1706

static void update_shares_locked(struct rq *rq, struct sched_domain *sd)

1707

{

1707

{

1708

if (root_task_group_empty())

1708

if (root_task_group_empty())

1709

return;

1709

return;

1710

1711

spin_unlock(&rq->lock);

1711

spin_unlock(&rq->lock);

1712

update_shares(sd);

1712

update_shares(sd);

1713

spin_lock(&rq->lock);

1713

spin_lock(&rq->lock);

1714

}

1714

}

1715

1716

static void update_h_load(long cpu)

1716

static void update_h_load(long cpu)

1717

{

1717

{

1718

if (root_task_group_empty())

1718

if (root_task_group_empty())

1719

return;

1719

return;

1720

1721

walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);

1721

walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);

1722

}

1722

}

1723

1724

#else

1724

#else

1725

1726

static inline void update_shares(struct sched_domain *sd)

1726

static inline void update_shares(struct sched_domain *sd)

1727

{

1727

{

1728

}

1728

}

1729

1730

static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)

1730

static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)

1731

{

1731

{

1732

}

1732

}

1733

1734

#endif

1734

#endif

1735

1736

#ifdef CONFIG_PREEMPT

1736

#ifdef CONFIG_PREEMPT

1737

1738

static void double_rq_lock(struct rq *rq1, struct rq *rq2);

1738

static void double_rq_lock(struct rq *rq1, struct rq *rq2);

1739

1740

/*

1740

/*

1741

* fair double_lock_balance: Safely acquires both rq->locks in a fair

1741

* fair double_lock_balance: Safely acquires both rq->locks in a fair

1742

* way at the expense of forcing extra atomic operations in all

1742

* way at the expense of forcing extra atomic operations in all

1743

* invocations. This assures that the double_lock is acquired using the

1743

* invocations. This assures that the double_lock is acquired using the

1744

* same underlying policy as the spinlock_t on this architecture, which

1744

* same underlying policy as the spinlock_t on this architecture, which

1745

* reduces latency compared to the unfair variant below. However, it

1745

* reduces latency compared to the unfair variant below. However, it

1746

* also adds more overhead and therefore may reduce throughput.

1746

* also adds more overhead and therefore may reduce throughput.

1747

*/

1747

*/

1748

static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1748

static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1749

__releases(this_rq->lock)

1749

__releases(this_rq->lock)

1750

__acquires(busiest->lock)

1750

__acquires(busiest->lock)

1751

__acquires(this_rq->lock)

1751

__acquires(this_rq->lock)

1752

{

1752

{

1753

spin_unlock(&this_rq->lock);

1753

spin_unlock(&this_rq->lock);

1754

double_rq_lock(this_rq, busiest);

1754

double_rq_lock(this_rq, busiest);

1755

1756

return 1;

1756

return 1;

1757

}

1757

}

1758

1759

#else

1759

#else

1760

/*

1760

/*

1761

* Unfair double_lock_balance: Optimizes throughput at the expense of

1761

* Unfair double_lock_balance: Optimizes throughput at the expense of

1762

* latency by eliminating extra atomic operations when the locks are

1762

* latency by eliminating extra atomic operations when the locks are

1763

* already in proper order on entry. This favors lower cpu-ids and will

1763

* already in proper order on entry. This favors lower cpu-ids and will

1764

* grant the double lock to lower cpus over higher ids under contention,

1764

* grant the double lock to lower cpus over higher ids under contention,

1765

* regardless of entry order into the function.

1765

* regardless of entry order into the function.

1766

*/

1766

*/

1767

static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1767

static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1768

__releases(this_rq->lock)

1768

__releases(this_rq->lock)

1769

__acquires(busiest->lock)

1769

__acquires(busiest->lock)

1770

__acquires(this_rq->lock)

1770

__acquires(this_rq->lock)

1771

{

1771

{

1772

int ret = 0;

1772

int ret = 0;

1773

1774

if (unlikely(!spin_trylock(&busiest->lock))) {

1774

if (unlikely(!spin_trylock(&busiest->lock))) {

1775

if (busiest < this_rq) {

1775

if (busiest < this_rq) {

1776

spin_unlock(&this_rq->lock);

1776

spin_unlock(&this_rq->lock);

1777

spin_lock(&busiest->lock);

1777

spin_lock(&busiest->lock);

1778

spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);

1778

spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);

1779

ret = 1;

1779

ret = 1;

1780

} else

1780

} else

1781

spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);

1781

spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);

1782

}

1782

}

1783

return ret;

1783

return ret;

1784

}

1784

}

1785

1786

#endif /* CONFIG_PREEMPT */

1786

#endif /* CONFIG_PREEMPT */

1787

1788

/*

1788

/*

1789

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

1789

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

1790

*/

1790

*/

1791

static int double_lock_balance(struct rq *this_rq, struct rq *busiest)

1791

static int double_lock_balance(struct rq *this_rq, struct rq *busiest)

1792

{

1792

{

1793

if (unlikely(!irqs_disabled())) {

1793

if (unlikely(!irqs_disabled())) {

1794

/* printk() doesn't work good under rq->lock */

1794

/* printk() doesn't work good under rq->lock */

1795

spin_unlock(&this_rq->lock);

1795

spin_unlock(&this_rq->lock);

1796

BUG_ON(1);

1796

BUG_ON(1);

1797

}

1797

}

1798

1799

return _double_lock_balance(this_rq, busiest);

1799

return _double_lock_balance(this_rq, busiest);

1800

}

1800

}

1801

1802

static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)

1802

static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)

1803

__releases(busiest->lock)

1803

__releases(busiest->lock)

1804

{

1804

{

1805

spin_unlock(&busiest->lock);

1805

spin_unlock(&busiest->lock);

1806

lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);

1806

lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);

1807

}

1807

}

1808

#endif

1808

#endif

1809

1810

#ifdef CONFIG_FAIR_GROUP_SCHED

1810

#ifdef CONFIG_FAIR_GROUP_SCHED

1811

static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)

1811

static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)

1812

{

1812

{

1813

#ifdef CONFIG_SMP

1813

#ifdef CONFIG_SMP

1814

cfs_rq->shares = shares;

1814

cfs_rq->shares = shares;

1815

#endif

1815

#endif

1816

}

1816

}

1817

#endif

1817

#endif

1818

1819

static void calc_load_account_active(struct rq *this_rq);

1819

static void calc_load_account_active(struct rq *this_rq);

1820

static void update_sysctl(void);

1820

static void update_sysctl(void);

1821

static int get_update_sysctl_factor(void);

1821

static int get_update_sysctl_factor(void);

1822

1823

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

1823

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

1824

{

1824

{

1825

set_task_rq(p, cpu);

1825

set_task_rq(p, cpu);

1826

#ifdef CONFIG_SMP

1826

#ifdef CONFIG_SMP

1827

/*

1827

/*

1828

* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be

1828

* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be

1829

* successfuly executed on another CPU. We must ensure that updates of

1829

* successfuly executed on another CPU. We must ensure that updates of

1830

* per-task data have been completed by this moment.

1830

* per-task data have been completed by this moment.

1831

*/

1831

*/

1832

smp_wmb();

1832

smp_wmb();

1833

task_thread_info(p)->cpu = cpu;

1833

task_thread_info(p)->cpu = cpu;

1834

#endif

1834

#endif

1835

}

1835

}

1836

1837

#include "sched_stats.h"

1837

#include "sched_stats.h"

1838

#include "sched_idletask.c"

1838

#include "sched_idletask.c"

1839

#include "sched_fair.c"

1839

#include "sched_fair.c"

1840

#include "sched_rt.c"

1840

#include "sched_rt.c"

1841

#ifdef CONFIG_SCHED_DEBUG

1841

#ifdef CONFIG_SCHED_DEBUG

1842

# include "sched_debug.c"

1842

# include "sched_debug.c"

1843

#endif

1843

#endif

1844

1845

#define sched_class_highest (&rt_sched_class)

1845

#define sched_class_highest (&rt_sched_class)

1846

#define for_each_class(class) \

1846

#define for_each_class(class) \

1847

for (class = sched_class_highest; class; class = class->next)

1847

for (class = sched_class_highest; class; class = class->next)

1848

1849

static void inc_nr_running(struct rq *rq)

1849

static void inc_nr_running(struct rq *rq)

1850

{

1850

{

1851

rq->nr_running++;

1851

rq->nr_running++;

1852

}

1852

}

1853

1854

static void dec_nr_running(struct rq *rq)

1854

static void dec_nr_running(struct rq *rq)

1855

{

1855

{

1856

rq->nr_running--;

1856

rq->nr_running--;

1857

}

1857

}

1858

1859

static void set_load_weight(struct task_struct *p)

1859

static void set_load_weight(struct task_struct *p)

1860

{

1860

{

1861

if (task_has_rt_policy(p)) {

1861

if (task_has_rt_policy(p)) {

1862

p->se.load.weight = prio_to_weight[0] * 2;

1862

p->se.load.weight = prio_to_weight[0] * 2;

1863

p->se.load.inv_weight = prio_to_wmult[0] >> 1;

1863

p->se.load.inv_weight = prio_to_wmult[0] >> 1;

1864

return;

1864

return;

1865

}

1865

}

1866

1867

/*

1867

/*

1868

* SCHED_IDLE tasks get minimal weight:

1868

* SCHED_IDLE tasks get minimal weight:

1869

*/

1869

*/

1870

if (p->policy == SCHED_IDLE) {

1870

if (p->policy == SCHED_IDLE) {

1871

p->se.load.weight = WEIGHT_IDLEPRIO;

1871

p->se.load.weight = WEIGHT_IDLEPRIO;

1872

p->se.load.inv_weight = WMULT_IDLEPRIO;

1872

p->se.load.inv_weight = WMULT_IDLEPRIO;

1873

return;

1873

return;

1874

}

1874

}

1875

1876

p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];

1876

p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];

1877

p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];

1877

p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];

1878

}

1878

}

1879

1880

static void update_avg(u64 *avg, u64 sample)

1880

static void update_avg(u64 *avg, u64 sample)

1881

{

1881

{

1882

s64 diff = sample - *avg;

1882

s64 diff = sample - *avg;

1883

*avg += diff >> 3;

1883

*avg += diff >> 3;

1884

}

1884

}

1885

1886

static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)

1886

static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)

1887

{

1887

{

1888

if (wakeup)

1888

if (wakeup)

1889

p->se.start_runtime = p->se.sum_exec_runtime;

1889

p->se.start_runtime = p->se.sum_exec_runtime;

1890

1891

sched_info_queued(p);

1891

sched_info_queued(p);

1892

p->sched_class->enqueue_task(rq, p, wakeup);

1892

p->sched_class->enqueue_task(rq, p, wakeup);

1893

p->se.on_rq = 1;

1893

p->se.on_rq = 1;

1894

}

1894

}

1895

1896

static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)

1896

static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)

1897

{

1897

{

1898

if (sleep) {

1898

if (sleep) {

1899

if (p->se.last_wakeup) {

1899

if (p->se.last_wakeup) {

1900

update_avg(&p->se.avg_overlap,

1900

update_avg(&p->se.avg_overlap,

1901

p->se.sum_exec_runtime - p->se.last_wakeup);

1901

p->se.sum_exec_runtime - p->se.last_wakeup);

1902

p->se.last_wakeup = 0;

1902

p->se.last_wakeup = 0;

1903

} else {

1903

} else {

1904

update_avg(&p->se.avg_wakeup,

1904

update_avg(&p->se.avg_wakeup,

1905

sysctl_sched_wakeup_granularity);

1905

sysctl_sched_wakeup_granularity);

1906

}

1906

}

1907

}

1907

}

1908

1909

sched_info_dequeued(p);

1909

sched_info_dequeued(p);

1910

p->sched_class->dequeue_task(rq, p, sleep);

1910

p->sched_class->dequeue_task(rq, p, sleep);

1911

p->se.on_rq = 0;

1911

p->se.on_rq = 0;

1912

}

1912

}

1913

1914

/*

1914

/*

1915

* __normal_prio - return the priority that is based on the static prio

1915

* __normal_prio - return the priority that is based on the static prio

1916

*/

1916

*/

1917

static inline int __normal_prio(struct task_struct *p)

1917

static inline int __normal_prio(struct task_struct *p)

1918

{

1918

{

1919

return p->static_prio;

1919

return p->static_prio;

1920

}

1920

}

1921

1922

/*

1922

/*

1923

* Calculate the expected normal priority: i.e. priority

1923

* Calculate the expected normal priority: i.e. priority

1924

* without taking RT-inheritance into account. Might be

1924

* without taking RT-inheritance into account. Might be

1925

* boosted by interactivity modifiers. Changes upon fork,

1925

* boosted by interactivity modifiers. Changes upon fork,

1926

* setprio syscalls, and whenever the interactivity

1926

* setprio syscalls, and whenever the interactivity

1927

* estimator recalculates.

1927

* estimator recalculates.

1928

*/

1928

*/

1929

static inline int normal_prio(struct task_struct *p)

1929

static inline int normal_prio(struct task_struct *p)

1930

{

1930

{

1931

int prio;

1931

int prio;

1932

1933

if (task_has_rt_policy(p))

1933

if (task_has_rt_policy(p))

1934

prio = MAX_RT_PRIO-1 - p->rt_priority;

1934

prio = MAX_RT_PRIO-1 - p->rt_priority;

1935

else

1935

else

1936

prio = __normal_prio(p);

1936

prio = __normal_prio(p);

1937

return prio;

1937

return prio;

1938

}

1938

}

1939

1940

/*

1940

/*

1941

* Calculate the current priority, i.e. the priority

1941

* Calculate the current priority, i.e. the priority

1942

* taken into account by the scheduler. This value might

1942

* taken into account by the scheduler. This value might

1943

* be boosted by RT tasks, or might be boosted by

1943

* be boosted by RT tasks, or might be boosted by

1944

* interactivity modifiers. Will be RT if the task got

1944

* interactivity modifiers. Will be RT if the task got

1945

* RT-boosted. If not then it returns p->normal_prio.

1945

* RT-boosted. If not then it returns p->normal_prio.

1946

*/

1946

*/

1947

static int effective_prio(struct task_struct *p)

1947

static int effective_prio(struct task_struct *p)

1948

{

1948

{

1949

p->normal_prio = normal_prio(p);

1949

p->normal_prio = normal_prio(p);

1950

/*

1950

/*

1951

* If we are RT tasks or we were boosted to RT priority,

1951

* If we are RT tasks or we were boosted to RT priority,

1952

* keep the priority unchanged. Otherwise, update priority

1952

* keep the priority unchanged. Otherwise, update priority

1953

* to the normal priority:

1953

* to the normal priority:

1954

*/

1954

*/

1955

if (!rt_prio(p->prio))

1955

if (!rt_prio(p->prio))

1956

return p->normal_prio;

1956

return p->normal_prio;

1957

return p->prio;

1957

return p->prio;

1958

}

1958

}

1959

1960

/*

1960

/*

1961

* activate_task - move a task to the runqueue.

1961

* activate_task - move a task to the runqueue.

1962

*/

1962

*/

1963

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)

1963

static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)

1964

{

1964

{

1965

if (task_contributes_to_load(p))

1965

if (task_contributes_to_load(p))

1966

rq->nr_uninterruptible--;

1966

rq->nr_uninterruptible--;

1967

1968

enqueue_task(rq, p, wakeup);

1968

enqueue_task(rq, p, wakeup);

1969

inc_nr_running(rq);

1969

inc_nr_running(rq);

1970

}

1970

}

1971

1972

/*

1972

/*

1973

* deactivate_task - remove a task from the runqueue.

1973

* deactivate_task - remove a task from the runqueue.

1974

*/

1974

*/

1975

static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)

1975

static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)

1976

{

1976

{

1977

if (task_contributes_to_load(p))

1977

if (task_contributes_to_load(p))

1978

rq->nr_uninterruptible++;

1978

rq->nr_uninterruptible++;

1979

1980

dequeue_task(rq, p, sleep);

1980

dequeue_task(rq, p, sleep);

1981

dec_nr_running(rq);

1981

dec_nr_running(rq);

1982

}

1982

}

1983

1984

/**

1984

/**

1985

* task_curr - is this task currently executing on a CPU?

1985

* task_curr - is this task currently executing on a CPU?

1986

* @p: the task in question.

1986

* @p: the task in question.

1987

*/

1987

*/

1988

inline int task_curr(const struct task_struct *p)

1988

inline int task_curr(const struct task_struct *p)

1989

{

1989

{

1990

return cpu_curr(task_cpu(p)) == p;

1990

return cpu_curr(task_cpu(p)) == p;

1991

}

1991

}

1992

1993

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

1993

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

1994

const struct sched_class *prev_class,

1994

const struct sched_class *prev_class,

1995

int oldprio, int running)

1995

int oldprio, int running)

1996

{

1996

{

1997

if (prev_class != p->sched_class) {

1997

if (prev_class != p->sched_class) {

1998

if (prev_class->switched_from)

1998

if (prev_class->switched_from)

1999

prev_class->switched_from(rq, p, running);

1999

prev_class->switched_from(rq, p, running);

2000

p->sched_class->switched_to(rq, p, running);

2000

p->sched_class->switched_to(rq, p, running);

2001

} else

2001

} else

2002

p->sched_class->prio_changed(rq, p, oldprio, running);

2002

p->sched_class->prio_changed(rq, p, oldprio, running);

2003

}

2003

}

2004

2005

/**

2005

/**

2006

* kthread_bind - bind a just-created kthread to a cpu.

2006

* kthread_bind - bind a just-created kthread to a cpu.

2007

* @p: thread created by kthread_create().

2007

* @p: thread created by kthread_create().

2008

* @cpu: cpu (might not be online, must be possible) for @k to run on.

2008

* @cpu: cpu (might not be online, must be possible) for @k to run on.

2009

*

2009

*

2010

* Description: This function is equivalent to set_cpus_allowed(),

2010

* Description: This function is equivalent to set_cpus_allowed(),

2011

* except that @cpu doesn't need to be online, and the thread must be

2011

* except that @cpu doesn't need to be online, and the thread must be

2012

* stopped (i.e., just returned from kthread_create()).

2012

* stopped (i.e., just returned from kthread_create()).

2013

*

2013

*

2014

* Function lives here instead of kthread.c because it messes with

2014

* Function lives here instead of kthread.c because it messes with

2015

* scheduler internals which require locking.

2015

* scheduler internals which require locking.

2016

*/

2016

*/

2017

void kthread_bind(struct task_struct *p, unsigned int cpu)

2017

void kthread_bind(struct task_struct *p, unsigned int cpu)

2018

{

2018

{

2019

struct rq *rq = cpu_rq(cpu);

2019

struct rq *rq = cpu_rq(cpu);

2020

unsigned long flags;

2020

unsigned long flags;

2021

2022

/* Must have done schedule() in kthread() before we set_task_cpu */

2022

/* Must have done schedule() in kthread() before we set_task_cpu */

2023

if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {

2023

if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {

2024

WARN_ON(1);

2024

WARN_ON(1);

2025

return;

2025

return;

2026

}

2026

}

2027

2028

spin_lock_irqsave(&rq->lock, flags);

2028

spin_lock_irqsave(&rq->lock, flags);

2029

update_rq_clock(rq);

2029

update_rq_clock(rq);

2030

set_task_cpu(p, cpu);

2030

set_task_cpu(p, cpu);

2031

p->cpus_allowed = cpumask_of_cpu(cpu);

2031

p->cpus_allowed = cpumask_of_cpu(cpu);

2032

p->rt.nr_cpus_allowed = 1;

2032

p->rt.nr_cpus_allowed = 1;

2033

p->flags |= PF_THREAD_BOUND;

2033

p->flags |= PF_THREAD_BOUND;

2034

spin_unlock_irqrestore(&rq->lock, flags);

2034

spin_unlock_irqrestore(&rq->lock, flags);

2035

}

2035

}

2036

EXPORT_SYMBOL(kthread_bind);

2036

EXPORT_SYMBOL(kthread_bind);

2037

2038

#ifdef CONFIG_SMP

2038

#ifdef CONFIG_SMP

2039

/*

2039

/*

2040

* Is this task likely cache-hot:

2040

* Is this task likely cache-hot:

2041

*/

2041

*/

2042

static int

2042

static int

2043

task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)

2043

task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)

2044

{

2044

{

2045

s64 delta;

2045

s64 delta;

2046

2047

/*

2047

/*

2048

* Buddy candidates are cache hot:

2048

* Buddy candidates are cache hot:

2049

*/

2049

*/

2050

if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&

2050

if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&

2051

(&p->se == cfs_rq_of(&p->se)->next ||

2051

(&p->se == cfs_rq_of(&p->se)->next ||

2052

&p->se == cfs_rq_of(&p->se)->last))

2052

&p->se == cfs_rq_of(&p->se)->last))

2053

return 1;

2053

return 1;

2054

2055

if (p->sched_class != &fair_sched_class)

2055

if (p->sched_class != &fair_sched_class)

2056

return 0;

2056

return 0;

2057

2058

if (sysctl_sched_migration_cost == -1)

2058

if (sysctl_sched_migration_cost == -1)

2059

return 1;

2059

return 1;

2060

if (sysctl_sched_migration_cost == 0)

2060

if (sysctl_sched_migration_cost == 0)

2061

return 0;

2061

return 0;

2062

2063

delta = now - p->se.exec_start;

2063

delta = now - p->se.exec_start;

2064

2065

return delta < (s64)sysctl_sched_migration_cost;

2065

return delta < (s64)sysctl_sched_migration_cost;

2066

}

2066

}

2067

2068

2069

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

2069

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

2070

{

2070

{

2071

int old_cpu = task_cpu(p);

2071

int old_cpu = task_cpu(p);

2072

struct cfs_rq *old_cfsrq = task_cfs_rq(p),

2072

struct cfs_rq *old_cfsrq = task_cfs_rq(p),

2073

*new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);

2073

*new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);

2074

2075

trace_sched_migrate_task(p, new_cpu);

2075

trace_sched_migrate_task(p, new_cpu);

2076

2077

if (old_cpu != new_cpu) {

2077

if (old_cpu != new_cpu) {

2078

p->se.nr_migrations++;

2078

p->se.nr_migrations++;

2079

perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,

2079

perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,

2080

1, 1, NULL, 0);

2080

1, 1, NULL, 0);

2081

}

2081

}

2082

p->se.vruntime -= old_cfsrq->min_vruntime -

2082

p->se.vruntime -= old_cfsrq->min_vruntime -

2083

new_cfsrq->min_vruntime;

2083

new_cfsrq->min_vruntime;

2084

2085

__set_task_cpu(p, new_cpu);

2085

__set_task_cpu(p, new_cpu);

2086

}

2086

}

2087

2088

struct migration_req {

2088

struct migration_req {

2089

struct list_head list;

2089

struct list_head list;

2090

2091

struct task_struct *task;

2091

struct task_struct *task;

2092

int dest_cpu;

2092

int dest_cpu;

2093

2094

struct completion done;

2094

struct completion done;

2095

};

2095

};

2096

2097

/*

2097

/*

2098

* The task's runqueue lock must be held.

2098

* The task's runqueue lock must be held.

2099

* Returns true if you have to wait for migration thread.

2099

* Returns true if you have to wait for migration thread.

2100

*/

2100

*/

2101

static int

2101

static int

2102

migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)

2102

migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)

2103

{

2103

{

2104

struct rq *rq = task_rq(p);

2104

struct rq *rq = task_rq(p);

2105

2106

/*

2106

/*

2107

* If the task is not on a runqueue (and not running), then

2107

* If the task is not on a runqueue (and not running), then

2108

* it is sufficient to simply update the task's cpu field.

2108

* it is sufficient to simply update the task's cpu field.

2109

*/

2109

*/

2110

if (!p->se.on_rq && !task_running(rq, p)) {

2110

if (!p->se.on_rq && !task_running(rq, p)) {

2111

update_rq_clock(rq);

2111

update_rq_clock(rq);

2112

set_task_cpu(p, dest_cpu);

2112

set_task_cpu(p, dest_cpu);

2113

return 0;

2113

return 0;

2114

}

2114

}

2115

2116

init_completion(&req->done);

2116

init_completion(&req->done);

2117

req->task = p;

2117

req->task = p;

2118

req->dest_cpu = dest_cpu;

2118

req->dest_cpu = dest_cpu;

2119

list_add(&req->list, &rq->migration_queue);

2119

list_add(&req->list, &rq->migration_queue);

2120

2121

return 1;

2121

return 1;

2122

}

2122

}

2123

2124

/*

2124

/*

2125

* wait_task_context_switch - wait for a thread to complete at least one

2125

* wait_task_context_switch - wait for a thread to complete at least one

2126

* context switch.

2126

* context switch.

2127

*

2127

*

2128

* @p must not be current.

2128

* @p must not be current.

2129

*/

2129

*/

2130

void wait_task_context_switch(struct task_struct *p)

2130

void wait_task_context_switch(struct task_struct *p)

2131

{

2131

{

2132

unsigned long nvcsw, nivcsw, flags;

2132

unsigned long nvcsw, nivcsw, flags;

2133

int running;

2133

int running;

2134

struct rq *rq;

2134

struct rq *rq;

2135

2136

nvcsw = p->nvcsw;

2136

nvcsw = p->nvcsw;

2137

nivcsw = p->nivcsw;

2137

nivcsw = p->nivcsw;

2138

for (;;) {

2138

for (;;) {

2139

/*

2139

/*

2140

* The runqueue is assigned before the actual context

2140

* The runqueue is assigned before the actual context

2141

* switch. We need to take the runqueue lock.

2141

* switch. We need to take the runqueue lock.

2142

*

2142

*

2143

* We could check initially without the lock but it is

2143

* We could check initially without the lock but it is

2144

* very likely that we need to take the lock in every

2144

* very likely that we need to take the lock in every

2145

* iteration.

2145

* iteration.

2146

*/

2146

*/

2147

rq = task_rq_lock(p, &flags);

2147

rq = task_rq_lock(p, &flags);

2148

running = task_running(rq, p);

2148

running = task_running(rq, p);

2149

task_rq_unlock(rq, &flags);

2149

task_rq_unlock(rq, &flags);

2150

2151

if (likely(!running))

2151

if (likely(!running))

2152

break;

2152

break;

2153

/*

2153

/*

2154

* The switch count is incremented before the actual

2154

* The switch count is incremented before the actual

2155

* context switch. We thus wait for two switches to be

2155

* context switch. We thus wait for two switches to be

2156

* sure at least one completed.

2156

* sure at least one completed.

2157

*/

2157

*/

2158

if ((p->nvcsw - nvcsw) > 1)

2158

if ((p->nvcsw - nvcsw) > 1)

2159

break;

2159

break;

2160

if ((p->nivcsw - nivcsw) > 1)

2160

if ((p->nivcsw - nivcsw) > 1)

2161

break;

2161

break;

2162

2163

cpu_relax();

2163

cpu_relax();

2164

}

2164

}

2165

}

2165

}

2166

2167

/*

2167

/*

2168

* wait_task_inactive - wait for a thread to unschedule.

2168

* wait_task_inactive - wait for a thread to unschedule.

2169

*

2169

*

2170

* If @match_state is nonzero, it's the @p->state value just checked and

2170

* If @match_state is nonzero, it's the @p->state value just checked and

2171

* not expected to change. If it changes, i.e. @p might have woken up,

2171

* not expected to change. If it changes, i.e. @p might have woken up,

2172

* then return zero. When we succeed in waiting for @p to be off its CPU,

2172

* then return zero. When we succeed in waiting for @p to be off its CPU,

2173

* we return a positive number (its total switch count). If a second call

2173

* we return a positive number (its total switch count). If a second call

2174

* a short while later returns the same number, the caller can be sure that

2174

* a short while later returns the same number, the caller can be sure that

2175

* @p has remained unscheduled the whole time.

2175

* @p has remained unscheduled the whole time.

2176

*

2176

*

2177

* The caller must ensure that the task *will* unschedule sometime soon,

2177

* The caller must ensure that the task *will* unschedule sometime soon,

2178

* else this function might spin for a *long* time. This function can't

2178

* else this function might spin for a *long* time. This function can't

2179

* be called with interrupts off, or it may introduce deadlock with

2179

* be called with interrupts off, or it may introduce deadlock with

2180

* smp_call_function() if an IPI is sent by the same process we are

2180

* smp_call_function() if an IPI is sent by the same process we are

2181

* waiting to become inactive.

2181

* waiting to become inactive.

2182

*/

2182

*/

2183

unsigned long wait_task_inactive(struct task_struct *p, long match_state)

2183

unsigned long wait_task_inactive(struct task_struct *p, long match_state)

2184

{

2184

{

2185

unsigned long flags;

2185

unsigned long flags;

2186

int running, on_rq;

2186

int running, on_rq;

2187

unsigned long ncsw;

2187

unsigned long ncsw;

2188

struct rq *rq;

2188

struct rq *rq;

2189

2190

for (;;) {

2190

for (;;) {

2191

/*

2191

/*

2192

* We do the initial early heuristics without holding

2192

* We do the initial early heuristics without holding

2193

* any task-queue locks at all. We'll only try to get

2193

* any task-queue locks at all. We'll only try to get

2194

* the runqueue lock when things look like they will

2194

* the runqueue lock when things look like they will

2195

* work out!

2195

* work out!

2196

*/

2196

*/

2197

rq = task_rq(p);

2197

rq = task_rq(p);

2198

2199

/*

2199

/*

2200

* If the task is actively running on another CPU

2200

* If the task is actively running on another CPU

2201

* still, just relax and busy-wait without holding

2201

* still, just relax and busy-wait without holding

2202

* any locks.

2202

* any locks.

2203

*

2203

*

2204

* NOTE! Since we don't hold any locks, it's not

2204

* NOTE! Since we don't hold any locks, it's not

2205

* even sure that "rq" stays as the right runqueue!

2205

* even sure that "rq" stays as the right runqueue!

2206

* But we don't care, since "task_running()" will

2206

* But we don't care, since "task_running()" will

2207

* return false if the runqueue has changed and p

2207

* return false if the runqueue has changed and p

2208

* is actually now running somewhere else!

2208

* is actually now running somewhere else!

2209

*/

2209

*/

2210

while (task_running(rq, p)) {

2210

while (task_running(rq, p)) {

2211

if (match_state && unlikely(p->state != match_state))

2211

if (match_state && unlikely(p->state != match_state))

2212

return 0;

2212

return 0;

2213

cpu_relax();

2213

cpu_relax();

2214

}

2214

}

2215

2216

/*

2216

/*

2217

* Ok, time to look more closely! We need the rq

2217

* Ok, time to look more closely! We need the rq

2218

* lock now, to be *sure*. If we're wrong, we'll

2218

* lock now, to be *sure*. If we're wrong, we'll

2219

* just go back and repeat.

2219

* just go back and repeat.

2220

*/

2220

*/

2221

rq = task_rq_lock(p, &flags);

2221

rq = task_rq_lock(p, &flags);

2222

trace_sched_wait_task(rq, p);

2222

trace_sched_wait_task(rq, p);

2223

running = task_running(rq, p);

2223

running = task_running(rq, p);

2224

on_rq = p->se.on_rq;

2224

on_rq = p->se.on_rq;

2225

ncsw = 0;

2225

ncsw = 0;

2226

if (!match_state || p->state == match_state)

2226

if (!match_state || p->state == match_state)

2227

ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

2227

ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

2228

task_rq_unlock(rq, &flags);

2228

task_rq_unlock(rq, &flags);

2229

2230

/*

2230

/*

2231

* If it changed from the expected state, bail out now.

2231

* If it changed from the expected state, bail out now.

2232

*/

2232

*/

2233

if (unlikely(!ncsw))

2233

if (unlikely(!ncsw))

2234

break;

2234

break;

2235

2236

/*

2236

/*

2237

* Was it really running after all now that we

2237

* Was it really running after all now that we

2238

* checked with the proper locks actually held?

2238

* checked with the proper locks actually held?

2239

*

2239

*

2240

* Oops. Go back and try again..

2240

* Oops. Go back and try again..

2241

*/

2241

*/

2242

if (unlikely(running)) {

2242

if (unlikely(running)) {

2243

cpu_relax();

2243

cpu_relax();

2244

continue;

2244

continue;

2245

}

2245

}

2246

2247

/*

2247

/*

2248

* It's not enough that it's not actively running,

2248

* It's not enough that it's not actively running,

2249

* it must be off the runqueue _entirely_, and not

2249

* it must be off the runqueue _entirely_, and not

2250

* preempted!

2250

* preempted!

2251

*

2251

*

2252

* So if it was still runnable (but just not actively

2252

* So if it was still runnable (but just not actively

2253

* running right now), it's preempted, and we should

2253

* running right now), it's preempted, and we should

2254

* yield - it could be a while.

2254

* yield - it could be a while.

2255

*/

2255

*/

2256

if (unlikely(on_rq)) {

2256

if (unlikely(on_rq)) {

2257

schedule_timeout_uninterruptible(1);

2257

schedule_timeout_uninterruptible(1);

2258

continue;

2258

continue;

2259

}

2259

}

2260

2261

/*

2261

/*

2262

* Ahh, all good. It wasn't running, and it wasn't

2262

* Ahh, all good. It wasn't running, and it wasn't

2263

* runnable, which means that it will never become

2263

* runnable, which means that it will never become

2264

* running in the future either. We're all done!

2264

* running in the future either. We're all done!

2265

*/

2265

*/

2266

break;

2266

break;

2267

}

2267

}

2268

2269

return ncsw;

2269

return ncsw;

2270

}

2270

}

2271

2272

/***

2272

/***

2273

* kick_process - kick a running thread to enter/exit the kernel

2273

* kick_process - kick a running thread to enter/exit the kernel

2274

* @p: the to-be-kicked thread

2274

* @p: the to-be-kicked thread

2275

*

2275

*

2276

* Cause a process which is running on another CPU to enter

2276

* Cause a process which is running on another CPU to enter

2277

* kernel-mode, without any delay. (to get signals handled.)

2277

* kernel-mode, without any delay. (to get signals handled.)

2278

*

2278

*

2279

* NOTE: this function doesnt have to take the runqueue lock,

2279

* NOTE: this function doesnt have to take the runqueue lock,

2280

* because all it wants to ensure is that the remote task enters

2280

* because all it wants to ensure is that the remote task enters

2281

* the kernel. If the IPI races and the task has been migrated

2281

* the kernel. If the IPI races and the task has been migrated

2282

* to another CPU then no harm is done and the purpose has been

2282

* to another CPU then no harm is done and the purpose has been

2283

* achieved as well.

2283

* achieved as well.

2284

*/

2284

*/

2285

void kick_process(struct task_struct *p)

2285

void kick_process(struct task_struct *p)

2286

{

2286

{

2287

int cpu;

2287

int cpu;

2288

2289

preempt_disable();

2289

preempt_disable();

2290

cpu = task_cpu(p);

2290

cpu = task_cpu(p);

2291

if ((cpu != smp_processor_id()) && task_curr(p))

2291

if ((cpu != smp_processor_id()) && task_curr(p))

2292

smp_send_reschedule(cpu);

2292

smp_send_reschedule(cpu);

2293

preempt_enable();

2293

preempt_enable();

2294

}

2294

}

2295

EXPORT_SYMBOL_GPL(kick_process);

2295

EXPORT_SYMBOL_GPL(kick_process);

2296

#endif /* CONFIG_SMP */

2296

#endif /* CONFIG_SMP */

2297

2298

/**

2298

/**

2299

* task_oncpu_function_call - call a function on the cpu on which a task runs

2299

* task_oncpu_function_call - call a function on the cpu on which a task runs

2300

* @p: the task to evaluate

2300

* @p: the task to evaluate

2301

* @func: the function to be called

2301

* @func: the function to be called

2302

* @info: the function call argument

2302

* @info: the function call argument

2303

*

2303

*

2304

* Calls the function @func when the task is currently running. This might

2304

* Calls the function @func when the task is currently running. This might

2305

* be on the current CPU, which just calls the function directly

2305

* be on the current CPU, which just calls the function directly

2306

*/

2306

*/

2307

void task_oncpu_function_call(struct task_struct *p,

2307

void task_oncpu_function_call(struct task_struct *p,

2308

void (*func) (void *info), void *info)

2308

void (*func) (void *info), void *info)

2309

{

2309

{

2310

int cpu;

2310

int cpu;

2311

2312

preempt_disable();

2312

preempt_disable();

2313

cpu = task_cpu(p);

2313

cpu = task_cpu(p);

2314

if (task_curr(p))

2314

if (task_curr(p))

2315

smp_call_function_single(cpu, func, info, 1);

2315

smp_call_function_single(cpu, func, info, 1);

2316

preempt_enable();

2316

preempt_enable();

2317

}

2317

}

2318

2319

#ifdef CONFIG_SMP

2319

#ifdef CONFIG_SMP

2320

static inline

2320

static inline

2321

int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)

2321

int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)

2322

{

2322

{

2323

return p->sched_class->select_task_rq(p, sd_flags, wake_flags);

2323

return p->sched_class->select_task_rq(p, sd_flags, wake_flags);

2324

}

2324

}

2325

#endif

2325

#endif

2326

2327

/***

2327

/***

2328

* try_to_wake_up - wake up a thread

2328

* try_to_wake_up - wake up a thread

2329

* @p: the to-be-woken-up thread

2329

* @p: the to-be-woken-up thread

2330

* @state: the mask of task states that can be woken

2330

* @state: the mask of task states that can be woken

2331

* @sync: do a synchronous wakeup?

2331

* @sync: do a synchronous wakeup?

2332

*

2332

*

2333

* Put it on the run-queue if it's not already there. The "current"

2333

* Put it on the run-queue if it's not already there. The "current"

2334

* thread is always on the run-queue (except when the actual

2334

* thread is always on the run-queue (except when the actual

2335

* re-schedule is in progress), and as such you're allowed to do

2335

* re-schedule is in progress), and as such you're allowed to do

2336

* the simpler "current->state = TASK_RUNNING" to mark yourself

2336

* the simpler "current->state = TASK_RUNNING" to mark yourself

2337

* runnable without the overhead of this.

2337

* runnable without the overhead of this.

2338

*

2338

*

2339

* returns failure only if the task is already active.

2339

* returns failure only if the task is already active.

2340

*/

2340

*/

2341

static int try_to_wake_up(struct task_struct *p, unsigned int state,

2341

static int try_to_wake_up(struct task_struct *p, unsigned int state,

2342

int wake_flags)

2342

int wake_flags)

2343

{

2343

{

2344

int cpu, orig_cpu, this_cpu, success = 0;

2344

int cpu, orig_cpu, this_cpu, success = 0;

2345

unsigned long flags;

2345

unsigned long flags;

2346

struct rq *rq, *orig_rq;

2346

struct rq *rq, *orig_rq;

2347

2348

if (!sched_feat(SYNC_WAKEUPS))

2348

if (!sched_feat(SYNC_WAKEUPS))

2349

wake_flags &= ~WF_SYNC;

2349

wake_flags &= ~WF_SYNC;

2350

2351

this_cpu = get_cpu();

2351

this_cpu = get_cpu();

2352

2353

smp_wmb();

2353

smp_wmb();

2354

rq = orig_rq = task_rq_lock(p, &flags);

2354

rq = orig_rq = task_rq_lock(p, &flags);

2355

update_rq_clock(rq);

2355

update_rq_clock(rq);

2356

if (!(p->state & state))

2356

if (!(p->state & state))

2357

goto out;

2357

goto out;

2358

2359

if (p->se.on_rq)

2359

if (p->se.on_rq)

2360

goto out_running;

2360

goto out_running;

2361

2362

cpu = task_cpu(p);

2362

cpu = task_cpu(p);

2363

orig_cpu = cpu;

2363

orig_cpu = cpu;

2364

2365

#ifdef CONFIG_SMP

2365

#ifdef CONFIG_SMP

2366

if (unlikely(task_running(rq, p)))

2366

if (unlikely(task_running(rq, p)))

2367

goto out_activate;

2367

goto out_activate;

2368

2369

/*

2369

/*

2370

* In order to handle concurrent wakeups and release the rq->lock

2370

* In order to handle concurrent wakeups and release the rq->lock

2371

* we put the task in TASK_WAKING state.

2371

* we put the task in TASK_WAKING state.

2372

*

2372

*

2373

* First fix up the nr_uninterruptible count:

2373

* First fix up the nr_uninterruptible count:

2374

*/

2374

*/

2375

if (task_contributes_to_load(p))

2375

if (task_contributes_to_load(p))

2376

rq->nr_uninterruptible--;

2376

rq->nr_uninterruptible--;

2377

p->state = TASK_WAKING;

2377

p->state = TASK_WAKING;

2378

__task_rq_unlock(rq);

2378

__task_rq_unlock(rq);

2379

2380

cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);

2380

cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);

2381

if (cpu != orig_cpu)

2381

if (cpu != orig_cpu)

2382

set_task_cpu(p, cpu);

2382

set_task_cpu(p, cpu);

2383

2384

rq = __task_rq_lock(p);

2384

rq = __task_rq_lock(p);

2385

update_rq_clock(rq);

2385

update_rq_clock(rq);

2386

2387

WARN_ON(p->state != TASK_WAKING);

2387

WARN_ON(p->state != TASK_WAKING);

2388

cpu = task_cpu(p);

2388

cpu = task_cpu(p);

2389

2390

#ifdef CONFIG_SCHEDSTATS

2390

#ifdef CONFIG_SCHEDSTATS

2391

schedstat_inc(rq, ttwu_count);

2391

schedstat_inc(rq, ttwu_count);

2392

if (cpu == this_cpu)

2392

if (cpu == this_cpu)

2393

schedstat_inc(rq, ttwu_local);

2393

schedstat_inc(rq, ttwu_local);

2394

else {

2394

else {

2395

struct sched_domain *sd;

2395

struct sched_domain *sd;

2396

for_each_domain(this_cpu, sd) {

2396

for_each_domain(this_cpu, sd) {

2397

if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {

2397

if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {

2398

schedstat_inc(sd, ttwu_wake_remote);

2398

schedstat_inc(sd, ttwu_wake_remote);

2399

break;

2399

break;

2400

}

2400

}

2401

}

2401

}

2402

}

2402

}

2403

#endif /* CONFIG_SCHEDSTATS */

2403

#endif /* CONFIG_SCHEDSTATS */

2404

2405

out_activate:

2405

out_activate:

2406

#endif /* CONFIG_SMP */

2406

#endif /* CONFIG_SMP */

2407

schedstat_inc(p, se.nr_wakeups);

2407

schedstat_inc(p, se.nr_wakeups);

2408

if (wake_flags & WF_SYNC)

2408

if (wake_flags & WF_SYNC)

2409

schedstat_inc(p, se.nr_wakeups_sync);

2409

schedstat_inc(p, se.nr_wakeups_sync);

2410

if (orig_cpu != cpu)

2410

if (orig_cpu != cpu)

2411

schedstat_inc(p, se.nr_wakeups_migrate);

2411

schedstat_inc(p, se.nr_wakeups_migrate);

2412

if (cpu == this_cpu)

2412

if (cpu == this_cpu)

2413

schedstat_inc(p, se.nr_wakeups_local);

2413

schedstat_inc(p, se.nr_wakeups_local);

2414

else

2414

else

2415

schedstat_inc(p, se.nr_wakeups_remote);

2415

schedstat_inc(p, se.nr_wakeups_remote);

2416

activate_task(rq, p, 1);

2416

activate_task(rq, p, 1);

2417

success = 1;

2417

success = 1;

2418

2419

/*

2419

/*

2420

* Only attribute actual wakeups done by this task.

2420

* Only attribute actual wakeups done by this task.

2421

*/

2421

*/

2422

if (!in_interrupt()) {

2422

if (!in_interrupt()) {

2423

struct sched_entity *se = &current->se;

2423

struct sched_entity *se = &current->se;

2424

u64 sample = se->sum_exec_runtime;

2424

u64 sample = se->sum_exec_runtime;

2425

2426

if (se->last_wakeup)

2426

if (se->last_wakeup)

2427

sample -= se->last_wakeup;

2427

sample -= se->last_wakeup;

2428

else

2428

else

2429

sample -= se->start_runtime;

2429

sample -= se->start_runtime;

2430

update_avg(&se->avg_wakeup, sample);

2430

update_avg(&se->avg_wakeup, sample);

2431

2432

se->last_wakeup = se->sum_exec_runtime;

2432

se->last_wakeup = se->sum_exec_runtime;

2433

}

2433

}

2434

2435

out_running:

2435

out_running:

2436

trace_sched_wakeup(rq, p, success);

2436

trace_sched_wakeup(rq, p, success);

2437

check_preempt_curr(rq, p, wake_flags);

2437

check_preempt_curr(rq, p, wake_flags);

2438

2439

p->state = TASK_RUNNING;

2439

p->state = TASK_RUNNING;

2440

#ifdef CONFIG_SMP

2440

#ifdef CONFIG_SMP

2441

if (p->sched_class->task_wake_up)

2441

if (p->sched_class->task_wake_up)

2442

p->sched_class->task_wake_up(rq, p);

2442

p->sched_class->task_wake_up(rq, p);

2443

2444

if (unlikely(rq->idle_stamp)) {

2444

if (unlikely(rq->idle_stamp)) {

2445

u64 delta = rq->clock - rq->idle_stamp;

2445

u64 delta = rq->clock - rq->idle_stamp;

2446

u64 max = 2*sysctl_sched_migration_cost;

2446

u64 max = 2*sysctl_sched_migration_cost;

2447

2448

if (delta > max)

2448

if (delta > max)

2449

rq->avg_idle = max;

2449

rq->avg_idle = max;

2450

else

2450

else

2451

update_avg(&rq->avg_idle, delta);

2451

update_avg(&rq->avg_idle, delta);

2452

rq->idle_stamp = 0;

2452

rq->idle_stamp = 0;

2453

}

2453

}

2454

#endif

2454

#endif

2455

out:

2455

out:

2456

task_rq_unlock(rq, &flags);

2456

task_rq_unlock(rq, &flags);

2457

put_cpu();

2457

put_cpu();

2458

2459

return success;

2459

return success;

2460

}

2460

}

2461

2462

/**

2462

/**

2463

* wake_up_process - Wake up a specific process

2463

* wake_up_process - Wake up a specific process

2464

* @p: The process to be woken up.

2464

* @p: The process to be woken up.

2465

*

2465

*

2466

* Attempt to wake up the nominated process and move it to the set of runnable

2466

* Attempt to wake up the nominated process and move it to the set of runnable

2467

* processes. Returns 1 if the process was woken up, 0 if it was already

2467

* processes. Returns 1 if the process was woken up, 0 if it was already

2468

* running.

2468

* running.

2469

*

2469

*

2470

* It may be assumed that this function implies a write memory barrier before

2470

* It may be assumed that this function implies a write memory barrier before

2471

* changing the task state if and only if any tasks are woken up.

2471

* changing the task state if and only if any tasks are woken up.

2472

*/

2472

*/

2473

int wake_up_process(struct task_struct *p)

2473

int wake_up_process(struct task_struct *p)

2474

{

2474

{

2475

return try_to_wake_up(p, TASK_ALL, 0);

2475

return try_to_wake_up(p, TASK_ALL, 0);

2476

}

2476

}

2477

EXPORT_SYMBOL(wake_up_process);

2477

EXPORT_SYMBOL(wake_up_process);

2478

2479

int wake_up_state(struct task_struct *p, unsigned int state)

2479

int wake_up_state(struct task_struct *p, unsigned int state)

2480

{

2480

{

2481

return try_to_wake_up(p, state, 0);

2481

return try_to_wake_up(p, state, 0);

2482

}

2482

}

2483

2484

/*

2484

/*

2485

* Perform scheduler related setup for a newly forked process p.

2485

* Perform scheduler related setup for a newly forked process p.

2486

* p is forked by current.

2486

* p is forked by current.

2487

*

2487

*

2488

* __sched_fork() is basic setup used by init_idle() too:

2488

* __sched_fork() is basic setup used by init_idle() too:

2489

*/

2489

*/

2490

static void __sched_fork(struct task_struct *p)

2490

static void __sched_fork(struct task_struct *p)

2491

{

2491

{

2492

p->se.exec_start = 0;

2492

p->se.exec_start = 0;

2493

p->se.sum_exec_runtime = 0;

2493

p->se.sum_exec_runtime = 0;

2494

p->se.prev_sum_exec_runtime = 0;

2494

p->se.prev_sum_exec_runtime = 0;

2495

p->se.nr_migrations = 0;

2495

p->se.nr_migrations = 0;

2496

p->se.last_wakeup = 0;

2496

p->se.last_wakeup = 0;

2497

p->se.avg_overlap = 0;

2497

p->se.avg_overlap = 0;

2498

p->se.start_runtime = 0;

2498

p->se.start_runtime = 0;

2499

p->se.avg_wakeup = sysctl_sched_wakeup_granularity;

2499

p->se.avg_wakeup = sysctl_sched_wakeup_granularity;

2500

2501

#ifdef CONFIG_SCHEDSTATS

2501

#ifdef CONFIG_SCHEDSTATS

2502

p->se.wait_start = 0;

2502

p->se.wait_start = 0;

2503

p->se.wait_max = 0;

2503

p->se.wait_max = 0;

2504

p->se.wait_count = 0;

2504

p->se.wait_count = 0;

2505

p->se.wait_sum = 0;

2505

p->se.wait_sum = 0;

2506

2507

p->se.sleep_start = 0;

2507

p->se.sleep_start = 0;

2508

p->se.sleep_max = 0;

2508

p->se.sleep_max = 0;

2509

p->se.sum_sleep_runtime = 0;

2509

p->se.sum_sleep_runtime = 0;

2510

2511

p->se.block_start = 0;

2511

p->se.block_start = 0;

2512

p->se.block_max = 0;

2512

p->se.block_max = 0;

2513

p->se.exec_max = 0;

2513

p->se.exec_max = 0;

2514

p->se.slice_max = 0;

2514

p->se.slice_max = 0;

2515

2516

p->se.nr_migrations_cold = 0;

2516

p->se.nr_migrations_cold = 0;

2517

p->se.nr_failed_migrations_affine = 0;

2517

p->se.nr_failed_migrations_affine = 0;

2518

p->se.nr_failed_migrations_running = 0;

2518

p->se.nr_failed_migrations_running = 0;

2519

p->se.nr_failed_migrations_hot = 0;

2519

p->se.nr_failed_migrations_hot = 0;

2520

p->se.nr_forced_migrations = 0;

2520

p->se.nr_forced_migrations = 0;

2521

2522

p->se.nr_wakeups = 0;

2522

p->se.nr_wakeups = 0;

2523

p->se.nr_wakeups_sync = 0;

2523

p->se.nr_wakeups_sync = 0;

2524

p->se.nr_wakeups_migrate = 0;

2524

p->se.nr_wakeups_migrate = 0;

2525

p->se.nr_wakeups_local = 0;

2525

p->se.nr_wakeups_local = 0;

2526

p->se.nr_wakeups_remote = 0;

2526

p->se.nr_wakeups_remote = 0;

2527

p->se.nr_wakeups_affine = 0;

2527

p->se.nr_wakeups_affine = 0;

2528

p->se.nr_wakeups_affine_attempts = 0;

2528

p->se.nr_wakeups_affine_attempts = 0;

2529

p->se.nr_wakeups_passive = 0;

2529

p->se.nr_wakeups_passive = 0;

2530

p->se.nr_wakeups_idle = 0;

2530

p->se.nr_wakeups_idle = 0;

2531

2532

#endif

2532

#endif

2533

2534

INIT_LIST_HEAD(&p->rt.run_list);

2534

INIT_LIST_HEAD(&p->rt.run_list);

2535

p->se.on_rq = 0;

2535

p->se.on_rq = 0;

2536

INIT_LIST_HEAD(&p->se.group_node);

2536

INIT_LIST_HEAD(&p->se.group_node);

2537

2538

#ifdef CONFIG_PREEMPT_NOTIFIERS

2538

#ifdef CONFIG_PREEMPT_NOTIFIERS

2539

INIT_HLIST_HEAD(&p->preempt_notifiers);

2539

INIT_HLIST_HEAD(&p->preempt_notifiers);

2540

#endif

2540

#endif

2541

2542

/*

2542

/*

2543

* We mark the process as running here, but have not actually

2543

* We mark the process as running here, but have not actually

2544

* inserted it onto the runqueue yet. This guarantees that

2544

* inserted it onto the runqueue yet. This guarantees that

2545

* nobody will actually run it, and a signal or other external

2545

* nobody will actually run it, and a signal or other external

2546

* event cannot wake it up and insert it on the runqueue either.

2546

* event cannot wake it up and insert it on the runqueue either.

2547

*/

2547

*/

2548

p->state = TASK_RUNNING;

2548

p->state = TASK_RUNNING;

2549

}

2549

}

2550

2551

/*

2551

/*

2552

* fork()/clone()-time setup:

2552

* fork()/clone()-time setup:

2553

*/

2553

*/

2554

void sched_fork(struct task_struct *p, int clone_flags)

2554

void sched_fork(struct task_struct *p, int clone_flags)

2555

{

2555

{

2556

int cpu = get_cpu();

2556

int cpu = get_cpu();

2557

2558

__sched_fork(p);

2558

__sched_fork(p);

2559

2560

/*

2560

/*

2561

* Revert to default priority/policy on fork if requested.

2561

* Revert to default priority/policy on fork if requested.

2562

*/

2562

*/

2563

if (unlikely(p->sched_reset_on_fork)) {

2563

if (unlikely(p->sched_reset_on_fork)) {

2564

if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {

2564

if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {

2565

p->policy = SCHED_NORMAL;

2565

p->policy = SCHED_NORMAL;

2566

p->normal_prio = p->static_prio;

2566

p->normal_prio = p->static_prio;

2567

}

2567

}

2568

2569

if (PRIO_TO_NICE(p->static_prio) < 0) {

2569

if (PRIO_TO_NICE(p->static_prio) < 0) {

2570

p->static_prio = NICE_TO_PRIO(0);

2570

p->static_prio = NICE_TO_PRIO(0);

2571

p->normal_prio = p->static_prio;

2571

p->normal_prio = p->static_prio;

2572

set_load_weight(p);

2572

set_load_weight(p);

2573

}

2573

}

2574

2575

/*

2575

/*

2576

* We don't need the reset flag anymore after the fork. It has

2576

* We don't need the reset flag anymore after the fork. It has

2577

* fulfilled its duty:

2577

* fulfilled its duty:

2578

*/

2578

*/

2579

p->sched_reset_on_fork = 0;

2579

p->sched_reset_on_fork = 0;

2580

}

2580

}

2581

2582

/*

2582

/*

2583

* Make sure we do not leak PI boosting priority to the child.

2583

* Make sure we do not leak PI boosting priority to the child.

2584

*/

2584

*/

2585

p->prio = current->normal_prio;

2585

p->prio = current->normal_prio;

2586

2587

if (!rt_prio(p->prio))

2587

if (!rt_prio(p->prio))

2588

p->sched_class = &fair_sched_class;

2588

p->sched_class = &fair_sched_class;

2589

2590

if (p->sched_class->task_fork)

2590

if (p->sched_class->task_fork)

2591

p->sched_class->task_fork(p);

2591

p->sched_class->task_fork(p);

2592

2593

#ifdef CONFIG_SMP

2593

#ifdef CONFIG_SMP

2594

cpu = select_task_rq(p, SD_BALANCE_FORK, 0);

2594

cpu = select_task_rq(p, SD_BALANCE_FORK, 0);

2595

#endif

2595

#endif

2596

set_task_cpu(p, cpu);

2596

set_task_cpu(p, cpu);

2597

2598

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

2598

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

2599

if (likely(sched_info_on()))

2599

if (likely(sched_info_on()))

2600

memset(&p->sched_info, 0, sizeof(p->sched_info));

2600

memset(&p->sched_info, 0, sizeof(p->sched_info));

2601

#endif

2601

#endif

2602

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

2602

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

2603

p->oncpu = 0;

2603

p->oncpu = 0;

2604

#endif

2604

#endif

2605

#ifdef CONFIG_PREEMPT

2605

#ifdef CONFIG_PREEMPT

2606

/* Want to start with kernel preemption disabled. */

2606

/* Want to start with kernel preemption disabled. */

2607

task_thread_info(p)->preempt_count = 1;

2607

task_thread_info(p)->preempt_count = 1;

2608

#endif

2608

#endif

2609

plist_node_init(&p->pushable_tasks, MAX_PRIO);

2609

plist_node_init(&p->pushable_tasks, MAX_PRIO);

2610

2611

put_cpu();

2611

put_cpu();

2612

}

2612

}

2613

2614

/*

2614

/*

2615

* wake_up_new_task - wake up a newly created task for the first time.

2615

* wake_up_new_task - wake up a newly created task for the first time.

2616

*

2616

*

2617

* This function will do some initial scheduler statistics housekeeping

2617

* This function will do some initial scheduler statistics housekeeping

2618

* that must be done for every newly created context, then puts the task

2618

* that must be done for every newly created context, then puts the task

2619

* on the runqueue and wakes it.

2619

* on the runqueue and wakes it.

2620

*/

2620

*/

2621

void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)

2621

void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)

2622

{

2622

{

2623

unsigned long flags;

2623

unsigned long flags;

2624

struct rq *rq;

2624

struct rq *rq;

2625

2626

rq = task_rq_lock(p, &flags);

2626

rq = task_rq_lock(p, &flags);

2627

BUG_ON(p->state != TASK_RUNNING);

2627

BUG_ON(p->state != TASK_RUNNING);

2628

update_rq_clock(rq);

2628

update_rq_clock(rq);

2629

activate_task(rq, p, 0);

2629

activate_task(rq, p, 0);

2630

trace_sched_wakeup_new(rq, p, 1);

2630

trace_sched_wakeup_new(rq, p, 1);

2631

check_preempt_curr(rq, p, WF_FORK);

2631

check_preempt_curr(rq, p, WF_FORK);

2632

#ifdef CONFIG_SMP

2632

#ifdef CONFIG_SMP

2633

if (p->sched_class->task_wake_up)

2633

if (p->sched_class->task_wake_up)

2634

p->sched_class->task_wake_up(rq, p);

2634

p->sched_class->task_wake_up(rq, p);

2635

#endif

2635

#endif

2636

task_rq_unlock(rq, &flags);

2636

task_rq_unlock(rq, &flags);

2637

}

2637

}

2638

2639

#ifdef CONFIG_PREEMPT_NOTIFIERS

2639

#ifdef CONFIG_PREEMPT_NOTIFIERS

2640

2641

/**

2641

/**

2642

* preempt_notifier_register - tell me when current is being preempted & rescheduled

2642

* preempt_notifier_register - tell me when current is being preempted & rescheduled

2643

* @notifier: notifier struct to register

2643

* @notifier: notifier struct to register

2644

*/

2644

*/

2645

void preempt_notifier_register(struct preempt_notifier *notifier)

2645

void preempt_notifier_register(struct preempt_notifier *notifier)

2646

{

2646

{

2647

hlist_add_head(&notifier->link, &current->preempt_notifiers);

2647

hlist_add_head(&notifier->link, &current->preempt_notifiers);

2648

}

2648

}

2649

EXPORT_SYMBOL_GPL(preempt_notifier_register);

2649

EXPORT_SYMBOL_GPL(preempt_notifier_register);

2650

2651

/**

2651

/**

2652

* preempt_notifier_unregister - no longer interested in preemption notifications

2652

* preempt_notifier_unregister - no longer interested in preemption notifications

2653

* @notifier: notifier struct to unregister

2653

* @notifier: notifier struct to unregister

2654

*

2654

*

2655

* This is safe to call from within a preemption notifier.

2655

* This is safe to call from within a preemption notifier.

2656

*/

2656

*/

2657

void preempt_notifier_unregister(struct preempt_notifier *notifier)

2657

void preempt_notifier_unregister(struct preempt_notifier *notifier)

2658

{

2658

{

2659

hlist_del(&notifier->link);

2659

hlist_del(&notifier->link);

2660

}

2660

}

2661

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

2661

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

2662

2663

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2663

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2664

{

2664

{

2665

struct preempt_notifier *notifier;

2665

struct preempt_notifier *notifier;

2666

struct hlist_node *node;

2666

struct hlist_node *node;

2667

2668

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2668

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2669

notifier->ops->sched_in(notifier, raw_smp_processor_id());

2669

notifier->ops->sched_in(notifier, raw_smp_processor_id());

2670

}

2670

}

2671

2672

static void

2672

static void

2673

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2673

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2674

struct task_struct *next)

2674

struct task_struct *next)

2675

{

2675

{

2676

struct preempt_notifier *notifier;

2676

struct preempt_notifier *notifier;

2677

struct hlist_node *node;

2677

struct hlist_node *node;

2678

2679

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2679

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2680

notifier->ops->sched_out(notifier, next);

2680

notifier->ops->sched_out(notifier, next);

2681

}

2681

}

2682

2683

#else /* !CONFIG_PREEMPT_NOTIFIERS */

2683

#else /* !CONFIG_PREEMPT_NOTIFIERS */

2684

2685

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2685

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2686

{

2686

{

2687

}

2687

}

2688

2689

static void

2689

static void

2690

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2690

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2691

struct task_struct *next)

2691

struct task_struct *next)

2692

{

2692

{

2693

}

2693

}

2694

2695

#endif /* CONFIG_PREEMPT_NOTIFIERS */

2695

#endif /* CONFIG_PREEMPT_NOTIFIERS */

2696

2697

/**

2697

/**

2698

* prepare_task_switch - prepare to switch tasks

2698

* prepare_task_switch - prepare to switch tasks

2699

* @rq: the runqueue preparing to switch

2699

* @rq: the runqueue preparing to switch

2700

* @prev: the current task that is being switched out

2700

* @prev: the current task that is being switched out

2701

* @next: the task we are going to switch to.

2701

* @next: the task we are going to switch to.

2702

*

2702

*

2703

* This is called with the rq lock held and interrupts off. It must

2703

* This is called with the rq lock held and interrupts off. It must

2704

* be paired with a subsequent finish_task_switch after the context

2704

* be paired with a subsequent finish_task_switch after the context

2705

* switch.

2705

* switch.

2706

*

2706

*

2707

* prepare_task_switch sets up locking and calls architecture specific

2707

* prepare_task_switch sets up locking and calls architecture specific

2708

* hooks.

2708

* hooks.

2709

*/

2709

*/

2710

static inline void

2710

static inline void

2711

prepare_task_switch(struct rq *rq, struct task_struct *prev,

2711

prepare_task_switch(struct rq *rq, struct task_struct *prev,

2712

struct task_struct *next)

2712

struct task_struct *next)

2713

{

2713

{

2714

fire_sched_out_preempt_notifiers(prev, next);

2714

fire_sched_out_preempt_notifiers(prev, next);

2715

prepare_lock_switch(rq, next);

2715

prepare_lock_switch(rq, next);

2716

prepare_arch_switch(next);

2716

prepare_arch_switch(next);

2717

}

2717

}

2718

2719

/**

2719

/**

2720

* finish_task_switch - clean up after a task-switch

2720

* finish_task_switch - clean up after a task-switch

2721

* @rq: runqueue associated with task-switch

2721

* @rq: runqueue associated with task-switch

2722

* @prev: the thread we just switched away from.

2722

* @prev: the thread we just switched away from.

2723

*

2723

*

2724

* finish_task_switch must be called after the context switch, paired

2724

* finish_task_switch must be called after the context switch, paired

2725

* with a prepare_task_switch call before the context switch.

2725

* with a prepare_task_switch call before the context switch.

2726

* finish_task_switch will reconcile locking set up by prepare_task_switch,

2726

* finish_task_switch will reconcile locking set up by prepare_task_switch,

2727

* and do any other architecture-specific cleanup actions.

2727

* and do any other architecture-specific cleanup actions.

2728

*

2728

*

2729

* Note that we may have delayed dropping an mm in context_switch(). If

2729

* Note that we may have delayed dropping an mm in context_switch(). If

2730

* so, we finish that here outside of the runqueue lock. (Doing it

2730

* so, we finish that here outside of the runqueue lock. (Doing it

2731

* with the lock held can cause deadlocks; see schedule() for

2731

* with the lock held can cause deadlocks; see schedule() for

2732

* details.)

2732

* details.)

2733

*/

2733

*/

2734

static void finish_task_switch(struct rq *rq, struct task_struct *prev)

2734

static void finish_task_switch(struct rq *rq, struct task_struct *prev)

2735

__releases(rq->lock)

2735

__releases(rq->lock)

2736

{

2736

{

2737

struct mm_struct *mm = rq->prev_mm;

2737

struct mm_struct *mm = rq->prev_mm;

2738

long prev_state;

2738

long prev_state;

2739

2740

rq->prev_mm = NULL;

2740

rq->prev_mm = NULL;

2741

2742

/*

2742

/*

2743

* A task struct has one reference for the use as "current".

2743

* A task struct has one reference for the use as "current".

2744

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

2744

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

2745

* schedule one last time. The schedule call will never return, and

2745

* schedule one last time. The schedule call will never return, and

2746

* the scheduled task must drop that reference.

2746

* the scheduled task must drop that reference.

2747

* The test for TASK_DEAD must occur while the runqueue locks are

2747

* The test for TASK_DEAD must occur while the runqueue locks are

2748

* still held, otherwise prev could be scheduled on another cpu, die

2748

* still held, otherwise prev could be scheduled on another cpu, die

2749

* there before we look at prev->state, and then the reference would

2749

* there before we look at prev->state, and then the reference would

2750

* be dropped twice.

2750

* be dropped twice.

2751

* Manfred Spraul <manfred@colorfullife.com>

2751

* Manfred Spraul <manfred@colorfullife.com>

2752

*/

2752

*/

2753

prev_state = prev->state;

2753

prev_state = prev->state;

2754

finish_arch_switch(prev);

2754

finish_arch_switch(prev);

2755

perf_event_task_sched_in(current, cpu_of(rq));

2755

perf_event_task_sched_in(current, cpu_of(rq));

2756

finish_lock_switch(rq, prev);

2756

finish_lock_switch(rq, prev);

2757

2758

fire_sched_in_preempt_notifiers(current);

2758

fire_sched_in_preempt_notifiers(current);

2759

if (mm)

2759

if (mm)

2760

mmdrop(mm);

2760

mmdrop(mm);

2761

if (unlikely(prev_state == TASK_DEAD)) {

2761

if (unlikely(prev_state == TASK_DEAD)) {

2762

/*

2762

/*

2763

* Remove function-return probe instances associated with this

2763

* Remove function-return probe instances associated with this

2764

* task and put them back on the free list.

2764

* task and put them back on the free list.

2765

*/

2765

*/

2766

kprobe_flush_task(prev);

2766

kprobe_flush_task(prev);

2767

put_task_struct(prev);

2767

put_task_struct(prev);

2768

}

2768

}

2769

}

2769

}

2770

2771

#ifdef CONFIG_SMP

2771

#ifdef CONFIG_SMP

2772

2773

/* assumes rq->lock is held */

2773

/* assumes rq->lock is held */

2774

static inline void pre_schedule(struct rq *rq, struct task_struct *prev)

2774

static inline void pre_schedule(struct rq *rq, struct task_struct *prev)

2775

{

2775

{

2776

if (prev->sched_class->pre_schedule)

2776

if (prev->sched_class->pre_schedule)

2777

prev->sched_class->pre_schedule(rq, prev);

2777

prev->sched_class->pre_schedule(rq, prev);

2778

}

2778

}

2779

2780

/* rq->lock is NOT held, but preemption is disabled */

2780

/* rq->lock is NOT held, but preemption is disabled */

2781

static inline void post_schedule(struct rq *rq)

2781

static inline void post_schedule(struct rq *rq)

2782

{

2782

{

2783

if (rq->post_schedule) {

2783

if (rq->post_schedule) {

2784

unsigned long flags;

2784

unsigned long flags;

2785

2786

spin_lock_irqsave(&rq->lock, flags);

2786

spin_lock_irqsave(&rq->lock, flags);

2787

if (rq->curr->sched_class->post_schedule)

2787

if (rq->curr->sched_class->post_schedule)

2788

rq->curr->sched_class->post_schedule(rq);

2788

rq->curr->sched_class->post_schedule(rq);

2789

spin_unlock_irqrestore(&rq->lock, flags);

2789

spin_unlock_irqrestore(&rq->lock, flags);

2790

2791

rq->post_schedule = 0;

2791

rq->post_schedule = 0;

2792

}

2792

}

2793

}

2793

}

2794

2795

#else

2795

#else

2796

2797

static inline void pre_schedule(struct rq *rq, struct task_struct *p)

2797

static inline void pre_schedule(struct rq *rq, struct task_struct *p)

2798

{

2798

{

2799

}

2799

}

2800

2801

static inline void post_schedule(struct rq *rq)

2801

static inline void post_schedule(struct rq *rq)

2802

{

2802

{

2803

}

2803

}

2804

2805

#endif

2805

#endif

2806

2807

/**

2807

/**

2808

* schedule_tail - first thing a freshly forked thread must call.

2808

* schedule_tail - first thing a freshly forked thread must call.

2809

* @prev: the thread we just switched away from.

2809

* @prev: the thread we just switched away from.

2810

*/

2810

*/

2811

asmlinkage void schedule_tail(struct task_struct *prev)

2811

asmlinkage void schedule_tail(struct task_struct *prev)

2812

__releases(rq->lock)

2812

__releases(rq->lock)

2813

{

2813

{

2814

struct rq *rq = this_rq();

2814

struct rq *rq = this_rq();

2815

2816

finish_task_switch(rq, prev);

2816

finish_task_switch(rq, prev);

2817

2818

/*

2818

/*

2819

* FIXME: do we need to worry about rq being invalidated by the

2819

* FIXME: do we need to worry about rq being invalidated by the

2820

* task_switch?

2820

* task_switch?

2821

*/

2821

*/

2822

post_schedule(rq);

2822

post_schedule(rq);

2823

2824

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

2824

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

2825

/* In this case, finish_task_switch does not reenable preemption */

2825

/* In this case, finish_task_switch does not reenable preemption */

2826

preempt_enable();

2826

preempt_enable();

2827

#endif

2827

#endif

2828

if (current->set_child_tid)

2828

if (current->set_child_tid)

2829

put_user(task_pid_vnr(current), current->set_child_tid);

2829

put_user(task_pid_vnr(current), current->set_child_tid);

2830

}

2830

}

2831

2832

/*

2832

/*

2833

* context_switch - switch to the new MM and the new

2833

* context_switch - switch to the new MM and the new

2834

* thread's register state.

2834

* thread's register state.

2835

*/

2835

*/

2836

static inline void

2836

static inline void

2837

context_switch(struct rq *rq, struct task_struct *prev,

2837

context_switch(struct rq *rq, struct task_struct *prev,

2838

struct task_struct *next)

2838

struct task_struct *next)

2839

{

2839

{

2840

struct mm_struct *mm, *oldmm;

2840

struct mm_struct *mm, *oldmm;

2841

2842

prepare_task_switch(rq, prev, next);

2842

prepare_task_switch(rq, prev, next);

2843

trace_sched_switch(rq, prev, next);

2843

trace_sched_switch(rq, prev, next);

2844

mm = next->mm;

2844

mm = next->mm;

2845

oldmm = prev->active_mm;

2845

oldmm = prev->active_mm;

2846

/*

2846

/*

2847

* For paravirt, this is coupled with an exit in switch_to to

2847

* For paravirt, this is coupled with an exit in switch_to to

2848

* combine the page table reload and the switch backend into

2848

* combine the page table reload and the switch backend into

2849

* one hypercall.

2849

* one hypercall.

2850

*/

2850

*/

2851

arch_start_context_switch(prev);

2851

arch_start_context_switch(prev);

2852

2853

if (likely(!mm)) {

2853

if (likely(!mm)) {

2854

next->active_mm = oldmm;

2854

next->active_mm = oldmm;

2855

atomic_inc(&oldmm->mm_count);

2855

atomic_inc(&oldmm->mm_count);

2856

enter_lazy_tlb(oldmm, next);

2856

enter_lazy_tlb(oldmm, next);

2857

} else

2857

} else

2858

switch_mm(oldmm, mm, next);

2858

switch_mm(oldmm, mm, next);

2859

2860

if (likely(!prev->mm)) {

2860

if (likely(!prev->mm)) {

2861

prev->active_mm = NULL;

2861

prev->active_mm = NULL;

2862

rq->prev_mm = oldmm;

2862

rq->prev_mm = oldmm;

2863

}

2863

}

2864

/*

2864

/*

2865

* Since the runqueue lock will be released by the next

2865

* Since the runqueue lock will be released by the next

2866

* task (which is an invalid locking op but in the case

2866

* task (which is an invalid locking op but in the case

2867

* of the scheduler it's an obvious special-case), so we

2867

* of the scheduler it's an obvious special-case), so we

2868

* do an early lockdep release here:

2868

* do an early lockdep release here:

2869

*/

2869

*/

2870

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

2870

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

2871

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

2871

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

2872

#endif

2872

#endif

2873

2874

/* Here we just switch the register state and the stack. */

2874

/* Here we just switch the register state and the stack. */

2875

switch_to(prev, next, prev);

2875

switch_to(prev, next, prev);

2876

2877

barrier();

2877

barrier();

2878

/*

2878

/*

2879

* this_rq must be evaluated again because prev may have moved

2879

* this_rq must be evaluated again because prev may have moved

2880

* CPUs since it called schedule(), thus the 'rq' on its stack

2880

* CPUs since it called schedule(), thus the 'rq' on its stack

2881

* frame will be invalid.

2881

* frame will be invalid.

2882

*/

2882

*/

2883

finish_task_switch(this_rq(), prev);

2883

finish_task_switch(this_rq(), prev);

2884

}

2884

}

2885

2886

/*

2886

/*

2887

* nr_running, nr_uninterruptible and nr_context_switches:

2887

* nr_running, nr_uninterruptible and nr_context_switches:

2888

*

2888

*

2889

* externally visible scheduler statistics: current number of runnable

2889

* externally visible scheduler statistics: current number of runnable

2890

* threads, current number of uninterruptible-sleeping threads, total

2890

* threads, current number of uninterruptible-sleeping threads, total

2891

* number of context switches performed since bootup.

2891

* number of context switches performed since bootup.

2892

*/

2892

*/

2893

unsigned long nr_running(void)

2893

unsigned long nr_running(void)

2894

{

2894

{

2895

unsigned long i, sum = 0;

2895

unsigned long i, sum = 0;

2896

2897

for_each_online_cpu(i)

2897

for_each_online_cpu(i)

2898

sum += cpu_rq(i)->nr_running;

2898

sum += cpu_rq(i)->nr_running;

2899

2900

return sum;

2900

return sum;

2901

}

2901

}

2902

2903

unsigned long nr_uninterruptible(void)

2903

unsigned long nr_uninterruptible(void)

2904

{

2904

{

2905

unsigned long i, sum = 0;

2905

unsigned long i, sum = 0;

2906

2907

for_each_possible_cpu(i)

2907

for_each_possible_cpu(i)

2908

sum += cpu_rq(i)->nr_uninterruptible;

2908

sum += cpu_rq(i)->nr_uninterruptible;

2909

2910

/*

2910

/*

2911

* Since we read the counters lockless, it might be slightly

2911

* Since we read the counters lockless, it might be slightly

2912

* inaccurate. Do not allow it to go below zero though:

2912

* inaccurate. Do not allow it to go below zero though:

2913

*/

2913

*/

2914

if (unlikely((long)sum < 0))

2914

if (unlikely((long)sum < 0))

2915

sum = 0;

2915

sum = 0;

2916

2917

return sum;

2917

return sum;

2918

}

2918

}

2919

2920

unsigned long long nr_context_switches(void)

2920

unsigned long long nr_context_switches(void)

2921

{

2921

{

2922

int i;

2922

int i;

2923

unsigned long long sum = 0;

2923

unsigned long long sum = 0;

2924

2925

for_each_possible_cpu(i)

2925

for_each_possible_cpu(i)

2926

sum += cpu_rq(i)->nr_switches;

2926

sum += cpu_rq(i)->nr_switches;

2927

2928

return sum;

2928

return sum;

2929

}

2929

}

2930

2931

unsigned long nr_iowait(void)

2931

unsigned long nr_iowait(void)

2932

{

2932

{

2933

unsigned long i, sum = 0;

2933

unsigned long i, sum = 0;

2934

2935

for_each_possible_cpu(i)

2935

for_each_possible_cpu(i)

2936

sum += atomic_read(&cpu_rq(i)->nr_iowait);

2936

sum += atomic_read(&cpu_rq(i)->nr_iowait);

2937

2938

return sum;

2938

return sum;

2939

}

2939

}

2940

2941

unsigned long nr_iowait_cpu(void)

2941

unsigned long nr_iowait_cpu(void)

2942

{

2942

{

2943

struct rq *this = this_rq();

2943

struct rq *this = this_rq();

2944

return atomic_read(&this->nr_iowait);

2944

return atomic_read(&this->nr_iowait);

2945

}

2945

}

2946

2947

unsigned long this_cpu_load(void)

2947

unsigned long this_cpu_load(void)

2948

{

2948

{

2949

struct rq *this = this_rq();

2949

struct rq *this = this_rq();

2950

return this->cpu_load[0];

2950

return this->cpu_load[0];

2951

}

2951

}

2952

2953

2954

/* Variables and functions for calc_load */

2954

/* Variables and functions for calc_load */

2955

static atomic_long_t calc_load_tasks;

2955

static atomic_long_t calc_load_tasks;

2956

static unsigned long calc_load_update;

2956

static unsigned long calc_load_update;

2957

unsigned long avenrun[3];

2957

unsigned long avenrun[3];

2958

EXPORT_SYMBOL(avenrun);

2958

EXPORT_SYMBOL(avenrun);

2959

2960

/**

2960

/**

2961

* get_avenrun - get the load average array

2961

* get_avenrun - get the load average array

2962

* @loads: pointer to dest load array

2962

* @loads: pointer to dest load array

2963

* @offset: offset to add

2963

* @offset: offset to add

2964

* @shift: shift count to shift the result left

2964

* @shift: shift count to shift the result left

2965

*

2965

*

2966

* These values are estimates at best, so no need for locking.

2966

* These values are estimates at best, so no need for locking.

2967

*/

2967

*/

2968

void get_avenrun(unsigned long *loads, unsigned long offset, int shift)

2968

void get_avenrun(unsigned long *loads, unsigned long offset, int shift)

2969

{

2969

{

2970

loads[0] = (avenrun[0] + offset) << shift;

2970

loads[0] = (avenrun[0] + offset) << shift;

2971

loads[1] = (avenrun[1] + offset) << shift;

2971

loads[1] = (avenrun[1] + offset) << shift;

2972

loads[2] = (avenrun[2] + offset) << shift;

2972

loads[2] = (avenrun[2] + offset) << shift;

2973

}

2973

}

2974

2975

static unsigned long

2975

static unsigned long

2976

calc_load(unsigned long load, unsigned long exp, unsigned long active)

2976

calc_load(unsigned long load, unsigned long exp, unsigned long active)

2977

{

2977

{

2978

load *= exp;

2978

load *= exp;

2979

load += active * (FIXED_1 - exp);

2979

load += active * (FIXED_1 - exp);

2980

return load >> FSHIFT;

2980

return load >> FSHIFT;

2981

}

2981

}

2982

2983

/*

2983

/*

2984

* calc_load - update the avenrun load estimates 10 ticks after the

2984

* calc_load - update the avenrun load estimates 10 ticks after the

2985

* CPUs have updated calc_load_tasks.

2985

* CPUs have updated calc_load_tasks.

2986

*/

2986

*/

2987

void calc_global_load(void)

2987

void calc_global_load(void)

2988

{

2988

{

2989

unsigned long upd = calc_load_update + 10;

2989

unsigned long upd = calc_load_update + 10;

2990

long active;

2990

long active;

2991

2992

if (time_before(jiffies, upd))

2992

if (time_before(jiffies, upd))

2993

return;

2993

return;

2994

2995

active = atomic_long_read(&calc_load_tasks);

2995

active = atomic_long_read(&calc_load_tasks);

2996

active = active > 0 ? active * FIXED_1 : 0;

2996

active = active > 0 ? active * FIXED_1 : 0;

2997

2998

avenrun[0] = calc_load(avenrun[0], EXP_1, active);

2998

avenrun[0] = calc_load(avenrun[0], EXP_1, active);

2999

avenrun[1] = calc_load(avenrun[1], EXP_5, active);

2999

avenrun[1] = calc_load(avenrun[1], EXP_5, active);

3000

avenrun[2] = calc_load(avenrun[2], EXP_15, active);

3000

avenrun[2] = calc_load(avenrun[2], EXP_15, active);

3001

3002

calc_load_update += LOAD_FREQ;

3002

calc_load_update += LOAD_FREQ;

3003

}

3003

}

3004

3005

/*

3005

/*

3006

* Either called from update_cpu_load() or from a cpu going idle

3006

* Either called from update_cpu_load() or from a cpu going idle

3007

*/

3007

*/

3008

static void calc_load_account_active(struct rq *this_rq)

3008

static void calc_load_account_active(struct rq *this_rq)

3009

{

3009

{

3010

long nr_active, delta;

3010

long nr_active, delta;

3011

3012

nr_active = this_rq->nr_running;

3012

nr_active = this_rq->nr_running;

3013

nr_active += (long) this_rq->nr_uninterruptible;

3013

nr_active += (long) this_rq->nr_uninterruptible;

3014

3015

if (nr_active != this_rq->calc_load_active) {

3015

if (nr_active != this_rq->calc_load_active) {

3016

delta = nr_active - this_rq->calc_load_active;

3016

delta = nr_active - this_rq->calc_load_active;

3017

this_rq->calc_load_active = nr_active;

3017

this_rq->calc_load_active = nr_active;

3018

atomic_long_add(delta, &calc_load_tasks);

3018

atomic_long_add(delta, &calc_load_tasks);

3019

}

3019

}

3020

}

3020

}

3021

3022

/*

3022

/*

3023

* Update rq->cpu_load[] statistics. This function is usually called every

3023

* Update rq->cpu_load[] statistics. This function is usually called every

3024

* scheduler tick (TICK_NSEC).

3024

* scheduler tick (TICK_NSEC).

3025

*/

3025

*/

3026

static void update_cpu_load(struct rq *this_rq)

3026

static void update_cpu_load(struct rq *this_rq)

3027

{

3027

{

3028

unsigned long this_load = this_rq->load.weight;

3028

unsigned long this_load = this_rq->load.weight;

3029

int i, scale;

3029

int i, scale;

3030

3031

this_rq->nr_load_updates++;

3031

this_rq->nr_load_updates++;

3032

3033

/* Update our load: */

3033

/* Update our load: */

3034

for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {

3034

for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {

3035

unsigned long old_load, new_load;

3035

unsigned long old_load, new_load;

3036

3037

/* scale is effectively 1 << i now, and >> i divides by scale */

3037

/* scale is effectively 1 << i now, and >> i divides by scale */

3038

3039

old_load = this_rq->cpu_load[i];

3039

old_load = this_rq->cpu_load[i];

3040

new_load = this_load;

3040

new_load = this_load;

3041

/*

3041

/*

3042

* Round up the averaging division if load is increasing. This

3042

* Round up the averaging division if load is increasing. This

3043

* prevents us from getting stuck on 9 if the load is 10, for

3043

* prevents us from getting stuck on 9 if the load is 10, for

3044

* example.

3044

* example.

3045

*/

3045

*/

3046

if (new_load > old_load)

3046

if (new_load > old_load)

3047

new_load += scale-1;

3047

new_load += scale-1;

3048

this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;

3048

this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;

3049

}

3049

}

3050

3051

if (time_after_eq(jiffies, this_rq->calc_load_update)) {

3051

if (time_after_eq(jiffies, this_rq->calc_load_update)) {

3052

this_rq->calc_load_update += LOAD_FREQ;

3052

this_rq->calc_load_update += LOAD_FREQ;

3053

calc_load_account_active(this_rq);

3053

calc_load_account_active(this_rq);

3054

}

3054

}

3055

}

3055

}

3056

3057

#ifdef CONFIG_SMP

3057

#ifdef CONFIG_SMP

3058

3059

/*

3059

/*

3060

* double_rq_lock - safely lock two runqueues

3060

* double_rq_lock - safely lock two runqueues

3061

*

3061

*

3062

* Note this does not disable interrupts like task_rq_lock,

3062

* Note this does not disable interrupts like task_rq_lock,

3063

* you need to do so manually before calling.

3063

* you need to do so manually before calling.

3064

*/

3064

*/

3065

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

3065

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

3066

__acquires(rq1->lock)

3066

__acquires(rq1->lock)

3067

__acquires(rq2->lock)

3067

__acquires(rq2->lock)

3068

{

3068

{

3069

BUG_ON(!irqs_disabled());

3069

BUG_ON(!irqs_disabled());

3070

if (rq1 == rq2) {

3070

if (rq1 == rq2) {

3071

spin_lock(&rq1->lock);

3071

spin_lock(&rq1->lock);

3072

__acquire(rq2->lock); /* Fake it out ;) */

3072

__acquire(rq2->lock); /* Fake it out ;) */

3073

} else {

3073

} else {

3074

if (rq1 < rq2) {

3074

if (rq1 < rq2) {

3075

spin_lock(&rq1->lock);

3075

spin_lock(&rq1->lock);

3076

spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);

3076

spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);

3077

} else {

3077

} else {

3078

spin_lock(&rq2->lock);

3078

spin_lock(&rq2->lock);

3079

spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);

3079

spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);

3080

}

3080

}

3081

}

3081

}

3082

update_rq_clock(rq1);

3082

update_rq_clock(rq1);

3083

update_rq_clock(rq2);

3083

update_rq_clock(rq2);

3084

}

3084

}

3085

3086

/*

3086

/*

3087

* double_rq_unlock - safely unlock two runqueues

3087

* double_rq_unlock - safely unlock two runqueues

3088

*

3088

*

3089

* Note this does not restore interrupts like task_rq_unlock,

3089

* Note this does not restore interrupts like task_rq_unlock,

3090

* you need to do so manually after calling.

3090

* you need to do so manually after calling.

3091

*/

3091

*/

3092

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

3092

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

3093

__releases(rq1->lock)

3093

__releases(rq1->lock)

3094

__releases(rq2->lock)

3094

__releases(rq2->lock)

3095

{

3095

{

3096

spin_unlock(&rq1->lock);

3096

spin_unlock(&rq1->lock);

3097

if (rq1 != rq2)

3097

if (rq1 != rq2)

3098

spin_unlock(&rq2->lock);

3098

spin_unlock(&rq2->lock);

3099

else

3099

else

3100

__release(rq2->lock);

3100

__release(rq2->lock);

3101

}

3101

}

3102

3103

/*

3103

/*

3104

* If dest_cpu is allowed for this process, migrate the task to it.

3104

* If dest_cpu is allowed for this process, migrate the task to it.

3105

* This is accomplished by forcing the cpu_allowed mask to only

3105

* This is accomplished by forcing the cpu_allowed mask to only

3106

* allow dest_cpu, which will force the cpu onto dest_cpu. Then

3106

* allow dest_cpu, which will force the cpu onto dest_cpu. Then

3107

* the cpu_allowed mask is restored.

3107

* the cpu_allowed mask is restored.

3108

*/

3108

*/

3109

static void sched_migrate_task(struct task_struct *p, int dest_cpu)

3109

static void sched_migrate_task(struct task_struct *p, int dest_cpu)

3110

{

3110

{

3111

struct migration_req req;

3111

struct migration_req req;

3112

unsigned long flags;

3112

unsigned long flags;

3113

struct rq *rq;

3113

struct rq *rq;

3114

3115

rq = task_rq_lock(p, &flags);

3115

rq = task_rq_lock(p, &flags);

3116

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)

3116

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)

3117

|| unlikely(!cpu_active(dest_cpu)))

3117

|| unlikely(!cpu_active(dest_cpu)))

3118

goto out;

3118

goto out;

3119

3120

/* force the process onto the specified CPU */

3120

/* force the process onto the specified CPU */

3121

if (migrate_task(p, dest_cpu, &req)) {

3121

if (migrate_task(p, dest_cpu, &req)) {

3122

/* Need to wait for migration thread (might exit: take ref). */

3122

/* Need to wait for migration thread (might exit: take ref). */

3123

struct task_struct *mt = rq->migration_thread;

3123

struct task_struct *mt = rq->migration_thread;

3124

3125

get_task_struct(mt);

3125

get_task_struct(mt);

3126

task_rq_unlock(rq, &flags);

3126

task_rq_unlock(rq, &flags);

3127

wake_up_process(mt);

3127

wake_up_process(mt);

3128

put_task_struct(mt);

3128

put_task_struct(mt);

3129

wait_for_completion(&req.done);

3129

wait_for_completion(&req.done);

3130

3131

return;

3131

return;

3132

}

3132

}

3133

out:

3133

out:

3134

task_rq_unlock(rq, &flags);

3134

task_rq_unlock(rq, &flags);

3135

}

3135

}

3136

3137

/*

3137

/*

3138

* sched_exec - execve() is a valuable balancing opportunity, because at

3138

* sched_exec - execve() is a valuable balancing opportunity, because at

3139

* this point the task has the smallest effective memory and cache footprint.

3139

* this point the task has the smallest effective memory and cache footprint.

3140

*/

3140

*/

3141

void sched_exec(void)

3141

void sched_exec(void)

3142

{

3142

{

3143

int new_cpu, this_cpu = get_cpu();

3143

int new_cpu, this_cpu = get_cpu();

3144

new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0);

3144

new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0);

3145

put_cpu();

3145

put_cpu();

3146

if (new_cpu != this_cpu)

3146

if (new_cpu != this_cpu)

3147

sched_migrate_task(current, new_cpu);

3147

sched_migrate_task(current, new_cpu);

3148

}

3148

}

3149

3150

/*

3150

/*

3151

* pull_task - move a task from a remote runqueue to the local runqueue.

3151

* pull_task - move a task from a remote runqueue to the local runqueue.

3152

* Both runqueues must be locked.

3152

* Both runqueues must be locked.

3153

*/

3153

*/

3154

static void pull_task(struct rq *src_rq, struct task_struct *p,

3154

static void pull_task(struct rq *src_rq, struct task_struct *p,

3155

struct rq *this_rq, int this_cpu)

3155

struct rq *this_rq, int this_cpu)

3156

{

3156

{

3157

deactivate_task(src_rq, p, 0);

3157

deactivate_task(src_rq, p, 0);

3158

set_task_cpu(p, this_cpu);

3158

set_task_cpu(p, this_cpu);

3159

activate_task(this_rq, p, 0);

3159

activate_task(this_rq, p, 0);

3160

check_preempt_curr(this_rq, p, 0);

3160

check_preempt_curr(this_rq, p, 0);

3161

}

3161

}

3162

3163

/*

3163

/*

3164

* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?

3164

* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?

3165

*/

3165

*/

3166

static

3166

static

3167

int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,

3167

int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,

3168

struct sched_domain *sd, enum cpu_idle_type idle,

3168

struct sched_domain *sd, enum cpu_idle_type idle,

3169

int *all_pinned)

3169

int *all_pinned)

3170

{

3170

{

3171

int tsk_cache_hot = 0;

3171

int tsk_cache_hot = 0;

3172

/*

3172

/*

3173

* We do not migrate tasks that are:

3173

* We do not migrate tasks that are:

3174

* 1) running (obviously), or

3174

* 1) running (obviously), or

3175

* 2) cannot be migrated to this CPU due to cpus_allowed, or

3175

* 2) cannot be migrated to this CPU due to cpus_allowed, or

3176

* 3) are cache-hot on their current CPU.

3176

* 3) are cache-hot on their current CPU.

3177

*/

3177

*/

3178

if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {

3178

if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {

3179

schedstat_inc(p, se.nr_failed_migrations_affine);

3179

schedstat_inc(p, se.nr_failed_migrations_affine);

3180

return 0;

3180

return 0;

3181

}

3181

}

3182

*all_pinned = 0;

3182

*all_pinned = 0;

3183

3184

if (task_running(rq, p)) {

3184

if (task_running(rq, p)) {

3185

schedstat_inc(p, se.nr_failed_migrations_running);

3185

schedstat_inc(p, se.nr_failed_migrations_running);

3186

return 0;

3186

return 0;

3187

}

3187

}

3188

3189

/*

3189

/*

3190

* Aggressive migration if:

3190

* Aggressive migration if:

3191

* 1) task is cache cold, or

3191

* 1) task is cache cold, or

3192

* 2) too many balance attempts have failed.

3192

* 2) too many balance attempts have failed.

3193

*/

3193

*/

3194

3195

tsk_cache_hot = task_hot(p, rq->clock, sd);

3195

tsk_cache_hot = task_hot(p, rq->clock, sd);

3196

if (!tsk_cache_hot ||

3196

if (!tsk_cache_hot ||

3197

sd->nr_balance_failed > sd->cache_nice_tries) {

3197

sd->nr_balance_failed > sd->cache_nice_tries) {

3198

#ifdef CONFIG_SCHEDSTATS

3198

#ifdef CONFIG_SCHEDSTATS

3199

if (tsk_cache_hot) {

3199

if (tsk_cache_hot) {

3200

schedstat_inc(sd, lb_hot_gained[idle]);

3200

schedstat_inc(sd, lb_hot_gained[idle]);

3201

schedstat_inc(p, se.nr_forced_migrations);

3201

schedstat_inc(p, se.nr_forced_migrations);

3202

}

3202

}

3203

#endif

3203

#endif

3204

return 1;

3204

return 1;

3205

}

3205

}

3206

3207

if (tsk_cache_hot) {

3207

if (tsk_cache_hot) {

3208

schedstat_inc(p, se.nr_failed_migrations_hot);

3208

schedstat_inc(p, se.nr_failed_migrations_hot);

3209

return 0;

3209

return 0;

3210

}

3210

}

3211

return 1;

3211

return 1;

3212

}

3212

}

3213

3214

static unsigned long

3214

static unsigned long

3215

balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

3215

balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

3216

unsigned long max_load_move, struct sched_domain *sd,

3216

unsigned long max_load_move, struct sched_domain *sd,

3217

enum cpu_idle_type idle, int *all_pinned,

3217

enum cpu_idle_type idle, int *all_pinned,

3218

int *this_best_prio, struct rq_iterator *iterator)

3218

int *this_best_prio, struct rq_iterator *iterator)

3219

{

3219

{

3220

int loops = 0, pulled = 0, pinned = 0;

3220

int loops = 0, pulled = 0, pinned = 0;

3221

struct task_struct *p;

3221

struct task_struct *p;

3222

long rem_load_move = max_load_move;

3222

long rem_load_move = max_load_move;

3223

3224

if (max_load_move == 0)

3224

if (max_load_move == 0)

3225

goto out;

3225

goto out;

3226

3227

pinned = 1;

3227

pinned = 1;

3228

3229

/*

3229

/*

3230

* Start the load-balancing iterator:

3230

* Start the load-balancing iterator:

3231

*/

3231

*/

3232

p = iterator->start(iterator->arg);

3232

p = iterator->start(iterator->arg);

3233

if (!p || loops++ > sysctl_sched_nr_migrate)

3234

if (!p || loops++ > sysctl_sched_nr_migrate)

3235

goto out;

3235

goto out;

3236

3237

if ((p->se.load.weight >> 1) > rem_load_move ||

3237

if ((p->se.load.weight >> 1) > rem_load_move ||

3238

!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

3238

!can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

3239

p = iterator->next(iterator->arg);

3239

p = iterator->next(iterator->arg);

3240

goto next;

3240

goto next;

3241

}

3241

}

3242

3243

pull_task(busiest, p, this_rq, this_cpu);

3243

pull_task(busiest, p, this_rq, this_cpu);

3244

pulled++;

3244

pulled++;

3245

rem_load_move -= p->se.load.weight;

3245

rem_load_move -= p->se.load.weight;

3246

3247

#ifdef CONFIG_PREEMPT

3247

#ifdef CONFIG_PREEMPT

3248

/*

3248

/*

3249

* NEWIDLE balancing is a source of latency, so preemptible kernels

3249

* NEWIDLE balancing is a source of latency, so preemptible kernels

3250

* will stop after the first task is pulled to minimize the critical

3250

* will stop after the first task is pulled to minimize the critical

3251

* section.

3251

* section.

3252

*/

3252

*/

3253

if (idle == CPU_NEWLY_IDLE)

3253

if (idle == CPU_NEWLY_IDLE)

3254

goto out;

3254

goto out;

3255

#endif

3255

#endif

3256

3257

/*

3257

/*

3258

* We only want to steal up to the prescribed amount of weighted load.

3258

* We only want to steal up to the prescribed amount of weighted load.

3259

*/

3259

*/

3260

if (rem_load_move > 0) {

3260

if (rem_load_move > 0) {

3261

if (p->prio < *this_best_prio)

3261

if (p->prio < *this_best_prio)

3262

*this_best_prio = p->prio;

3262

*this_best_prio = p->prio;

3263

p = iterator->next(iterator->arg);

3263

p = iterator->next(iterator->arg);

3264

goto next;

3264

goto next;

3265

}

3265

}

3266

out:

3266

out:

3267

/*

3267

/*

3268

* Right now, this is one of only two places pull_task() is called,

3268

* Right now, this is one of only two places pull_task() is called,

3269

* so we can safely collect pull_task() stats here rather than

3269

* so we can safely collect pull_task() stats here rather than

3270

* inside pull_task().

3270

* inside pull_task().

3271

*/

3271

*/

3272

schedstat_add(sd, lb_gained[idle], pulled);

3272

schedstat_add(sd, lb_gained[idle], pulled);

3273

3274

if (all_pinned)

3274

if (all_pinned)

3275

*all_pinned = pinned;

3275

*all_pinned = pinned;

3276

3277

return max_load_move - rem_load_move;

3277

return max_load_move - rem_load_move;

3278

}

3278

}

3279

3280

/*

3280

/*

3281

* move_tasks tries to move up to max_load_move weighted load from busiest to

3281

* move_tasks tries to move up to max_load_move weighted load from busiest to

3282

* this_rq, as part of a balancing operation within domain "sd".

3282

* this_rq, as part of a balancing operation within domain "sd".

3283

* Returns 1 if successful and 0 otherwise.

3283

* Returns 1 if successful and 0 otherwise.

3284

*

3284

*

3285

* Called with both runqueues locked.

3285

* Called with both runqueues locked.

3286

*/

3286

*/

3287

static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

3287

static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,

3288

unsigned long max_load_move,

3288

unsigned long max_load_move,

3289

struct sched_domain *sd, enum cpu_idle_type idle,

3289

struct sched_domain *sd, enum cpu_idle_type idle,

3290

int *all_pinned)

3290

int *all_pinned)

3291

{

3291

{

3292

const struct sched_class *class = sched_class_highest;

3292

const struct sched_class *class = sched_class_highest;

3293

unsigned long total_load_moved = 0;

3293

unsigned long total_load_moved = 0;

3294

int this_best_prio = this_rq->curr->prio;

3294

int this_best_prio = this_rq->curr->prio;

3295

3296

do {

3296

do {

3297

total_load_moved +=

3297

total_load_moved +=

3298

class->load_balance(this_rq, this_cpu, busiest,

3298

class->load_balance(this_rq, this_cpu, busiest,

3299

max_load_move - total_load_moved,

3299

max_load_move - total_load_moved,

3300

sd, idle, all_pinned, &this_best_prio);

3300

sd, idle, all_pinned, &this_best_prio);

3301

class = class->next;

3301

class = class->next;

3302

3303

#ifdef CONFIG_PREEMPT

3303

#ifdef CONFIG_PREEMPT

3304

/*

3304

/*

3305

* NEWIDLE balancing is a source of latency, so preemptible

3305

* NEWIDLE balancing is a source of latency, so preemptible

3306

* kernels will stop after the first task is pulled to minimize

3306

* kernels will stop after the first task is pulled to minimize

3307

* the critical section.

3307

* the critical section.

3308

*/

3308

*/

3309

if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)

3309

if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)

3310

break;

3310

break;

3311

#endif

3311

#endif

3312

} while (class && max_load_move > total_load_moved);

3312

} while (class && max_load_move > total_load_moved);

3313

3314

return total_load_moved > 0;

3314

return total_load_moved > 0;

3315

}

3315

}

3316

3317

static int

3317

static int

3318

iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

3318

iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

3319

struct sched_domain *sd, enum cpu_idle_type idle,

3319

struct sched_domain *sd, enum cpu_idle_type idle,

3320

struct rq_iterator *iterator)

3320

struct rq_iterator *iterator)

3321

{

3321

{

3322

struct task_struct *p = iterator->start(iterator->arg);

3322

struct task_struct *p = iterator->start(iterator->arg);

3323

int pinned = 0;

3323

int pinned = 0;

3324

3325

while (p) {

3325

while (p) {

3326

if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

3326

if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {

3327

pull_task(busiest, p, this_rq, this_cpu);

3327

pull_task(busiest, p, this_rq, this_cpu);

3328

/*

3328

/*

3329

* Right now, this is only the second place pull_task()

3329

* Right now, this is only the second place pull_task()

3330

* is called, so we can safely collect pull_task()

3330

* is called, so we can safely collect pull_task()

3331

* stats here rather than inside pull_task().

3331

* stats here rather than inside pull_task().

3332

*/

3332

*/

3333

schedstat_inc(sd, lb_gained[idle]);

3333

schedstat_inc(sd, lb_gained[idle]);

3334

3335

return 1;

3335

return 1;

3336

}

3336

}

3337

p = iterator->next(iterator->arg);

3337

p = iterator->next(iterator->arg);

3338

}

3338

}

3339

3340

return 0;

3340

return 0;

3341

}

3341

}

3342

3343

/*

3343

/*

3344

* move_one_task tries to move exactly one task from busiest to this_rq, as

3344

* move_one_task tries to move exactly one task from busiest to this_rq, as

3345

* part of active balancing operations within "domain".

3345

* part of active balancing operations within "domain".

3346

* Returns 1 if successful and 0 otherwise.

3346

* Returns 1 if successful and 0 otherwise.

3347

*

3347

*

3348

* Called with both runqueues locked.

3348

* Called with both runqueues locked.

3349

*/

3349

*/

3350

static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

3350

static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,

3351

struct sched_domain *sd, enum cpu_idle_type idle)

3351

struct sched_domain *sd, enum cpu_idle_type idle)

3352

{

3352

{

3353

const struct sched_class *class;

3353

const struct sched_class *class;

3354

3355

for_each_class(class) {

3355

for_each_class(class) {

3356

if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))

3356

if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))

3357

return 1;

3357

return 1;

3358

}

3358

}

3359

3360

return 0;

3360

return 0;

3361

}

3361

}

3362

/********** Helpers for find_busiest_group ************************/

3362

/********** Helpers for find_busiest_group ************************/

3363

/*

3363

/*

3364

* sd_lb_stats - Structure to store the statistics of a sched_domain

3364

* sd_lb_stats - Structure to store the statistics of a sched_domain

3365

* during load balancing.

3365

* during load balancing.

3366

*/

3366

*/

3367

struct sd_lb_stats {

3367

struct sd_lb_stats {

3368

struct sched_group *busiest; /* Busiest group in this sd */

3368

struct sched_group *busiest; /* Busiest group in this sd */

3369

struct sched_group *this; /* Local group in this sd */

3369

struct sched_group *this; /* Local group in this sd */

3370

unsigned long total_load; /* Total load of all groups in sd */

3370

unsigned long total_load; /* Total load of all groups in sd */

3371

unsigned long total_pwr; /* Total power of all groups in sd */

3371

unsigned long total_pwr; /* Total power of all groups in sd */

3372

unsigned long avg_load; /* Average load across all groups in sd */

3372

unsigned long avg_load; /* Average load across all groups in sd */

3373

3374

/** Statistics of this group */

3374

/** Statistics of this group */

3375

unsigned long this_load;

3375

unsigned long this_load;

3376

unsigned long this_load_per_task;

3376

unsigned long this_load_per_task;

3377

unsigned long this_nr_running;

3377

unsigned long this_nr_running;

3378

3379

/* Statistics of the busiest group */

3379

/* Statistics of the busiest group */

3380

unsigned long max_load;

3380

unsigned long max_load;

3381

unsigned long busiest_load_per_task;

3381

unsigned long busiest_load_per_task;

3382

unsigned long busiest_nr_running;

3382

unsigned long busiest_nr_running;

3383

3384

int group_imb; /* Is there imbalance in this sd */

3384

int group_imb; /* Is there imbalance in this sd */

3385

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3385

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3386

int power_savings_balance; /* Is powersave balance needed for this sd */

3386

int power_savings_balance; /* Is powersave balance needed for this sd */

3387

struct sched_group *group_min; /* Least loaded group in sd */

3387

struct sched_group *group_min; /* Least loaded group in sd */

3388

struct sched_group *group_leader; /* Group which relieves group_min */

3388

struct sched_group *group_leader; /* Group which relieves group_min */

3389

unsigned long min_load_per_task; /* load_per_task in group_min */

3389

unsigned long min_load_per_task; /* load_per_task in group_min */

3390

unsigned long leader_nr_running; /* Nr running of group_leader */

3390

unsigned long leader_nr_running; /* Nr running of group_leader */

3391

unsigned long min_nr_running; /* Nr running of group_min */

3391

unsigned long min_nr_running; /* Nr running of group_min */

3392

#endif

3392

#endif

3393

};

3393

};

3394

3395

/*

3395

/*

3396

* sg_lb_stats - stats of a sched_group required for load_balancing

3396

* sg_lb_stats - stats of a sched_group required for load_balancing

3397

*/

3397

*/

3398

struct sg_lb_stats {

3398

struct sg_lb_stats {

3399

unsigned long avg_load; /*Avg load across the CPUs of the group */

3399

unsigned long avg_load; /*Avg load across the CPUs of the group */

3400

unsigned long group_load; /* Total load over the CPUs of the group */

3400

unsigned long group_load; /* Total load over the CPUs of the group */

3401

unsigned long sum_nr_running; /* Nr tasks running in the group */

3401

unsigned long sum_nr_running; /* Nr tasks running in the group */

3402

unsigned long sum_weighted_load; /* Weighted load of group's tasks */

3402

unsigned long sum_weighted_load; /* Weighted load of group's tasks */

3403

unsigned long group_capacity;

3403

unsigned long group_capacity;

3404

int group_imb; /* Is there an imbalance in the group ? */

3404

int group_imb; /* Is there an imbalance in the group ? */

3405

};

3405

};

3406

3407

/**

3407

/**

3408

* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.

3408

* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.

3409

* @group: The group whose first cpu is to be returned.

3409

* @group: The group whose first cpu is to be returned.

3410

*/

3410

*/

3411

static inline unsigned int group_first_cpu(struct sched_group *group)

3411

static inline unsigned int group_first_cpu(struct sched_group *group)

3412

{

3412

{

3413

return cpumask_first(sched_group_cpus(group));

3413

return cpumask_first(sched_group_cpus(group));

3414

}

3414

}

3415

3416

/**

3416

/**

3417

* get_sd_load_idx - Obtain the load index for a given sched domain.

3417

* get_sd_load_idx - Obtain the load index for a given sched domain.

3418

* @sd: The sched_domain whose load_idx is to be obtained.

3418

* @sd: The sched_domain whose load_idx is to be obtained.

3419

* @idle: The Idle status of the CPU for whose sd load_icx is obtained.

3419

* @idle: The Idle status of the CPU for whose sd load_icx is obtained.

3420

*/

3420

*/

3421

static inline int get_sd_load_idx(struct sched_domain *sd,

3421

static inline int get_sd_load_idx(struct sched_domain *sd,

3422

enum cpu_idle_type idle)

3422

enum cpu_idle_type idle)

3423

{

3423

{

3424

int load_idx;

3424

int load_idx;

3425

3426

switch (idle) {

3426

switch (idle) {

3427

case CPU_NOT_IDLE:

3427

case CPU_NOT_IDLE:

3428

load_idx = sd->busy_idx;

3428

load_idx = sd->busy_idx;

3429

break;

3429

break;

3430

3431

case CPU_NEWLY_IDLE:

3431

case CPU_NEWLY_IDLE:

3432

load_idx = sd->newidle_idx;

3432

load_idx = sd->newidle_idx;

3433

break;

3433

break;

3434

default:

3434

default:

3435

load_idx = sd->idle_idx;

3435

load_idx = sd->idle_idx;

3436

break;

3436

break;

3437

}

3437

}

3438

3439

return load_idx;

3439

return load_idx;

3440

}

3440

}

3441

3442

3443

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3443

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

3444

/**

3444

/**

3445

* init_sd_power_savings_stats - Initialize power savings statistics for

3445

* init_sd_power_savings_stats - Initialize power savings statistics for

3446

* the given sched_domain, during load balancing.

3446

* the given sched_domain, during load balancing.

3447

*

3447

*

3448

* @sd: Sched domain whose power-savings statistics are to be initialized.

3448

* @sd: Sched domain whose power-savings statistics are to be initialized.

3449

* @sds: Variable containing the statistics for sd.

3449

* @sds: Variable containing the statistics for sd.

3450

* @idle: Idle status of the CPU at which we're performing load-balancing.

3450

* @idle: Idle status of the CPU at which we're performing load-balancing.

3451

*/

3451

*/

3452

static inline void init_sd_power_savings_stats(struct sched_domain *sd,

3452

static inline void init_sd_power_savings_stats(struct sched_domain *sd,

3453

struct sd_lb_stats *sds, enum cpu_idle_type idle)

3453

struct sd_lb_stats *sds, enum cpu_idle_type idle)

3454

{

3454

{

3455

/*

3455

/*

3456

* Busy processors will not participate in power savings

3456

* Busy processors will not participate in power savings

3457

* balance.

3457

* balance.

3458

*/

3458

*/

3459

if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))

3459

if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))

3460

sds->power_savings_balance = 0;

3460

sds->power_savings_balance = 0;

3461

else {

3461

else {

3462

sds->power_savings_balance = 1;

3462

sds->power_savings_balance = 1;

3463

sds->min_nr_running = ULONG_MAX;

3463

sds->min_nr_running = ULONG_MAX;

3464

sds->leader_nr_running = 0;

3464

sds->leader_nr_running = 0;

3465

}

3465

}

3466

}

3466

}

3467

3468

/**

3468

/**

3469

* update_sd_power_savings_stats - Update the power saving stats for a

3469

* update_sd_power_savings_stats - Update the power saving stats for a

3470

* sched_domain while performing load balancing.

3470

* sched_domain while performing load balancing.

3471

*

3471

*

3472

* @group: sched_group belonging to the sched_domain under consideration.

3472

* @group: sched_group belonging to the sched_domain under consideration.

3473

* @sds: Variable containing the statistics of the sched_domain

3473

* @sds: Variable containing the statistics of the sched_domain

3474

* @local_group: Does group contain the CPU for which we're performing

3474

* @local_group: Does group contain the CPU for which we're performing

3475

* load balancing ?

3475

* load balancing ?

3476

* @sgs: Variable containing the statistics of the group.

3476

* @sgs: Variable containing the statistics of the group.

3477

*/

3477

*/

3478

static inline void update_sd_power_savings_stats(struct sched_group *group,

3478

static inline void update_sd_power_savings_stats(struct sched_group *group,

3479

struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)

3479

struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)

3480

{

3480

{

3481

3482

if (!sds->power_savings_balance)

3482

if (!sds->power_savings_balance)

3483

return;

3483

return;

3484

3485

/*

3485

/*

3486

* If the local group is idle or completely loaded

3486

* If the local group is idle or completely loaded

3487

* no need to do power savings balance at this domain

3487

* no need to do power savings balance at this domain

3488

*/

3488

*/

3489

if (local_group && (sds->this_nr_running >= sgs->group_capacity ||

3489

if (local_group && (sds->this_nr_running >= sgs->group_capacity ||

3490

!sds->this_nr_running))

3490

!sds->this_nr_running))

3491

sds->power_savings_balance = 0;

3491

sds->power_savings_balance = 0;

3492

3493

/*

3493

/*

3494

* If a group is already running at full capacity or idle,

3494

* If a group is already running at full capacity or idle,

3495

* don't include that group in power savings calculations

3495

* don't include that group in power savings calculations

3496

*/

3496

*/

3497

if (!sds->power_savings_balance ||

3497

if (!sds->power_savings_balance ||

3498

sgs->sum_nr_running >= sgs->group_capacity ||

3498

sgs->sum_nr_running >= sgs->group_capacity ||

3499

!sgs->sum_nr_running)

3499

!sgs->sum_nr_running)

3500

return;

3500

return;

3501

3502

/*

3502

/*

3503

* Calculate the group which has the least non-idle load.

3503

* Calculate the group which has the least non-idle load.

3504

* This is the group from where we need to pick up the load

3504

* This is the group from where we need to pick up the load

3505

* for saving power

3505

* for saving power

3506

*/

3506

*/

3507

if ((sgs->sum_nr_running < sds->min_nr_running) ||

3507

if ((sgs->sum_nr_running < sds->min_nr_running) ||

3508

(sgs->sum_nr_running == sds->min_nr_running &&

3508

(sgs->sum_nr_running == sds->min_nr_running &&

3509

group_first_cpu(group) > group_first_cpu(sds->group_min))) {

3509

group_first_cpu(group) > group_first_cpu(sds->group_min))) {

3510

sds->group_min = group;

3510

sds->group_min = group;

3511

sds->min_nr_running = sgs->sum_nr_running;

3511

sds->min_nr_running = sgs->sum_nr_running;

3512

sds->min_load_per_task = sgs->sum_weighted_load /

3512

sds->min_load_per_task = sgs->sum_weighted_load /

3513

sgs->sum_nr_running;

3513

sgs->sum_nr_running;

3514

}

3514

}

3515

3516

/*

3516

/*

3517

* Calculate the group which is almost near its

3517

* Calculate the group which is almost near its

3518

* capacity but still has some space to pick up some load

3518

* capacity but still has some space to pick up some load

3519

* from other group and save more power

3519

* from other group and save more power

3520

*/

3520

*/

3521

if (sgs->sum_nr_running + 1 > sgs->group_capacity)

3521

if (sgs->sum_nr_running + 1 > sgs->group_capacity)

3522

return;

3522

return;

3523

3524

if (sgs->sum_nr_running > sds->leader_nr_running ||

3524

if (sgs->sum_nr_running > sds->leader_nr_running ||

3525

(sgs->sum_nr_running == sds->leader_nr_running &&

3525

(sgs->sum_nr_running == sds->leader_nr_running &&

3526

group_first_cpu(group) < group_first_cpu(sds->group_leader))) {

3526

group_first_cpu(group) < group_first_cpu(sds->group_leader))) {

3527

sds->group_leader = group;

3527

sds->group_leader = group;

3528

sds->leader_nr_running = sgs->sum_nr_running;

3528

sds->leader_nr_running = sgs->sum_nr_running;

3529

}

3529

}

3530

}

3530

}

3531

3532

/**

3532

/**

3533

* check_power_save_busiest_group - see if there is potential for some power-savings balance

3533

* check_power_save_busiest_group - see if there is potential for some power-savings balance

3534

* @sds: Variable containing the statistics of the sched_domain

3534

* @sds: Variable containing the statistics of the sched_domain

3535

* under consideration.

3535

* under consideration.

3536

* @this_cpu: Cpu at which we're currently performing load-balancing.

3536

* @this_cpu: Cpu at which we're currently performing load-balancing.

3537

* @imbalance: Variable to store the imbalance.

3537

* @imbalance: Variable to store the imbalance.

3538

*

3538

*

3539

* Description:

3539

* Description:

3540

* Check if we have potential to perform some power-savings balance.

3540

* Check if we have potential to perform some power-savings balance.

3541

* If yes, set the busiest group to be the least loaded group in the

3541

* If yes, set the busiest group to be the least loaded group in the

3542

* sched_domain, so that it's CPUs can be put to idle.

3542

* sched_domain, so that it's CPUs can be put to idle.

3543

*

3543

*

3544

* Returns 1 if there is potential to perform power-savings balance.

3544

* Returns 1 if there is potential to perform power-savings balance.

3545

* Else returns 0.

3545

* Else returns 0.

3546

*/

3546

*/

3547

static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,

3547

static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,

3548

int this_cpu, unsigned long *imbalance)

3548

int this_cpu, unsigned long *imbalance)

3549

{

3549

{

3550

if (!sds->power_savings_balance)

3550

if (!sds->power_savings_balance)

3551

return 0;

3551

return 0;

3552

3553

if (sds->this != sds->group_leader ||

3553

if (sds->this != sds->group_leader ||

3554

sds->group_leader == sds->group_min)

3554

sds->group_leader == sds->group_min)

3555

return 0;

3555

return 0;

3556

3557

*imbalance = sds->min_load_per_task;

3557

*imbalance = sds->min_load_per_task;

3558

sds->busiest = sds->group_min;

3558

sds->busiest = sds->group_min;

3559

3560

return 1;

3560

return 1;

3561

3562

}

3562

}

3563

#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

3563

#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

3564

static inline void init_sd_power_savings_stats(struct sched_domain *sd,

3564

static inline void init_sd_power_savings_stats(struct sched_domain *sd,

3565

struct sd_lb_stats *sds, enum cpu_idle_type idle)

3565

struct sd_lb_stats *sds, enum cpu_idle_type idle)

3566

{

3566

{

3567

return;

3567

return;

3568

}

3568

}

3569

3570

static inline void update_sd_power_savings_stats(struct sched_group *group,

3570

static inline void update_sd_power_savings_stats(struct sched_group *group,

3571

struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)

3571

struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)

3572

{

3572

{

3573

return;

3573

return;

3574

}

3574

}

3575

3576

static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,

3576

static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,

3577

int this_cpu, unsigned long *imbalance)

3577

int this_cpu, unsigned long *imbalance)

3578

{

3578

{

3579

return 0;

3579

return 0;

3580

}

3580

}

3581

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

3581

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

3582

3583

3584

unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)

3584

unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)

3585

{

3585

{

3586

return SCHED_LOAD_SCALE;

3586

return SCHED_LOAD_SCALE;

3587

}

3587

}

3588

3589

unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)

3589

unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)

3590

{

3590

{

3591

return default_scale_freq_power(sd, cpu);

3591

return default_scale_freq_power(sd, cpu);

3592

}

3592

}

3593

3594

unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)

3594

unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)

3595

{

3595

{

3596

unsigned long weight = cpumask_weight(sched_domain_span(sd));

3596

unsigned long weight = cpumask_weight(sched_domain_span(sd));

3597

unsigned long smt_gain = sd->smt_gain;

3597

unsigned long smt_gain = sd->smt_gain;

3598

3599

smt_gain /= weight;

3599

smt_gain /= weight;

3600

3601

return smt_gain;

3601

return smt_gain;

3602

}

3602

}

3603

3604

unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)

3604

unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)

3605

{

3605

{

3606

return default_scale_smt_power(sd, cpu);

3606

return default_scale_smt_power(sd, cpu);

3607

}

3607

}

3608

3609

unsigned long scale_rt_power(int cpu)

3609

unsigned long scale_rt_power(int cpu)

3610

{

3610

{

3611

struct rq *rq = cpu_rq(cpu);

3611

struct rq *rq = cpu_rq(cpu);

3612

u64 total, available;

3612

u64 total, available;

3613

3614

sched_avg_update(rq);

3614

sched_avg_update(rq);

3615

3616

total = sched_avg_period() + (rq->clock - rq->age_stamp);

3616

total = sched_avg_period() + (rq->clock - rq->age_stamp);

3617

available = total - rq->rt_avg;

3617

available = total - rq->rt_avg;

3618

3619

if (unlikely((s64)total < SCHED_LOAD_SCALE))

3619

if (unlikely((s64)total < SCHED_LOAD_SCALE))

3620

total = SCHED_LOAD_SCALE;

3620

total = SCHED_LOAD_SCALE;

3621

3622

total >>= SCHED_LOAD_SHIFT;

3622

total >>= SCHED_LOAD_SHIFT;

3623

3624

return div_u64(available, total);

3624

return div_u64(available, total);

3625

}

3625

}

3626

3627

static void update_cpu_power(struct sched_domain *sd, int cpu)

3627

static void update_cpu_power(struct sched_domain *sd, int cpu)

3628

{

3628

{

3629

unsigned long weight = cpumask_weight(sched_domain_span(sd));

3629

unsigned long weight = cpumask_weight(sched_domain_span(sd));

3630

unsigned long power = SCHED_LOAD_SCALE;

3630

unsigned long power = SCHED_LOAD_SCALE;

3631

struct sched_group *sdg = sd->groups;

3631

struct sched_group *sdg = sd->groups;

3632

3633

if (sched_feat(ARCH_POWER))

3633

if (sched_feat(ARCH_POWER))

3634

power *= arch_scale_freq_power(sd, cpu);

3634

power *= arch_scale_freq_power(sd, cpu);

3635

else

3635

else

3636

power *= default_scale_freq_power(sd, cpu);

3636

power *= default_scale_freq_power(sd, cpu);

3637

3638

power >>= SCHED_LOAD_SHIFT;

3638

power >>= SCHED_LOAD_SHIFT;

3639

3640

if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {

3640

if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {

3641

if (sched_feat(ARCH_POWER))

3641

if (sched_feat(ARCH_POWER))

3642

power *= arch_scale_smt_power(sd, cpu);

3642

power *= arch_scale_smt_power(sd, cpu);

3643

else

3643

else

3644

power *= default_scale_smt_power(sd, cpu);

3644

power *= default_scale_smt_power(sd, cpu);

3645

3646

power >>= SCHED_LOAD_SHIFT;

3646

power >>= SCHED_LOAD_SHIFT;

3647

}

3647

}

3648

3649

power *= scale_rt_power(cpu);

3649

power *= scale_rt_power(cpu);

3650

power >>= SCHED_LOAD_SHIFT;

3650

power >>= SCHED_LOAD_SHIFT;

3651

3652

if (!power)

3652

if (!power)

3653

power = 1;

3653

power = 1;

3654

3655

sdg->cpu_power = power;

3655

sdg->cpu_power = power;

3656

}

3656

}

3657

3658

static void update_group_power(struct sched_domain *sd, int cpu)

3658

static void update_group_power(struct sched_domain *sd, int cpu)

3659

{

3659

{

3660

struct sched_domain *child = sd->child;

3660

struct sched_domain *child = sd->child;

3661

struct sched_group *group, *sdg = sd->groups;

3661

struct sched_group *group, *sdg = sd->groups;

3662

unsigned long power;

3662

unsigned long power;

3663

3664

if (!child) {

3664

if (!child) {

3665

update_cpu_power(sd, cpu);

3665

update_cpu_power(sd, cpu);

3666

return;

3666

return;

3667

}

3667

}

3668

3669

power = 0;

3669

power = 0;

3670

3671

group = child->groups;

3671

group = child->groups;

3672

do {

3672

do {

3673

power += group->cpu_power;

3673

power += group->cpu_power;

3674

group = group->next;

3674

group = group->next;

3675

} while (group != child->groups);

3675

} while (group != child->groups);

3676

3677

sdg->cpu_power = power;

3677

sdg->cpu_power = power;

3678

}

3678

}

3679

3680

/**

3680

/**

3681

* update_sg_lb_stats - Update sched_group's statistics for load balancing.

3681

* update_sg_lb_stats - Update sched_group's statistics for load balancing.

3682

* @sd: The sched_domain whose statistics are to be updated.

3682

* @sd: The sched_domain whose statistics are to be updated.

3683

* @group: sched_group whose statistics are to be updated.

3683

* @group: sched_group whose statistics are to be updated.

3684

* @this_cpu: Cpu for which load balance is currently performed.

3684

* @this_cpu: Cpu for which load balance is currently performed.

3685

* @idle: Idle status of this_cpu

3685

* @idle: Idle status of this_cpu

3686

* @load_idx: Load index of sched_domain of this_cpu for load calc.

3686

* @load_idx: Load index of sched_domain of this_cpu for load calc.

3687

* @sd_idle: Idle status of the sched_domain containing group.

3687

* @sd_idle: Idle status of the sched_domain containing group.

3688

* @local_group: Does group contain this_cpu.

3688

* @local_group: Does group contain this_cpu.

3689

* @cpus: Set of cpus considered for load balancing.

3689

* @cpus: Set of cpus considered for load balancing.

3690

* @balance: Should we balance.

3690

* @balance: Should we balance.

3691

* @sgs: variable to hold the statistics for this group.

3691

* @sgs: variable to hold the statistics for this group.

3692

*/

3692

*/

3693

static inline void update_sg_lb_stats(struct sched_domain *sd,

3693

static inline void update_sg_lb_stats(struct sched_domain *sd,

3694

struct sched_group *group, int this_cpu,

3694

struct sched_group *group, int this_cpu,

3695

enum cpu_idle_type idle, int load_idx, int *sd_idle,

3695

enum cpu_idle_type idle, int load_idx, int *sd_idle,

3696

int local_group, const struct cpumask *cpus,

3696

int local_group, const struct cpumask *cpus,

3697

int *balance, struct sg_lb_stats *sgs)

3697

int *balance, struct sg_lb_stats *sgs)

3698

{

3698

{

3699

unsigned long load, max_cpu_load, min_cpu_load;

3699

unsigned long load, max_cpu_load, min_cpu_load;

3700

int i;

3700

int i;

3701

unsigned int balance_cpu = -1, first_idle_cpu = 0;

3701

unsigned int balance_cpu = -1, first_idle_cpu = 0;

3702

unsigned long sum_avg_load_per_task;

3702

unsigned long sum_avg_load_per_task;

3703

unsigned long avg_load_per_task;

3703

unsigned long avg_load_per_task;

3704

3705

if (local_group) {

3705

if (local_group) {

3706

balance_cpu = group_first_cpu(group);

3706

balance_cpu = group_first_cpu(group);

3707

if (balance_cpu == this_cpu)

3707

if (balance_cpu == this_cpu)

3708

update_group_power(sd, this_cpu);

3708

update_group_power(sd, this_cpu);

3709

}

3709

}

3710

3711

/* Tally up the load of all CPUs in the group */

3711

/* Tally up the load of all CPUs in the group */

3712

sum_avg_load_per_task = avg_load_per_task = 0;

3712

sum_avg_load_per_task = avg_load_per_task = 0;

3713

max_cpu_load = 0;

3713

max_cpu_load = 0;

3714

min_cpu_load = ~0UL;

3714

min_cpu_load = ~0UL;

3715

3716

for_each_cpu_and(i, sched_group_cpus(group), cpus) {

3716

for_each_cpu_and(i, sched_group_cpus(group), cpus) {

3717

struct rq *rq = cpu_rq(i);

3717

struct rq *rq = cpu_rq(i);

3718

3719

if (*sd_idle && rq->nr_running)

3719

if (*sd_idle && rq->nr_running)

3720

*sd_idle = 0;

3720

*sd_idle = 0;

3721

3722

/* Bias balancing toward cpus of our domain */

3722

/* Bias balancing toward cpus of our domain */

3723

if (local_group) {

3723

if (local_group) {

3724

if (idle_cpu(i) && !first_idle_cpu) {

3724

if (idle_cpu(i) && !first_idle_cpu) {

3725

first_idle_cpu = 1;

3725

first_idle_cpu = 1;

3726

balance_cpu = i;

3726

balance_cpu = i;

3727

}

3727

}

3728

3729

load = target_load(i, load_idx);

3729

load = target_load(i, load_idx);

3730

} else {

3730

} else {

3731

load = source_load(i, load_idx);

3731

load = source_load(i, load_idx);

3732

if (load > max_cpu_load)

3732

if (load > max_cpu_load)

3733

max_cpu_load = load;

3733

max_cpu_load = load;

3734

if (min_cpu_load > load)

3734

if (min_cpu_load > load)

3735

min_cpu_load = load;

3735

min_cpu_load = load;

3736

}

3736

}

3737

3738

sgs->group_load += load;

3738

sgs->group_load += load;

3739

sgs->sum_nr_running += rq->nr_running;

3739

sgs->sum_nr_running += rq->nr_running;

3740

sgs->sum_weighted_load += weighted_cpuload(i);

3740

sgs->sum_weighted_load += weighted_cpuload(i);

3741

3742

sum_avg_load_per_task += cpu_avg_load_per_task(i);

3742

sum_avg_load_per_task += cpu_avg_load_per_task(i);

3743

}

3743

}

3744

3745

/*

3745

/*

3746

* First idle cpu or the first cpu(busiest) in this sched group

3746

* First idle cpu or the first cpu(busiest) in this sched group

3747

* is eligible for doing load balancing at this and above

3747

* is eligible for doing load balancing at this and above

3748

* domains. In the newly idle case, we will allow all the cpu's

3748

* domains. In the newly idle case, we will allow all the cpu's

3749

* to do the newly idle load balance.

3749

* to do the newly idle load balance.

3750

*/

3750

*/

3751

if (idle != CPU_NEWLY_IDLE && local_group &&

3751

if (idle != CPU_NEWLY_IDLE && local_group &&

3752

balance_cpu != this_cpu && balance) {

3752

balance_cpu != this_cpu && balance) {

3753

*balance = 0;

3753

*balance = 0;

3754

return;

3754

return;

3755

}

3755

}

3756

3757

/* Adjust by relative CPU power of the group */

3757

/* Adjust by relative CPU power of the group */

3758

sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;

3758

sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;

3759

3760

3761

/*

3761

/*

3762

* Consider the group unbalanced when the imbalance is larger

3762

* Consider the group unbalanced when the imbalance is larger

3763

* than the average weight of two tasks.

3763

* than the average weight of two tasks.

3764

*

3764

*

3765

* APZ: with cgroup the avg task weight can vary wildly and

3765

* APZ: with cgroup the avg task weight can vary wildly and

3766

* might not be a suitable number - should we keep a

3766

* might not be a suitable number - should we keep a

3767

* normalized nr_running number somewhere that negates

3767

* normalized nr_running number somewhere that negates

3768

* the hierarchy?

3768

* the hierarchy?

3769

*/

3769

*/

3770

avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /

3770

avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /

3771

group->cpu_power;

3771

group->cpu_power;

3772

3773

if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)

3773

if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)

3774

sgs->group_imb = 1;

3774

sgs->group_imb = 1;

3775

3776

sgs->group_capacity =

3776

sgs->group_capacity =

3777

DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);

3777

DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);

3778

}

3778

}

3779

3780

/**

3780

/**

3781

* update_sd_lb_stats - Update sched_group's statistics for load balancing.

3781

* update_sd_lb_stats - Update sched_group's statistics for load balancing.

3782

* @sd: sched_domain whose statistics are to be updated.

3782

* @sd: sched_domain whose statistics are to be updated.

3783

* @this_cpu: Cpu for which load balance is currently performed.

3783

* @this_cpu: Cpu for which load balance is currently performed.

3784

* @idle: Idle status of this_cpu

3784

* @idle: Idle status of this_cpu

3785

* @sd_idle: Idle status of the sched_domain containing group.

3785

* @sd_idle: Idle status of the sched_domain containing group.

3786

* @cpus: Set of cpus considered for load balancing.

3786

* @cpus: Set of cpus considered for load balancing.

3787

* @balance: Should we balance.

3787

* @balance: Should we balance.

3788

* @sds: variable to hold the statistics for this sched_domain.

3788

* @sds: variable to hold the statistics for this sched_domain.

3789

*/

3789

*/

3790

static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,

3790

static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,

3791

enum cpu_idle_type idle, int *sd_idle,

3791

enum cpu_idle_type idle, int *sd_idle,

3792

const struct cpumask *cpus, int *balance,

3792

const struct cpumask *cpus, int *balance,

3793

struct sd_lb_stats *sds)

3793

struct sd_lb_stats *sds)

3794

{

3794

{

3795

struct sched_domain *child = sd->child;

3795

struct sched_domain *child = sd->child;

3796

struct sched_group *group = sd->groups;

3796

struct sched_group *group = sd->groups;

3797

struct sg_lb_stats sgs;

3797

struct sg_lb_stats sgs;

3798

int load_idx, prefer_sibling = 0;

3798

int load_idx, prefer_sibling = 0;

3799

3800

if (child && child->flags & SD_PREFER_SIBLING)

3800

if (child && child->flags & SD_PREFER_SIBLING)

3801

prefer_sibling = 1;

3801

prefer_sibling = 1;

3802

3803

init_sd_power_savings_stats(sd, sds, idle);

3803

init_sd_power_savings_stats(sd, sds, idle);

3804

load_idx = get_sd_load_idx(sd, idle);

3804

load_idx = get_sd_load_idx(sd, idle);

3805

3806

do {

3806

do {

3807

int local_group;

3807

int local_group;

3808

3809

local_group = cpumask_test_cpu(this_cpu,

3809

local_group = cpumask_test_cpu(this_cpu,

3810

sched_group_cpus(group));

3810

sched_group_cpus(group));

3811

memset(&sgs, 0, sizeof(sgs));

3811

memset(&sgs, 0, sizeof(sgs));

3812

update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,

3812

update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,

3813

local_group, cpus, balance, &sgs);

3813

local_group, cpus, balance, &sgs);

3814

3815

if (local_group && balance && !(*balance))

3815

if (local_group && balance && !(*balance))

3816

return;

3816

return;

3817

3818

sds->total_load += sgs.group_load;

3818

sds->total_load += sgs.group_load;

3819

sds->total_pwr += group->cpu_power;

3819

sds->total_pwr += group->cpu_power;

3820

3821

/*

3821

/*

3822

* In case the child domain prefers tasks go to siblings

3822

* In case the child domain prefers tasks go to siblings

3823

* first, lower the group capacity to one so that we'll try

3823

* first, lower the group capacity to one so that we'll try

3824

* and move all the excess tasks away.

3824

* and move all the excess tasks away.

3825

*/

3825

*/

3826

if (prefer_sibling)

3826

if (prefer_sibling)

3827

sgs.group_capacity = min(sgs.group_capacity, 1UL);

3827

sgs.group_capacity = min(sgs.group_capacity, 1UL);

3828

3829

if (local_group) {

3829

if (local_group) {

3830

sds->this_load = sgs.avg_load;

3830

sds->this_load = sgs.avg_load;

3831

sds->this = group;

3831

sds->this = group;

3832

sds->this_nr_running = sgs.sum_nr_running;

3832

sds->this_nr_running = sgs.sum_nr_running;

3833

sds->this_load_per_task = sgs.sum_weighted_load;

3833

sds->this_load_per_task = sgs.sum_weighted_load;

3834

} else if (sgs.avg_load > sds->max_load &&

3834

} else if (sgs.avg_load > sds->max_load &&

3835

(sgs.sum_nr_running > sgs.group_capacity ||

3835

(sgs.sum_nr_running > sgs.group_capacity ||

3836

sgs.group_imb)) {

3836

sgs.group_imb)) {

3837

sds->max_load = sgs.avg_load;

3837

sds->max_load = sgs.avg_load;

3838

sds->busiest = group;

3838

sds->busiest = group;

3839

sds->busiest_nr_running = sgs.sum_nr_running;

3839

sds->busiest_nr_running = sgs.sum_nr_running;

3840

sds->busiest_load_per_task = sgs.sum_weighted_load;

3840

sds->busiest_load_per_task = sgs.sum_weighted_load;

3841

sds->group_imb = sgs.group_imb;

3841

sds->group_imb = sgs.group_imb;

3842

}

3842

}

3843

3844

update_sd_power_savings_stats(group, sds, local_group, &sgs);

3844

update_sd_power_savings_stats(group, sds, local_group, &sgs);

3845

group = group->next;

3845

group = group->next;

3846

} while (group != sd->groups);

3846

} while (group != sd->groups);

3847

}

3847

}

3848

3849

/**

3849

/**

3850

* fix_small_imbalance - Calculate the minor imbalance that exists

3850

* fix_small_imbalance - Calculate the minor imbalance that exists

3851

* amongst the groups of a sched_domain, during

3851

* amongst the groups of a sched_domain, during

3852

* load balancing.

3852

* load balancing.

3853

* @sds: Statistics of the sched_domain whose imbalance is to be calculated.

3853

* @sds: Statistics of the sched_domain whose imbalance is to be calculated.

3854

* @this_cpu: The cpu at whose sched_domain we're performing load-balance.

3854

* @this_cpu: The cpu at whose sched_domain we're performing load-balance.

3855

* @imbalance: Variable to store the imbalance.

3855

* @imbalance: Variable to store the imbalance.

3856

*/

3856

*/

3857

static inline void fix_small_imbalance(struct sd_lb_stats *sds,

3857

static inline void fix_small_imbalance(struct sd_lb_stats *sds,

3858

int this_cpu, unsigned long *imbalance)

3858

int this_cpu, unsigned long *imbalance)

3859

{

3859

{

3860

unsigned long tmp, pwr_now = 0, pwr_move = 0;

3860

unsigned long tmp, pwr_now = 0, pwr_move = 0;

3861

unsigned int imbn = 2;

3861

unsigned int imbn = 2;

3862

3863

if (sds->this_nr_running) {

3863

if (sds->this_nr_running) {

3864

sds->this_load_per_task /= sds->this_nr_running;

3864

sds->this_load_per_task /= sds->this_nr_running;

3865

if (sds->busiest_load_per_task >

3865

if (sds->busiest_load_per_task >

3866

sds->this_load_per_task)

3866

sds->this_load_per_task)

3867

imbn = 1;

3867

imbn = 1;

3868

} else

3868

} else

3869

sds->this_load_per_task =

3869

sds->this_load_per_task =

3870

cpu_avg_load_per_task(this_cpu);

3870

cpu_avg_load_per_task(this_cpu);

3871

3872

if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=

3872

if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=

3873

sds->busiest_load_per_task * imbn) {

3873

sds->busiest_load_per_task * imbn) {

3874

*imbalance = sds->busiest_load_per_task;

3874

*imbalance = sds->busiest_load_per_task;

3875

return;

3875

return;

3876

}

3876

}

3877

3878

/*

3878

/*

3879

* OK, we don't have enough imbalance to justify moving tasks,

3879

* OK, we don't have enough imbalance to justify moving tasks,

3880

* however we may be able to increase total CPU power used by

3880

* however we may be able to increase total CPU power used by

3881

* moving them.

3881

* moving them.

3882

*/

3882

*/

3883

3884

pwr_now += sds->busiest->cpu_power *

3884

pwr_now += sds->busiest->cpu_power *

3885

min(sds->busiest_load_per_task, sds->max_load);

3885

min(sds->busiest_load_per_task, sds->max_load);

3886

pwr_now += sds->this->cpu_power *

3886

pwr_now += sds->this->cpu_power *

3887

min(sds->this_load_per_task, sds->this_load);

3887

min(sds->this_load_per_task, sds->this_load);

3888

pwr_now /= SCHED_LOAD_SCALE;

3888

pwr_now /= SCHED_LOAD_SCALE;

3889

3890

/* Amount of load we'd subtract */

3890

/* Amount of load we'd subtract */

3891

tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /

3891

tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /

3892

sds->busiest->cpu_power;

3892

sds->busiest->cpu_power;

3893

if (sds->max_load > tmp)

3893

if (sds->max_load > tmp)

3894

pwr_move += sds->busiest->cpu_power *

3894

pwr_move += sds->busiest->cpu_power *

3895

min(sds->busiest_load_per_task, sds->max_load - tmp);

3895

min(sds->busiest_load_per_task, sds->max_load - tmp);

3896

3897

/* Amount of load we'd add */

3897

/* Amount of load we'd add */

3898

if (sds->max_load * sds->busiest->cpu_power <

3898

if (sds->max_load * sds->busiest->cpu_power <

3899

sds->busiest_load_per_task * SCHED_LOAD_SCALE)

3899

sds->busiest_load_per_task * SCHED_LOAD_SCALE)

3900

tmp = (sds->max_load * sds->busiest->cpu_power) /

3900

tmp = (sds->max_load * sds->busiest->cpu_power) /

3901

sds->this->cpu_power;

3901

sds->this->cpu_power;

3902

else

3902

else

3903

tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /

3903

tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /

3904

sds->this->cpu_power;

3904

sds->this->cpu_power;

3905

pwr_move += sds->this->cpu_power *

3905

pwr_move += sds->this->cpu_power *

3906

min(sds->this_load_per_task, sds->this_load + tmp);

3906

min(sds->this_load_per_task, sds->this_load + tmp);

3907

pwr_move /= SCHED_LOAD_SCALE;

3907

pwr_move /= SCHED_LOAD_SCALE;

3908

3909

/* Move if we gain throughput */

3909

/* Move if we gain throughput */

3910

if (pwr_move > pwr_now)

3910

if (pwr_move > pwr_now)

3911

*imbalance = sds->busiest_load_per_task;

3911

*imbalance = sds->busiest_load_per_task;

3912

}

3912

}

3913

3914

/**

3914

/**

3915

* calculate_imbalance - Calculate the amount of imbalance present within the

3915

* calculate_imbalance - Calculate the amount of imbalance present within the

3916

* groups of a given sched_domain during load balance.

3916

* groups of a given sched_domain during load balance.

3917

* @sds: statistics of the sched_domain whose imbalance is to be calculated.

3917

* @sds: statistics of the sched_domain whose imbalance is to be calculated.

3918

* @this_cpu: Cpu for which currently load balance is being performed.

3918

* @this_cpu: Cpu for which currently load balance is being performed.

3919

* @imbalance: The variable to store the imbalance.

3919

* @imbalance: The variable to store the imbalance.

3920

*/

3920

*/

3921

static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,

3921

static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,

3922

unsigned long *imbalance)

3922

unsigned long *imbalance)

3923

{

3923

{

3924

unsigned long max_pull;

3924

unsigned long max_pull;

3925

/*

3925

/*

3926

* In the presence of smp nice balancing, certain scenarios can have

3926

* In the presence of smp nice balancing, certain scenarios can have

3927

* max load less than avg load(as we skip the groups at or below

3927

* max load less than avg load(as we skip the groups at or below

3928

* its cpu_power, while calculating max_load..)

3928

* its cpu_power, while calculating max_load..)

3929

*/

3929

*/

3930

if (sds->max_load < sds->avg_load) {

3930

if (sds->max_load < sds->avg_load) {

3931

*imbalance = 0;

3931

*imbalance = 0;

3932

return fix_small_imbalance(sds, this_cpu, imbalance);

3932

return fix_small_imbalance(sds, this_cpu, imbalance);

3933

}

3933

}

3934

3935

/* Don't want to pull so many tasks that a group would go idle */

3935

/* Don't want to pull so many tasks that a group would go idle */

3936

max_pull = min(sds->max_load - sds->avg_load,

3936

max_pull = min(sds->max_load - sds->avg_load,

3937

sds->max_load - sds->busiest_load_per_task);

3937

sds->max_load - sds->busiest_load_per_task);

3938

3939

/* How much load to actually move to equalise the imbalance */

3939

/* How much load to actually move to equalise the imbalance */

3940

*imbalance = min(max_pull * sds->busiest->cpu_power,

3940

*imbalance = min(max_pull * sds->busiest->cpu_power,

3941

(sds->avg_load - sds->this_load) * sds->this->cpu_power)

3941

(sds->avg_load - sds->this_load) * sds->this->cpu_power)

3942

/ SCHED_LOAD_SCALE;

3942

/ SCHED_LOAD_SCALE;

3943

3944

/*

3944

/*

3945

* if *imbalance is less than the average load per runnable task

3945

* if *imbalance is less than the average load per runnable task

3946

* there is no gaurantee that any tasks will be moved so we'll have

3946

* there is no gaurantee that any tasks will be moved so we'll have

3947

* a think about bumping its value to force at least one task to be

3947

* a think about bumping its value to force at least one task to be

3948

* moved

3948

* moved

3949

*/

3949

*/

3950

if (*imbalance < sds->busiest_load_per_task)

3950

if (*imbalance < sds->busiest_load_per_task)

3951

return fix_small_imbalance(sds, this_cpu, imbalance);

3951

return fix_small_imbalance(sds, this_cpu, imbalance);

3952

3953

}

3953

}

3954

/******* find_busiest_group() helpers end here *********************/

3954

/******* find_busiest_group() helpers end here *********************/

3955

3956

/**

3956

/**

3957

* find_busiest_group - Returns the busiest group within the sched_domain

3957

* find_busiest_group - Returns the busiest group within the sched_domain

3958

* if there is an imbalance. If there isn't an imbalance, and

3958

* if there is an imbalance. If there isn't an imbalance, and

3959

* the user has opted for power-savings, it returns a group whose

3959

* the user has opted for power-savings, it returns a group whose

3960

* CPUs can be put to idle by rebalancing those tasks elsewhere, if

3960

* CPUs can be put to idle by rebalancing those tasks elsewhere, if

3961

* such a group exists.

3961

* such a group exists.

3962

*

3962

*

3963

* Also calculates the amount of weighted load which should be moved

3963

* Also calculates the amount of weighted load which should be moved

3964

* to restore balance.

3964

* to restore balance.

3965

*

3965

*

3966

* @sd: The sched_domain whose busiest group is to be returned.

3966

* @sd: The sched_domain whose busiest group is to be returned.

3967

* @this_cpu: The cpu for which load balancing is currently being performed.

3967

* @this_cpu: The cpu for which load balancing is currently being performed.

3968

* @imbalance: Variable which stores amount of weighted load which should

3968

* @imbalance: Variable which stores amount of weighted load which should

3969

* be moved to restore balance/put a group to idle.

3969

* be moved to restore balance/put a group to idle.

3970

* @idle: The idle status of this_cpu.

3970

* @idle: The idle status of this_cpu.

3971

* @sd_idle: The idleness of sd

3971

* @sd_idle: The idleness of sd

3972

* @cpus: The set of CPUs under consideration for load-balancing.

3972

* @cpus: The set of CPUs under consideration for load-balancing.

3973

* @balance: Pointer to a variable indicating if this_cpu

3973

* @balance: Pointer to a variable indicating if this_cpu

3974

* is the appropriate cpu to perform load balancing at this_level.

3974

* is the appropriate cpu to perform load balancing at this_level.

3975

*

3975

*

3976

* Returns: - the busiest group if imbalance exists.

3976

* Returns: - the busiest group if imbalance exists.

3977

* - If no imbalance and user has opted for power-savings balance,

3977

* - If no imbalance and user has opted for power-savings balance,

3978

* return the least loaded group whose CPUs can be

3978

* return the least loaded group whose CPUs can be

3979

* put to idle by rebalancing its tasks onto our group.

3979

* put to idle by rebalancing its tasks onto our group.

3980

*/

3980

*/

3981

static struct sched_group *

3981

static struct sched_group *

3982

find_busiest_group(struct sched_domain *sd, int this_cpu,

3982

find_busiest_group(struct sched_domain *sd, int this_cpu,

3983

unsigned long *imbalance, enum cpu_idle_type idle,

3983

unsigned long *imbalance, enum cpu_idle_type idle,

3984

int *sd_idle, const struct cpumask *cpus, int *balance)

3984

int *sd_idle, const struct cpumask *cpus, int *balance)

3985

{

3985

{

3986

struct sd_lb_stats sds;

3986

struct sd_lb_stats sds;

3987

3988

memset(&sds, 0, sizeof(sds));

3988

memset(&sds, 0, sizeof(sds));

3989

3990

/*

3990

/*

3991

* Compute the various statistics relavent for load balancing at

3991

* Compute the various statistics relavent for load balancing at

3992

* this level.

3992

* this level.

3993

*/

3993

*/

3994

update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,

3994

update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,

3995

balance, &sds);

3995

balance, &sds);

3996

3997

/* Cases where imbalance does not exist from POV of this_cpu */

3997

/* Cases where imbalance does not exist from POV of this_cpu */

3998

/* 1) this_cpu is not the appropriate cpu to perform load balancing

3998

/* 1) this_cpu is not the appropriate cpu to perform load balancing

3999

* at this level.

3999

* at this level.

4000

* 2) There is no busy sibling group to pull from.

4000

* 2) There is no busy sibling group to pull from.

4001

* 3) This group is the busiest group.

4001

* 3) This group is the busiest group.

4002

* 4) This group is more busy than the avg busieness at this

4002

* 4) This group is more busy than the avg busieness at this

4003

* sched_domain.

4003

* sched_domain.

4004

* 5) The imbalance is within the specified limit.

4004

* 5) The imbalance is within the specified limit.

4005

* 6) Any rebalance would lead to ping-pong

4005

* 6) Any rebalance would lead to ping-pong

4006

*/

4006

*/

4007

if (balance && !(*balance))

4007

if (balance && !(*balance))

4008

goto ret;

4008

goto ret;

4009

4010

if (!sds.busiest || sds.busiest_nr_running == 0)

4010

if (!sds.busiest || sds.busiest_nr_running == 0)

4011

goto out_balanced;

4011

goto out_balanced;

4012

4013

if (sds.this_load >= sds.max_load)

4013

if (sds.this_load >= sds.max_load)

4014

goto out_balanced;

4014

goto out_balanced;

4015

4016

sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;

4016

sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;

4017

4018

if (sds.this_load >= sds.avg_load)

4018

if (sds.this_load >= sds.avg_load)

4019

goto out_balanced;

4019

goto out_balanced;

4020

4021

if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)

4021

if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)

4022

goto out_balanced;

4022

goto out_balanced;

4023

4024

sds.busiest_load_per_task /= sds.busiest_nr_running;

4024

sds.busiest_load_per_task /= sds.busiest_nr_running;

4025

if (sds.group_imb)

4025

if (sds.group_imb)

4026

sds.busiest_load_per_task =

4026

sds.busiest_load_per_task =

4027

min(sds.busiest_load_per_task, sds.avg_load);

4027

min(sds.busiest_load_per_task, sds.avg_load);

4028

4029

/*

4029

/*

4030

* We're trying to get all the cpus to the average_load, so we don't

4030

* We're trying to get all the cpus to the average_load, so we don't

4031

* want to push ourselves above the average load, nor do we wish to

4031

* want to push ourselves above the average load, nor do we wish to

4032

* reduce the max loaded cpu below the average load, as either of these

4032

* reduce the max loaded cpu below the average load, as either of these

4033

* actions would just result in more rebalancing later, and ping-pong

4033

* actions would just result in more rebalancing later, and ping-pong

4034

* tasks around. Thus we look for the minimum possible imbalance.

4034

* tasks around. Thus we look for the minimum possible imbalance.

4035

* Negative imbalances (*we* are more loaded than anyone else) will

4035

* Negative imbalances (*we* are more loaded than anyone else) will

4036

* be counted as no imbalance for these purposes -- we can't fix that

4036

* be counted as no imbalance for these purposes -- we can't fix that

4037

* by pulling tasks to us. Be careful of negative numbers as they'll

4037

* by pulling tasks to us. Be careful of negative numbers as they'll

4038

* appear as very large values with unsigned longs.

4038

* appear as very large values with unsigned longs.

4039

*/

4039

*/

4040

if (sds.max_load <= sds.busiest_load_per_task)

4040

if (sds.max_load <= sds.busiest_load_per_task)

4041

goto out_balanced;

4041

goto out_balanced;

4042

4043

/* Looks like there is an imbalance. Compute it */

4043

/* Looks like there is an imbalance. Compute it */

4044

calculate_imbalance(&sds, this_cpu, imbalance);

4044

calculate_imbalance(&sds, this_cpu, imbalance);

4045

return sds.busiest;

4045

return sds.busiest;

4046

4047

out_balanced:

4047

out_balanced:

4048

/*

4048

/*

4049

* There is no obvious imbalance. But check if we can do some balancing

4049

* There is no obvious imbalance. But check if we can do some balancing

4050

* to save power.

4050

* to save power.

4051

*/

4051

*/

4052

if (check_power_save_busiest_group(&sds, this_cpu, imbalance))

4052

if (check_power_save_busiest_group(&sds, this_cpu, imbalance))

4053

return sds.busiest;

4053

return sds.busiest;

4054

ret:

4054

ret:

4055

*imbalance = 0;

4055

*imbalance = 0;

4056

return NULL;

4056

return NULL;

4057

}

4057

}

4058

4059

/*

4059

/*

4060

* find_busiest_queue - find the busiest runqueue among the cpus in group.

4060

* find_busiest_queue - find the busiest runqueue among the cpus in group.

4061

*/

4061

*/

4062

static struct rq *

4062

static struct rq *

4063

find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,

4063

find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,

4064

unsigned long imbalance, const struct cpumask *cpus)

4064

unsigned long imbalance, const struct cpumask *cpus)

4065

{

4065

{

4066

struct rq *busiest = NULL, *rq;

4066

struct rq *busiest = NULL, *rq;

4067

unsigned long max_load = 0;

4067

unsigned long max_load = 0;

4068

int i;

4068

int i;

4069

4070

for_each_cpu(i, sched_group_cpus(group)) {

4070

for_each_cpu(i, sched_group_cpus(group)) {

4071

unsigned long power = power_of(i);

4071

unsigned long power = power_of(i);

4072

unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);

4072

unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);

4073

unsigned long wl;

4073

unsigned long wl;

4074

4075

if (!cpumask_test_cpu(i, cpus))

4075

if (!cpumask_test_cpu(i, cpus))

4076

continue;

4076

continue;

4077

4078

rq = cpu_rq(i);

4078

rq = cpu_rq(i);

4079

wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;

4079

wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;

4080

wl /= power;

4080

wl /= power;

4081

4082

if (capacity && rq->nr_running == 1 && wl > imbalance)

4082

if (capacity && rq->nr_running == 1 && wl > imbalance)

4083

continue;

4083

continue;

4084

4085

if (wl > max_load) {

4085

if (wl > max_load) {

4086

max_load = wl;

4086

max_load = wl;

4087

busiest = rq;

4087

busiest = rq;

4088

}

4088

}

4089

}

4089

}

4090

4091

return busiest;

4091

return busiest;

4092

}

4092

}

4093

4094

/*

4094

/*

4095

* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but

4095

* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but

4096

* so long as it is large enough.

4096

* so long as it is large enough.

4097

*/

4097

*/

4098

#define MAX_PINNED_INTERVAL 512

4098

#define MAX_PINNED_INTERVAL 512

4099

4100

/* Working cpumask for load_balance and load_balance_newidle. */

4100

/* Working cpumask for load_balance and load_balance_newidle. */

4101

static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);

4101

static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);

4102

4103

/*

4103

/*

4104

* Check this_cpu to ensure it is balanced within domain. Attempt to move

4104

* Check this_cpu to ensure it is balanced within domain. Attempt to move

4105

* tasks if there is an imbalance.

4105

* tasks if there is an imbalance.

4106

*/

4106

*/

4107

static int load_balance(int this_cpu, struct rq *this_rq,

4107

static int load_balance(int this_cpu, struct rq *this_rq,

4108

struct sched_domain *sd, enum cpu_idle_type idle,

4108

struct sched_domain *sd, enum cpu_idle_type idle,

4109

int *balance)

4109

int *balance)

4110

{

4110

{

4111

int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;

4111

int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;

4112

struct sched_group *group;

4112

struct sched_group *group;

4113

unsigned long imbalance;

4113

unsigned long imbalance;

4114

struct rq *busiest;

4114

struct rq *busiest;

4115

unsigned long flags;

4115

unsigned long flags;

4116

struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);

4116

struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);

4117

4118

cpumask_copy(cpus, cpu_active_mask);

4118

cpumask_copy(cpus, cpu_active_mask);

4119

4120

/*

4120

/*

4121

* When power savings policy is enabled for the parent domain, idle

4121

* When power savings policy is enabled for the parent domain, idle

4122

* sibling can pick up load irrespective of busy siblings. In this case,

4122

* sibling can pick up load irrespective of busy siblings. In this case,

4123

* let the state of idle sibling percolate up as CPU_IDLE, instead of

4123

* let the state of idle sibling percolate up as CPU_IDLE, instead of

4124

* portraying it as CPU_NOT_IDLE.

4124

* portraying it as CPU_NOT_IDLE.

4125

*/

4125

*/

4126

if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&

4126

if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&

4127

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4127

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4128

sd_idle = 1;

4128

sd_idle = 1;

4129

4130

schedstat_inc(sd, lb_count[idle]);

4130

schedstat_inc(sd, lb_count[idle]);

4131

4132

redo:

4132

redo:

4133

update_shares(sd);

4133

update_shares(sd);

4134

group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,

4134

group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,

4135

cpus, balance);

4135

cpus, balance);

4136

4137

if (*balance == 0)

4137

if (*balance == 0)

4138

goto out_balanced;

4138

goto out_balanced;

4139

4140

if (!group) {

4140

if (!group) {

4141

schedstat_inc(sd, lb_nobusyg[idle]);

4141

schedstat_inc(sd, lb_nobusyg[idle]);

4142

goto out_balanced;

4142

goto out_balanced;

4143

}

4143

}

4144

4145

busiest = find_busiest_queue(group, idle, imbalance, cpus);

4145

busiest = find_busiest_queue(group, idle, imbalance, cpus);

4146

if (!busiest) {

4146

if (!busiest) {

4147

schedstat_inc(sd, lb_nobusyq[idle]);

4147

schedstat_inc(sd, lb_nobusyq[idle]);

4148

goto out_balanced;

4148

goto out_balanced;

4149

}

4149

}

4150

4151

BUG_ON(busiest == this_rq);

4151

BUG_ON(busiest == this_rq);

4152

4153

schedstat_add(sd, lb_imbalance[idle], imbalance);

4153

schedstat_add(sd, lb_imbalance[idle], imbalance);

4154

4155

ld_moved = 0;

4155

ld_moved = 0;

4156

if (busiest->nr_running > 1) {

4156

if (busiest->nr_running > 1) {

4157

/*

4157

/*

4158

* Attempt to move tasks. If find_busiest_group has found

4158

* Attempt to move tasks. If find_busiest_group has found

4159

* an imbalance but busiest->nr_running <= 1, the group is

4159

* an imbalance but busiest->nr_running <= 1, the group is

4160

* still unbalanced. ld_moved simply stays zero, so it is

4160

* still unbalanced. ld_moved simply stays zero, so it is

4161

* correctly treated as an imbalance.

4161

* correctly treated as an imbalance.

4162

*/

4162

*/

4163

local_irq_save(flags);

4163

local_irq_save(flags);

4164

double_rq_lock(this_rq, busiest);

4164

double_rq_lock(this_rq, busiest);

4165

ld_moved = move_tasks(this_rq, this_cpu, busiest,

4165

ld_moved = move_tasks(this_rq, this_cpu, busiest,

4166

imbalance, sd, idle, &all_pinned);

4166

imbalance, sd, idle, &all_pinned);

4167

double_rq_unlock(this_rq, busiest);

4167

double_rq_unlock(this_rq, busiest);

4168

local_irq_restore(flags);

4168

local_irq_restore(flags);

4169

4170

/*

4170

/*

4171

* some other cpu did the load balance for us.

4171

* some other cpu did the load balance for us.

4172

*/

4172

*/

4173

if (ld_moved && this_cpu != smp_processor_id())

4173

if (ld_moved && this_cpu != smp_processor_id())

4174

resched_cpu(this_cpu);

4174

resched_cpu(this_cpu);

4175

4176

/* All tasks on this runqueue were pinned by CPU affinity */

4176

/* All tasks on this runqueue were pinned by CPU affinity */

4177

if (unlikely(all_pinned)) {

4177

if (unlikely(all_pinned)) {

4178

cpumask_clear_cpu(cpu_of(busiest), cpus);

4178

cpumask_clear_cpu(cpu_of(busiest), cpus);

4179

if (!cpumask_empty(cpus))

4179

if (!cpumask_empty(cpus))

4180

goto redo;

4180

goto redo;

4181

goto out_balanced;

4181

goto out_balanced;

4182

}

4182

}

4183

}

4183

}

4184

4185

if (!ld_moved) {

4185

if (!ld_moved) {

4186

schedstat_inc(sd, lb_failed[idle]);

4186

schedstat_inc(sd, lb_failed[idle]);

4187

sd->nr_balance_failed++;

4187

sd->nr_balance_failed++;

4188

4189

if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

4189

if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {

4190

4191

spin_lock_irqsave(&busiest->lock, flags);

4191

spin_lock_irqsave(&busiest->lock, flags);

4192

4193

/* don't kick the migration_thread, if the curr

4193

/* don't kick the migration_thread, if the curr

4194

* task on busiest cpu can't be moved to this_cpu

4194

* task on busiest cpu can't be moved to this_cpu

4195

*/

4195

*/

4196

if (!cpumask_test_cpu(this_cpu,

4196

if (!cpumask_test_cpu(this_cpu,

4197

&busiest->curr->cpus_allowed)) {

4197

&busiest->curr->cpus_allowed)) {

4198

spin_unlock_irqrestore(&busiest->lock, flags);

4198

spin_unlock_irqrestore(&busiest->lock, flags);

4199

all_pinned = 1;

4199

all_pinned = 1;

4200

goto out_one_pinned;

4200

goto out_one_pinned;

4201

}

4201

}

4202

4203

if (!busiest->active_balance) {

4203

if (!busiest->active_balance) {

4204

busiest->active_balance = 1;

4204

busiest->active_balance = 1;

4205

busiest->push_cpu = this_cpu;

4205

busiest->push_cpu = this_cpu;

4206

active_balance = 1;

4206

active_balance = 1;

4207

}

4207

}

4208

spin_unlock_irqrestore(&busiest->lock, flags);

4208

spin_unlock_irqrestore(&busiest->lock, flags);

4209

if (active_balance)

4209

if (active_balance)

4210

wake_up_process(busiest->migration_thread);

4210

wake_up_process(busiest->migration_thread);

4211

4212

/*

4212

/*

4213

* We've kicked active balancing, reset the failure

4213

* We've kicked active balancing, reset the failure

4214

* counter.

4214

* counter.

4215

*/

4215

*/

4216

sd->nr_balance_failed = sd->cache_nice_tries+1;

4216

sd->nr_balance_failed = sd->cache_nice_tries+1;

4217

}

4217

}

4218

} else

4218

} else

4219

sd->nr_balance_failed = 0;

4219

sd->nr_balance_failed = 0;

4220

4221

if (likely(!active_balance)) {

4221

if (likely(!active_balance)) {

4222

/* We were unbalanced, so reset the balancing interval */

4222

/* We were unbalanced, so reset the balancing interval */

4223

sd->balance_interval = sd->min_interval;

4223

sd->balance_interval = sd->min_interval;

4224

} else {

4224

} else {

4225

/*

4225

/*

4226

* If we've begun active balancing, start to back off. This

4226

* If we've begun active balancing, start to back off. This

4227

* case may not be covered by the all_pinned logic if there

4227

* case may not be covered by the all_pinned logic if there

4228

* is only 1 task on the busy runqueue (because we don't call

4228

* is only 1 task on the busy runqueue (because we don't call

4229

* move_tasks).

4229

* move_tasks).

4230

*/

4230

*/

4231

if (sd->balance_interval < sd->max_interval)

4231

if (sd->balance_interval < sd->max_interval)

4232

sd->balance_interval *= 2;

4232

sd->balance_interval *= 2;

4233

}

4233

}

4234

4235

if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4235

if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4236

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4236

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4237

ld_moved = -1;

4237

ld_moved = -1;

4238

4239

goto out;

4239

goto out;

4240

4241

out_balanced:

4241

out_balanced:

4242

schedstat_inc(sd, lb_balanced[idle]);

4242

schedstat_inc(sd, lb_balanced[idle]);

4243

4244

sd->nr_balance_failed = 0;

4244

sd->nr_balance_failed = 0;

4245

4246

out_one_pinned:

4246

out_one_pinned:

4247

/* tune up the balancing interval */

4247

/* tune up the balancing interval */

4248

if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||

4248

if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||

4249

(sd->balance_interval < sd->max_interval))

4249

(sd->balance_interval < sd->max_interval))

4250

sd->balance_interval *= 2;

4250

sd->balance_interval *= 2;

4251

4252

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4252

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4253

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4253

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4254

ld_moved = -1;

4254

ld_moved = -1;

4255

else

4255

else

4256

ld_moved = 0;

4256

ld_moved = 0;

4257

out:

4257

out:

4258

if (ld_moved)

4258

if (ld_moved)

4259

update_shares(sd);

4259

update_shares(sd);

4260

return ld_moved;

4260

return ld_moved;

4261

}

4261

}

4262

4263

/*

4263

/*

4264

* Check this_cpu to ensure it is balanced within domain. Attempt to move

4264

* Check this_cpu to ensure it is balanced within domain. Attempt to move

4265

* tasks if there is an imbalance.

4265

* tasks if there is an imbalance.

4266

*

4266

*

4267

* Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).

4267

* Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).

4268

* this_rq is locked.

4268

* this_rq is locked.

4269

*/

4269

*/

4270

static int

4270

static int

4271

load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)

4271

load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)

4272

{

4272

{

4273

struct sched_group *group;

4273

struct sched_group *group;

4274

struct rq *busiest = NULL;

4274

struct rq *busiest = NULL;

4275

unsigned long imbalance;

4275

unsigned long imbalance;

4276

int ld_moved = 0;

4276

int ld_moved = 0;

4277

int sd_idle = 0;

4277

int sd_idle = 0;

4278

int all_pinned = 0;

4278

int all_pinned = 0;

4279

struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);

4279

struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);

4280

4281

cpumask_copy(cpus, cpu_active_mask);

4281

cpumask_copy(cpus, cpu_active_mask);

4282

4283

/*

4283

/*

4284

* When power savings policy is enabled for the parent domain, idle

4284

* When power savings policy is enabled for the parent domain, idle

4285

* sibling can pick up load irrespective of busy siblings. In this case,

4285

* sibling can pick up load irrespective of busy siblings. In this case,

4286

* let the state of idle sibling percolate up as IDLE, instead of

4286

* let the state of idle sibling percolate up as IDLE, instead of

4287

* portraying it as CPU_NOT_IDLE.

4287

* portraying it as CPU_NOT_IDLE.

4288

*/

4288

*/

4289

if (sd->flags & SD_SHARE_CPUPOWER &&

4289

if (sd->flags & SD_SHARE_CPUPOWER &&

4290

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4290

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4291

sd_idle = 1;

4291

sd_idle = 1;

4292

4293

schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);

4293

schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);

4294

redo:

4294

redo:

4295

update_shares_locked(this_rq, sd);

4295

update_shares_locked(this_rq, sd);

4296

group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,

4296

group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,

4297

&sd_idle, cpus, NULL);

4297

&sd_idle, cpus, NULL);

4298

if (!group) {

4298

if (!group) {

4299

schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);

4299

schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);

4300

goto out_balanced;

4300

goto out_balanced;

4301

}

4301

}

4302

4303

busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);

4303

busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);

4304

if (!busiest) {

4304

if (!busiest) {

4305

schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);

4305

schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);

4306

goto out_balanced;

4306

goto out_balanced;

4307

}

4307

}

4308

4309

BUG_ON(busiest == this_rq);

4309

BUG_ON(busiest == this_rq);

4310

4311

schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);

4311

schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);

4312

4313

ld_moved = 0;

4313

ld_moved = 0;

4314

if (busiest->nr_running > 1) {

4314

if (busiest->nr_running > 1) {

4315

/* Attempt to move tasks */

4315

/* Attempt to move tasks */

4316

double_lock_balance(this_rq, busiest);

4316

double_lock_balance(this_rq, busiest);

4317

/* this_rq->clock is already updated */

4317

/* this_rq->clock is already updated */

4318

update_rq_clock(busiest);

4318

update_rq_clock(busiest);

4319

ld_moved = move_tasks(this_rq, this_cpu, busiest,

4319

ld_moved = move_tasks(this_rq, this_cpu, busiest,

4320

imbalance, sd, CPU_NEWLY_IDLE,

4320

imbalance, sd, CPU_NEWLY_IDLE,

4321

&all_pinned);

4321

&all_pinned);

4322

double_unlock_balance(this_rq, busiest);

4322

double_unlock_balance(this_rq, busiest);

4323

4324

if (unlikely(all_pinned)) {

4324

if (unlikely(all_pinned)) {

4325

cpumask_clear_cpu(cpu_of(busiest), cpus);

4325

cpumask_clear_cpu(cpu_of(busiest), cpus);

4326

if (!cpumask_empty(cpus))

4326

if (!cpumask_empty(cpus))

4327

goto redo;

4327

goto redo;

4328

}

4328

}

4329

}

4329

}

4330

4331

if (!ld_moved) {

4331

if (!ld_moved) {

4332

int active_balance = 0;

4332

int active_balance = 0;

4333

4334

schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);

4334

schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);

4335

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4335

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4336

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4336

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4337

return -1;

4337

return -1;

4338

4339

if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)

4339

if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)

4340

return -1;

4340

return -1;

4341

4342

if (sd->nr_balance_failed++ < 2)

4342

if (sd->nr_balance_failed++ < 2)

4343

return -1;

4343

return -1;

4344

4345

/*

4345

/*

4346

* The only task running in a non-idle cpu can be moved to this

4346

* The only task running in a non-idle cpu can be moved to this

4347

* cpu in an attempt to completely freeup the other CPU

4347

* cpu in an attempt to completely freeup the other CPU

4348

* package. The same method used to move task in load_balance()

4348

* package. The same method used to move task in load_balance()

4349

* have been extended for load_balance_newidle() to speedup

4349

* have been extended for load_balance_newidle() to speedup

4350

* consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)

4350

* consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)

4351

*

4351

*

4352

* The package power saving logic comes from

4352

* The package power saving logic comes from

4353

* find_busiest_group(). If there are no imbalance, then

4353

* find_busiest_group(). If there are no imbalance, then

4354

* f_b_g() will return NULL. However when sched_mc={1,2} then

4354

* f_b_g() will return NULL. However when sched_mc={1,2} then

4355

* f_b_g() will select a group from which a running task may be

4355

* f_b_g() will select a group from which a running task may be

4356

* pulled to this cpu in order to make the other package idle.

4356

* pulled to this cpu in order to make the other package idle.

4357

* If there is no opportunity to make a package idle and if

4357

* If there is no opportunity to make a package idle and if

4358

* there are no imbalance, then f_b_g() will return NULL and no

4358

* there are no imbalance, then f_b_g() will return NULL and no

4359

* action will be taken in load_balance_newidle().

4359

* action will be taken in load_balance_newidle().

4360

*

4360

*

4361

* Under normal task pull operation due to imbalance, there

4361

* Under normal task pull operation due to imbalance, there

4362

* will be more than one task in the source run queue and

4362

* will be more than one task in the source run queue and

4363

* move_tasks() will succeed. ld_moved will be true and this

4363

* move_tasks() will succeed. ld_moved will be true and this

4364

* active balance code will not be triggered.

4364

* active balance code will not be triggered.

4365

*/

4365

*/

4366

4367

/* Lock busiest in correct order while this_rq is held */

4367

/* Lock busiest in correct order while this_rq is held */

4368

double_lock_balance(this_rq, busiest);

4368

double_lock_balance(this_rq, busiest);

4369

4370

/*

4370

/*

4371

* don't kick the migration_thread, if the curr

4371

* don't kick the migration_thread, if the curr

4372

* task on busiest cpu can't be moved to this_cpu

4372

* task on busiest cpu can't be moved to this_cpu

4373

*/

4373

*/

4374

if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {

4374

if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {

4375

double_unlock_balance(this_rq, busiest);

4375

double_unlock_balance(this_rq, busiest);

4376

all_pinned = 1;

4376

all_pinned = 1;

4377

return ld_moved;

4377

return ld_moved;

4378

}

4378

}

4379

4380

if (!busiest->active_balance) {

4380

if (!busiest->active_balance) {

4381

busiest->active_balance = 1;

4381

busiest->active_balance = 1;

4382

busiest->push_cpu = this_cpu;

4382

busiest->push_cpu = this_cpu;

4383

active_balance = 1;

4383

active_balance = 1;

4384

}

4384

}

4385

4386

double_unlock_balance(this_rq, busiest);

4386

double_unlock_balance(this_rq, busiest);

4387

/*

4387

/*

4388

* Should not call ttwu while holding a rq->lock

4388

* Should not call ttwu while holding a rq->lock

4389

*/

4389

*/

4390

spin_unlock(&this_rq->lock);

4390

spin_unlock(&this_rq->lock);

4391

if (active_balance)

4391

if (active_balance)

4392

wake_up_process(busiest->migration_thread);

4392

wake_up_process(busiest->migration_thread);

4393

spin_lock(&this_rq->lock);

4393

spin_lock(&this_rq->lock);

4394

4395

} else

4395

} else

4396

sd->nr_balance_failed = 0;

4396

sd->nr_balance_failed = 0;

4397

4398

update_shares_locked(this_rq, sd);

4398

update_shares_locked(this_rq, sd);

4399

return ld_moved;

4399

return ld_moved;

4400

4401

out_balanced:

4401

out_balanced:

4402

schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);

4402

schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);

4403

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4403

if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&

4404

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4404

!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))

4405

return -1;

4405

return -1;

4406

sd->nr_balance_failed = 0;

4406

sd->nr_balance_failed = 0;

4407

4408

return 0;

4408

return 0;

4409

}

4409

}

4410

4411

/*

4411

/*

4412

* idle_balance is called by schedule() if this_cpu is about to become

4412

* idle_balance is called by schedule() if this_cpu is about to become

4413

* idle. Attempts to pull tasks from other CPUs.

4413

* idle. Attempts to pull tasks from other CPUs.

4414

*/

4414

*/

4415

static void idle_balance(int this_cpu, struct rq *this_rq)

4415

static void idle_balance(int this_cpu, struct rq *this_rq)

4416

{

4416

{

4417

struct sched_domain *sd;

4417

struct sched_domain *sd;

4418

int pulled_task = 0;

4418

int pulled_task = 0;

4419

unsigned long next_balance = jiffies + HZ;

4419

unsigned long next_balance = jiffies + HZ;

4420

4421

this_rq->idle_stamp = this_rq->clock;

4421

this_rq->idle_stamp = this_rq->clock;

4422

4423

if (this_rq->avg_idle < sysctl_sched_migration_cost)

4423

if (this_rq->avg_idle < sysctl_sched_migration_cost)

4424

return;

4424

return;

4425

4426

for_each_domain(this_cpu, sd) {

4426

for_each_domain(this_cpu, sd) {

4427

unsigned long interval;

4427

unsigned long interval;

4428

4429

if (!(sd->flags & SD_LOAD_BALANCE))

4429

if (!(sd->flags & SD_LOAD_BALANCE))

4430

continue;

4430

continue;

4431

4432

if (sd->flags & SD_BALANCE_NEWIDLE)

4432

if (sd->flags & SD_BALANCE_NEWIDLE)

4433

/* If we've pulled tasks over stop searching: */

4433

/* If we've pulled tasks over stop searching: */

4434

pulled_task = load_balance_newidle(this_cpu, this_rq,

4434

pulled_task = load_balance_newidle(this_cpu, this_rq,

4435

sd);

4435

sd);

4436

4437

interval = msecs_to_jiffies(sd->balance_interval);

4437

interval = msecs_to_jiffies(sd->balance_interval);

4438

if (time_after(next_balance, sd->last_balance + interval))

4438

if (time_after(next_balance, sd->last_balance + interval))

4439

next_balance = sd->last_balance + interval;

4439

next_balance = sd->last_balance + interval;

4440

if (pulled_task) {

4440

if (pulled_task) {

4441

this_rq->idle_stamp = 0;

4441

this_rq->idle_stamp = 0;

4442

break;

4442

break;

4443

}

4443

}

4444

}

4444

}

4445

if (pulled_task || time_after(jiffies, this_rq->next_balance)) {

4445

if (pulled_task || time_after(jiffies, this_rq->next_balance)) {

4446

/*

4446

/*

4447

* We are going idle. next_balance may be set based on

4447

* We are going idle. next_balance may be set based on

4448

* a busy processor. So reset next_balance.

4448

* a busy processor. So reset next_balance.

4449

*/

4449

*/

4450

this_rq->next_balance = next_balance;

4450

this_rq->next_balance = next_balance;

4451

}

4451

}

4452

}

4452

}

4453

4454

/*

4454

/*

4455

* active_load_balance is run by migration threads. It pushes running tasks

4455

* active_load_balance is run by migration threads. It pushes running tasks

4456

* off the busiest CPU onto idle CPUs. It requires at least 1 task to be

4456

* off the busiest CPU onto idle CPUs. It requires at least 1 task to be

4457

* running on each physical CPU where possible, and avoids physical /

4457

* running on each physical CPU where possible, and avoids physical /

4458

* logical imbalances.

4458

* logical imbalances.

4459

*

4459

*

4460

* Called with busiest_rq locked.

4460

* Called with busiest_rq locked.

4461

*/

4461

*/

4462

static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)

4462

static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)

4463

{

4463

{

4464

int target_cpu = busiest_rq->push_cpu;

4464

int target_cpu = busiest_rq->push_cpu;

4465

struct sched_domain *sd;

4465

struct sched_domain *sd;

4466

struct rq *target_rq;

4466

struct rq *target_rq;

4467

4468

/* Is there any task to move? */

4468

/* Is there any task to move? */

4469

if (busiest_rq->nr_running <= 1)

4469

if (busiest_rq->nr_running <= 1)

4470

return;

4470

return;

4471

4472

target_rq = cpu_rq(target_cpu);

4472

target_rq = cpu_rq(target_cpu);

4473

4474

/*

4474

/*

4475

* This condition is "impossible", if it occurs

4475

* This condition is "impossible", if it occurs

4476

* we need to fix it. Originally reported by

4476

* we need to fix it. Originally reported by

4477

* Bjorn Helgaas on a 128-cpu setup.

4477

* Bjorn Helgaas on a 128-cpu setup.

4478

*/

4478

*/

4479

BUG_ON(busiest_rq == target_rq);

4479

BUG_ON(busiest_rq == target_rq);

4480

4481

/* move a task from busiest_rq to target_rq */

4481

/* move a task from busiest_rq to target_rq */

4482

double_lock_balance(busiest_rq, target_rq);

4482

double_lock_balance(busiest_rq, target_rq);

4483

update_rq_clock(busiest_rq);

4483

update_rq_clock(busiest_rq);

4484

update_rq_clock(target_rq);

4484

update_rq_clock(target_rq);

4485

4486

/* Search for an sd spanning us and the target CPU. */

4486

/* Search for an sd spanning us and the target CPU. */

4487

for_each_domain(target_cpu, sd) {

4487

for_each_domain(target_cpu, sd) {

4488

if ((sd->flags & SD_LOAD_BALANCE) &&

4488

if ((sd->flags & SD_LOAD_BALANCE) &&

4489

cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))

4489

cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))

4490

break;

4490

break;

4491

}

4491

}

4492

4493

if (likely(sd)) {

4493

if (likely(sd)) {

4494

schedstat_inc(sd, alb_count);

4494

schedstat_inc(sd, alb_count);

4495

4496

if (move_one_task(target_rq, target_cpu, busiest_rq,

4496

if (move_one_task(target_rq, target_cpu, busiest_rq,

4497

sd, CPU_IDLE))

4497

sd, CPU_IDLE))

4498

schedstat_inc(sd, alb_pushed);

4498

schedstat_inc(sd, alb_pushed);

4499

else

4499

else

4500

schedstat_inc(sd, alb_failed);

4500

schedstat_inc(sd, alb_failed);

4501

}

4501

}

4502

double_unlock_balance(busiest_rq, target_rq);

4502

double_unlock_balance(busiest_rq, target_rq);

4503

}

4503

}

4504

4505

#ifdef CONFIG_NO_HZ

4505

#ifdef CONFIG_NO_HZ

4506

static struct {

4506

static struct {

4507

atomic_t load_balancer;

4507

atomic_t load_balancer;

4508

cpumask_var_t cpu_mask;

4508

cpumask_var_t cpu_mask;

4509

cpumask_var_t ilb_grp_nohz_mask;

4509

cpumask_var_t ilb_grp_nohz_mask;

4510

} nohz ____cacheline_aligned = {

4510

} nohz ____cacheline_aligned = {

4511

.load_balancer = ATOMIC_INIT(-1),

4511

.load_balancer = ATOMIC_INIT(-1),

4512

};

4512

};

4513

4514

int get_nohz_load_balancer(void)

4514

int get_nohz_load_balancer(void)

4515

{

4515

{

4516

return atomic_read(&nohz.load_balancer);

4516

return atomic_read(&nohz.load_balancer);

4517

}

4517

}

4518

4519

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

4519

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

4520

/**

4520

/**

4521

* lowest_flag_domain - Return lowest sched_domain containing flag.

4521

* lowest_flag_domain - Return lowest sched_domain containing flag.

4522

* @cpu: The cpu whose lowest level of sched domain is to

4522

* @cpu: The cpu whose lowest level of sched domain is to

4523

* be returned.

4523

* be returned.

4524

* @flag: The flag to check for the lowest sched_domain

4524

* @flag: The flag to check for the lowest sched_domain

4525

* for the given cpu.

4525

* for the given cpu.

4526

*

4526

*

4527

* Returns the lowest sched_domain of a cpu which contains the given flag.

4527

* Returns the lowest sched_domain of a cpu which contains the given flag.

4528

*/

4528

*/

4529

static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)

4529

static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)

4530

{

4530

{

4531

struct sched_domain *sd;

4531

struct sched_domain *sd;

4532

4533

for_each_domain(cpu, sd)

4533

for_each_domain(cpu, sd)

4534

if (sd && (sd->flags & flag))

4534

if (sd && (sd->flags & flag))

4535

break;

4535

break;

4536

4537

return sd;

4537

return sd;

4538

}

4538

}

4539

4540

/**

4540

/**

4541

* for_each_flag_domain - Iterates over sched_domains containing the flag.

4541

* for_each_flag_domain - Iterates over sched_domains containing the flag.

4542

* @cpu: The cpu whose domains we're iterating over.

4542

* @cpu: The cpu whose domains we're iterating over.

4543

* @sd: variable holding the value of the power_savings_sd

4543

* @sd: variable holding the value of the power_savings_sd

4544

* for cpu.

4544

* for cpu.

4545

* @flag: The flag to filter the sched_domains to be iterated.

4545

* @flag: The flag to filter the sched_domains to be iterated.

4546

*

4546

*

4547

* Iterates over all the scheduler domains for a given cpu that has the 'flag'

4547

* Iterates over all the scheduler domains for a given cpu that has the 'flag'

4548

* set, starting from the lowest sched_domain to the highest.

4548

* set, starting from the lowest sched_domain to the highest.

4549

*/

4549

*/

4550

#define for_each_flag_domain(cpu, sd, flag) \

4550

#define for_each_flag_domain(cpu, sd, flag) \

4551

for (sd = lowest_flag_domain(cpu, flag); \

4551

for (sd = lowest_flag_domain(cpu, flag); \

4552

(sd && (sd->flags & flag)); sd = sd->parent)

4552

(sd && (sd->flags & flag)); sd = sd->parent)

4553

4554

/**

4554

/**

4555

* is_semi_idle_group - Checks if the given sched_group is semi-idle.

4555

* is_semi_idle_group - Checks if the given sched_group is semi-idle.

4556

* @ilb_group: group to be checked for semi-idleness

4556

* @ilb_group: group to be checked for semi-idleness

4557

*

4557

*

4558

* Returns: 1 if the group is semi-idle. 0 otherwise.

4558

* Returns: 1 if the group is semi-idle. 0 otherwise.

4559

*

4559

*

4560

* We define a sched_group to be semi idle if it has atleast one idle-CPU

4560

* We define a sched_group to be semi idle if it has atleast one idle-CPU

4561

* and atleast one non-idle CPU. This helper function checks if the given

4561

* and atleast one non-idle CPU. This helper function checks if the given

4562

* sched_group is semi-idle or not.

4562

* sched_group is semi-idle or not.

4563

*/

4563

*/

4564

static inline int is_semi_idle_group(struct sched_group *ilb_group)

4564

static inline int is_semi_idle_group(struct sched_group *ilb_group)

4565

{

4565

{

4566

cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,

4566

cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,

4567

sched_group_cpus(ilb_group));

4567

sched_group_cpus(ilb_group));

4568

4569

/*

4569

/*

4570

* A sched_group is semi-idle when it has atleast one busy cpu

4570

* A sched_group is semi-idle when it has atleast one busy cpu

4571

* and atleast one idle cpu.

4571

* and atleast one idle cpu.

4572

*/

4572

*/

4573

if (cpumask_empty(nohz.ilb_grp_nohz_mask))

4573

if (cpumask_empty(nohz.ilb_grp_nohz_mask))

4574

return 0;

4574

return 0;

4575

4576

if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))

4576

if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))

4577

return 0;

4577

return 0;

4578

4579

return 1;

4579

return 1;

4580

}

4580

}

4581

/**

4581

/**

4582

* find_new_ilb - Finds the optimum idle load balancer for nomination.

4582

* find_new_ilb - Finds the optimum idle load balancer for nomination.

4583

* @cpu: The cpu which is nominating a new idle_load_balancer.

4583

* @cpu: The cpu which is nominating a new idle_load_balancer.

4584

*

4584

*

4585

* Returns: Returns the id of the idle load balancer if it exists,

4585

* Returns: Returns the id of the idle load balancer if it exists,

4586

* Else, returns >= nr_cpu_ids.

4586

* Else, returns >= nr_cpu_ids.

4587

*

4587

*

4588

* This algorithm picks the idle load balancer such that it belongs to a

4588

* This algorithm picks the idle load balancer such that it belongs to a

4589

* semi-idle powersavings sched_domain. The idea is to try and avoid

4589

* semi-idle powersavings sched_domain. The idea is to try and avoid

4590

* completely idle packages/cores just for the purpose of idle load balancing

4590

* completely idle packages/cores just for the purpose of idle load balancing

4591

* when there are other idle cpu's which are better suited for that job.

4591

* when there are other idle cpu's which are better suited for that job.

4592

*/

4592

*/

4593

static int find_new_ilb(int cpu)

4593

static int find_new_ilb(int cpu)

4594

{

4594

{

4595

struct sched_domain *sd;

4595

struct sched_domain *sd;

4596

struct sched_group *ilb_group;

4596

struct sched_group *ilb_group;

4597

4598

/*

4598

/*

4599

* Have idle load balancer selection from semi-idle packages only

4599

* Have idle load balancer selection from semi-idle packages only

4600

* when power-aware load balancing is enabled

4600

* when power-aware load balancing is enabled

4601

*/

4601

*/

4602

if (!(sched_smt_power_savings || sched_mc_power_savings))

4602

if (!(sched_smt_power_savings || sched_mc_power_savings))

4603

goto out_done;

4603

goto out_done;

4604

4605

/*

4605

/*

4606

* Optimize for the case when we have no idle CPUs or only one

4606

* Optimize for the case when we have no idle CPUs or only one

4607

* idle CPU. Don't walk the sched_domain hierarchy in such cases

4607

* idle CPU. Don't walk the sched_domain hierarchy in such cases

4608

*/

4608

*/

4609

if (cpumask_weight(nohz.cpu_mask) < 2)

4609

if (cpumask_weight(nohz.cpu_mask) < 2)

4610

goto out_done;

4610

goto out_done;

4611

4612

for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {

4612

for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {

4613

ilb_group = sd->groups;

4613

ilb_group = sd->groups;

4614

4615

do {

4615

do {

4616

if (is_semi_idle_group(ilb_group))

4616

if (is_semi_idle_group(ilb_group))

4617

return cpumask_first(nohz.ilb_grp_nohz_mask);

4617

return cpumask_first(nohz.ilb_grp_nohz_mask);

4618

4619

ilb_group = ilb_group->next;

4619

ilb_group = ilb_group->next;

4620

4621

} while (ilb_group != sd->groups);

4621

} while (ilb_group != sd->groups);

4622

}

4622

}

4623

4624

out_done:

4624

out_done:

4625

return cpumask_first(nohz.cpu_mask);

4625

return cpumask_first(nohz.cpu_mask);

4626

}

4626

}

4627

#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */

4627

#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */

4628

static inline int find_new_ilb(int call_cpu)

4628

static inline int find_new_ilb(int call_cpu)

4629

{

4629

{

4630

return cpumask_first(nohz.cpu_mask);

4630

return cpumask_first(nohz.cpu_mask);

4631

}

4631

}

4632

#endif

4632

#endif

4633

4634

/*

4634

/*

4635

* This routine will try to nominate the ilb (idle load balancing)

4635

* This routine will try to nominate the ilb (idle load balancing)

4636

* owner among the cpus whose ticks are stopped. ilb owner will do the idle

4636

* owner among the cpus whose ticks are stopped. ilb owner will do the idle

4637

* load balancing on behalf of all those cpus. If all the cpus in the system

4637

* load balancing on behalf of all those cpus. If all the cpus in the system

4638

* go into this tickless mode, then there will be no ilb owner (as there is

4638

* go into this tickless mode, then there will be no ilb owner (as there is

4639

* no need for one) and all the cpus will sleep till the next wakeup event

4639

* no need for one) and all the cpus will sleep till the next wakeup event

4640

* arrives...

4640

* arrives...

4641

*

4641

*

4642

* For the ilb owner, tick is not stopped. And this tick will be used

4642

* For the ilb owner, tick is not stopped. And this tick will be used

4643

* for idle load balancing. ilb owner will still be part of

4643

* for idle load balancing. ilb owner will still be part of

4644

* nohz.cpu_mask..

4644

* nohz.cpu_mask..

4645

*

4645

*

4646

* While stopping the tick, this cpu will become the ilb owner if there

4646

* While stopping the tick, this cpu will become the ilb owner if there

4647

* is no other owner. And will be the owner till that cpu becomes busy

4647

* is no other owner. And will be the owner till that cpu becomes busy

4648

* or if all cpus in the system stop their ticks at which point

4648

* or if all cpus in the system stop their ticks at which point

4649

* there is no need for ilb owner.

4649

* there is no need for ilb owner.

4650

*

4650

*

4651

* When the ilb owner becomes busy, it nominates another owner, during the

4651

* When the ilb owner becomes busy, it nominates another owner, during the

4652

* next busy scheduler_tick()

4652

* next busy scheduler_tick()

4653

*/

4653

*/

4654

int select_nohz_load_balancer(int stop_tick)

4654

int select_nohz_load_balancer(int stop_tick)

4655

{

4655

{

4656

int cpu = smp_processor_id();

4656

int cpu = smp_processor_id();

4657

4658

if (stop_tick) {

4658

if (stop_tick) {

4659

cpu_rq(cpu)->in_nohz_recently = 1;

4659

cpu_rq(cpu)->in_nohz_recently = 1;

4660

4661

if (!cpu_active(cpu)) {

4661

if (!cpu_active(cpu)) {

4662

if (atomic_read(&nohz.load_balancer) != cpu)

4662

if (atomic_read(&nohz.load_balancer) != cpu)

4663

return 0;

4663

return 0;

4664

4665

/*

4665

/*

4666

* If we are going offline and still the leader,

4666

* If we are going offline and still the leader,

4667

* give up!

4667

* give up!

4668

*/

4668

*/

4669

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

4669

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

4670

BUG();

4670

BUG();

4671

4672

return 0;

4672

return 0;

4673

}

4673

}

4674

4675

cpumask_set_cpu(cpu, nohz.cpu_mask);

4675

cpumask_set_cpu(cpu, nohz.cpu_mask);

4676

4677

/* time for ilb owner also to sleep */

4677

/* time for ilb owner also to sleep */

4678

if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {

4678

if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {

4679

if (atomic_read(&nohz.load_balancer) == cpu)

4679

if (atomic_read(&nohz.load_balancer) == cpu)

4680

atomic_set(&nohz.load_balancer, -1);

4680

atomic_set(&nohz.load_balancer, -1);

4681

return 0;

4681

return 0;

4682

}

4682

}

4683

4684

if (atomic_read(&nohz.load_balancer) == -1) {

4684

if (atomic_read(&nohz.load_balancer) == -1) {

4685

/* make me the ilb owner */

4685

/* make me the ilb owner */

4686

if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)

4686

if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)

4687

return 1;

4687

return 1;

4688

} else if (atomic_read(&nohz.load_balancer) == cpu) {

4688

} else if (atomic_read(&nohz.load_balancer) == cpu) {

4689

int new_ilb;

4689

int new_ilb;

4690

4691

if (!(sched_smt_power_savings ||

4691

if (!(sched_smt_power_savings ||

4692

sched_mc_power_savings))

4692

sched_mc_power_savings))

4693

return 1;

4693

return 1;

4694

/*

4694

/*

4695

* Check to see if there is a more power-efficient

4695

* Check to see if there is a more power-efficient

4696

* ilb.

4696

* ilb.

4697

*/

4697

*/

4698

new_ilb = find_new_ilb(cpu);

4698

new_ilb = find_new_ilb(cpu);

4699

if (new_ilb < nr_cpu_ids && new_ilb != cpu) {

4699

if (new_ilb < nr_cpu_ids && new_ilb != cpu) {

4700

atomic_set(&nohz.load_balancer, -1);

4700

atomic_set(&nohz.load_balancer, -1);

4701

resched_cpu(new_ilb);

4701

resched_cpu(new_ilb);

4702

return 0;

4702

return 0;

4703

}

4703

}

4704

return 1;

4704

return 1;

4705

}

4705

}

4706

} else {

4706

} else {

4707

if (!cpumask_test_cpu(cpu, nohz.cpu_mask))

4707

if (!cpumask_test_cpu(cpu, nohz.cpu_mask))

4708

return 0;

4708

return 0;

4709

4710

cpumask_clear_cpu(cpu, nohz.cpu_mask);

4710

cpumask_clear_cpu(cpu, nohz.cpu_mask);

4711

4712

if (atomic_read(&nohz.load_balancer) == cpu)

4712

if (atomic_read(&nohz.load_balancer) == cpu)

4713

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

4713

if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)

4714

BUG();

4714

BUG();

4715

}

4715

}

4716

return 0;

4716

return 0;

4717

}

4717

}

4718

#endif

4718

#endif

4719

4720

static DEFINE_SPINLOCK(balancing);

4720

static DEFINE_SPINLOCK(balancing);

4721

4722

/*

4722

/*

4723

* It checks each scheduling domain to see if it is due to be balanced,

4723

* It checks each scheduling domain to see if it is due to be balanced,

4724

* and initiates a balancing operation if so.

4724

* and initiates a balancing operation if so.

4725

*

4725

*

4726

* Balancing parameters are set up in arch_init_sched_domains.

4726

* Balancing parameters are set up in arch_init_sched_domains.

4727

*/

4727

*/

4728

static void rebalance_domains(int cpu, enum cpu_idle_type idle)

4728

static void rebalance_domains(int cpu, enum cpu_idle_type idle)

4729

{

4729

{

4730

int balance = 1;

4730

int balance = 1;

4731

struct rq *rq = cpu_rq(cpu);

4731

struct rq *rq = cpu_rq(cpu);

4732

unsigned long interval;

4732

unsigned long interval;

4733

struct sched_domain *sd;

4733

struct sched_domain *sd;

4734

/* Earliest time when we have to do rebalance again */

4734

/* Earliest time when we have to do rebalance again */

4735

unsigned long next_balance = jiffies + 60*HZ;

4735

unsigned long next_balance = jiffies + 60*HZ;

4736

int update_next_balance = 0;

4736

int update_next_balance = 0;

4737

int need_serialize;

4737

int need_serialize;

4738

4739

for_each_domain(cpu, sd) {

4739

for_each_domain(cpu, sd) {

4740

if (!(sd->flags & SD_LOAD_BALANCE))

4740

if (!(sd->flags & SD_LOAD_BALANCE))

4741

continue;

4741

continue;

4742

4743

interval = sd->balance_interval;

4743

interval = sd->balance_interval;

4744

if (idle != CPU_IDLE)

4744

if (idle != CPU_IDLE)

4745

interval *= sd->busy_factor;

4745

interval *= sd->busy_factor;

4746

4747

/* scale ms to jiffies */

4747

/* scale ms to jiffies */

4748

interval = msecs_to_jiffies(interval);

4748

interval = msecs_to_jiffies(interval);

4749

if (unlikely(!interval))

4749

if (unlikely(!interval))

4750

interval = 1;

4750

interval = 1;

4751

if (interval > HZ*NR_CPUS/10)

4751

if (interval > HZ*NR_CPUS/10)

4752

interval = HZ*NR_CPUS/10;

4752

interval = HZ*NR_CPUS/10;

4753

4754

need_serialize = sd->flags & SD_SERIALIZE;

4754

need_serialize = sd->flags & SD_SERIALIZE;

4755

4756

if (need_serialize) {

4756

if (need_serialize) {

4757

if (!spin_trylock(&balancing))

4757

if (!spin_trylock(&balancing))

4758

goto out;

4758

goto out;

4759

}

4759

}

4760

4761

if (time_after_eq(jiffies, sd->last_balance + interval)) {

4761

if (time_after_eq(jiffies, sd->last_balance + interval)) {

4762

if (load_balance(cpu, rq, sd, idle, &balance)) {

4762

if (load_balance(cpu, rq, sd, idle, &balance)) {

4763

/*

4763

/*

4764

* We've pulled tasks over so either we're no

4764

* We've pulled tasks over so either we're no

4765

* longer idle, or one of our SMT siblings is

4765

* longer idle, or one of our SMT siblings is

4766

* not idle.

4766

* not idle.

4767

*/

4767

*/

4768

idle = CPU_NOT_IDLE;

4768

idle = CPU_NOT_IDLE;

4769

}

4769

}

4770

sd->last_balance = jiffies;

4770

sd->last_balance = jiffies;

4771

}

4771

}

4772

if (need_serialize)

4772

if (need_serialize)

4773

spin_unlock(&balancing);

4773

spin_unlock(&balancing);

4774

out:

4774

out:

4775

if (time_after(next_balance, sd->last_balance + interval)) {

4775

if (time_after(next_balance, sd->last_balance + interval)) {

4776

next_balance = sd->last_balance + interval;

4776

next_balance = sd->last_balance + interval;

4777

update_next_balance = 1;

4777

update_next_balance = 1;

4778

}

4778

}

4779

4780

/*

4780

/*

4781

* Stop the load balance at this level. There is another

4781

* Stop the load balance at this level. There is another

4782

* CPU in our sched group which is doing load balancing more

4782

* CPU in our sched group which is doing load balancing more

4783

* actively.

4783

* actively.

4784

*/

4784

*/

4785

if (!balance)

4785

if (!balance)

4786

break;

4786

break;

4787

}

4787

}

4788

4789

/*

4789

/*

4790

* next_balance will be updated only when there is a need.

4790

* next_balance will be updated only when there is a need.

4791

* When the cpu is attached to null domain for ex, it will not be

4791

* When the cpu is attached to null domain for ex, it will not be

4792

* updated.

4792

* updated.

4793

*/

4793

*/

4794

if (likely(update_next_balance))

4794

if (likely(update_next_balance))

4795

rq->next_balance = next_balance;

4795

rq->next_balance = next_balance;

4796

}

4796

}

4797

4798

/*

4798

/*

4799

* run_rebalance_domains is triggered when needed from the scheduler tick.

4799

* run_rebalance_domains is triggered when needed from the scheduler tick.

4800

* In CONFIG_NO_HZ case, the idle load balance owner will do the

4800

* In CONFIG_NO_HZ case, the idle load balance owner will do the

4801

* rebalancing for all the cpus for whom scheduler ticks are stopped.

4801

* rebalancing for all the cpus for whom scheduler ticks are stopped.

4802

*/

4802

*/

4803

static void run_rebalance_domains(struct softirq_action *h)

4803

static void run_rebalance_domains(struct softirq_action *h)

4804

{

4804

{

4805

int this_cpu = smp_processor_id();

4805

int this_cpu = smp_processor_id();

4806

struct rq *this_rq = cpu_rq(this_cpu);

4806

struct rq *this_rq = cpu_rq(this_cpu);

4807

enum cpu_idle_type idle = this_rq->idle_at_tick ?

4807

enum cpu_idle_type idle = this_rq->idle_at_tick ?

4808

CPU_IDLE : CPU_NOT_IDLE;

4808

CPU_IDLE : CPU_NOT_IDLE;

4809

4810

rebalance_domains(this_cpu, idle);

4810

rebalance_domains(this_cpu, idle);

4811

4812

#ifdef CONFIG_NO_HZ

4812

#ifdef CONFIG_NO_HZ

4813

/*

4813

/*

4814

* If this cpu is the owner for idle load balancing, then do the

4814

* If this cpu is the owner for idle load balancing, then do the

4815

* balancing on behalf of the other idle cpus whose ticks are

4815

* balancing on behalf of the other idle cpus whose ticks are

4816

* stopped.

4816

* stopped.

4817

*/

4817

*/

4818

if (this_rq->idle_at_tick &&

4818

if (this_rq->idle_at_tick &&

4819

atomic_read(&nohz.load_balancer) == this_cpu) {

4819

atomic_read(&nohz.load_balancer) == this_cpu) {

4820

struct rq *rq;

4820

struct rq *rq;

4821

int balance_cpu;

4821

int balance_cpu;

4822

4823

for_each_cpu(balance_cpu, nohz.cpu_mask) {

4823

for_each_cpu(balance_cpu, nohz.cpu_mask) {

4824

if (balance_cpu == this_cpu)

4824

if (balance_cpu == this_cpu)

4825

continue;

4825

continue;

4826

4827

/*

4827

/*

4828

* If this cpu gets work to do, stop the load balancing

4828

* If this cpu gets work to do, stop the load balancing

4829

* work being done for other cpus. Next load

4829

* work being done for other cpus. Next load

4830

* balancing owner will pick it up.

4830

* balancing owner will pick it up.

4831

*/

4831

*/

4832

if (need_resched())

4832

if (need_resched())

4833

break;

4833

break;

4834

4835

rebalance_domains(balance_cpu, CPU_IDLE);

4835

rebalance_domains(balance_cpu, CPU_IDLE);

4836

4837

rq = cpu_rq(balance_cpu);

4837

rq = cpu_rq(balance_cpu);

4838

if (time_after(this_rq->next_balance, rq->next_balance))

4838

if (time_after(this_rq->next_balance, rq->next_balance))

4839

this_rq->next_balance = rq->next_balance;

4839

this_rq->next_balance = rq->next_balance;

4840

}

4840

}

4841

}

4841

}

4842

#endif

4842

#endif

4843

}

4843

}

4844

4845

static inline int on_null_domain(int cpu)

4845

static inline int on_null_domain(int cpu)

4846

{

4846

{

4847

return !rcu_dereference(cpu_rq(cpu)->sd);

4847

return !rcu_dereference(cpu_rq(cpu)->sd);

4848

}

4848

}

4849

4850

/*

4850

/*

4851

* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.

4851

* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.

4852

*

4852

*

4853

* In case of CONFIG_NO_HZ, this is the place where we nominate a new

4853

* In case of CONFIG_NO_HZ, this is the place where we nominate a new

4854

* idle load balancing owner or decide to stop the periodic load balancing,

4854

* idle load balancing owner or decide to stop the periodic load balancing,

4855

* if the whole system is idle.

4855

* if the whole system is idle.

4856

*/

4856

*/

4857

static inline void trigger_load_balance(struct rq *rq, int cpu)

4857

static inline void trigger_load_balance(struct rq *rq, int cpu)

4858

{

4858

{

4859

#ifdef CONFIG_NO_HZ

4859

#ifdef CONFIG_NO_HZ

4860

/*

4860

/*

4861

* If we were in the nohz mode recently and busy at the current

4861

* If we were in the nohz mode recently and busy at the current

4862

* scheduler tick, then check if we need to nominate new idle

4862

* scheduler tick, then check if we need to nominate new idle

4863

* load balancer.

4863

* load balancer.

4864

*/

4864

*/

4865

if (rq->in_nohz_recently && !rq->idle_at_tick) {

4865

if (rq->in_nohz_recently && !rq->idle_at_tick) {

4866

rq->in_nohz_recently = 0;

4866

rq->in_nohz_recently = 0;

4867

4868

if (atomic_read(&nohz.load_balancer) == cpu) {

4868

if (atomic_read(&nohz.load_balancer) == cpu) {

4869

cpumask_clear_cpu(cpu, nohz.cpu_mask);

4869

cpumask_clear_cpu(cpu, nohz.cpu_mask);

4870

atomic_set(&nohz.load_balancer, -1);

4870

atomic_set(&nohz.load_balancer, -1);

4871

}

4871

}

4872

4873

if (atomic_read(&nohz.load_balancer) == -1) {

4873

if (atomic_read(&nohz.load_balancer) == -1) {

4874

int ilb = find_new_ilb(cpu);

4874

int ilb = find_new_ilb(cpu);

4875

4876

if (ilb < nr_cpu_ids)

4876

if (ilb < nr_cpu_ids)

4877

resched_cpu(ilb);

4877

resched_cpu(ilb);

4878

}

4878

}

4879

}

4879

}

4880

4881

/*

4881

/*

4882

* If this cpu is idle and doing idle load balancing for all the

4882

* If this cpu is idle and doing idle load balancing for all the

4883

* cpus with ticks stopped, is it time for that to stop?

4883

* cpus with ticks stopped, is it time for that to stop?

4884

*/

4884

*/

4885

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&

4885

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&

4886

cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {

4886

cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {

4887

resched_cpu(cpu);

4887

resched_cpu(cpu);

4888

return;

4888

return;

4889

}

4889

}

4890

4891

/*

4891

/*

4892

* If this cpu is idle and the idle load balancing is done by

4892

* If this cpu is idle and the idle load balancing is done by

4893

* someone else, then no need raise the SCHED_SOFTIRQ

4893

* someone else, then no need raise the SCHED_SOFTIRQ

4894

*/

4894

*/

4895

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&

4895

if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&

4896

cpumask_test_cpu(cpu, nohz.cpu_mask))

4896

cpumask_test_cpu(cpu, nohz.cpu_mask))

4897

return;

4897

return;

4898

#endif

4898

#endif

4899

/* Don't need to rebalance while attached to NULL domain */

4899

/* Don't need to rebalance while attached to NULL domain */

4900

if (time_after_eq(jiffies, rq->next_balance) &&

4900

if (time_after_eq(jiffies, rq->next_balance) &&

4901

likely(!on_null_domain(cpu)))

4901

likely(!on_null_domain(cpu)))

4902

raise_softirq(SCHED_SOFTIRQ);

4902

raise_softirq(SCHED_SOFTIRQ);

4903

}

4903

}

4904

4905

#else /* CONFIG_SMP */

4905

#else /* CONFIG_SMP */

4906

4907

/*

4907

/*

4908

* on UP we do not need to balance between CPUs:

4908

* on UP we do not need to balance between CPUs:

4909

*/

4909

*/

4910

static inline void idle_balance(int cpu, struct rq *rq)

4910

static inline void idle_balance(int cpu, struct rq *rq)

4911

{

4911

{

4912

}

4912

}

4913

4914

#endif

4914

#endif

4915

4916

DEFINE_PER_CPU(struct kernel_stat, kstat);

4916

DEFINE_PER_CPU(struct kernel_stat, kstat);

4917

4918

EXPORT_PER_CPU_SYMBOL(kstat);

4918

EXPORT_PER_CPU_SYMBOL(kstat);

4919

4920

/*

4920

/*

4921

* Return any ns on the sched_clock that have not yet been accounted in

4921

* Return any ns on the sched_clock that have not yet been accounted in

4922

* @p in case that task is currently running.

4922

* @p in case that task is currently running.

4923

*

4923

*

4924

* Called with task_rq_lock() held on @rq.

4924

* Called with task_rq_lock() held on @rq.

4925

*/

4925

*/

4926

static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)

4926

static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)

4927

{

4927

{

4928

u64 ns = 0;

4928

u64 ns = 0;

4929

4930

if (task_current(rq, p)) {

4930

if (task_current(rq, p)) {

4931

update_rq_clock(rq);

4931

update_rq_clock(rq);

4932

ns = rq->clock - p->se.exec_start;

4932

ns = rq->clock - p->se.exec_start;

4933

if ((s64)ns < 0)

4933

if ((s64)ns < 0)

4934

ns = 0;

4934

ns = 0;

4935

}

4935

}

4936

4937

return ns;

4937

return ns;

4938

}

4938

}

4939

4940

unsigned long long task_delta_exec(struct task_struct *p)

4940

unsigned long long task_delta_exec(struct task_struct *p)

4941

{

4941

{

4942

unsigned long flags;

4942

unsigned long flags;

4943

struct rq *rq;

4943

struct rq *rq;

4944

u64 ns = 0;

4944

u64 ns = 0;

4945

4946

rq = task_rq_lock(p, &flags);

4946

rq = task_rq_lock(p, &flags);

4947

ns = do_task_delta_exec(p, rq);

4947

ns = do_task_delta_exec(p, rq);

4948

task_rq_unlock(rq, &flags);

4948

task_rq_unlock(rq, &flags);

4949

4950

return ns;

4950

return ns;

4951

}

4951

}

4952

4953

/*

4953

/*

4954

* Return accounted runtime for the task.

4954

* Return accounted runtime for the task.

4955

* In case the task is currently running, return the runtime plus current's

4955

* In case the task is currently running, return the runtime plus current's

4956

* pending runtime that have not been accounted yet.

4956

* pending runtime that have not been accounted yet.

4957

*/

4957

*/

4958

unsigned long long task_sched_runtime(struct task_struct *p)

4958

unsigned long long task_sched_runtime(struct task_struct *p)

4959

{

4959

{

4960

unsigned long flags;

4960

unsigned long flags;

4961

struct rq *rq;

4961

struct rq *rq;

4962

u64 ns = 0;

4962

u64 ns = 0;

4963

4964

rq = task_rq_lock(p, &flags);

4964

rq = task_rq_lock(p, &flags);

4965

ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);

4965

ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);

4966

task_rq_unlock(rq, &flags);

4966

task_rq_unlock(rq, &flags);

4967

4968

return ns;

4968

return ns;

4969

}

4969

}

4970

4971

/*

4971

/*

4972

* Return sum_exec_runtime for the thread group.

4972

* Return sum_exec_runtime for the thread group.

4973

* In case the task is currently running, return the sum plus current's

4973

* In case the task is currently running, return the sum plus current's

4974

* pending runtime that have not been accounted yet.

4974

* pending runtime that have not been accounted yet.

4975

*

4975

*

4976

* Note that the thread group might have other running tasks as well,

4976

* Note that the thread group might have other running tasks as well,

4977

* so the return value not includes other pending runtime that other

4977

* so the return value not includes other pending runtime that other

4978

* running tasks might have.

4978

* running tasks might have.

4979

*/

4979

*/

4980

unsigned long long thread_group_sched_runtime(struct task_struct *p)

4980

unsigned long long thread_group_sched_runtime(struct task_struct *p)

4981

{

4981

{

4982

struct task_cputime totals;

4982

struct task_cputime totals;

4983

unsigned long flags;

4983

unsigned long flags;

4984

struct rq *rq;

4984

struct rq *rq;

4985

u64 ns;

4985

u64 ns;

4986

4987

rq = task_rq_lock(p, &flags);

4987

rq = task_rq_lock(p, &flags);

4988

thread_group_cputime(p, &totals);

4988

thread_group_cputime(p, &totals);

4989

ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);

4989

ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);

4990

task_rq_unlock(rq, &flags);

4990

task_rq_unlock(rq, &flags);

4991

4992

return ns;

4992

return ns;

4993

}

4993

}

4994

4995

/*

4995

/*

4996

* Account user cpu time to a process.

4996

* Account user cpu time to a process.

4997

* @p: the process that the cpu time gets accounted to

4997

* @p: the process that the cpu time gets accounted to

4998

* @cputime: the cpu time spent in user space since the last update

4998

* @cputime: the cpu time spent in user space since the last update

4999

* @cputime_scaled: cputime scaled by cpu frequency

4999

* @cputime_scaled: cputime scaled by cpu frequency

5000

*/

5000

*/

5001

void account_user_time(struct task_struct *p, cputime_t cputime,

5001

void account_user_time(struct task_struct *p, cputime_t cputime,

5002

cputime_t cputime_scaled)

5002

cputime_t cputime_scaled)

5003

{

5003

{

5004

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5004

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5005

cputime64_t tmp;

5005

cputime64_t tmp;

5006

5007

/* Add user time to process. */

5007

/* Add user time to process. */

5008

p->utime = cputime_add(p->utime, cputime);

5008

p->utime = cputime_add(p->utime, cputime);

5009

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

5009

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

5010

account_group_user_time(p, cputime);

5010

account_group_user_time(p, cputime);

5011

5012

/* Add user time to cpustat. */

5012

/* Add user time to cpustat. */

5013

tmp = cputime_to_cputime64(cputime);

5013

tmp = cputime_to_cputime64(cputime);

5014

if (TASK_NICE(p) > 0)

5014

if (TASK_NICE(p) > 0)

5015

cpustat->nice = cputime64_add(cpustat->nice, tmp);

5015

cpustat->nice = cputime64_add(cpustat->nice, tmp);

5016

else

5016

else

5017

cpustat->user = cputime64_add(cpustat->user, tmp);

5017

cpustat->user = cputime64_add(cpustat->user, tmp);

5018

5019

cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);

5019

cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);

5020

/* Account for user time used */

5020

/* Account for user time used */

5021

acct_update_integrals(p);

5021

acct_update_integrals(p);

5022

}

5022

}

5023

5024

/*

5024

/*

5025

* Account guest cpu time to a process.

5025

* Account guest cpu time to a process.

5026

* @p: the process that the cpu time gets accounted to

5026

* @p: the process that the cpu time gets accounted to

5027

* @cputime: the cpu time spent in virtual machine since the last update

5027

* @cputime: the cpu time spent in virtual machine since the last update

5028

* @cputime_scaled: cputime scaled by cpu frequency

5028

* @cputime_scaled: cputime scaled by cpu frequency

5029

*/

5029

*/

5030

static void account_guest_time(struct task_struct *p, cputime_t cputime,

5030

static void account_guest_time(struct task_struct *p, cputime_t cputime,

5031

cputime_t cputime_scaled)

5031

cputime_t cputime_scaled)

5032

{

5032

{

5033

cputime64_t tmp;

5033

cputime64_t tmp;

5034

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5034

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5035

5036

tmp = cputime_to_cputime64(cputime);

5036

tmp = cputime_to_cputime64(cputime);

5037

5038

/* Add guest time to process. */

5038

/* Add guest time to process. */

5039

p->utime = cputime_add(p->utime, cputime);

5039

p->utime = cputime_add(p->utime, cputime);

5040

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

5040

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

5041

account_group_user_time(p, cputime);

5041

account_group_user_time(p, cputime);

5042

p->gtime = cputime_add(p->gtime, cputime);

5042

p->gtime = cputime_add(p->gtime, cputime);

5043

5044

/* Add guest time to cpustat. */

5044

/* Add guest time to cpustat. */

5045

if (TASK_NICE(p) > 0) {

5045

if (TASK_NICE(p) > 0) {

5046

cpustat->nice = cputime64_add(cpustat->nice, tmp);

5046

cpustat->nice = cputime64_add(cpustat->nice, tmp);

5047

cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);

5047

cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);

5048

} else {

5048

} else {

5049

cpustat->user = cputime64_add(cpustat->user, tmp);

5049

cpustat->user = cputime64_add(cpustat->user, tmp);

5050

cpustat->guest = cputime64_add(cpustat->guest, tmp);

5050

cpustat->guest = cputime64_add(cpustat->guest, tmp);

5051

}

5051

}

5052

}

5052

}

5053

5054

/*

5054

/*

5055

* Account system cpu time to a process.

5055

* Account system cpu time to a process.

5056

* @p: the process that the cpu time gets accounted to

5056

* @p: the process that the cpu time gets accounted to

5057

* @hardirq_offset: the offset to subtract from hardirq_count()

5057

* @hardirq_offset: the offset to subtract from hardirq_count()

5058

* @cputime: the cpu time spent in kernel space since the last update

5058

* @cputime: the cpu time spent in kernel space since the last update

5059

* @cputime_scaled: cputime scaled by cpu frequency

5059

* @cputime_scaled: cputime scaled by cpu frequency

5060

*/

5060

*/

5061

void account_system_time(struct task_struct *p, int hardirq_offset,

5061

void account_system_time(struct task_struct *p, int hardirq_offset,

5062

cputime_t cputime, cputime_t cputime_scaled)

5062

cputime_t cputime, cputime_t cputime_scaled)

5063

{

5063

{

5064

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5064

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5065

cputime64_t tmp;

5065

cputime64_t tmp;

5066

5067

if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {

5067

if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {

5068

account_guest_time(p, cputime, cputime_scaled);

5068

account_guest_time(p, cputime, cputime_scaled);

5069

return;

5069

return;

5070

}

5070

}

5071

5072

/* Add system time to process. */

5072

/* Add system time to process. */

5073

p->stime = cputime_add(p->stime, cputime);

5073

p->stime = cputime_add(p->stime, cputime);

5074

p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);

5074

p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);

5075

account_group_system_time(p, cputime);

5075

account_group_system_time(p, cputime);

5076

5077

/* Add system time to cpustat. */

5077

/* Add system time to cpustat. */

5078

tmp = cputime_to_cputime64(cputime);

5078

tmp = cputime_to_cputime64(cputime);

5079

if (hardirq_count() - hardirq_offset)

5079

if (hardirq_count() - hardirq_offset)

5080

cpustat->irq = cputime64_add(cpustat->irq, tmp);

5080

cpustat->irq = cputime64_add(cpustat->irq, tmp);

5081

else if (softirq_count())

5081

else if (softirq_count())

5082

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

5082

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

5083

else

5083

else

5084

cpustat->system = cputime64_add(cpustat->system, tmp);

5084

cpustat->system = cputime64_add(cpustat->system, tmp);

5085

5086

cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);

5086

cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);

5087

5088

/* Account for system time used */

5088

/* Account for system time used */

5089

acct_update_integrals(p);

5089

acct_update_integrals(p);

5090

}

5090

}

5091

5092

/*

5092

/*

5093

* Account for involuntary wait time.

5093

* Account for involuntary wait time.

5094

* @steal: the cpu time spent in involuntary wait

5094

* @steal: the cpu time spent in involuntary wait

5095

*/

5095

*/

5096

void account_steal_time(cputime_t cputime)

5096

void account_steal_time(cputime_t cputime)

5097

{

5097

{

5098

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5098

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5099

cputime64_t cputime64 = cputime_to_cputime64(cputime);

5099

cputime64_t cputime64 = cputime_to_cputime64(cputime);

5100

5101

cpustat->steal = cputime64_add(cpustat->steal, cputime64);

5101

cpustat->steal = cputime64_add(cpustat->steal, cputime64);

5102

}

5102

}

5103

5104

/*

5104

/*

5105

* Account for idle time.

5105

* Account for idle time.

5106

* @cputime: the cpu time spent in idle wait

5106

* @cputime: the cpu time spent in idle wait

5107

*/

5107

*/

5108

void account_idle_time(cputime_t cputime)

5108

void account_idle_time(cputime_t cputime)

5109

{

5109

{

5110

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5110

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

5111

cputime64_t cputime64 = cputime_to_cputime64(cputime);

5111

cputime64_t cputime64 = cputime_to_cputime64(cputime);

5112

struct rq *rq = this_rq();

5112

struct rq *rq = this_rq();

5113

5114

if (atomic_read(&rq->nr_iowait) > 0)

5114

if (atomic_read(&rq->nr_iowait) > 0)

5115

cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);

5115

cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);

5116

else

5116

else

5117

cpustat->idle = cputime64_add(cpustat->idle, cputime64);

5117

cpustat->idle = cputime64_add(cpustat->idle, cputime64);

5118

}

5118

}

5119

5120

#ifndef CONFIG_VIRT_CPU_ACCOUNTING

5120

#ifndef CONFIG_VIRT_CPU_ACCOUNTING

5121

5122

/*

5122

/*

5123

* Account a single tick of cpu time.

5123

* Account a single tick of cpu time.

5124

* @p: the process that the cpu time gets accounted to

5124

* @p: the process that the cpu time gets accounted to

5125

* @user_tick: indicates if the tick is a user or a system tick

5125

* @user_tick: indicates if the tick is a user or a system tick

5126

*/

5126

*/

5127

void account_process_tick(struct task_struct *p, int user_tick)

5127

void account_process_tick(struct task_struct *p, int user_tick)

5128

{

5128

{

5129

cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);

5129

cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);

5130

struct rq *rq = this_rq();

5130

struct rq *rq = this_rq();

5131

5132

if (user_tick)

5132

if (user_tick)

5133

account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);

5133

account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);

5134

else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))

5134

else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))

5135

account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,

5135

account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,

5136

one_jiffy_scaled);

5136

one_jiffy_scaled);

5137

else

5137

else

5138

account_idle_time(cputime_one_jiffy);

5138

account_idle_time(cputime_one_jiffy);

5139

}

5139

}

5140

5141

/*

5141

/*

5142

* Account multiple ticks of steal time.

5142

* Account multiple ticks of steal time.

5143

* @p: the process from which the cpu time has been stolen

5143

* @p: the process from which the cpu time has been stolen

5144

* @ticks: number of stolen ticks

5144

* @ticks: number of stolen ticks

5145

*/

5145

*/

5146

void account_steal_ticks(unsigned long ticks)

5146

void account_steal_ticks(unsigned long ticks)

5147

{

5147

{

5148

account_steal_time(jiffies_to_cputime(ticks));

5148

account_steal_time(jiffies_to_cputime(ticks));

5149

}

5149

}

5150

5151

/*

5151

/*

5152

* Account multiple ticks of idle time.

5152

* Account multiple ticks of idle time.

5153

* @ticks: number of stolen ticks

5153

* @ticks: number of stolen ticks

5154

*/

5154

*/

5155

void account_idle_ticks(unsigned long ticks)

5155

void account_idle_ticks(unsigned long ticks)

5156

{

5156

{

5157

account_idle_time(jiffies_to_cputime(ticks));

5157

account_idle_time(jiffies_to_cputime(ticks));

5158

}

5158

}

5159

5160

#endif

5160

#endif

5161

5162

/*

5162

/*

5163

* Use precise platform statistics if available:

5163

* Use precise platform statistics if available:

5164

*/

5164

*/

5165

#ifdef CONFIG_VIRT_CPU_ACCOUNTING

5165

#ifdef CONFIG_VIRT_CPU_ACCOUNTING

5166

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

5166

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

5167

{

5167

{

5168

*ut = p->utime;

5168

*ut = p->utime;

5169

*st = p->stime;

5169

*st = p->stime;

5170

}

5170

}

5171

5172

void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

5172

void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

5173

{

5173

{

5174

struct task_cputime cputime;

5174

struct task_cputime cputime;

5175

5176

thread_group_cputime(p, &cputime);

5176

thread_group_cputime(p, &cputime);

5177

5178

*ut = cputime.utime;

5178

*ut = cputime.utime;

5179

*st = cputime.stime;

5179

*st = cputime.stime;

5180

}

5180

}

5181

#else

5181

#else

5182

5183

#ifndef nsecs_to_cputime

5183

#ifndef nsecs_to_cputime

5184

# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)

5184

# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)

5185

#endif

5185

#endif

5186

5187

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

5187

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

5188

{

5188

{

5189

cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);

5189

cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);

5190

5191

/*

5191

/*

5192

* Use CFS's precise accounting:

5192

* Use CFS's precise accounting:

5193

*/

5193

*/

5194

rtime = nsecs_to_cputime(p->se.sum_exec_runtime);

5194

rtime = nsecs_to_cputime(p->se.sum_exec_runtime);

5195

5196

if (total) {

5196

if (total) {

5197

u64 temp;

5197

u64 temp;

5198

5199

temp = (u64)(rtime * utime);

5199

temp = (u64)(rtime * utime);

5200

do_div(temp, total);

5200

do_div(temp, total);

5201

utime = (cputime_t)temp;

5201

utime = (cputime_t)temp;

5202

} else

5202

} else

5203

utime = rtime;

5203

utime = rtime;

5204

5205

/*

5205

/*

5206

* Compare with previous values, to keep monotonicity:

5206

* Compare with previous values, to keep monotonicity:

5207

*/

5207

*/

5208

p->prev_utime = max(p->prev_utime, utime);

5208

p->prev_utime = max(p->prev_utime, utime);

5209

p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));

5209

p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));

5210

5211

*ut = p->prev_utime;

5211

*ut = p->prev_utime;

5212

*st = p->prev_stime;

5212

*st = p->prev_stime;

5213

}

5213

}

5214

5215

/*

5215

/*

5216

* Must be called with siglock held.

5216

* Must be called with siglock held.

5217

*/

5217

*/

5218

void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

5218

void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

5219

{

5219

{

5220

struct signal_struct *sig = p->signal;

5220

struct signal_struct *sig = p->signal;

5221

struct task_cputime cputime;

5221

struct task_cputime cputime;

5222

cputime_t rtime, utime, total;

5222

cputime_t rtime, utime, total;

5223

5224

thread_group_cputime(p, &cputime);

5224

thread_group_cputime(p, &cputime);

5225

5226

total = cputime_add(cputime.utime, cputime.stime);

5226

total = cputime_add(cputime.utime, cputime.stime);

5227

rtime = nsecs_to_cputime(cputime.sum_exec_runtime);

5227

rtime = nsecs_to_cputime(cputime.sum_exec_runtime);

5228

5229

if (total) {

5229

if (total) {

5230

u64 temp;

5230

u64 temp;

5231

5232

temp = (u64)(rtime * cputime.utime);

5232

temp = (u64)(rtime * cputime.utime);

5233

do_div(temp, total);

5233

do_div(temp, total);

5234

utime = (cputime_t)temp;

5234

utime = (cputime_t)temp;

5235

} else

5235

} else

5236

utime = rtime;

5236

utime = rtime;

5237

5238

sig->prev_utime = max(sig->prev_utime, utime);

5238

sig->prev_utime = max(sig->prev_utime, utime);

5239

sig->prev_stime = max(sig->prev_stime,

5239

sig->prev_stime = max(sig->prev_stime,

5240

cputime_sub(rtime, sig->prev_utime));

5240

cputime_sub(rtime, sig->prev_utime));

5241

5242

*ut = sig->prev_utime;

5242

*ut = sig->prev_utime;

5243

*st = sig->prev_stime;

5243

*st = sig->prev_stime;

5244

}

5244

}

5245

#endif

5245

#endif

5246

5247

/*

5247

/*

5248

* This function gets called by the timer code, with HZ frequency.

5248

* This function gets called by the timer code, with HZ frequency.

5249

* We call it with interrupts disabled.

5249

* We call it with interrupts disabled.

5250

*

5250

*

5251

* It also gets called by the fork code, when changing the parent's

5251

* It also gets called by the fork code, when changing the parent's

5252

* timeslices.

5252

* timeslices.

5253

*/

5253

*/

5254

void scheduler_tick(void)

5254

void scheduler_tick(void)

5255

{

5255

{

5256

int cpu = smp_processor_id();

5256

int cpu = smp_processor_id();

5257

struct rq *rq = cpu_rq(cpu);

5257

struct rq *rq = cpu_rq(cpu);

5258

struct task_struct *curr = rq->curr;

5258

struct task_struct *curr = rq->curr;

5259

5260

sched_clock_tick();

5260

sched_clock_tick();

5261

5262

spin_lock(&rq->lock);

5262

spin_lock(&rq->lock);

5263

update_rq_clock(rq);

5263

update_rq_clock(rq);

5264

update_cpu_load(rq);

5264

update_cpu_load(rq);

5265

curr->sched_class->task_tick(rq, curr, 0);

5265

curr->sched_class->task_tick(rq, curr, 0);

5266

spin_unlock(&rq->lock);

5266

spin_unlock(&rq->lock);

5267

5268

perf_event_task_tick(curr, cpu);

5268

perf_event_task_tick(curr, cpu);

5269

5270

#ifdef CONFIG_SMP

5270

#ifdef CONFIG_SMP

5271

rq->idle_at_tick = idle_cpu(cpu);

5271

rq->idle_at_tick = idle_cpu(cpu);

5272

trigger_load_balance(rq, cpu);

5272

trigger_load_balance(rq, cpu);

5273

#endif

5273

#endif

5274

}

5274

}

5275

5276

notrace unsigned long get_parent_ip(unsigned long addr)

5276

notrace unsigned long get_parent_ip(unsigned long addr)

5277

{

5277

{

5278

if (in_lock_functions(addr)) {

5278

if (in_lock_functions(addr)) {

5279

addr = CALLER_ADDR2;

5279

addr = CALLER_ADDR2;

5280

if (in_lock_functions(addr))

5280

if (in_lock_functions(addr))

5281

addr = CALLER_ADDR3;

5281

addr = CALLER_ADDR3;

5282

}

5282

}

5283

return addr;

5283

return addr;

5284

}

5284

}

5285

5286

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \

5286

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \

5287

defined(CONFIG_PREEMPT_TRACER))

5287

defined(CONFIG_PREEMPT_TRACER))

5288

5289

void __kprobes add_preempt_count(int val)

5289

void __kprobes add_preempt_count(int val)

5290

{

5290

{

5291

#ifdef CONFIG_DEBUG_PREEMPT

5291

#ifdef CONFIG_DEBUG_PREEMPT

5292

/*

5292

/*

5293

* Underflow?

5293

* Underflow?

5294

*/

5294

*/

5295

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

5295

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

5296

return;

5296

return;

5297

#endif

5297

#endif

5298

preempt_count() += val;

5298

preempt_count() += val;

5299

#ifdef CONFIG_DEBUG_PREEMPT

5299

#ifdef CONFIG_DEBUG_PREEMPT

5300

/*

5300

/*

5301

* Spinlock count overflowing soon?

5301

* Spinlock count overflowing soon?

5302

*/

5302

*/

5303

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

5303

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

5304

PREEMPT_MASK - 10);

5304

PREEMPT_MASK - 10);

5305

#endif

5305

#endif

5306

if (preempt_count() == val)

5306

if (preempt_count() == val)

5307

trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

5307

trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

5308

}

5308

}

5309

EXPORT_SYMBOL(add_preempt_count);

5309

EXPORT_SYMBOL(add_preempt_count);

5310

5311

void __kprobes sub_preempt_count(int val)

5311

void __kprobes sub_preempt_count(int val)

5312

{

5312

{

5313

#ifdef CONFIG_DEBUG_PREEMPT

5313

#ifdef CONFIG_DEBUG_PREEMPT

5314

/*

5314

/*

5315

* Underflow?

5315

* Underflow?

5316

*/

5316

*/

5317

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

5317

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

5318

return;

5318

return;

5319

/*

5319

/*

5320

* Is the spinlock portion underflowing?

5320

* Is the spinlock portion underflowing?

5321

*/

5321

*/

5322

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

5322

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

5323

!(preempt_count() & PREEMPT_MASK)))

5323

!(preempt_count() & PREEMPT_MASK)))

5324

return;

5324

return;

5325

#endif

5325

#endif

5326

5327

if (preempt_count() == val)

5327

if (preempt_count() == val)

5328

trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

5328

trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

5329

preempt_count() -= val;

5329

preempt_count() -= val;

5330

}

5330

}

5331

EXPORT_SYMBOL(sub_preempt_count);

5331

EXPORT_SYMBOL(sub_preempt_count);

5332

5333

#endif

5333

#endif

5334

5335

/*

5335

/*

5336

* Print scheduling while atomic bug:

5336

* Print scheduling while atomic bug:

5337

*/

5337

*/

5338

static noinline void __schedule_bug(struct task_struct *prev)

5338

static noinline void __schedule_bug(struct task_struct *prev)

5339

{

5339

{

5340

struct pt_regs *regs = get_irq_regs();

5340

struct pt_regs *regs = get_irq_regs();

5341

5342

pr_err("BUG: scheduling while atomic: %s/%d/0x%08x\n",

5342

pr_err("BUG: scheduling while atomic: %s/%d/0x%08x\n",

5343

prev->comm, prev->pid, preempt_count());

5343

prev->comm, prev->pid, preempt_count());

5344

5345

debug_show_held_locks(prev);

5345

debug_show_held_locks(prev);

5346

print_modules();

5346

print_modules();

5347

if (irqs_disabled())

5347

if (irqs_disabled())

5348

print_irqtrace_events(prev);

5348

print_irqtrace_events(prev);

5349

5350

if (regs)

5350

if (regs)

5351

show_regs(regs);

5351

show_regs(regs);

5352

else

5352

else

5353

dump_stack();

5353

dump_stack();

5354

}

5354

}

5355

5356

/*

5356

/*

5357

* Various schedule()-time debugging checks and statistics:

5357

* Various schedule()-time debugging checks and statistics:

5358

*/

5358

*/

5359

static inline void schedule_debug(struct task_struct *prev)

5359

static inline void schedule_debug(struct task_struct *prev)

5360

{

5360

{

5361

/*

5361

/*

5362

* Test if we are atomic. Since do_exit() needs to call into

5362

* Test if we are atomic. Since do_exit() needs to call into

5363

* schedule() atomically, we ignore that path for now.

5363

* schedule() atomically, we ignore that path for now.

5364

* Otherwise, whine if we are scheduling when we should not be.

5364

* Otherwise, whine if we are scheduling when we should not be.

5365

*/

5365

*/

5366

if (unlikely(in_atomic_preempt_off() && !prev->exit_state))

5366

if (unlikely(in_atomic_preempt_off() && !prev->exit_state))

5367

__schedule_bug(prev);

5367

__schedule_bug(prev);

5368

5369

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

5369

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

5370

5371

schedstat_inc(this_rq(), sched_count);

5371

schedstat_inc(this_rq(), sched_count);

5372

#ifdef CONFIG_SCHEDSTATS

5372

#ifdef CONFIG_SCHEDSTATS

5373

if (unlikely(prev->lock_depth >= 0)) {

5373

if (unlikely(prev->lock_depth >= 0)) {

5374

schedstat_inc(this_rq(), bkl_count);

5374

schedstat_inc(this_rq(), bkl_count);

5375

schedstat_inc(prev, sched_info.bkl_count);

5375

schedstat_inc(prev, sched_info.bkl_count);

5376

}

5376

}

5377

#endif

5377

#endif

5378

}

5378

}

5379

5380

static void put_prev_task(struct rq *rq, struct task_struct *prev)

5380

static void put_prev_task(struct rq *rq, struct task_struct *prev)

5381

{

5381

{

5382

if (prev->state == TASK_RUNNING) {

5382

if (prev->state == TASK_RUNNING) {

5383

u64 runtime = prev->se.sum_exec_runtime;

5383

u64 runtime = prev->se.sum_exec_runtime;

5384

5385

runtime -= prev->se.prev_sum_exec_runtime;

5385

runtime -= prev->se.prev_sum_exec_runtime;

5386

runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);

5386

runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);

5387

5388

/*

5388

/*

5389

* In order to avoid avg_overlap growing stale when we are

5389

* In order to avoid avg_overlap growing stale when we are

5390

* indeed overlapping and hence not getting put to sleep, grow

5390

* indeed overlapping and hence not getting put to sleep, grow

5391

* the avg_overlap on preemption.

5391

* the avg_overlap on preemption.

5392

*

5392

*

5393

* We use the average preemption runtime because that

5393

* We use the average preemption runtime because that

5394

* correlates to the amount of cache footprint a task can

5394

* correlates to the amount of cache footprint a task can

5395

* build up.

5395

* build up.

5396

*/

5396

*/

5397

update_avg(&prev->se.avg_overlap, runtime);

5397

update_avg(&prev->se.avg_overlap, runtime);

5398

}

5398

}

5399

prev->sched_class->put_prev_task(rq, prev);

5399

prev->sched_class->put_prev_task(rq, prev);

5400

}

5400

}

5401

5402

/*

5402

/*

5403

* Pick up the highest-prio task:

5403

* Pick up the highest-prio task:

5404

*/

5404

*/

5405

static inline struct task_struct *

5405

static inline struct task_struct *

5406

pick_next_task(struct rq *rq)

5406

pick_next_task(struct rq *rq)

5407

{

5407

{

5408

const struct sched_class *class;

5408

const struct sched_class *class;

5409

struct task_struct *p;

5409

struct task_struct *p;

5410

5411

/*

5411

/*

5412

* Optimization: we know that if all tasks are in

5412

* Optimization: we know that if all tasks are in

5413

* the fair class we can call that function directly:

5413

* the fair class we can call that function directly:

5414

*/

5414

*/

5415

if (likely(rq->nr_running == rq->cfs.nr_running)) {

5415

if (likely(rq->nr_running == rq->cfs.nr_running)) {

5416

p = fair_sched_class.pick_next_task(rq);

5416

p = fair_sched_class.pick_next_task(rq);

5417

if (likely(p))

5417

if (likely(p))

5418

return p;

5418

return p;

5419

}

5419

}

5420

5421

class = sched_class_highest;

5421

class = sched_class_highest;

5422

for ( ; ; ) {

5422

for ( ; ; ) {

5423

p = class->pick_next_task(rq);

5423

p = class->pick_next_task(rq);

5424

if (p)

5424

if (p)

5425

return p;

5425

return p;

5426

/*

5426

/*

5427

* Will never be NULL as the idle class always

5427

* Will never be NULL as the idle class always

5428

* returns a non-NULL p:

5428

* returns a non-NULL p:

5429

*/

5429

*/

5430

class = class->next;

5430

class = class->next;

5431

}

5431

}

5432

}

5432

}

5433

5434

/*

5434

/*

5435

* schedule() is the main scheduler function.

5435

* schedule() is the main scheduler function.

5436

*/

5436

*/

5437

asmlinkage void __sched schedule(void)

5437

asmlinkage void __sched schedule(void)

5438

{

5438

{

5439

struct task_struct *prev, *next;

5439

struct task_struct *prev, *next;

5440

unsigned long *switch_count;

5440

unsigned long *switch_count;

5441

struct rq *rq;

5441

struct rq *rq;

5442

int cpu;

5442

int cpu;

5443

5444

need_resched:

5444

need_resched:

5445

preempt_disable();

5445

preempt_disable();

5446

cpu = smp_processor_id();

5446

cpu = smp_processor_id();

5447

rq = cpu_rq(cpu);

5447

rq = cpu_rq(cpu);

5448

rcu_sched_qs(cpu);

5448

rcu_sched_qs(cpu);

5449

prev = rq->curr;

5449

prev = rq->curr;

5450

switch_count = &prev->nivcsw;

5450

switch_count = &prev->nivcsw;

5451

5452

release_kernel_lock(prev);

5452

release_kernel_lock(prev);

5453

need_resched_nonpreemptible:

5453

need_resched_nonpreemptible:

5454

5455

schedule_debug(prev);

5455

schedule_debug(prev);

5456

5457

if (sched_feat(HRTICK))

5457

if (sched_feat(HRTICK))

5458

hrtick_clear(rq);

5458

hrtick_clear(rq);

5459

5460

spin_lock_irq(&rq->lock);

5460

spin_lock_irq(&rq->lock);

5461

update_rq_clock(rq);

5461

update_rq_clock(rq);

5462

clear_tsk_need_resched(prev);

5462

clear_tsk_need_resched(prev);

5463

5464

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

5464

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

5465

if (unlikely(signal_pending_state(prev->state, prev)))

5465

if (unlikely(signal_pending_state(prev->state, prev)))

5466

prev->state = TASK_RUNNING;

5466

prev->state = TASK_RUNNING;

5467

else

5467

else

5468

deactivate_task(rq, prev, 1);

5468

deactivate_task(rq, prev, 1);

5469

switch_count = &prev->nvcsw;

5469

switch_count = &prev->nvcsw;

5470

}

5470

}

5471

5472

pre_schedule(rq, prev);

5472

pre_schedule(rq, prev);

5473

5474

if (unlikely(!rq->nr_running))

5474

if (unlikely(!rq->nr_running))

5475

idle_balance(cpu, rq);

5475

idle_balance(cpu, rq);

5476

5477

put_prev_task(rq, prev);

5477

put_prev_task(rq, prev);

5478

next = pick_next_task(rq);

5478

next = pick_next_task(rq);

5479

5480

if (likely(prev != next)) {

5480

if (likely(prev != next)) {

5481

sched_info_switch(prev, next);

5481

sched_info_switch(prev, next);

5482

perf_event_task_sched_out(prev, next, cpu);

5482

perf_event_task_sched_out(prev, next, cpu);

5483

5484

rq->nr_switches++;

5484

rq->nr_switches++;

5485

rq->curr = next;

5485

rq->curr = next;

5486

++*switch_count;

5486

++*switch_count;

5487

5488

context_switch(rq, prev, next); /* unlocks the rq */

5488

context_switch(rq, prev, next); /* unlocks the rq */

5489

/*

5489

/*

5490

* the context switch might have flipped the stack from under

5490

* the context switch might have flipped the stack from under

5491

* us, hence refresh the local variables.

5491

* us, hence refresh the local variables.

5492

*/

5492

*/

5493

cpu = smp_processor_id();

5493

cpu = smp_processor_id();

5494

rq = cpu_rq(cpu);

5494

rq = cpu_rq(cpu);

5495

} else

5495

} else

5496

spin_unlock_irq(&rq->lock);

5496

spin_unlock_irq(&rq->lock);

5497

5498

post_schedule(rq);

5498

post_schedule(rq);

5499

5500

if (unlikely(reacquire_kernel_lock(current) < 0))

5500

if (unlikely(reacquire_kernel_lock(current) < 0))

5501

goto need_resched_nonpreemptible;

5501

goto need_resched_nonpreemptible;

5502

5503

preempt_enable_no_resched();

5503

preempt_enable_no_resched();

5504

if (need_resched())

5504

if (need_resched())

5505

goto need_resched;

5505

goto need_resched;

5506

}

5506

}

5507

EXPORT_SYMBOL(schedule);

5507

EXPORT_SYMBOL(schedule);

5508

5509

#ifdef CONFIG_MUTEX_SPIN_ON_OWNER

5509

#ifdef CONFIG_MUTEX_SPIN_ON_OWNER

5510

/*

5510

/*

5511

* Look out! "owner" is an entirely speculative pointer

5511

* Look out! "owner" is an entirely speculative pointer

5512

* access and not reliable.

5512

* access and not reliable.

5513

*/

5513

*/

5514

int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)

5514

int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)

5515

{

5515

{

5516

unsigned int cpu;

5516

unsigned int cpu;

5517

struct rq *rq;

5517

struct rq *rq;

5518

5519

if (!sched_feat(OWNER_SPIN))

5519

if (!sched_feat(OWNER_SPIN))

5520

return 0;

5520

return 0;

5521

5522

#ifdef CONFIG_DEBUG_PAGEALLOC

5522

#ifdef CONFIG_DEBUG_PAGEALLOC

5523

/*

5523

/*

5524

* Need to access the cpu field knowing that

5524

* Need to access the cpu field knowing that

5525

* DEBUG_PAGEALLOC could have unmapped it if

5525

* DEBUG_PAGEALLOC could have unmapped it if

5526

* the mutex owner just released it and exited.

5526

* the mutex owner just released it and exited.

5527

*/

5527

*/

5528

if (probe_kernel_address(&owner->cpu, cpu))

5528

if (probe_kernel_address(&owner->cpu, cpu))

5529

goto out;

5529

goto out;

5530

#else

5530

#else

5531

cpu = owner->cpu;

5531

cpu = owner->cpu;

5532

#endif

5532

#endif

5533

5534

/*

5534

/*

5535

* Even if the access succeeded (likely case),

5535

* Even if the access succeeded (likely case),

5536

* the cpu field may no longer be valid.

5536

* the cpu field may no longer be valid.

5537

*/

5537

*/

5538

if (cpu >= nr_cpumask_bits)

5538

if (cpu >= nr_cpumask_bits)

5539

goto out;

5539

goto out;

5540

5541

/*

5541

/*

5542

* We need to validate that we can do a

5542

* We need to validate that we can do a

5543

* get_cpu() and that we have the percpu area.

5543

* get_cpu() and that we have the percpu area.

5544

*/

5544

*/

5545

if (!cpu_online(cpu))

5545

if (!cpu_online(cpu))

5546

goto out;

5546

goto out;

5547

5548

rq = cpu_rq(cpu);

5548

rq = cpu_rq(cpu);

5549

5550

for (;;) {

5550

for (;;) {

5551

/*

5551

/*

5552

* Owner changed, break to re-assess state.

5552

* Owner changed, break to re-assess state.

5553

*/

5553

*/

5554

if (lock->owner != owner)

5554

if (lock->owner != owner)

5555

break;

5555

break;

5556

5557

/*

5557

/*

5558

* Is that owner really running on that cpu?

5558

* Is that owner really running on that cpu?

5559

*/

5559

*/

5560

if (task_thread_info(rq->curr) != owner || need_resched())

5560

if (task_thread_info(rq->curr) != owner || need_resched())

5561

return 0;

5561

return 0;

5562

5563

cpu_relax();

5563

cpu_relax();

5564

}

5564

}

5565

out:

5565

out:

5566

return 1;

5566

return 1;

5567

}

5567

}

5568

#endif

5568

#endif

5569

5570

#ifdef CONFIG_PREEMPT

5570

#ifdef CONFIG_PREEMPT

5571

/*

5571

/*

5572

* this is the entry point to schedule() from in-kernel preemption

5572

* this is the entry point to schedule() from in-kernel preemption

5573

* off of preempt_enable. Kernel preemptions off return from interrupt

5573

* off of preempt_enable. Kernel preemptions off return from interrupt

5574

* occur there and call schedule directly.

5574

* occur there and call schedule directly.

5575

*/

5575

*/

5576

asmlinkage void __sched preempt_schedule(void)

5576

asmlinkage void __sched preempt_schedule(void)

5577

{

5577

{

5578

struct thread_info *ti = current_thread_info();

5578

struct thread_info *ti = current_thread_info();

5579

5580

/*

5580

/*

5581

* If there is a non-zero preempt_count or interrupts are disabled,

5581

* If there is a non-zero preempt_count or interrupts are disabled,

5582

* we do not want to preempt the current task. Just return..

5582

* we do not want to preempt the current task. Just return..

5583

*/

5583

*/

5584

if (likely(ti->preempt_count || irqs_disabled()))

5584

if (likely(ti->preempt_count || irqs_disabled()))

5585

return;

5585

return;

5586

5587

do {

5587

do {

5588

add_preempt_count(PREEMPT_ACTIVE);

5588

add_preempt_count(PREEMPT_ACTIVE);

5589

schedule();

5589

schedule();

5590

sub_preempt_count(PREEMPT_ACTIVE);

5590

sub_preempt_count(PREEMPT_ACTIVE);

5591

5592

/*

5592

/*

5593

* Check again in case we missed a preemption opportunity

5593

* Check again in case we missed a preemption opportunity

5594

* between schedule and now.

5594

* between schedule and now.

5595

*/

5595

*/

5596

barrier();

5596

barrier();

5597

} while (need_resched());

5597

} while (need_resched());

5598

}

5598

}

5599

EXPORT_SYMBOL(preempt_schedule);

5599

EXPORT_SYMBOL(preempt_schedule);

5600

5601

/*

5601

/*

5602

* this is the entry point to schedule() from kernel preemption

5602

* this is the entry point to schedule() from kernel preemption

5603

* off of irq context.

5603

* off of irq context.

5604

* Note, that this is called and return with irqs disabled. This will

5604

* Note, that this is called and return with irqs disabled. This will

5605

* protect us against recursive calling from irq.

5605

* protect us against recursive calling from irq.

5606

*/

5606

*/

5607

asmlinkage void __sched preempt_schedule_irq(void)

5607

asmlinkage void __sched preempt_schedule_irq(void)

5608

{

5608

{

5609

struct thread_info *ti = current_thread_info();

5609

struct thread_info *ti = current_thread_info();

5610

5611

/* Catch callers which need to be fixed */

5611

/* Catch callers which need to be fixed */

5612

BUG_ON(ti->preempt_count || !irqs_disabled());

5612

BUG_ON(ti->preempt_count || !irqs_disabled());

5613

5614

do {

5614

do {

5615

add_preempt_count(PREEMPT_ACTIVE);

5615

add_preempt_count(PREEMPT_ACTIVE);

5616

local_irq_enable();

5616

local_irq_enable();

5617

schedule();

5617

schedule();

5618

local_irq_disable();

5618

local_irq_disable();

5619

sub_preempt_count(PREEMPT_ACTIVE);

5619

sub_preempt_count(PREEMPT_ACTIVE);

5620

5621

/*

5621

/*

5622

* Check again in case we missed a preemption opportunity

5622

* Check again in case we missed a preemption opportunity

5623

* between schedule and now.

5623

* between schedule and now.

5624

*/

5624

*/

5625

barrier();

5625

barrier();

5626

} while (need_resched());

5626

} while (need_resched());

5627

}

5627

}

5628

5629

#endif /* CONFIG_PREEMPT */

5629

#endif /* CONFIG_PREEMPT */

5630

5631

int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,

5631

int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,

5632

void *key)

5632

void *key)

5633

{

5633

{

5634

return try_to_wake_up(curr->private, mode, wake_flags);

5634

return try_to_wake_up(curr->private, mode, wake_flags);

5635

}

5635

}

5636

EXPORT_SYMBOL(default_wake_function);

5636

EXPORT_SYMBOL(default_wake_function);

5637

5638

/*

5638

/*

5639

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

5639

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

5640

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

5640

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

5641

* number) then we wake all the non-exclusive tasks and one exclusive task.

5641

* number) then we wake all the non-exclusive tasks and one exclusive task.

5642

*

5642

*

5643

* There are circumstances in which we can try to wake a task which has already

5643

* There are circumstances in which we can try to wake a task which has already

5644

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

5644

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

5645

* zero in this (rare) case, and we handle it by continuing to scan the queue.

5645

* zero in this (rare) case, and we handle it by continuing to scan the queue.

5646

*/

5646

*/

5647

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

5647

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

5648

int nr_exclusive, int wake_flags, void *key)

5648

int nr_exclusive, int wake_flags, void *key)

5649

{

5649

{

5650

wait_queue_t *curr, *next;

5650

wait_queue_t *curr, *next;

5651

5652

list_for_each_entry_safe(curr, next, &q->task_list, task_list) {

5652

list_for_each_entry_safe(curr, next, &q->task_list, task_list) {

5653

unsigned flags = curr->flags;

5653

unsigned flags = curr->flags;

5654

5655

if (curr->func(curr, mode, wake_flags, key) &&

5655

if (curr->func(curr, mode, wake_flags, key) &&

5656

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

5656

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

5657

break;

5657

break;

5658

}

5658

}

5659

}

5659

}

5660

5661

/**

5661

/**

5662

* __wake_up - wake up threads blocked on a waitqueue.

5662

* __wake_up - wake up threads blocked on a waitqueue.

5663

* @q: the waitqueue

5663

* @q: the waitqueue

5664

* @mode: which threads

5664

* @mode: which threads

5665

* @nr_exclusive: how many wake-one or wake-many threads to wake up

5665

* @nr_exclusive: how many wake-one or wake-many threads to wake up

5666

* @key: is directly passed to the wakeup function

5666

* @key: is directly passed to the wakeup function

5667

*

5667

*

5668

* It may be assumed that this function implies a write memory barrier before

5668

* It may be assumed that this function implies a write memory barrier before

5669

* changing the task state if and only if any tasks are woken up.

5669

* changing the task state if and only if any tasks are woken up.

5670

*/

5670

*/

5671

void __wake_up(wait_queue_head_t *q, unsigned int mode,

5671

void __wake_up(wait_queue_head_t *q, unsigned int mode,

5672

int nr_exclusive, void *key)

5672

int nr_exclusive, void *key)

5673

{

5673

{

5674

unsigned long flags;

5674

unsigned long flags;

5675

5676

spin_lock_irqsave(&q->lock, flags);

5676

spin_lock_irqsave(&q->lock, flags);

5677

__wake_up_common(q, mode, nr_exclusive, 0, key);

5677

__wake_up_common(q, mode, nr_exclusive, 0, key);

5678

spin_unlock_irqrestore(&q->lock, flags);

5678

spin_unlock_irqrestore(&q->lock, flags);

5679

}

5679

}

5680

EXPORT_SYMBOL(__wake_up);

5680

EXPORT_SYMBOL(__wake_up);

5681

5682

/*

5682

/*

5683

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

5683

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

5684

*/

5684

*/

5685

void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

5685

void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

5686

{

5686

{

5687

__wake_up_common(q, mode, 1, 0, NULL);

5687

__wake_up_common(q, mode, 1, 0, NULL);

5688

}

5688

}

5689

5690

void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)

5690

void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)

5691

{

5691

{

5692

__wake_up_common(q, mode, 1, 0, key);

5692

__wake_up_common(q, mode, 1, 0, key);

5693

}

5693

}

5694

5695

/**

5695

/**

5696

* __wake_up_sync_key - wake up threads blocked on a waitqueue.

5696

* __wake_up_sync_key - wake up threads blocked on a waitqueue.

5697

* @q: the waitqueue

5697

* @q: the waitqueue

5698

* @mode: which threads

5698

* @mode: which threads

5699

* @nr_exclusive: how many wake-one or wake-many threads to wake up

5699

* @nr_exclusive: how many wake-one or wake-many threads to wake up

5700

* @key: opaque value to be passed to wakeup targets

5700

* @key: opaque value to be passed to wakeup targets

5701

*

5701

*

5702

* The sync wakeup differs that the waker knows that it will schedule

5702

* The sync wakeup differs that the waker knows that it will schedule

5703

* away soon, so while the target thread will be woken up, it will not

5703

* away soon, so while the target thread will be woken up, it will not

5704

* be migrated to another CPU - ie. the two threads are 'synchronized'

5704

* be migrated to another CPU - ie. the two threads are 'synchronized'

5705

* with each other. This can prevent needless bouncing between CPUs.

5705

* with each other. This can prevent needless bouncing between CPUs.

5706

*

5706

*

5707

* On UP it can prevent extra preemption.

5707

* On UP it can prevent extra preemption.

5708

*

5708

*

5709

* It may be assumed that this function implies a write memory barrier before

5709

* It may be assumed that this function implies a write memory barrier before

5710

* changing the task state if and only if any tasks are woken up.

5710

* changing the task state if and only if any tasks are woken up.

5711

*/

5711

*/

5712

void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,

5712

void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,

5713

int nr_exclusive, void *key)

5713

int nr_exclusive, void *key)

5714

{

5714

{

5715

unsigned long flags;

5715

unsigned long flags;

5716

int wake_flags = WF_SYNC;

5716

int wake_flags = WF_SYNC;

5717

5718

if (unlikely(!q))

5718

if (unlikely(!q))

5719

return;

5719

return;

5720

5721

if (unlikely(!nr_exclusive))

5721

if (unlikely(!nr_exclusive))

5722

wake_flags = 0;

5722

wake_flags = 0;

5723

5724

spin_lock_irqsave(&q->lock, flags);

5724

spin_lock_irqsave(&q->lock, flags);

5725

__wake_up_common(q, mode, nr_exclusive, wake_flags, key);

5725

__wake_up_common(q, mode, nr_exclusive, wake_flags, key);

5726

spin_unlock_irqrestore(&q->lock, flags);

5726

spin_unlock_irqrestore(&q->lock, flags);

5727

}

5727

}

5728

EXPORT_SYMBOL_GPL(__wake_up_sync_key);

5728

EXPORT_SYMBOL_GPL(__wake_up_sync_key);

5729

5730

/*

5730

/*

5731

* __wake_up_sync - see __wake_up_sync_key()

5731

* __wake_up_sync - see __wake_up_sync_key()

5732

*/

5732

*/

5733

void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

5733

void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

5734

{

5734

{

5735

__wake_up_sync_key(q, mode, nr_exclusive, NULL);

5735

__wake_up_sync_key(q, mode, nr_exclusive, NULL);

5736

}

5736

}

5737

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

5737

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

5738

5739

/**

5739

/**

5740

* complete: - signals a single thread waiting on this completion

5740

* complete: - signals a single thread waiting on this completion

5741

* @x: holds the state of this particular completion

5741

* @x: holds the state of this particular completion

5742

*

5742

*

5743

* This will wake up a single thread waiting on this completion. Threads will be

5743

* This will wake up a single thread waiting on this completion. Threads will be

5744

* awakened in the same order in which they were queued.

5744

* awakened in the same order in which they were queued.

5745

*

5745

*

5746

* See also complete_all(), wait_for_completion() and related routines.

5746

* See also complete_all(), wait_for_completion() and related routines.

5747

*

5747

*

5748

* It may be assumed that this function implies a write memory barrier before

5748

* It may be assumed that this function implies a write memory barrier before

5749

* changing the task state if and only if any tasks are woken up.

5749

* changing the task state if and only if any tasks are woken up.

5750

*/

5750

*/

5751

void complete(struct completion *x)

5751

void complete(struct completion *x)

5752

{

5752

{

5753

unsigned long flags;

5753

unsigned long flags;

5754

5755

spin_lock_irqsave(&x->wait.lock, flags);

5755

spin_lock_irqsave(&x->wait.lock, flags);

5756

x->done++;

5756

x->done++;

5757

__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);

5757

__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);

5758

spin_unlock_irqrestore(&x->wait.lock, flags);

5758

spin_unlock_irqrestore(&x->wait.lock, flags);

5759

}

5759

}

5760

EXPORT_SYMBOL(complete);

5760

EXPORT_SYMBOL(complete);

5761

5762

/**

5762

/**

5763

* complete_all: - signals all threads waiting on this completion

5763

* complete_all: - signals all threads waiting on this completion

5764

* @x: holds the state of this particular completion

5764

* @x: holds the state of this particular completion

5765

*

5765

*

5766

* This will wake up all threads waiting on this particular completion event.

5766

* This will wake up all threads waiting on this particular completion event.

5767

*

5767

*

5768

* It may be assumed that this function implies a write memory barrier before

5768

* It may be assumed that this function implies a write memory barrier before

5769

* changing the task state if and only if any tasks are woken up.

5769

* changing the task state if and only if any tasks are woken up.

5770

*/

5770

*/

5771

void complete_all(struct completion *x)

5771

void complete_all(struct completion *x)

5772

{

5772

{

5773

unsigned long flags;

5773

unsigned long flags;

5774

5775

spin_lock_irqsave(&x->wait.lock, flags);

5775

spin_lock_irqsave(&x->wait.lock, flags);

5776

x->done += UINT_MAX/2;

5776

x->done += UINT_MAX/2;

5777

__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);

5777

__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);

5778

spin_unlock_irqrestore(&x->wait.lock, flags);

5778

spin_unlock_irqrestore(&x->wait.lock, flags);

5779

}

5779

}

5780

EXPORT_SYMBOL(complete_all);

5780

EXPORT_SYMBOL(complete_all);

5781

5782

static inline long __sched

5782

static inline long __sched

5783

do_wait_for_common(struct completion *x, long timeout, int state)

5783

do_wait_for_common(struct completion *x, long timeout, int state)

5784

{

5784

{

5785

if (!x->done) {

5785

if (!x->done) {

5786

DECLARE_WAITQUEUE(wait, current);

5786

DECLARE_WAITQUEUE(wait, current);

5787

5788

wait.flags |= WQ_FLAG_EXCLUSIVE;

5788

wait.flags |= WQ_FLAG_EXCLUSIVE;

5789

__add_wait_queue_tail(&x->wait, &wait);

5789

__add_wait_queue_tail(&x->wait, &wait);

5790

do {

5790

do {

5791

if (signal_pending_state(state, current)) {

5791

if (signal_pending_state(state, current)) {

5792

timeout = -ERESTARTSYS;

5792

timeout = -ERESTARTSYS;

5793

break;

5793

break;

5794

}

5794

}

5795

__set_current_state(state);

5795

__set_current_state(state);

5796

spin_unlock_irq(&x->wait.lock);

5796

spin_unlock_irq(&x->wait.lock);

5797

timeout = schedule_timeout(timeout);

5797

timeout = schedule_timeout(timeout);

5798

spin_lock_irq(&x->wait.lock);

5798

spin_lock_irq(&x->wait.lock);

5799

} while (!x->done && timeout);

5799

} while (!x->done && timeout);

5800

__remove_wait_queue(&x->wait, &wait);

5800

__remove_wait_queue(&x->wait, &wait);

5801

if (!x->done)

5801

if (!x->done)

5802

return timeout;

5802

return timeout;

5803

}

5803

}

5804

x->done--;

5804

x->done--;

5805

return timeout ?: 1;

5805

return timeout ?: 1;

5806

}

5806

}

5807

5808

static long __sched

5808

static long __sched

5809

wait_for_common(struct completion *x, long timeout, int state)

5809

wait_for_common(struct completion *x, long timeout, int state)

5810

{

5810

{

5811

might_sleep();

5811

might_sleep();

5812

5813

spin_lock_irq(&x->wait.lock);

5813

spin_lock_irq(&x->wait.lock);

5814

timeout = do_wait_for_common(x, timeout, state);

5814

timeout = do_wait_for_common(x, timeout, state);

5815

spin_unlock_irq(&x->wait.lock);

5815

spin_unlock_irq(&x->wait.lock);

5816

return timeout;

5816

return timeout;

5817

}

5817

}

5818

5819

/**

5819

/**

5820

* wait_for_completion: - waits for completion of a task

5820

* wait_for_completion: - waits for completion of a task

5821

* @x: holds the state of this particular completion

5821

* @x: holds the state of this particular completion

5822

*

5822

*

5823

* This waits to be signaled for completion of a specific task. It is NOT

5823

* This waits to be signaled for completion of a specific task. It is NOT

5824

* interruptible and there is no timeout.

5824

* interruptible and there is no timeout.

5825

*

5825

*

5826

* See also similar routines (i.e. wait_for_completion_timeout()) with timeout

5826

* See also similar routines (i.e. wait_for_completion_timeout()) with timeout

5827

* and interrupt capability. Also see complete().

5827

* and interrupt capability. Also see complete().

5828

*/

5828

*/

5829

void __sched wait_for_completion(struct completion *x)

5829

void __sched wait_for_completion(struct completion *x)

5830

{

5830

{

5831

wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);

5831

wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);

5832

}

5832

}

5833

EXPORT_SYMBOL(wait_for_completion);

5833

EXPORT_SYMBOL(wait_for_completion);

5834

5835

/**

5835

/**

5836

* wait_for_completion_timeout: - waits for completion of a task (w/timeout)

5836

* wait_for_completion_timeout: - waits for completion of a task (w/timeout)

5837

* @x: holds the state of this particular completion

5837

* @x: holds the state of this particular completion

5838

* @timeout: timeout value in jiffies

5838

* @timeout: timeout value in jiffies

5839

*

5839

*

5840

* This waits for either a completion of a specific task to be signaled or for a

5840

* This waits for either a completion of a specific task to be signaled or for a

5841

* specified timeout to expire. The timeout is in jiffies. It is not

5841

* specified timeout to expire. The timeout is in jiffies. It is not

5842

* interruptible.

5842

* interruptible.

5843

*/

5843

*/

5844

unsigned long __sched

5844

unsigned long __sched

5845

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

5845

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

5846

{

5846

{

5847

return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);

5847

return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);

5848

}

5848

}

5849

EXPORT_SYMBOL(wait_for_completion_timeout);

5849

EXPORT_SYMBOL(wait_for_completion_timeout);

5850

5851

/**

5851

/**

5852

* wait_for_completion_interruptible: - waits for completion of a task (w/intr)

5852

* wait_for_completion_interruptible: - waits for completion of a task (w/intr)

5853

* @x: holds the state of this particular completion

5853

* @x: holds the state of this particular completion

5854

*

5854

*

5855

* This waits for completion of a specific task to be signaled. It is

5855

* This waits for completion of a specific task to be signaled. It is

5856

* interruptible.

5856

* interruptible.

5857

*/

5857

*/

5858

int __sched wait_for_completion_interruptible(struct completion *x)

5858

int __sched wait_for_completion_interruptible(struct completion *x)

5859

{

5859

{

5860

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);

5860

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);

5861

if (t == -ERESTARTSYS)

5861

if (t == -ERESTARTSYS)

5862

return t;

5862

return t;

5863

return 0;

5863

return 0;

5864

}

5864

}

5865

EXPORT_SYMBOL(wait_for_completion_interruptible);

5865

EXPORT_SYMBOL(wait_for_completion_interruptible);

5866

5867

/**

5867

/**

5868

* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))

5868

* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))

5869

* @x: holds the state of this particular completion

5869

* @x: holds the state of this particular completion

5870

* @timeout: timeout value in jiffies

5870

* @timeout: timeout value in jiffies

5871

*

5871

*

5872

* This waits for either a completion of a specific task to be signaled or for a

5872

* This waits for either a completion of a specific task to be signaled or for a

5873

* specified timeout to expire. It is interruptible. The timeout is in jiffies.

5873

* specified timeout to expire. It is interruptible. The timeout is in jiffies.

5874

*/

5874

*/

5875

unsigned long __sched

5875

unsigned long __sched

5876

wait_for_completion_interruptible_timeout(struct completion *x,

5876

wait_for_completion_interruptible_timeout(struct completion *x,

5877

unsigned long timeout)

5877

unsigned long timeout)

5878

{

5878

{

5879

return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);

5879

return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);

5880

}

5880

}

5881

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

5881

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

5882

5883

/**

5883

/**

5884

* wait_for_completion_killable: - waits for completion of a task (killable)

5884

* wait_for_completion_killable: - waits for completion of a task (killable)

5885

* @x: holds the state of this particular completion

5885

* @x: holds the state of this particular completion

5886

*

5886

*

5887

* This waits to be signaled for completion of a specific task. It can be

5887

* This waits to be signaled for completion of a specific task. It can be

5888

* interrupted by a kill signal.

5888

* interrupted by a kill signal.

5889

*/

5889

*/

5890

int __sched wait_for_completion_killable(struct completion *x)

5890

int __sched wait_for_completion_killable(struct completion *x)

5891

{

5891

{

5892

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);

5892

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);

5893

if (t == -ERESTARTSYS)

5893

if (t == -ERESTARTSYS)

5894

return t;

5894

return t;

5895

return 0;

5895

return 0;

5896

}

5896

}

5897

EXPORT_SYMBOL(wait_for_completion_killable);

5897

EXPORT_SYMBOL(wait_for_completion_killable);

5898

5899

/**

5899

/**

5900

* try_wait_for_completion - try to decrement a completion without blocking

5900

* try_wait_for_completion - try to decrement a completion without blocking

5901

* @x: completion structure

5901

* @x: completion structure

5902

*

5902

*

5903

* Returns: 0 if a decrement cannot be done without blocking

5903

* Returns: 0 if a decrement cannot be done without blocking

5904

* 1 if a decrement succeeded.

5904

* 1 if a decrement succeeded.

5905

*

5905

*

5906

* If a completion is being used as a counting completion,

5906

* If a completion is being used as a counting completion,

5907

* attempt to decrement the counter without blocking. This

5907

* attempt to decrement the counter without blocking. This

5908

* enables us to avoid waiting if the resource the completion

5908

* enables us to avoid waiting if the resource the completion

5909

* is protecting is not available.

5909

* is protecting is not available.

5910

*/

5910

*/

5911

bool try_wait_for_completion(struct completion *x)

5911

bool try_wait_for_completion(struct completion *x)

5912

{

5912

{

5913

unsigned long flags;

5913

unsigned long flags;

5914

int ret = 1;

5914

int ret = 1;

5915

5916

spin_lock_irqsave(&x->wait.lock, flags);

5916

spin_lock_irqsave(&x->wait.lock, flags);

5917

if (!x->done)

5917

if (!x->done)

5918

ret = 0;

5918

ret = 0;

5919

else

5919

else

5920

x->done--;

5920

x->done--;

5921

spin_unlock_irqrestore(&x->wait.lock, flags);

5921

spin_unlock_irqrestore(&x->wait.lock, flags);

5922

return ret;

5922

return ret;

5923

}

5923

}

5924

EXPORT_SYMBOL(try_wait_for_completion);

5924

EXPORT_SYMBOL(try_wait_for_completion);

5925

5926

/**

5926

/**

5927

* completion_done - Test to see if a completion has any waiters

5927

* completion_done - Test to see if a completion has any waiters

5928

* @x: completion structure

5928

* @x: completion structure

5929

*

5929

*

5930

* Returns: 0 if there are waiters (wait_for_completion() in progress)

5930

* Returns: 0 if there are waiters (wait_for_completion() in progress)

5931

* 1 if there are no waiters.

5931

* 1 if there are no waiters.

5932

*

5932

*

5933

*/

5933

*/

5934

bool completion_done(struct completion *x)

5934

bool completion_done(struct completion *x)

5935

{

5935

{

5936

unsigned long flags;

5936

unsigned long flags;

5937

int ret = 1;

5937

int ret = 1;

5938

5939

spin_lock_irqsave(&x->wait.lock, flags);

5939

spin_lock_irqsave(&x->wait.lock, flags);

5940

if (!x->done)

5940

if (!x->done)

5941

ret = 0;

5941

ret = 0;

5942

spin_unlock_irqrestore(&x->wait.lock, flags);

5942

spin_unlock_irqrestore(&x->wait.lock, flags);

5943

return ret;

5943

return ret;

5944

}

5944

}

5945

EXPORT_SYMBOL(completion_done);

5945

EXPORT_SYMBOL(completion_done);

5946

5947

static long __sched

5947

static long __sched

5948

sleep_on_common(wait_queue_head_t *q, int state, long timeout)

5948

sleep_on_common(wait_queue_head_t *q, int state, long timeout)

5949

{

5949

{

5950

unsigned long flags;

5950

unsigned long flags;

5951

wait_queue_t wait;

5951

wait_queue_t wait;

5952

5953

init_waitqueue_entry(&wait, current);

5953

init_waitqueue_entry(&wait, current);

5954

5955

__set_current_state(state);

5955

__set_current_state(state);

5956

5957

spin_lock_irqsave(&q->lock, flags);

5957

spin_lock_irqsave(&q->lock, flags);

5958

__add_wait_queue(q, &wait);

5958

__add_wait_queue(q, &wait);

5959

spin_unlock(&q->lock);

5959

spin_unlock(&q->lock);

5960

timeout = schedule_timeout(timeout);

5960

timeout = schedule_timeout(timeout);

5961

spin_lock_irq(&q->lock);

5961

spin_lock_irq(&q->lock);

5962

__remove_wait_queue(q, &wait);

5962

__remove_wait_queue(q, &wait);

5963

spin_unlock_irqrestore(&q->lock, flags);

5963

spin_unlock_irqrestore(&q->lock, flags);

5964

5965

return timeout;

5965

return timeout;

5966

}

5966

}

5967

5968

void __sched interruptible_sleep_on(wait_queue_head_t *q)

5968

void __sched interruptible_sleep_on(wait_queue_head_t *q)

5969

{

5969

{

5970

sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

5970

sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

5971

}

5971

}

5972

EXPORT_SYMBOL(interruptible_sleep_on);

5972

EXPORT_SYMBOL(interruptible_sleep_on);

5973

5974

long __sched

5974

long __sched

5975

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

5975

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

5976

{

5976

{

5977

return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);

5977

return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);

5978

}

5978

}

5979

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

5979

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

5980

5981

void __sched sleep_on(wait_queue_head_t *q)

5981

void __sched sleep_on(wait_queue_head_t *q)

5982

{

5982

{

5983

sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

5983

sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

5984

}

5984

}

5985

EXPORT_SYMBOL(sleep_on);

5985

EXPORT_SYMBOL(sleep_on);

5986

5987

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

5987

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

5988

{

5988

{

5989

return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);

5989

return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);

5990

}

5990

}

5991

EXPORT_SYMBOL(sleep_on_timeout);

5991

EXPORT_SYMBOL(sleep_on_timeout);

5992

5993

#ifdef CONFIG_RT_MUTEXES

5993

#ifdef CONFIG_RT_MUTEXES

5994

5995

/*

5995

/*

5996

* rt_mutex_setprio - set the current priority of a task

5996

* rt_mutex_setprio - set the current priority of a task

5997

* @p: task

5997

* @p: task

5998

* @prio: prio value (kernel-internal form)

5998

* @prio: prio value (kernel-internal form)

5999

*

5999

*

6000

* This function changes the 'effective' priority of a task. It does

6000

* This function changes the 'effective' priority of a task. It does

6001

* not touch ->normal_prio like __setscheduler().

6001

* not touch ->normal_prio like __setscheduler().

6002

*

6002

*

6003

* Used by the rt_mutex code to implement priority inheritance logic.

6003

* Used by the rt_mutex code to implement priority inheritance logic.

6004

*/

6004

*/

6005

void rt_mutex_setprio(struct task_struct *p, int prio)

6005

void rt_mutex_setprio(struct task_struct *p, int prio)

6006

{

6006

{

6007

unsigned long flags;

6007

unsigned long flags;

6008

int oldprio, on_rq, running;

6008

int oldprio, on_rq, running;

6009

struct rq *rq;

6009

struct rq *rq;

6010

const struct sched_class *prev_class = p->sched_class;

6010

const struct sched_class *prev_class = p->sched_class;

6011

6012

BUG_ON(prio < 0 || prio > MAX_PRIO);

6012

BUG_ON(prio < 0 || prio > MAX_PRIO);

6013

6014

rq = task_rq_lock(p, &flags);

6014

rq = task_rq_lock(p, &flags);

6015

update_rq_clock(rq);

6015

update_rq_clock(rq);

6016

6017

oldprio = p->prio;

6017

oldprio = p->prio;

6018

on_rq = p->se.on_rq;

6018

on_rq = p->se.on_rq;

6019

running = task_current(rq, p);

6019

running = task_current(rq, p);

6020

if (on_rq)

6020

if (on_rq)

6021

dequeue_task(rq, p, 0);

6021

dequeue_task(rq, p, 0);

6022

if (running)

6022

if (running)

6023

p->sched_class->put_prev_task(rq, p);

6023

p->sched_class->put_prev_task(rq, p);

6024

6025

if (rt_prio(prio))

6025

if (rt_prio(prio))

6026

p->sched_class = &rt_sched_class;

6026

p->sched_class = &rt_sched_class;

6027

else

6027

else

6028

p->sched_class = &fair_sched_class;

6028

p->sched_class = &fair_sched_class;

6029

6030

p->prio = prio;

6030

p->prio = prio;

6031

6032

if (running)

6032

if (running)

6033

p->sched_class->set_curr_task(rq);

6033

p->sched_class->set_curr_task(rq);

6034

if (on_rq) {

6034

if (on_rq) {

6035

enqueue_task(rq, p, 0);

6035

enqueue_task(rq, p, 0);

6036

6037

check_class_changed(rq, p, prev_class, oldprio, running);

6037

check_class_changed(rq, p, prev_class, oldprio, running);

6038

}

6038

}

6039

task_rq_unlock(rq, &flags);

6039

task_rq_unlock(rq, &flags);

6040

}

6040

}

6041

6042

#endif

6042

#endif

6043

6044

void set_user_nice(struct task_struct *p, long nice)

6044

void set_user_nice(struct task_struct *p, long nice)

6045

{

6045

{

6046

int old_prio, delta, on_rq;

6046

int old_prio, delta, on_rq;

6047

unsigned long flags;

6047

unsigned long flags;

6048

struct rq *rq;

6048

struct rq *rq;

6049

6050

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

6050

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

6051

return;

6051

return;

6052

/*

6052

/*

6053

* We have to be careful, if called from sys_setpriority(),

6053

* We have to be careful, if called from sys_setpriority(),

6054

* the task might be in the middle of scheduling on another CPU.

6054

* the task might be in the middle of scheduling on another CPU.

6055

*/

6055

*/

6056

rq = task_rq_lock(p, &flags);

6056

rq = task_rq_lock(p, &flags);

6057

update_rq_clock(rq);

6057

update_rq_clock(rq);

6058

/*

6058

/*

6059

* The RT priorities are set via sched_setscheduler(), but we still

6059

* The RT priorities are set via sched_setscheduler(), but we still

6060

* allow the 'normal' nice value to be set - but as expected

6060

* allow the 'normal' nice value to be set - but as expected

6061

* it wont have any effect on scheduling until the task is

6061

* it wont have any effect on scheduling until the task is

6062

* SCHED_FIFO/SCHED_RR:

6062

* SCHED_FIFO/SCHED_RR:

6063

*/

6063

*/

6064

if (task_has_rt_policy(p)) {

6064

if (task_has_rt_policy(p)) {

6065

p->static_prio = NICE_TO_PRIO(nice);

6065

p->static_prio = NICE_TO_PRIO(nice);

6066

goto out_unlock;

6066

goto out_unlock;

6067

}

6067

}

6068

on_rq = p->se.on_rq;

6068

on_rq = p->se.on_rq;

6069

if (on_rq)

6069

if (on_rq)

6070

dequeue_task(rq, p, 0);

6070

dequeue_task(rq, p, 0);

6071

6072

p->static_prio = NICE_TO_PRIO(nice);

6072

p->static_prio = NICE_TO_PRIO(nice);

6073

set_load_weight(p);

6073

set_load_weight(p);

6074

old_prio = p->prio;

6074

old_prio = p->prio;

6075

p->prio = effective_prio(p);

6075

p->prio = effective_prio(p);

6076

delta = p->prio - old_prio;

6076

delta = p->prio - old_prio;

6077

6078

if (on_rq) {

6078

if (on_rq) {

6079

enqueue_task(rq, p, 0);

6079

enqueue_task(rq, p, 0);

6080

/*

6080

/*

6081

* If the task increased its priority or is running and

6081

* If the task increased its priority or is running and

6082

* lowered its priority, then reschedule its CPU:

6082

* lowered its priority, then reschedule its CPU:

6083

*/

6083

*/

6084

if (delta < 0 || (delta > 0 && task_running(rq, p)))

6084

if (delta < 0 || (delta > 0 && task_running(rq, p)))

6085

resched_task(rq->curr);

6085

resched_task(rq->curr);

6086

}

6086

}

6087

out_unlock:

6087

out_unlock:

6088

task_rq_unlock(rq, &flags);

6088

task_rq_unlock(rq, &flags);

6089

}

6089

}

6090

EXPORT_SYMBOL(set_user_nice);

6090

EXPORT_SYMBOL(set_user_nice);

6091

6092

/*

6092

/*

6093

* can_nice - check if a task can reduce its nice value

6093

* can_nice - check if a task can reduce its nice value

6094

* @p: task

6094

* @p: task

6095

* @nice: nice value

6095

* @nice: nice value

6096

*/

6096

*/

6097

int can_nice(const struct task_struct *p, const int nice)

6097

int can_nice(const struct task_struct *p, const int nice)

6098

{

6098

{

6099

/* convert nice value [19,-20] to rlimit style value [1,40] */

6099

/* convert nice value [19,-20] to rlimit style value [1,40] */

6100

int nice_rlim = 20 - nice;

6100

int nice_rlim = 20 - nice;

6101

6102

return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||

6102

return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||

6103

capable(CAP_SYS_NICE));

6103

capable(CAP_SYS_NICE));

6104

}

6104

}

6105

6106

#ifdef __ARCH_WANT_SYS_NICE

6106

#ifdef __ARCH_WANT_SYS_NICE

6107

6108

/*

6108

/*

6109

* sys_nice - change the priority of the current process.

6109

* sys_nice - change the priority of the current process.

6110

* @increment: priority increment

6110

* @increment: priority increment

6111

*

6111

*

6112

* sys_setpriority is a more generic, but much slower function that

6112

* sys_setpriority is a more generic, but much slower function that

6113

* does similar things.

6113

* does similar things.

6114

*/

6114

*/

6115

SYSCALL_DEFINE1(nice, int, increment)

6115

SYSCALL_DEFINE1(nice, int, increment)

6116

{

6116

{

6117

long nice, retval;

6117

long nice, retval;

6118

6119

/*

6119

/*

6120

* Setpriority might change our priority at the same moment.

6120

* Setpriority might change our priority at the same moment.

6121

* We don't have to worry. Conceptually one call occurs first

6121

* We don't have to worry. Conceptually one call occurs first

6122

* and we have a single winner.

6122

* and we have a single winner.

6123

*/

6123

*/

6124

if (increment < -40)

6124

if (increment < -40)

6125

increment = -40;

6125

increment = -40;

6126

if (increment > 40)

6126

if (increment > 40)

6127

increment = 40;

6127

increment = 40;

6128

6129

nice = TASK_NICE(current) + increment;

6129

nice = TASK_NICE(current) + increment;

6130

if (nice < -20)

6130

if (nice < -20)

6131

nice = -20;

6131

nice = -20;

6132

if (nice > 19)

6132

if (nice > 19)

6133

nice = 19;

6133

nice = 19;

6134

6135

if (increment < 0 && !can_nice(current, nice))

6135

if (increment < 0 && !can_nice(current, nice))

6136

return -EPERM;

6136

return -EPERM;

6137

6138

retval = security_task_setnice(current, nice);

6138

retval = security_task_setnice(current, nice);

6139

if (retval)

6139

if (retval)

6140

return retval;

6140

return retval;

6141

6142

set_user_nice(current, nice);

6142

set_user_nice(current, nice);

6143

return 0;

6143

return 0;

6144

}

6144

}

6145

6146

#endif

6146

#endif

6147

6148

/**

6148

/**

6149

* task_prio - return the priority value of a given task.

6149

* task_prio - return the priority value of a given task.

6150

* @p: the task in question.

6150

* @p: the task in question.

6151

*

6151

*

6152

* This is the priority value as seen by users in /proc.

6152

* This is the priority value as seen by users in /proc.

6153

* RT tasks are offset by -200. Normal tasks are centered

6153

* RT tasks are offset by -200. Normal tasks are centered

6154

* around 0, value goes from -16 to +15.

6154

* around 0, value goes from -16 to +15.

6155

*/

6155

*/

6156

int task_prio(const struct task_struct *p)

6156

int task_prio(const struct task_struct *p)

6157

{

6157

{

6158

return p->prio - MAX_RT_PRIO;

6158

return p->prio - MAX_RT_PRIO;

6159

}

6159

}

6160

6161

/**

6161

/**

6162

* task_nice - return the nice value of a given task.

6162

* task_nice - return the nice value of a given task.

6163

* @p: the task in question.

6163

* @p: the task in question.

6164

*/

6164

*/

6165

int task_nice(const struct task_struct *p)

6165

int task_nice(const struct task_struct *p)

6166

{

6166

{

6167

return TASK_NICE(p);

6167

return TASK_NICE(p);

6168

}

6168

}

6169

EXPORT_SYMBOL(task_nice);

6169

EXPORT_SYMBOL(task_nice);

6170

6171

/**

6171

/**

6172

* idle_cpu - is a given cpu idle currently?

6172

* idle_cpu - is a given cpu idle currently?

6173

* @cpu: the processor in question.

6173

* @cpu: the processor in question.

6174

*/

6174

*/

6175

int idle_cpu(int cpu)

6175

int idle_cpu(int cpu)

6176

{

6176

{

6177

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

6177

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

6178

}

6178

}

6179

6180

/**

6180

/**

6181

* idle_task - return the idle task for a given cpu.

6181

* idle_task - return the idle task for a given cpu.

6182

* @cpu: the processor in question.

6182

* @cpu: the processor in question.

6183

*/

6183

*/

6184

struct task_struct *idle_task(int cpu)

6184

struct task_struct *idle_task(int cpu)

6185

{

6185

{

6186

return cpu_rq(cpu)->idle;

6186

return cpu_rq(cpu)->idle;

6187

}

6187

}

6188

6189

/**

6189

/**

6190

* find_process_by_pid - find a process with a matching PID value.

6190

* find_process_by_pid - find a process with a matching PID value.

6191

* @pid: the pid in question.

6191

* @pid: the pid in question.

6192

*/

6192

*/

6193

static struct task_struct *find_process_by_pid(pid_t pid)

6193

static struct task_struct *find_process_by_pid(pid_t pid)

6194

{

6194

{

6195

return pid ? find_task_by_vpid(pid) : current;

6195

return pid ? find_task_by_vpid(pid) : current;

6196

}

6196

}

6197

6198

/* Actually do priority change: must hold rq lock. */

6198

/* Actually do priority change: must hold rq lock. */

6199

static void

6199

static void

6200

__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)

6200

__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)

6201

{

6201

{

6202

BUG_ON(p->se.on_rq);

6202

BUG_ON(p->se.on_rq);

6203

6204

p->policy = policy;

6204

p->policy = policy;

6205

p->rt_priority = prio;

6205

p->rt_priority = prio;

6206

p->normal_prio = normal_prio(p);

6206

p->normal_prio = normal_prio(p);

6207

/* we are holding p->pi_lock already */

6207

/* we are holding p->pi_lock already */

6208

p->prio = rt_mutex_getprio(p);

6208

p->prio = rt_mutex_getprio(p);

6209

if (rt_prio(p->prio))

6209

if (rt_prio(p->prio))

6210

p->sched_class = &rt_sched_class;

6210

p->sched_class = &rt_sched_class;

6211

else

6211

else

6212

p->sched_class = &fair_sched_class;

6212

p->sched_class = &fair_sched_class;

6213

set_load_weight(p);

6213

set_load_weight(p);

6214

}

6214

}

6215

6216

/*

6216

/*

6217

* check the target process has a UID that matches the current process's

6217

* check the target process has a UID that matches the current process's

6218

*/

6218

*/

6219

static bool check_same_owner(struct task_struct *p)

6219

static bool check_same_owner(struct task_struct *p)

6220

{

6220

{

6221

const struct cred *cred = current_cred(), *pcred;

6221

const struct cred *cred = current_cred(), *pcred;

6222

bool match;

6222

bool match;

6223

6224

rcu_read_lock();

6224

rcu_read_lock();

6225

pcred = __task_cred(p);

6225

pcred = __task_cred(p);

6226

match = (cred->euid == pcred->euid ||

6226

match = (cred->euid == pcred->euid ||

6227

cred->euid == pcred->uid);

6227

cred->euid == pcred->uid);

6228

rcu_read_unlock();

6228

rcu_read_unlock();

6229

return match;

6229

return match;

6230

}

6230

}

6231

6232

static int __sched_setscheduler(struct task_struct *p, int policy,

6232

static int __sched_setscheduler(struct task_struct *p, int policy,

6233

struct sched_param *param, bool user)

6233

struct sched_param *param, bool user)

6234

{

6234

{

6235

int retval, oldprio, oldpolicy = -1, on_rq, running;

6235

int retval, oldprio, oldpolicy = -1, on_rq, running;

6236

unsigned long flags;

6236

unsigned long flags;

6237

const struct sched_class *prev_class = p->sched_class;

6237

const struct sched_class *prev_class = p->sched_class;

6238

struct rq *rq;

6238

struct rq *rq;

6239

int reset_on_fork;

6239

int reset_on_fork;

6240

6241

/* may grab non-irq protected spin_locks */

6241

/* may grab non-irq protected spin_locks */

6242

BUG_ON(in_interrupt());

6242

BUG_ON(in_interrupt());

6243

recheck:

6243

recheck:

6244

/* double check policy once rq lock held */

6244

/* double check policy once rq lock held */

6245

if (policy < 0) {

6245

if (policy < 0) {

6246

reset_on_fork = p->sched_reset_on_fork;

6246

reset_on_fork = p->sched_reset_on_fork;

6247

policy = oldpolicy = p->policy;

6247

policy = oldpolicy = p->policy;

6248

} else {

6248

} else {

6249

reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);

6249

reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);

6250

policy &= ~SCHED_RESET_ON_FORK;

6250

policy &= ~SCHED_RESET_ON_FORK;

6251

6252

if (policy != SCHED_FIFO && policy != SCHED_RR &&

6252

if (policy != SCHED_FIFO && policy != SCHED_RR &&

6253

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

6253

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

6254

policy != SCHED_IDLE)

6254

policy != SCHED_IDLE)

6255

return -EINVAL;

6255

return -EINVAL;

6256

}

6256

}

6257

6258

/*

6258

/*

6259

* Valid priorities for SCHED_FIFO and SCHED_RR are

6259

* Valid priorities for SCHED_FIFO and SCHED_RR are

6260

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

6260

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

6261

* SCHED_BATCH and SCHED_IDLE is 0.

6261

* SCHED_BATCH and SCHED_IDLE is 0.

6262

*/

6262

*/

6263

if (param->sched_priority < 0 ||

6263

if (param->sched_priority < 0 ||

6264

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

6264

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

6265

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

6265

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

6266

return -EINVAL;

6266

return -EINVAL;

6267

if (rt_policy(policy) != (param->sched_priority != 0))

6267

if (rt_policy(policy) != (param->sched_priority != 0))

6268

return -EINVAL;

6268

return -EINVAL;

6269

6270

/*

6270

/*

6271

* Allow unprivileged RT tasks to decrease priority:

6271

* Allow unprivileged RT tasks to decrease priority:

6272

*/

6272

*/

6273

if (user && !capable(CAP_SYS_NICE)) {

6273

if (user && !capable(CAP_SYS_NICE)) {

6274

if (rt_policy(policy)) {

6274

if (rt_policy(policy)) {

6275

unsigned long rlim_rtprio;

6275

unsigned long rlim_rtprio;

6276

6277

if (!lock_task_sighand(p, &flags))

6277

if (!lock_task_sighand(p, &flags))

6278

return -ESRCH;

6278

return -ESRCH;

6279

rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;

6279

rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;

6280

unlock_task_sighand(p, &flags);

6280

unlock_task_sighand(p, &flags);

6281

6282

/* can't set/change the rt policy */

6282

/* can't set/change the rt policy */

6283

if (policy != p->policy && !rlim_rtprio)

6283

if (policy != p->policy && !rlim_rtprio)

6284

return -EPERM;

6284

return -EPERM;

6285

6286

/* can't increase priority */

6286

/* can't increase priority */

6287

if (param->sched_priority > p->rt_priority &&

6287

if (param->sched_priority > p->rt_priority &&

6288

param->sched_priority > rlim_rtprio)

6288

param->sched_priority > rlim_rtprio)

6289

return -EPERM;

6289

return -EPERM;

6290

}

6290

}

6291

/*

6291

/*

6292

* Like positive nice levels, dont allow tasks to

6292

* Like positive nice levels, dont allow tasks to

6293

* move out of SCHED_IDLE either:

6293

* move out of SCHED_IDLE either:

6294

*/

6294

*/

6295

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)

6295

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)

6296

return -EPERM;

6296

return -EPERM;

6297

6298

/* can't change other user's priorities */

6298

/* can't change other user's priorities */

6299

if (!check_same_owner(p))

6299

if (!check_same_owner(p))

6300

return -EPERM;

6300

return -EPERM;

6301

6302

/* Normal users shall not reset the sched_reset_on_fork flag */

6302

/* Normal users shall not reset the sched_reset_on_fork flag */

6303

if (p->sched_reset_on_fork && !reset_on_fork)

6303

if (p->sched_reset_on_fork && !reset_on_fork)

6304

return -EPERM;

6304

return -EPERM;

6305

}

6305

}

6306

6307

if (user) {

6307

if (user) {

6308

#ifdef CONFIG_RT_GROUP_SCHED

6308

#ifdef CONFIG_RT_GROUP_SCHED

6309

/*

6309

/*

6310

* Do not allow realtime tasks into groups that have no runtime

6310

* Do not allow realtime tasks into groups that have no runtime

6311

* assigned.

6311

* assigned.

6312

*/

6312

*/

6313

if (rt_bandwidth_enabled() && rt_policy(policy) &&

6313

if (rt_bandwidth_enabled() && rt_policy(policy) &&

6314

task_group(p)->rt_bandwidth.rt_runtime == 0)

6314

task_group(p)->rt_bandwidth.rt_runtime == 0)

6315

return -EPERM;

6315

return -EPERM;

6316

#endif

6316

#endif

6317

6318

retval = security_task_setscheduler(p, policy, param);

6318

retval = security_task_setscheduler(p, policy, param);

6319

if (retval)

6319

if (retval)

6320

return retval;

6320

return retval;

6321

}

6321

}

6322

6323

/*

6323

/*

6324

* make sure no PI-waiters arrive (or leave) while we are

6324

* make sure no PI-waiters arrive (or leave) while we are

6325

* changing the priority of the task:

6325

* changing the priority of the task:

6326

*/

6326

*/

6327

spin_lock_irqsave(&p->pi_lock, flags);

6327

spin_lock_irqsave(&p->pi_lock, flags);

6328

/*

6328

/*

6329

* To be able to change p->policy safely, the apropriate

6329

* To be able to change p->policy safely, the apropriate

6330

* runqueue lock must be held.

6330

* runqueue lock must be held.

6331

*/

6331

*/

6332

rq = __task_rq_lock(p);

6332

rq = __task_rq_lock(p);

6333

/* recheck policy now with rq lock held */

6333

/* recheck policy now with rq lock held */

6334

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

6334

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

6335

policy = oldpolicy = -1;

6335

policy = oldpolicy = -1;

6336

__task_rq_unlock(rq);

6336

__task_rq_unlock(rq);

6337

spin_unlock_irqrestore(&p->pi_lock, flags);

6337

spin_unlock_irqrestore(&p->pi_lock, flags);

6338

goto recheck;

6338

goto recheck;

6339

}

6339

}

6340

update_rq_clock(rq);

6340

update_rq_clock(rq);

6341

on_rq = p->se.on_rq;

6341

on_rq = p->se.on_rq;

6342

running = task_current(rq, p);

6342

running = task_current(rq, p);

6343

if (on_rq)

6343

if (on_rq)

6344

deactivate_task(rq, p, 0);

6344

deactivate_task(rq, p, 0);

6345

if (running)

6345

if (running)

6346

p->sched_class->put_prev_task(rq, p);

6346

p->sched_class->put_prev_task(rq, p);

6347

6348

p->sched_reset_on_fork = reset_on_fork;

6348

p->sched_reset_on_fork = reset_on_fork;

6349

6350

oldprio = p->prio;

6350

oldprio = p->prio;

6351

__setscheduler(rq, p, policy, param->sched_priority);

6351

__setscheduler(rq, p, policy, param->sched_priority);

6352

6353

if (running)

6353

if (running)

6354

p->sched_class->set_curr_task(rq);

6354

p->sched_class->set_curr_task(rq);

6355

if (on_rq) {

6355

if (on_rq) {

6356

activate_task(rq, p, 0);

6356

activate_task(rq, p, 0);

6357

6358

check_class_changed(rq, p, prev_class, oldprio, running);

6358

check_class_changed(rq, p, prev_class, oldprio, running);

6359

}

6359

}

6360

__task_rq_unlock(rq);

6360

__task_rq_unlock(rq);

6361

spin_unlock_irqrestore(&p->pi_lock, flags);

6361

spin_unlock_irqrestore(&p->pi_lock, flags);

6362

6363

rt_mutex_adjust_pi(p);

6363

rt_mutex_adjust_pi(p);

6364

6365

return 0;

6365

return 0;

6366

}

6366

}

6367

6368

/**

6368

/**

6369

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

6369

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

6370

* @p: the task in question.

6370

* @p: the task in question.

6371

* @policy: new policy.

6371

* @policy: new policy.

6372

* @param: structure containing the new RT priority.

6372

* @param: structure containing the new RT priority.

6373

*

6373

*

6374

* NOTE that the task may be already dead.

6374

* NOTE that the task may be already dead.

6375

*/

6375

*/

6376

int sched_setscheduler(struct task_struct *p, int policy,

6376

int sched_setscheduler(struct task_struct *p, int policy,

6377

struct sched_param *param)

6377

struct sched_param *param)

6378

{

6378

{

6379

return __sched_setscheduler(p, policy, param, true);

6379

return __sched_setscheduler(p, policy, param, true);

6380

}

6380

}

6381

EXPORT_SYMBOL_GPL(sched_setscheduler);

6381

EXPORT_SYMBOL_GPL(sched_setscheduler);

6382

6383

/**

6383

/**

6384

* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

6384

* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

6385

* @p: the task in question.

6385

* @p: the task in question.

6386

* @policy: new policy.

6386

* @policy: new policy.

6387

* @param: structure containing the new RT priority.

6387

* @param: structure containing the new RT priority.

6388

*

6388

*

6389

* Just like sched_setscheduler, only don't bother checking if the

6389

* Just like sched_setscheduler, only don't bother checking if the

6390

* current context has permission. For example, this is needed in

6390

* current context has permission. For example, this is needed in

6391

* stop_machine(): we create temporary high priority worker threads,

6391

* stop_machine(): we create temporary high priority worker threads,

6392

* but our caller might not have that capability.

6392

* but our caller might not have that capability.

6393

*/

6393

*/

6394

int sched_setscheduler_nocheck(struct task_struct *p, int policy,

6394

int sched_setscheduler_nocheck(struct task_struct *p, int policy,

6395

struct sched_param *param)

6395

struct sched_param *param)

6396

{

6396

{

6397

return __sched_setscheduler(p, policy, param, false);

6397

return __sched_setscheduler(p, policy, param, false);

6398

}

6398

}

6399

6400

static int

6400

static int

6401

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

6401

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

6402

{

6402

{

6403

struct sched_param lparam;

6403

struct sched_param lparam;

6404

struct task_struct *p;

6404

struct task_struct *p;

6405

int retval;

6405

int retval;

6406

6407

if (!param || pid < 0)

6407

if (!param || pid < 0)

6408

return -EINVAL;

6408

return -EINVAL;

6409

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

6409

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

6410

return -EFAULT;

6410

return -EFAULT;

6411

6412

rcu_read_lock();

6412

rcu_read_lock();

6413

retval = -ESRCH;

6413

retval = -ESRCH;

6414

p = find_process_by_pid(pid);

6414

p = find_process_by_pid(pid);

6415

if (p != NULL)

6415

if (p != NULL)

6416

retval = sched_setscheduler(p, policy, &lparam);

6416

retval = sched_setscheduler(p, policy, &lparam);

6417

rcu_read_unlock();

6417

rcu_read_unlock();

6418

6419

return retval;

6419

return retval;

6420

}

6420

}

6421

6422

/**

6422

/**

6423

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

6423

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

6424

* @pid: the pid in question.

6424

* @pid: the pid in question.

6425

* @policy: new policy.

6425

* @policy: new policy.

6426

* @param: structure containing the new RT priority.

6426

* @param: structure containing the new RT priority.

6427

*/

6427

*/

6428

SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,

6428

SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,

6429

struct sched_param __user *, param)

6429

struct sched_param __user *, param)

6430

{

6430

{

6431

/* negative values for policy are not valid */

6431

/* negative values for policy are not valid */

6432

if (policy < 0)

6432

if (policy < 0)

6433

return -EINVAL;

6433

return -EINVAL;

6434

6435

return do_sched_setscheduler(pid, policy, param);

6435

return do_sched_setscheduler(pid, policy, param);

6436

}

6436

}

6437

6438

/**

6438

/**

6439

* sys_sched_setparam - set/change the RT priority of a thread

6439

* sys_sched_setparam - set/change the RT priority of a thread

6440

* @pid: the pid in question.

6440

* @pid: the pid in question.

6441

* @param: structure containing the new RT priority.

6441

* @param: structure containing the new RT priority.

6442

*/

6442

*/

6443

SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

6443

SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

6444

{

6444

{

6445

return do_sched_setscheduler(pid, -1, param);

6445

return do_sched_setscheduler(pid, -1, param);

6446

}

6446

}

6447

6448

/**

6448

/**

6449

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

6449

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

6450

* @pid: the pid in question.

6450

* @pid: the pid in question.

6451

*/

6451

*/

6452

SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

6452

SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

6453

{

6453

{

6454

struct task_struct *p;

6454

struct task_struct *p;

6455

int retval;

6455

int retval;

6456

6457

if (pid < 0)

6457

if (pid < 0)

6458

return -EINVAL;

6458

return -EINVAL;

6459

6460

retval = -ESRCH;

6460

retval = -ESRCH;

6461

rcu_read_lock();

6461

rcu_read_lock();

6462

p = find_process_by_pid(pid);

6462

p = find_process_by_pid(pid);

6463

if (p) {

6463

if (p) {

6464

retval = security_task_getscheduler(p);

6464

retval = security_task_getscheduler(p);

6465

if (!retval)

6465

if (!retval)

6466

retval = p->policy

6466

retval = p->policy

6467

| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);

6467

| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);

6468

}

6468

}

6469

rcu_read_unlock();

6469

rcu_read_unlock();

6470

return retval;

6470

return retval;

6471

}

6471

}

6472

6473

/**

6473

/**

6474

* sys_sched_getparam - get the RT priority of a thread

6474

* sys_sched_getparam - get the RT priority of a thread

6475

* @pid: the pid in question.

6475

* @pid: the pid in question.

6476

* @param: structure containing the RT priority.

6476

* @param: structure containing the RT priority.

6477

*/

6477

*/

6478

SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

6478

SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

6479

{

6479

{

6480

struct sched_param lp;

6480

struct sched_param lp;

6481

struct task_struct *p;

6481

struct task_struct *p;

6482

int retval;

6482

int retval;

6483

6484

if (!param || pid < 0)

6484

if (!param || pid < 0)

6485

return -EINVAL;

6485

return -EINVAL;

6486

6487

rcu_read_lock();

6487

rcu_read_lock();

6488

p = find_process_by_pid(pid);

6488

p = find_process_by_pid(pid);

6489

retval = -ESRCH;

6489

retval = -ESRCH;

6490

if (!p)

6490

if (!p)

6491

goto out_unlock;

6491

goto out_unlock;

6492

6493

retval = security_task_getscheduler(p);

6493

retval = security_task_getscheduler(p);

6494

if (retval)

6494

if (retval)

6495

goto out_unlock;

6495

goto out_unlock;

6496

6497

lp.sched_priority = p->rt_priority;

6497

lp.sched_priority = p->rt_priority;

6498

rcu_read_unlock();

6498

rcu_read_unlock();

6499

6500

/*

6500

/*

6501

* This one might sleep, we cannot do it with a spinlock held ...

6501

* This one might sleep, we cannot do it with a spinlock held ...

6502

*/

6502

*/

6503

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

6503

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

6504

6505

return retval;

6505

return retval;

6506

6507

out_unlock:

6507

out_unlock:

6508

rcu_read_unlock();

6508

rcu_read_unlock();

6509

return retval;

6509

return retval;

6510

}

6510

}

6511

6512

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

6512

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

6513

{

6513

{

6514

cpumask_var_t cpus_allowed, new_mask;

6514

cpumask_var_t cpus_allowed, new_mask;

6515

struct task_struct *p;

6515

struct task_struct *p;

6516

int retval;

6516

int retval;

6517

6518

get_online_cpus();

6518

get_online_cpus();

6519

rcu_read_lock();

6519

rcu_read_lock();

6520

6521

p = find_process_by_pid(pid);

6521

p = find_process_by_pid(pid);

6522

if (!p) {

6522

if (!p) {

6523

rcu_read_unlock();

6523

rcu_read_unlock();

6524

put_online_cpus();

6524

put_online_cpus();

6525

return -ESRCH;

6525

return -ESRCH;

6526

}

6526

}

6527

6528

/* Prevent p going away */

6528

/* Prevent p going away */

6529

get_task_struct(p);

6529

get_task_struct(p);

6530

rcu_read_unlock();

6530

rcu_read_unlock();

6531

6532

if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {

6532

if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {

6533

retval = -ENOMEM;

6533

retval = -ENOMEM;

6534

goto out_put_task;

6534

goto out_put_task;

6535

}

6535

}

6536

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

6536

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

6537

retval = -ENOMEM;

6537

retval = -ENOMEM;

6538

goto out_free_cpus_allowed;

6538

goto out_free_cpus_allowed;

6539

}

6539

}

6540

retval = -EPERM;

6540

retval = -EPERM;

6541

if (!check_same_owner(p) && !capable(CAP_SYS_NICE))

6541

if (!check_same_owner(p) && !capable(CAP_SYS_NICE))

6542

goto out_unlock;

6542

goto out_unlock;

6543

6544

retval = security_task_setscheduler(p, 0, NULL);

6544

retval = security_task_setscheduler(p, 0, NULL);

6545

if (retval)

6545

if (retval)

6546

goto out_unlock;

6546

goto out_unlock;

6547

6548

cpuset_cpus_allowed(p, cpus_allowed);

6548

cpuset_cpus_allowed(p, cpus_allowed);

6549

cpumask_and(new_mask, in_mask, cpus_allowed);

6549

cpumask_and(new_mask, in_mask, cpus_allowed);

6550

again:

6550

again:

6551

retval = set_cpus_allowed_ptr(p, new_mask);

6551

retval = set_cpus_allowed_ptr(p, new_mask);

6552

6553

if (!retval) {

6553

if (!retval) {

6554

cpuset_cpus_allowed(p, cpus_allowed);

6554

cpuset_cpus_allowed(p, cpus_allowed);

6555

if (!cpumask_subset(new_mask, cpus_allowed)) {

6555

if (!cpumask_subset(new_mask, cpus_allowed)) {

6556

/*

6556

/*

6557

* We must have raced with a concurrent cpuset

6557

* We must have raced with a concurrent cpuset

6558

* update. Just reset the cpus_allowed to the

6558

* update. Just reset the cpus_allowed to the

6559

* cpuset's cpus_allowed

6559

* cpuset's cpus_allowed

6560

*/

6560

*/

6561

cpumask_copy(new_mask, cpus_allowed);

6561

cpumask_copy(new_mask, cpus_allowed);

6562

goto again;

6562

goto again;

6563

}

6563

}

6564

}

6564

}

6565

out_unlock:

6565

out_unlock:

6566

free_cpumask_var(new_mask);

6566

free_cpumask_var(new_mask);

6567

out_free_cpus_allowed:

6567

out_free_cpus_allowed:

6568

free_cpumask_var(cpus_allowed);

6568

free_cpumask_var(cpus_allowed);

6569

out_put_task:

6569

out_put_task:

6570

put_task_struct(p);

6570

put_task_struct(p);

6571

put_online_cpus();

6571

put_online_cpus();

6572

return retval;

6572

return retval;

6573

}

6573

}

6574

6575

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

6575

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

6576

struct cpumask *new_mask)

6576

struct cpumask *new_mask)

6577

{

6577

{

6578

if (len < cpumask_size())

6578

if (len < cpumask_size())

6579

cpumask_clear(new_mask);

6579

cpumask_clear(new_mask);

6580

else if (len > cpumask_size())

6580

else if (len > cpumask_size())

6581

len = cpumask_size();

6581

len = cpumask_size();

6582

6583

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

6583

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

6584

}

6584

}

6585

6586

/**

6586

/**

6587

* sys_sched_setaffinity - set the cpu affinity of a process

6587

* sys_sched_setaffinity - set the cpu affinity of a process

6588

* @pid: pid of the process

6588

* @pid: pid of the process

6589

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

6589

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

6590

* @user_mask_ptr: user-space pointer to the new cpu mask

6590

* @user_mask_ptr: user-space pointer to the new cpu mask

6591

*/

6591

*/

6592

SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

6592

SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

6593

unsigned long __user *, user_mask_ptr)

6593

unsigned long __user *, user_mask_ptr)

6594

{

6594

{

6595

cpumask_var_t new_mask;

6595

cpumask_var_t new_mask;

6596

int retval;

6596

int retval;

6597

6598

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

6598

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

6599

return -ENOMEM;

6599

return -ENOMEM;

6600

6601

retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

6601

retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

6602

if (retval == 0)

6602

if (retval == 0)

6603

retval = sched_setaffinity(pid, new_mask);

6603

retval = sched_setaffinity(pid, new_mask);

6604

free_cpumask_var(new_mask);

6604

free_cpumask_var(new_mask);

6605

return retval;

6605

return retval;

6606

}

6606

}

6607

6608

long sched_getaffinity(pid_t pid, struct cpumask *mask)

6608

long sched_getaffinity(pid_t pid, struct cpumask *mask)

6609

{

6609

{

6610

struct task_struct *p;

6610

struct task_struct *p;

6611

unsigned long flags;

6611

unsigned long flags;

6612

struct rq *rq;

6612

struct rq *rq;

6613

int retval;

6613

int retval;

6614

6615

get_online_cpus();

6615

get_online_cpus();

6616

rcu_read_lock();

6616

rcu_read_lock();

6617

6618

retval = -ESRCH;

6618

retval = -ESRCH;

6619

p = find_process_by_pid(pid);

6619

p = find_process_by_pid(pid);

6620

if (!p)

6620

if (!p)

6621

goto out_unlock;

6621

goto out_unlock;

6622

6623

retval = security_task_getscheduler(p);

6623

retval = security_task_getscheduler(p);

6624

if (retval)

6624

if (retval)

6625

goto out_unlock;

6625

goto out_unlock;

6626

6627

rq = task_rq_lock(p, &flags);

6627

rq = task_rq_lock(p, &flags);

6628

cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);

6628

cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);

6629

task_rq_unlock(rq, &flags);

6629

task_rq_unlock(rq, &flags);

6630

6631

out_unlock:

6631

out_unlock:

6632

rcu_read_unlock();

6632

rcu_read_unlock();

6633

put_online_cpus();

6633

put_online_cpus();

6634

6635

return retval;

6635

return retval;

6636

}

6636

}

6637

6638

/**

6638

/**

6639

* sys_sched_getaffinity - get the cpu affinity of a process

6639

* sys_sched_getaffinity - get the cpu affinity of a process

6640

* @pid: pid of the process

6640

* @pid: pid of the process

6641

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

6641

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

6642

* @user_mask_ptr: user-space pointer to hold the current cpu mask

6642

* @user_mask_ptr: user-space pointer to hold the current cpu mask

6643

*/

6643

*/

6644

SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

6644

SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

6645

unsigned long __user *, user_mask_ptr)

6645

unsigned long __user *, user_mask_ptr)

6646

{

6646

{

6647

int ret;

6647

int ret;

6648

cpumask_var_t mask;

6648

cpumask_var_t mask;

6649

6650

if (len < cpumask_size())

6650

if (len < cpumask_size())

6651

return -EINVAL;

6651

return -EINVAL;

6652

6653

if (!alloc_cpumask_var(&mask, GFP_KERNEL))

6653

if (!alloc_cpumask_var(&mask, GFP_KERNEL))

6654

return -ENOMEM;

6654

return -ENOMEM;

6655

6656

ret = sched_getaffinity(pid, mask);

6656

ret = sched_getaffinity(pid, mask);

6657

if (ret == 0) {

6657

if (ret == 0) {

6658

if (copy_to_user(user_mask_ptr, mask, cpumask_size()))

6658

if (copy_to_user(user_mask_ptr, mask, cpumask_size()))

6659

ret = -EFAULT;

6659

ret = -EFAULT;

6660

else

6660

else

6661

ret = cpumask_size();

6661

ret = cpumask_size();

6662

}

6662

}

6663

free_cpumask_var(mask);

6663

free_cpumask_var(mask);

6664

6665

return ret;

6665

return ret;

6666

}

6666

}

6667

6668

/**

6668

/**

6669

* sys_sched_yield - yield the current processor to other threads.

6669

* sys_sched_yield - yield the current processor to other threads.

6670

*

6670

*

6671

* This function yields the current CPU to other tasks. If there are no

6671

* This function yields the current CPU to other tasks. If there are no

6672

* other threads running on this CPU then this function will return.

6672

* other threads running on this CPU then this function will return.

6673

*/

6673

*/

6674

SYSCALL_DEFINE0(sched_yield)

6674

SYSCALL_DEFINE0(sched_yield)

6675

{

6675

{

6676

struct rq *rq = this_rq_lock();

6676

struct rq *rq = this_rq_lock();

6677

6678

schedstat_inc(rq, yld_count);

6678

schedstat_inc(rq, yld_count);

6679

current->sched_class->yield_task(rq);

6679

current->sched_class->yield_task(rq);

6680

6681

/*

6681

/*

6682

* Since we are going to call schedule() anyway, there's

6682

* Since we are going to call schedule() anyway, there's

6683

* no need to preempt or enable interrupts:

6683

* no need to preempt or enable interrupts:

6684

*/

6684

*/

6685

__release(rq->lock);

6685

__release(rq->lock);

6686

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

6686

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

6687

_raw_spin_unlock(&rq->lock);

6687

_raw_spin_unlock(&rq->lock);

6688

preempt_enable_no_resched();

6688

preempt_enable_no_resched();

6689

6690

schedule();

6690

schedule();

6691

6692

return 0;

6692

return 0;

6693

}

6693

}

6694

6695

static inline int should_resched(void)

6695

static inline int should_resched(void)

6696

{

6696

{

6697

return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);

6697

return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);

6698

}

6698

}

6699

6700

static void __cond_resched(void)

6700

static void __cond_resched(void)

6701

{

6701

{

6702

add_preempt_count(PREEMPT_ACTIVE);

6702

add_preempt_count(PREEMPT_ACTIVE);

6703

schedule();

6703

schedule();

6704

sub_preempt_count(PREEMPT_ACTIVE);

6704

sub_preempt_count(PREEMPT_ACTIVE);

6705

}

6705

}

6706

6707

int __sched _cond_resched(void)

6707

int __sched _cond_resched(void)

6708

{

6708

{

6709

if (should_resched()) {

6709

if (should_resched()) {

6710

__cond_resched();

6710

__cond_resched();

6711

return 1;

6711

return 1;

6712

}

6712

}

6713

return 0;

6713

return 0;

6714

}

6714

}

6715

EXPORT_SYMBOL(_cond_resched);

6715

EXPORT_SYMBOL(_cond_resched);

6716

6717

/*

6717

/*

6718

* __cond_resched_lock() - if a reschedule is pending, drop the given lock,

6718

* __cond_resched_lock() - if a reschedule is pending, drop the given lock,

6719

* call schedule, and on return reacquire the lock.

6719

* call schedule, and on return reacquire the lock.

6720

*

6720

*

6721

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

6721

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

6722

* operations here to prevent schedule() from being called twice (once via

6722

* operations here to prevent schedule() from being called twice (once via

6723

* spin_unlock(), once by hand).

6723

* spin_unlock(), once by hand).

6724

*/

6724

*/

6725

int __cond_resched_lock(spinlock_t *lock)

6725

int __cond_resched_lock(spinlock_t *lock)

6726

{

6726

{

6727

int resched = should_resched();

6727

int resched = should_resched();

6728

int ret = 0;

6728

int ret = 0;

6729

6730

lockdep_assert_held(lock);

6730

lockdep_assert_held(lock);

6731

6732

if (spin_needbreak(lock) || resched) {

6732

if (spin_needbreak(lock) || resched) {

6733

spin_unlock(lock);

6733

spin_unlock(lock);

6734

if (resched)

6734

if (resched)

6735

__cond_resched();

6735

__cond_resched();

6736

else

6736

else

6737

cpu_relax();

6737

cpu_relax();

6738

ret = 1;

6738

ret = 1;

6739

spin_lock(lock);

6739

spin_lock(lock);

6740

}

6740

}

6741

return ret;

6741

return ret;

6742

}

6742

}

6743

EXPORT_SYMBOL(__cond_resched_lock);

6743

EXPORT_SYMBOL(__cond_resched_lock);

6744

6745

int __sched __cond_resched_softirq(void)

6745

int __sched __cond_resched_softirq(void)

6746

{

6746

{

6747

BUG_ON(!in_softirq());

6747

BUG_ON(!in_softirq());

6748

6749

if (should_resched()) {

6749

if (should_resched()) {

6750

local_bh_enable();

6750

local_bh_enable();

6751

__cond_resched();

6751

__cond_resched();

6752

local_bh_disable();

6752

local_bh_disable();

6753

return 1;

6753

return 1;

6754

}

6754

}

6755

return 0;

6755

return 0;

6756

}

6756

}

6757

EXPORT_SYMBOL(__cond_resched_softirq);

6757

EXPORT_SYMBOL(__cond_resched_softirq);

6758

6759

/**

6759

/**

6760

* yield - yield the current processor to other threads.

6760

* yield - yield the current processor to other threads.

6761

*

6761

*

6762

* This is a shortcut for kernel-space yielding - it marks the

6762

* This is a shortcut for kernel-space yielding - it marks the

6763

* thread runnable and calls sys_sched_yield().

6763

* thread runnable and calls sys_sched_yield().

6764

*/

6764

*/

6765

void __sched yield(void)

6765

void __sched yield(void)

6766

{

6766

{

6767

set_current_state(TASK_RUNNING);

6767

set_current_state(TASK_RUNNING);

6768

sys_sched_yield();

6768

sys_sched_yield();

6769

}

6769

}

6770

EXPORT_SYMBOL(yield);

6770

EXPORT_SYMBOL(yield);

6771

6772

/*

6772

/*

6773

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

6773

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

6774

* that process accounting knows that this is a task in IO wait state.

6774

* that process accounting knows that this is a task in IO wait state.

6775

*/

6775

*/

6776

void __sched io_schedule(void)

6776

void __sched io_schedule(void)

6777

{

6777

{

6778

struct rq *rq = raw_rq();

6778

struct rq *rq = raw_rq();

6779

6780

delayacct_blkio_start();

6780

delayacct_blkio_start();

6781

atomic_inc(&rq->nr_iowait);

6781

atomic_inc(&rq->nr_iowait);

6782

current->in_iowait = 1;

6782

current->in_iowait = 1;

6783

schedule();

6783

schedule();

6784

current->in_iowait = 0;

6784

current->in_iowait = 0;

6785

atomic_dec(&rq->nr_iowait);

6785

atomic_dec(&rq->nr_iowait);

6786

delayacct_blkio_end();

6786

delayacct_blkio_end();

6787

}

6787

}

6788

EXPORT_SYMBOL(io_schedule);

6788

EXPORT_SYMBOL(io_schedule);

6789

6790

long __sched io_schedule_timeout(long timeout)

6790

long __sched io_schedule_timeout(long timeout)

6791

{

6791

{

6792

struct rq *rq = raw_rq();

6792

struct rq *rq = raw_rq();

6793

long ret;

6793

long ret;

6794

6795

delayacct_blkio_start();

6795

delayacct_blkio_start();

6796

atomic_inc(&rq->nr_iowait);

6796

atomic_inc(&rq->nr_iowait);

6797

current->in_iowait = 1;

6797

current->in_iowait = 1;

6798

ret = schedule_timeout(timeout);

6798

ret = schedule_timeout(timeout);

6799

current->in_iowait = 0;

6799

current->in_iowait = 0;

6800

atomic_dec(&rq->nr_iowait);

6800

atomic_dec(&rq->nr_iowait);

6801

delayacct_blkio_end();

6801

delayacct_blkio_end();

6802

return ret;

6802

return ret;

6803

}

6803

}

6804

6805

/**

6805

/**

6806

* sys_sched_get_priority_max - return maximum RT priority.

6806

* sys_sched_get_priority_max - return maximum RT priority.

6807

* @policy: scheduling class.

6807

* @policy: scheduling class.

6808

*

6808

*

6809

* this syscall returns the maximum rt_priority that can be used

6809

* this syscall returns the maximum rt_priority that can be used

6810

* by a given scheduling class.

6810

* by a given scheduling class.

6811

*/

6811

*/

6812

SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

6812

SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

6813

{

6813

{

6814

int ret = -EINVAL;

6814

int ret = -EINVAL;

6815

6816

switch (policy) {

6816

switch (policy) {

6817

case SCHED_FIFO:

6817

case SCHED_FIFO:

6818

case SCHED_RR:

6818

case SCHED_RR:

6819

ret = MAX_USER_RT_PRIO-1;

6819

ret = MAX_USER_RT_PRIO-1;

6820

break;

6820

break;

6821

case SCHED_NORMAL:

6821

case SCHED_NORMAL:

6822

case SCHED_BATCH:

6822

case SCHED_BATCH:

6823

case SCHED_IDLE:

6823

case SCHED_IDLE:

6824

ret = 0;

6824

ret = 0;

6825

break;

6825

break;

6826

}

6826

}

6827

return ret;

6827

return ret;

6828

}

6828

}

6829

6830

/**

6830

/**

6831

* sys_sched_get_priority_min - return minimum RT priority.

6831

* sys_sched_get_priority_min - return minimum RT priority.

6832

* @policy: scheduling class.

6832

* @policy: scheduling class.

6833

*

6833

*

6834

* this syscall returns the minimum rt_priority that can be used

6834

* this syscall returns the minimum rt_priority that can be used

6835

* by a given scheduling class.

6835

* by a given scheduling class.

6836

*/

6836

*/

6837

SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

6837

SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

6838

{

6838

{

6839

int ret = -EINVAL;

6839

int ret = -EINVAL;

6840

6841

switch (policy) {

6841

switch (policy) {

6842

case SCHED_FIFO:

6842

case SCHED_FIFO:

6843

case SCHED_RR:

6843

case SCHED_RR:

6844

ret = 1;

6844

ret = 1;

6845

break;

6845

break;

6846

case SCHED_NORMAL:

6846

case SCHED_NORMAL:

6847

case SCHED_BATCH:

6847

case SCHED_BATCH:

6848

case SCHED_IDLE:

6848

case SCHED_IDLE:

6849

ret = 0;

6849

ret = 0;

6850

}

6850

}

6851

return ret;

6851

return ret;

6852

}

6852

}

6853

6854

/**

6854

/**

6855

* sys_sched_rr_get_interval - return the default timeslice of a process.

6855

* sys_sched_rr_get_interval - return the default timeslice of a process.

6856

* @pid: pid of the process.

6856

* @pid: pid of the process.

6857

* @interval: userspace pointer to the timeslice value.

6857

* @interval: userspace pointer to the timeslice value.

6858

*

6858

*

6859

* this syscall writes the default timeslice value of a given process

6859

* this syscall writes the default timeslice value of a given process

6860

* into the user-space timespec buffer. A value of '0' means infinity.

6860

* into the user-space timespec buffer. A value of '0' means infinity.

6861

*/

6861

*/

6862

SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

6862

SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

6863

struct timespec __user *, interval)

6863

struct timespec __user *, interval)

6864

{

6864

{

6865

struct task_struct *p;

6865

struct task_struct *p;

6866

unsigned int time_slice;

6866

unsigned int time_slice;

6867

unsigned long flags;

6867

unsigned long flags;

6868

struct rq *rq;

6868

struct rq *rq;

6869

int retval;

6869

int retval;

6870

struct timespec t;

6870

struct timespec t;

6871

6872

if (pid < 0)

6872

if (pid < 0)

6873

return -EINVAL;

6873

return -EINVAL;

6874

6875

retval = -ESRCH;

6875

retval = -ESRCH;

6876

read_lock(&tasklist_lock);

6876

rcu_read_lock();

6877

p = find_process_by_pid(pid);

6877

p = find_process_by_pid(pid);

6878

if (!p)

6878

if (!p)

6879

goto out_unlock;

6879

goto out_unlock;

6880

6881

retval = security_task_getscheduler(p);

6881

retval = security_task_getscheduler(p);

6882

if (retval)

6882

if (retval)

6883

goto out_unlock;

6883

goto out_unlock;

6884

6885

rq = task_rq_lock(p, &flags);

6885

rq = task_rq_lock(p, &flags);

6886

time_slice = p->sched_class->get_rr_interval(rq, p);

6886

time_slice = p->sched_class->get_rr_interval(rq, p);

6887

task_rq_unlock(rq, &flags);

6887

task_rq_unlock(rq, &flags);

6888

6889

read_unlock(&tasklist_lock);

6889

rcu_read_unlock();

6890

jiffies_to_timespec(time_slice, &t);

6890

jiffies_to_timespec(time_slice, &t);

6891

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

6891

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

6892

return retval;

6892

return retval;

6893

6894

out_unlock:

6894

out_unlock:

6895

read_unlock(&tasklist_lock);

6895

rcu_read_unlock();

6896

return retval;

6896

return retval;

6897

}

6897

}

6898

6899

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

6899

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

6900

6901

void sched_show_task(struct task_struct *p)

6901

void sched_show_task(struct task_struct *p)

6902

{

6902

{

6903

unsigned long free = 0;

6903

unsigned long free = 0;

6904

unsigned state;

6904

unsigned state;

6905

6906

state = p->state ? __ffs(p->state) + 1 : 0;

6906

state = p->state ? __ffs(p->state) + 1 : 0;

6907

pr_info("%-13.13s %c", p->comm,

6907

pr_info("%-13.13s %c", p->comm,

6908

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

6908

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

6909

#if BITS_PER_LONG == 32

6909

#if BITS_PER_LONG == 32

6910

if (state == TASK_RUNNING)

6910

if (state == TASK_RUNNING)

6911

pr_cont(" running ");

6911

pr_cont(" running ");

6912

else

6912

else

6913

pr_cont(" %08lx ", thread_saved_pc(p));

6913

pr_cont(" %08lx ", thread_saved_pc(p));

6914

#else

6914

#else

6915

if (state == TASK_RUNNING)

6915

if (state == TASK_RUNNING)

6916

pr_cont(" running task ");

6916

pr_cont(" running task ");

6917

else

6917

else

6918

pr_cont(" %016lx ", thread_saved_pc(p));

6918

pr_cont(" %016lx ", thread_saved_pc(p));

6919

#endif

6919

#endif

6920

#ifdef CONFIG_DEBUG_STACK_USAGE

6920

#ifdef CONFIG_DEBUG_STACK_USAGE

6921

free = stack_not_used(p);

6921

free = stack_not_used(p);

6922

#endif

6922

#endif

6923

pr_cont("%5lu %5d %6d 0x%08lx\n", free,

6923

pr_cont("%5lu %5d %6d 0x%08lx\n", free,

6924

task_pid_nr(p), task_pid_nr(p->real_parent),

6924

task_pid_nr(p), task_pid_nr(p->real_parent),

6925

(unsigned long)task_thread_info(p)->flags);

6925

(unsigned long)task_thread_info(p)->flags);

6926

6927

show_stack(p, NULL);

6927

show_stack(p, NULL);

6928

}

6928

}

6929

6930

void show_state_filter(unsigned long state_filter)

6930

void show_state_filter(unsigned long state_filter)

6931

{

6931

{

6932

struct task_struct *g, *p;

6932

struct task_struct *g, *p;

6933

6934

#if BITS_PER_LONG == 32

6934

#if BITS_PER_LONG == 32

6935

pr_info(" task PC stack pid father\n");

6935

pr_info(" task PC stack pid father\n");

6936

#else

6936

#else

6937

pr_info(" task PC stack pid father\n");

6937

pr_info(" task PC stack pid father\n");

6938

#endif

6938

#endif

6939

read_lock(&tasklist_lock);

6939

read_lock(&tasklist_lock);

6940

do_each_thread(g, p) {

6940

do_each_thread(g, p) {

6941

/*

6941

/*

6942

* reset the NMI-timeout, listing all files on a slow

6942

* reset the NMI-timeout, listing all files on a slow

6943

* console might take alot of time:

6943

* console might take alot of time:

6944

*/

6944

*/

6945

touch_nmi_watchdog();

6945

touch_nmi_watchdog();

6946

if (!state_filter || (p->state & state_filter))

6946

if (!state_filter || (p->state & state_filter))

6947

sched_show_task(p);

6947

sched_show_task(p);

6948

} while_each_thread(g, p);

6948

} while_each_thread(g, p);

6949

6950

touch_all_softlockup_watchdogs();

6950

touch_all_softlockup_watchdogs();

6951

6952

#ifdef CONFIG_SCHED_DEBUG

6952

#ifdef CONFIG_SCHED_DEBUG

6953

sysrq_sched_debug_show();

6953

sysrq_sched_debug_show();

6954

#endif

6954

#endif

6955

read_unlock(&tasklist_lock);

6955

read_unlock(&tasklist_lock);

6956

/*

6956

/*

6957

* Only show locks if all tasks are dumped:

6957

* Only show locks if all tasks are dumped:

6958

*/

6958

*/

6959

if (!state_filter)

6959

if (!state_filter)

6960

debug_show_all_locks();

6960

debug_show_all_locks();

6961

}

6961

}

6962

6963

void __cpuinit init_idle_bootup_task(struct task_struct *idle)

6963

void __cpuinit init_idle_bootup_task(struct task_struct *idle)

6964

{

6964

{

6965

idle->sched_class = &idle_sched_class;

6965

idle->sched_class = &idle_sched_class;

6966

}

6966

}

6967

6968

/**

6968

/**

6969

* init_idle - set up an idle thread for a given CPU

6969

* init_idle - set up an idle thread for a given CPU

6970

* @idle: task in question

6970

* @idle: task in question

6971

* @cpu: cpu the idle task belongs to

6971

* @cpu: cpu the idle task belongs to

6972

*

6972

*

6973

* NOTE: this function does not set the idle thread's NEED_RESCHED

6973

* NOTE: this function does not set the idle thread's NEED_RESCHED

6974

* flag, to make booting more robust.

6974

* flag, to make booting more robust.

6975

*/

6975

*/

6976

void __cpuinit init_idle(struct task_struct *idle, int cpu)

6976

void __cpuinit init_idle(struct task_struct *idle, int cpu)

6977

{

6977

{

6978

struct rq *rq = cpu_rq(cpu);

6978

struct rq *rq = cpu_rq(cpu);

6979

unsigned long flags;

6979

unsigned long flags;

6980

6981

spin_lock_irqsave(&rq->lock, flags);

6981

spin_lock_irqsave(&rq->lock, flags);

6982

6983

__sched_fork(idle);

6983

__sched_fork(idle);

6984

idle->se.exec_start = sched_clock();

6984

idle->se.exec_start = sched_clock();

6985

6986

cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));

6986

cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));

6987

__set_task_cpu(idle, cpu);

6987

__set_task_cpu(idle, cpu);

6988

6989

rq->curr = rq->idle = idle;

6989

rq->curr = rq->idle = idle;

6990

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

6990

#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)

6991

idle->oncpu = 1;

6991

idle->oncpu = 1;

6992

#endif

6992

#endif

6993

spin_unlock_irqrestore(&rq->lock, flags);

6993

spin_unlock_irqrestore(&rq->lock, flags);

6994

6995

/* Set the preempt count _outside_ the spinlocks! */

6995

/* Set the preempt count _outside_ the spinlocks! */

6996

#if defined(CONFIG_PREEMPT)

6996

#if defined(CONFIG_PREEMPT)

6997

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

6997

task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);

6998

#else

6998

#else

6999

task_thread_info(idle)->preempt_count = 0;

6999

task_thread_info(idle)->preempt_count = 0;

7000

#endif

7000

#endif

7001

/*

7001

/*

7002

* The idle tasks have their own, simple scheduling class:

7002

* The idle tasks have their own, simple scheduling class:

7003

*/

7003

*/

7004

idle->sched_class = &idle_sched_class;

7004

idle->sched_class = &idle_sched_class;

7005

ftrace_graph_init_task(idle);

7005

ftrace_graph_init_task(idle);

7006

}

7006

}

7007

7008

/*

7008

/*

7009

* In a system that switches off the HZ timer nohz_cpu_mask

7009

* In a system that switches off the HZ timer nohz_cpu_mask

7010

* indicates which cpus entered this state. This is used

7010

* indicates which cpus entered this state. This is used

7011

* in the rcu update to wait only for active cpus. For system

7011

* in the rcu update to wait only for active cpus. For system

7012

* which do not switch off the HZ timer nohz_cpu_mask should

7012

* which do not switch off the HZ timer nohz_cpu_mask should

7013

* always be CPU_BITS_NONE.

7013

* always be CPU_BITS_NONE.

7014

*/

7014

*/

7015

cpumask_var_t nohz_cpu_mask;

7015

cpumask_var_t nohz_cpu_mask;

7016

7017

/*

7017

/*

7018

* Increase the granularity value when there are more CPUs,

7018

* Increase the granularity value when there are more CPUs,

7019

* because with more CPUs the 'effective latency' as visible

7019

* because with more CPUs the 'effective latency' as visible

7020

* to users decreases. But the relationship is not linear,

7020

* to users decreases. But the relationship is not linear,

7021

* so pick a second-best guess by going with the log2 of the

7021

* so pick a second-best guess by going with the log2 of the

7022

* number of CPUs.

7022

* number of CPUs.

7023

*

7023

*

7024

* This idea comes from the SD scheduler of Con Kolivas:

7024

* This idea comes from the SD scheduler of Con Kolivas:

7025

*/

7025

*/

7026

static int get_update_sysctl_factor(void)

7026

static int get_update_sysctl_factor(void)

7027

{

7027

{

7028

unsigned int cpus = min_t(int, num_online_cpus(), 8);

7028

unsigned int cpus = min_t(int, num_online_cpus(), 8);

7029

unsigned int factor;

7029

unsigned int factor;

7030

7031

switch (sysctl_sched_tunable_scaling) {

7031

switch (sysctl_sched_tunable_scaling) {

7032

case SCHED_TUNABLESCALING_NONE:

7032

case SCHED_TUNABLESCALING_NONE:

7033

factor = 1;

7033

factor = 1;

7034

break;

7034

break;

7035

case SCHED_TUNABLESCALING_LINEAR:

7035

case SCHED_TUNABLESCALING_LINEAR:

7036

factor = cpus;

7036

factor = cpus;

7037

break;

7037

break;

7038

case SCHED_TUNABLESCALING_LOG:

7038

case SCHED_TUNABLESCALING_LOG:

7039

default:

7039

default:

7040

factor = 1 + ilog2(cpus);

7040

factor = 1 + ilog2(cpus);

7041

break;

7041

break;

7042

}

7042

}

7043

7044

return factor;

7044

return factor;

7045

}

7045

}

7046

7047

static void update_sysctl(void)

7047

static void update_sysctl(void)

7048

{

7048

{

7049

unsigned int factor = get_update_sysctl_factor();

7049

unsigned int factor = get_update_sysctl_factor();

7050

7051

#define SET_SYSCTL(name) \

7051

#define SET_SYSCTL(name) \

7052

(sysctl_##name = (factor) * normalized_sysctl_##name)

7052

(sysctl_##name = (factor) * normalized_sysctl_##name)

7053

SET_SYSCTL(sched_min_granularity);

7053

SET_SYSCTL(sched_min_granularity);

7054

SET_SYSCTL(sched_latency);

7054

SET_SYSCTL(sched_latency);

7055

SET_SYSCTL(sched_wakeup_granularity);

7055

SET_SYSCTL(sched_wakeup_granularity);

7056

SET_SYSCTL(sched_shares_ratelimit);

7056

SET_SYSCTL(sched_shares_ratelimit);

7057

#undef SET_SYSCTL

7057

#undef SET_SYSCTL

7058

}

7058

}

7059

7060

static inline void sched_init_granularity(void)

7060

static inline void sched_init_granularity(void)

7061

{

7061

{

7062

update_sysctl();

7062

update_sysctl();

7063

}

7063

}

7064

7065

#ifdef CONFIG_SMP

7065

#ifdef CONFIG_SMP

7066

/*

7066

/*

7067

* This is how migration works:

7067

* This is how migration works:

7068

*

7068

*

7069

* 1) we queue a struct migration_req structure in the source CPU's

7069

* 1) we queue a struct migration_req structure in the source CPU's

7070

* runqueue and wake up that CPU's migration thread.

7070

* runqueue and wake up that CPU's migration thread.

7071

* 2) we down() the locked semaphore => thread blocks.

7071

* 2) we down() the locked semaphore => thread blocks.

7072

* 3) migration thread wakes up (implicitly it forces the migrated

7072

* 3) migration thread wakes up (implicitly it forces the migrated

7073

* thread off the CPU)

7073

* thread off the CPU)

7074

* 4) it gets the migration request and checks whether the migrated

7074

* 4) it gets the migration request and checks whether the migrated

7075

* task is still in the wrong runqueue.

7075

* task is still in the wrong runqueue.

7076

* 5) if it's in the wrong runqueue then the migration thread removes

7076

* 5) if it's in the wrong runqueue then the migration thread removes

7077

* it and puts it into the right queue.

7077

* it and puts it into the right queue.

7078

* 6) migration thread up()s the semaphore.

7078

* 6) migration thread up()s the semaphore.

7079

* 7) we wake up and the migration is done.

7079

* 7) we wake up and the migration is done.

7080

*/

7080

*/

7081

7082

/*

7082

/*

7083

* Change a given task's CPU affinity. Migrate the thread to a

7083

* Change a given task's CPU affinity. Migrate the thread to a

7084

* proper CPU and schedule it away if the CPU it's executing on

7084

* proper CPU and schedule it away if the CPU it's executing on

7085

* is removed from the allowed bitmask.

7085

* is removed from the allowed bitmask.

7086

*

7086

*

7087

* NOTE: the caller must have a valid reference to the task, the

7087

* NOTE: the caller must have a valid reference to the task, the

7088

* task must not exit() & deallocate itself prematurely. The

7088

* task must not exit() & deallocate itself prematurely. The

7089

* call is not atomic; no spinlocks may be held.

7089

* call is not atomic; no spinlocks may be held.

7090

*/

7090

*/

7091

int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

7091

int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

7092

{

7092

{

7093

struct migration_req req;

7093

struct migration_req req;

7094

unsigned long flags;

7094

unsigned long flags;

7095

struct rq *rq;

7095

struct rq *rq;

7096

int ret = 0;

7096

int ret = 0;

7097

7098

rq = task_rq_lock(p, &flags);

7098

rq = task_rq_lock(p, &flags);

7099

if (!cpumask_intersects(new_mask, cpu_active_mask)) {

7099

if (!cpumask_intersects(new_mask, cpu_active_mask)) {

7100

ret = -EINVAL;

7100

ret = -EINVAL;

7101

goto out;

7101

goto out;

7102

}

7102

}

7103

7104

if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&

7104

if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&

7105

!cpumask_equal(&p->cpus_allowed, new_mask))) {

7105

!cpumask_equal(&p->cpus_allowed, new_mask))) {

7106

ret = -EINVAL;

7106

ret = -EINVAL;

7107

goto out;

7107

goto out;

7108

}

7108

}

7109

7110

if (p->sched_class->set_cpus_allowed)

7110

if (p->sched_class->set_cpus_allowed)

7111

p->sched_class->set_cpus_allowed(p, new_mask);

7111

p->sched_class->set_cpus_allowed(p, new_mask);

7112

else {

7112

else {

7113

cpumask_copy(&p->cpus_allowed, new_mask);

7113

cpumask_copy(&p->cpus_allowed, new_mask);

7114

p->rt.nr_cpus_allowed = cpumask_weight(new_mask);

7114

p->rt.nr_cpus_allowed = cpumask_weight(new_mask);

7115

}

7115

}

7116

7117

/* Can the task run on the task's current CPU? If so, we're done */

7117

/* Can the task run on the task's current CPU? If so, we're done */

7118

if (cpumask_test_cpu(task_cpu(p), new_mask))

7118

if (cpumask_test_cpu(task_cpu(p), new_mask))

7119

goto out;

7119

goto out;

7120

7121

if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {

7121

if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {

7122

/* Need help from migration thread: drop lock and wait. */

7122

/* Need help from migration thread: drop lock and wait. */

7123

struct task_struct *mt = rq->migration_thread;

7123

struct task_struct *mt = rq->migration_thread;

7124

7125

get_task_struct(mt);

7125

get_task_struct(mt);

7126

task_rq_unlock(rq, &flags);

7126

task_rq_unlock(rq, &flags);

7127

wake_up_process(rq->migration_thread);

7127

wake_up_process(rq->migration_thread);

7128

put_task_struct(mt);

7128

put_task_struct(mt);

7129

wait_for_completion(&req.done);

7129

wait_for_completion(&req.done);

7130

tlb_migrate_finish(p->mm);

7130

tlb_migrate_finish(p->mm);

7131

return 0;

7131

return 0;

7132

}

7132

}

7133

out:

7133

out:

7134

task_rq_unlock(rq, &flags);

7134

task_rq_unlock(rq, &flags);

7135

7136

return ret;

7136

return ret;

7137

}

7137

}

7138

EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

7138

EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

7139

7140

/*

7140

/*

7141

* Move (not current) task off this cpu, onto dest cpu. We're doing

7141

* Move (not current) task off this cpu, onto dest cpu. We're doing

7142

* this because either it can't run here any more (set_cpus_allowed()

7142

* this because either it can't run here any more (set_cpus_allowed()

7143

* away from this CPU, or CPU going down), or because we're

7143

* away from this CPU, or CPU going down), or because we're

7144

* attempting to rebalance this task on exec (sched_exec).

7144

* attempting to rebalance this task on exec (sched_exec).

7145

*

7145

*

7146

* So we race with normal scheduler movements, but that's OK, as long

7146

* So we race with normal scheduler movements, but that's OK, as long

7147

* as the task is no longer on this CPU.

7147

* as the task is no longer on this CPU.

7148

*

7148

*

7149

* Returns non-zero if task was successfully migrated.

7149

* Returns non-zero if task was successfully migrated.

7150

*/

7150

*/

7151

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

7151

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

7152

{

7152

{

7153

struct rq *rq_dest, *rq_src;

7153

struct rq *rq_dest, *rq_src;

7154

int ret = 0, on_rq;

7154

int ret = 0, on_rq;

7155

7156

if (unlikely(!cpu_active(dest_cpu)))

7156

if (unlikely(!cpu_active(dest_cpu)))

7157

return ret;

7157

return ret;

7158

7159

rq_src = cpu_rq(src_cpu);

7159

rq_src = cpu_rq(src_cpu);

7160

rq_dest = cpu_rq(dest_cpu);

7160

rq_dest = cpu_rq(dest_cpu);

7161

7162

double_rq_lock(rq_src, rq_dest);

7162

double_rq_lock(rq_src, rq_dest);

7163

/* Already moved. */

7163

/* Already moved. */

7164

if (task_cpu(p) != src_cpu)

7164

if (task_cpu(p) != src_cpu)

7165

goto done;

7165

goto done;

7166

/* Affinity changed (again). */

7166

/* Affinity changed (again). */

7167

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

7167

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

7168

goto fail;

7168

goto fail;

7169

7170

on_rq = p->se.on_rq;

7170

on_rq = p->se.on_rq;

7171

if (on_rq)

7171

if (on_rq)

7172

deactivate_task(rq_src, p, 0);

7172

deactivate_task(rq_src, p, 0);

7173

7174

set_task_cpu(p, dest_cpu);

7174

set_task_cpu(p, dest_cpu);

7175

if (on_rq) {

7175

if (on_rq) {

7176

activate_task(rq_dest, p, 0);

7176

activate_task(rq_dest, p, 0);

7177

check_preempt_curr(rq_dest, p, 0);

7177

check_preempt_curr(rq_dest, p, 0);

7178

}

7178

}

7179

done:

7179

done:

7180

ret = 1;

7180

ret = 1;

7181

fail:

7181

fail:

7182

double_rq_unlock(rq_src, rq_dest);

7182

double_rq_unlock(rq_src, rq_dest);

7183

return ret;

7183

return ret;

7184

}

7184

}

7185

7186

#define RCU_MIGRATION_IDLE 0

7186

#define RCU_MIGRATION_IDLE 0

7187

#define RCU_MIGRATION_NEED_QS 1

7187

#define RCU_MIGRATION_NEED_QS 1

7188

#define RCU_MIGRATION_GOT_QS 2

7188

#define RCU_MIGRATION_GOT_QS 2

7189

#define RCU_MIGRATION_MUST_SYNC 3

7189

#define RCU_MIGRATION_MUST_SYNC 3

7190

7191

/*

7191

/*

7192

* migration_thread - this is a highprio system thread that performs

7192

* migration_thread - this is a highprio system thread that performs

7193

* thread migration by bumping thread off CPU then 'pushing' onto

7193

* thread migration by bumping thread off CPU then 'pushing' onto

7194

* another runqueue.

7194

* another runqueue.

7195

*/

7195

*/

7196

static int migration_thread(void *data)

7196

static int migration_thread(void *data)

7197

{

7197

{

7198

int badcpu;

7198

int badcpu;

7199

int cpu = (long)data;

7199

int cpu = (long)data;

7200

struct rq *rq;

7200

struct rq *rq;

7201

7202

rq = cpu_rq(cpu);

7202

rq = cpu_rq(cpu);

7203

BUG_ON(rq->migration_thread != current);

7203

BUG_ON(rq->migration_thread != current);

7204

7205

set_current_state(TASK_INTERRUPTIBLE);

7205

set_current_state(TASK_INTERRUPTIBLE);

7206

while (!kthread_should_stop()) {

7206

while (!kthread_should_stop()) {

7207

struct migration_req *req;

7207

struct migration_req *req;

7208

struct list_head *head;

7208

struct list_head *head;

7209

7210

spin_lock_irq(&rq->lock);

7210

spin_lock_irq(&rq->lock);

7211

7212

if (cpu_is_offline(cpu)) {

7212

if (cpu_is_offline(cpu)) {

7213

spin_unlock_irq(&rq->lock);

7213

spin_unlock_irq(&rq->lock);

7214

break;

7214

break;

7215

}

7215

}

7216

7217

if (rq->active_balance) {

7217

if (rq->active_balance) {

7218

active_load_balance(rq, cpu);

7218

active_load_balance(rq, cpu);

7219

rq->active_balance = 0;

7219

rq->active_balance = 0;

7220

}

7220

}

7221

7222

head = &rq->migration_queue;

7222

head = &rq->migration_queue;

7223

7224

if (list_empty(head)) {

7224

if (list_empty(head)) {

7225

spin_unlock_irq(&rq->lock);

7225

spin_unlock_irq(&rq->lock);

7226

schedule();

7226

schedule();

7227

set_current_state(TASK_INTERRUPTIBLE);

7227

set_current_state(TASK_INTERRUPTIBLE);

7228

continue;

7228

continue;

7229

}

7229

}

7230

req = list_entry(head->next, struct migration_req, list);

7230

req = list_entry(head->next, struct migration_req, list);

7231

list_del_init(head->next);

7231

list_del_init(head->next);

7232

7233

if (req->task != NULL) {

7233

if (req->task != NULL) {

7234

spin_unlock(&rq->lock);

7234

spin_unlock(&rq->lock);

7235

__migrate_task(req->task, cpu, req->dest_cpu);

7235

__migrate_task(req->task, cpu, req->dest_cpu);

7236

} else if (likely(cpu == (badcpu = smp_processor_id()))) {

7236

} else if (likely(cpu == (badcpu = smp_processor_id()))) {

7237

req->dest_cpu = RCU_MIGRATION_GOT_QS;

7237

req->dest_cpu = RCU_MIGRATION_GOT_QS;

7238

spin_unlock(&rq->lock);

7238

spin_unlock(&rq->lock);

7239

} else {

7239

} else {

7240

req->dest_cpu = RCU_MIGRATION_MUST_SYNC;

7240

req->dest_cpu = RCU_MIGRATION_MUST_SYNC;

7241

spin_unlock(&rq->lock);

7241

spin_unlock(&rq->lock);

7242

WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);

7242

WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);

7243

}

7243

}

7244

local_irq_enable();

7244

local_irq_enable();

7245

7246

complete(&req->done);

7246

complete(&req->done);

7247

}

7247

}

7248

__set_current_state(TASK_RUNNING);

7248

__set_current_state(TASK_RUNNING);

7249

7250

return 0;

7250

return 0;

7251

}

7251

}

7252

7253

#ifdef CONFIG_HOTPLUG_CPU

7253

#ifdef CONFIG_HOTPLUG_CPU

7254

7255

static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)

7255

static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)

7256

{

7256

{

7257

int ret;

7257

int ret;

7258

7259

local_irq_disable();

7259

local_irq_disable();

7260

ret = __migrate_task(p, src_cpu, dest_cpu);

7260

ret = __migrate_task(p, src_cpu, dest_cpu);

7261

local_irq_enable();

7261

local_irq_enable();

7262

return ret;

7262

return ret;

7263

}

7263

}

7264

7265

/*

7265

/*

7266

* Figure out where task on dead CPU should go, use force if necessary.

7266

* Figure out where task on dead CPU should go, use force if necessary.

7267

*/

7267

*/

7268

static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)

7268

static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)

7269

{

7269

{

7270

int dest_cpu;

7270

int dest_cpu;

7271

const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));

7271

const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));

7272

7273

again:

7273

again:

7274

/* Look for allowed, online CPU in same node. */

7274

/* Look for allowed, online CPU in same node. */

7275

for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)

7275

for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)

7276

if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

7276

if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

7277

goto move;

7277

goto move;

7278

7279

/* Any allowed, online CPU? */

7279

/* Any allowed, online CPU? */

7280

dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);

7280

dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);

7281

if (dest_cpu < nr_cpu_ids)

7281

if (dest_cpu < nr_cpu_ids)

7282

goto move;

7282

goto move;

7283

7284

/* No more Mr. Nice Guy. */

7284

/* No more Mr. Nice Guy. */

7285

if (dest_cpu >= nr_cpu_ids) {

7285

if (dest_cpu >= nr_cpu_ids) {

7286

cpuset_cpus_allowed_locked(p, &p->cpus_allowed);

7286

cpuset_cpus_allowed_locked(p, &p->cpus_allowed);

7287

dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);

7287

dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);

7288

7289

/*

7289

/*

7290

* Don't tell them about moving exiting tasks or

7290

* Don't tell them about moving exiting tasks or

7291

* kernel threads (both mm NULL), since they never

7291

* kernel threads (both mm NULL), since they never

7292

* leave kernel.

7292

* leave kernel.

7293

*/

7293

*/

7294

if (p->mm && printk_ratelimit()) {

7294

if (p->mm && printk_ratelimit()) {

7295

pr_info("process %d (%s) no longer affine to cpu%d\n",

7295

pr_info("process %d (%s) no longer affine to cpu%d\n",

7296

task_pid_nr(p), p->comm, dead_cpu);

7296

task_pid_nr(p), p->comm, dead_cpu);

7297

}

7297

}

7298

}

7298

}

7299

7300

move:

7300

move:

7301

/* It can have affinity changed while we were choosing. */

7301

/* It can have affinity changed while we were choosing. */

7302

if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))

7302

if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))

7303

goto again;

7303

goto again;

7304

}

7304

}

7305

7306

/*

7306

/*

7307

* While a dead CPU has no uninterruptible tasks queued at this point,

7307

* While a dead CPU has no uninterruptible tasks queued at this point,

7308

* it might still have a nonzero ->nr_uninterruptible counter, because

7308

* it might still have a nonzero ->nr_uninterruptible counter, because

7309

* for performance reasons the counter is not stricly tracking tasks to

7309

* for performance reasons the counter is not stricly tracking tasks to

7310

* their home CPUs. So we just add the counter to another CPU's counter,

7310

* their home CPUs. So we just add the counter to another CPU's counter,

7311

* to keep the global sum constant after CPU-down:

7311

* to keep the global sum constant after CPU-down:

7312

*/

7312

*/

7313

static void migrate_nr_uninterruptible(struct rq *rq_src)

7313

static void migrate_nr_uninterruptible(struct rq *rq_src)

7314

{

7314

{

7315

struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));

7315

struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));

7316

unsigned long flags;

7316

unsigned long flags;

7317

7318

local_irq_save(flags);

7318

local_irq_save(flags);

7319

double_rq_lock(rq_src, rq_dest);

7319

double_rq_lock(rq_src, rq_dest);

7320

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

7320

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

7321

rq_src->nr_uninterruptible = 0;

7321

rq_src->nr_uninterruptible = 0;

7322

double_rq_unlock(rq_src, rq_dest);

7322

double_rq_unlock(rq_src, rq_dest);

7323

local_irq_restore(flags);

7323

local_irq_restore(flags);

7324

}

7324

}

7325

7326

/* Run through task list and migrate tasks from the dead cpu. */

7326

/* Run through task list and migrate tasks from the dead cpu. */

7327

static void migrate_live_tasks(int src_cpu)

7327

static void migrate_live_tasks(int src_cpu)

7328

{

7328

{

7329

struct task_struct *p, *t;

7329

struct task_struct *p, *t;

7330

7331

read_lock(&tasklist_lock);

7331

read_lock(&tasklist_lock);

7332

7333

do_each_thread(t, p) {

7333

do_each_thread(t, p) {

7334

if (p == current)

7334

if (p == current)

7335

continue;

7335

continue;

7336

7337

if (task_cpu(p) == src_cpu)

7337

if (task_cpu(p) == src_cpu)

7338

move_task_off_dead_cpu(src_cpu, p);

7338

move_task_off_dead_cpu(src_cpu, p);

7339

} while_each_thread(t, p);

7339

} while_each_thread(t, p);

7340

7341

read_unlock(&tasklist_lock);

7341

read_unlock(&tasklist_lock);

7342

}

7342

}

7343

7344

/*

7344

/*

7345

* Schedules idle task to be the next runnable task on current CPU.

7345

* Schedules idle task to be the next runnable task on current CPU.

7346

* It does so by boosting its priority to highest possible.

7346

* It does so by boosting its priority to highest possible.

7347

* Used by CPU offline code.

7347

* Used by CPU offline code.

7348

*/

7348

*/

7349

void sched_idle_next(void)

7349

void sched_idle_next(void)

7350

{

7350

{

7351

int this_cpu = smp_processor_id();

7351

int this_cpu = smp_processor_id();

7352

struct rq *rq = cpu_rq(this_cpu);

7352

struct rq *rq = cpu_rq(this_cpu);

7353

struct task_struct *p = rq->idle;

7353

struct task_struct *p = rq->idle;

7354

unsigned long flags;

7354

unsigned long flags;

7355

7356

/* cpu has to be offline */

7356

/* cpu has to be offline */

7357

BUG_ON(cpu_online(this_cpu));

7357

BUG_ON(cpu_online(this_cpu));

7358

7359

/*

7359

/*

7360

* Strictly not necessary since rest of the CPUs are stopped by now

7360

* Strictly not necessary since rest of the CPUs are stopped by now

7361

* and interrupts disabled on the current cpu.

7361

* and interrupts disabled on the current cpu.

7362

*/

7362

*/

7363

spin_lock_irqsave(&rq->lock, flags);

7363

spin_lock_irqsave(&rq->lock, flags);

7364

7365

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

7365

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

7366

7367

update_rq_clock(rq);

7367

update_rq_clock(rq);

7368

activate_task(rq, p, 0);

7368

activate_task(rq, p, 0);

7369

7370

spin_unlock_irqrestore(&rq->lock, flags);

7370

spin_unlock_irqrestore(&rq->lock, flags);

7371

}

7371

}

7372

7373

/*

7373

/*

7374

* Ensures that the idle task is using init_mm right before its cpu goes

7374

* Ensures that the idle task is using init_mm right before its cpu goes

7375

* offline.

7375

* offline.

7376

*/

7376

*/

7377

void idle_task_exit(void)

7377

void idle_task_exit(void)

7378

{

7378

{

7379

struct mm_struct *mm = current->active_mm;

7379

struct mm_struct *mm = current->active_mm;

7380

7381

BUG_ON(cpu_online(smp_processor_id()));

7381

BUG_ON(cpu_online(smp_processor_id()));

7382

7383

if (mm != &init_mm)

7383

if (mm != &init_mm)

7384

switch_mm(mm, &init_mm, current);

7384

switch_mm(mm, &init_mm, current);

7385

mmdrop(mm);

7385

mmdrop(mm);

7386

}

7386

}

7387

7388

/* called under rq->lock with disabled interrupts */

7388

/* called under rq->lock with disabled interrupts */

7389

static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)

7389

static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)

7390

{

7390

{

7391

struct rq *rq = cpu_rq(dead_cpu);

7391

struct rq *rq = cpu_rq(dead_cpu);

7392

7393

/* Must be exiting, otherwise would be on tasklist. */

7393

/* Must be exiting, otherwise would be on tasklist. */

7394

BUG_ON(!p->exit_state);

7394

BUG_ON(!p->exit_state);

7395

7396

/* Cannot have done final schedule yet: would have vanished. */

7396

/* Cannot have done final schedule yet: would have vanished. */

7397

BUG_ON(p->state == TASK_DEAD);

7397

BUG_ON(p->state == TASK_DEAD);

7398

7399

get_task_struct(p);

7399

get_task_struct(p);

7400

7401

/*

7401

/*

7402

* Drop lock around migration; if someone else moves it,

7402

* Drop lock around migration; if someone else moves it,

7403

* that's OK. No task can be added to this CPU, so iteration is

7403

* that's OK. No task can be added to this CPU, so iteration is

7404

* fine.

7404

* fine.

7405

*/

7405

*/

7406

spin_unlock_irq(&rq->lock);

7406

spin_unlock_irq(&rq->lock);

7407

move_task_off_dead_cpu(dead_cpu, p);

7407

move_task_off_dead_cpu(dead_cpu, p);

7408

spin_lock_irq(&rq->lock);

7408

spin_lock_irq(&rq->lock);

7409

7410

put_task_struct(p);

7410

put_task_struct(p);

7411

}

7411

}

7412

7413

/* release_task() removes task from tasklist, so we won't find dead tasks. */

7413

/* release_task() removes task from tasklist, so we won't find dead tasks. */

7414

static void migrate_dead_tasks(unsigned int dead_cpu)

7414

static void migrate_dead_tasks(unsigned int dead_cpu)

7415

{

7415

{

7416

struct rq *rq = cpu_rq(dead_cpu);

7416

struct rq *rq = cpu_rq(dead_cpu);

7417

struct task_struct *next;

7417

struct task_struct *next;

7418

7419

for ( ; ; ) {

7419

for ( ; ; ) {

7420

if (!rq->nr_running)

7420

if (!rq->nr_running)

7421

break;

7421

break;

7422

update_rq_clock(rq);

7422

update_rq_clock(rq);

7423

next = pick_next_task(rq);

7423

next = pick_next_task(rq);

7424

if (!next)

7424

if (!next)

7425

break;

7425

break;

7426

next->sched_class->put_prev_task(rq, next);

7426

next->sched_class->put_prev_task(rq, next);

7427

migrate_dead(dead_cpu, next);

7427

migrate_dead(dead_cpu, next);

7428

7429

}

7429

}

7430

}

7430

}

7431

7432

/*

7432

/*

7433

* remove the tasks which were accounted by rq from calc_load_tasks.

7433

* remove the tasks which were accounted by rq from calc_load_tasks.

7434

*/

7434

*/

7435

static void calc_global_load_remove(struct rq *rq)

7435

static void calc_global_load_remove(struct rq *rq)

7436

{

7436

{

7437

atomic_long_sub(rq->calc_load_active, &calc_load_tasks);

7437

atomic_long_sub(rq->calc_load_active, &calc_load_tasks);

7438

rq->calc_load_active = 0;

7438

rq->calc_load_active = 0;

7439

}

7439

}

7440

#endif /* CONFIG_HOTPLUG_CPU */

7440

#endif /* CONFIG_HOTPLUG_CPU */

7441

7442

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

7442

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

7443

7444

static struct ctl_table sd_ctl_dir[] = {

7444

static struct ctl_table sd_ctl_dir[] = {

7445

{

7445

{

7446

.procname = "sched_domain",

7446

.procname = "sched_domain",

7447

.mode = 0555,

7447

.mode = 0555,

7448

},

7448

},

7449

{}

7449

{}

7450

};

7450

};

7451

7452

static struct ctl_table sd_ctl_root[] = {

7452

static struct ctl_table sd_ctl_root[] = {

7453

{

7453

{

7454

.procname = "kernel",

7454

.procname = "kernel",

7455

.mode = 0555,

7455

.mode = 0555,

7456

.child = sd_ctl_dir,

7456

.child = sd_ctl_dir,

7457

},

7457

},

7458

{}

7458

{}

7459

};

7459

};

7460

7461

static struct ctl_table *sd_alloc_ctl_entry(int n)

7461

static struct ctl_table *sd_alloc_ctl_entry(int n)

7462

{

7462

{

7463

struct ctl_table *entry =

7463

struct ctl_table *entry =

7464

kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);

7464

kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);

7465

7466

return entry;

7466

return entry;

7467

}

7467

}

7468

7469

static void sd_free_ctl_entry(struct ctl_table **tablep)

7469

static void sd_free_ctl_entry(struct ctl_table **tablep)

7470

{

7470

{

7471

struct ctl_table *entry;

7471

struct ctl_table *entry;

7472

7473

/*

7473

/*

7474

* In the intermediate directories, both the child directory and

7474

* In the intermediate directories, both the child directory and

7475

* procname are dynamically allocated and could fail but the mode

7475

* procname are dynamically allocated and could fail but the mode

7476

* will always be set. In the lowest directory the names are

7476

* will always be set. In the lowest directory the names are

7477

* static strings and all have proc handlers.

7477

* static strings and all have proc handlers.

7478

*/

7478

*/

7479

for (entry = *tablep; entry->mode; entry++) {

7479

for (entry = *tablep; entry->mode; entry++) {

7480

if (entry->child)

7480

if (entry->child)

7481

sd_free_ctl_entry(&entry->child);

7481

sd_free_ctl_entry(&entry->child);

7482

if (entry->proc_handler == NULL)

7482

if (entry->proc_handler == NULL)

7483

kfree(entry->procname);

7483

kfree(entry->procname);

7484

}

7484

}

7485

7486

kfree(*tablep);

7486

kfree(*tablep);

7487

*tablep = NULL;

7487

*tablep = NULL;

7488

}

7488

}

7489

7490

static void

7490

static void

7491

set_table_entry(struct ctl_table *entry,

7491

set_table_entry(struct ctl_table *entry,

7492

const char *procname, void *data, int maxlen,

7492

const char *procname, void *data, int maxlen,

7493

mode_t mode, proc_handler *proc_handler)

7493

mode_t mode, proc_handler *proc_handler)

7494

{

7494

{

7495

entry->procname = procname;

7495

entry->procname = procname;

7496

entry->data = data;

7496

entry->data = data;

7497

entry->maxlen = maxlen;

7497

entry->maxlen = maxlen;

7498

entry->mode = mode;

7498

entry->mode = mode;

7499

entry->proc_handler = proc_handler;

7499

entry->proc_handler = proc_handler;

7500

}

7500

}

7501

7502

static struct ctl_table *

7502

static struct ctl_table *

7503

sd_alloc_ctl_domain_table(struct sched_domain *sd)

7503

sd_alloc_ctl_domain_table(struct sched_domain *sd)

7504

{

7504

{

7505

struct ctl_table *table = sd_alloc_ctl_entry(13);

7505

struct ctl_table *table = sd_alloc_ctl_entry(13);

7506

7507

if (table == NULL)

7507

if (table == NULL)

7508

return NULL;

7508

return NULL;

7509

7510

set_table_entry(&table[0], "min_interval", &sd->min_interval,

7510

set_table_entry(&table[0], "min_interval", &sd->min_interval,

7511

sizeof(long), 0644, proc_doulongvec_minmax);

7511

sizeof(long), 0644, proc_doulongvec_minmax);

7512

set_table_entry(&table[1], "max_interval", &sd->max_interval,

7512

set_table_entry(&table[1], "max_interval", &sd->max_interval,

7513

sizeof(long), 0644, proc_doulongvec_minmax);

7513

sizeof(long), 0644, proc_doulongvec_minmax);

7514

set_table_entry(&table[2], "busy_idx", &sd->busy_idx,

7514

set_table_entry(&table[2], "busy_idx", &sd->busy_idx,

7515

sizeof(int), 0644, proc_dointvec_minmax);

7515

sizeof(int), 0644, proc_dointvec_minmax);

7516

set_table_entry(&table[3], "idle_idx", &sd->idle_idx,

7516

set_table_entry(&table[3], "idle_idx", &sd->idle_idx,

7517

sizeof(int), 0644, proc_dointvec_minmax);

7517

sizeof(int), 0644, proc_dointvec_minmax);

7518

set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,

7518

set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,

7519

sizeof(int), 0644, proc_dointvec_minmax);

7519

sizeof(int), 0644, proc_dointvec_minmax);

7520

set_table_entry(&table[5], "wake_idx", &sd->wake_idx,

7520

set_table_entry(&table[5], "wake_idx", &sd->wake_idx,

7521

sizeof(int), 0644, proc_dointvec_minmax);

7521

sizeof(int), 0644, proc_dointvec_minmax);

7522

set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,

7522

set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,

7523

sizeof(int), 0644, proc_dointvec_minmax);

7523

sizeof(int), 0644, proc_dointvec_minmax);

7524

set_table_entry(&table[7], "busy_factor", &sd->busy_factor,

7524

set_table_entry(&table[7], "busy_factor", &sd->busy_factor,

7525

sizeof(int), 0644, proc_dointvec_minmax);

7525

sizeof(int), 0644, proc_dointvec_minmax);

7526

set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,

7526

set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,

7527

sizeof(int), 0644, proc_dointvec_minmax);

7527

sizeof(int), 0644, proc_dointvec_minmax);

7528

set_table_entry(&table[9], "cache_nice_tries",

7528

set_table_entry(&table[9], "cache_nice_tries",

7529

&sd->cache_nice_tries,

7529

&sd->cache_nice_tries,

7530

sizeof(int), 0644, proc_dointvec_minmax);

7530

sizeof(int), 0644, proc_dointvec_minmax);

7531

set_table_entry(&table[10], "flags", &sd->flags,

7531

set_table_entry(&table[10], "flags", &sd->flags,

7532

sizeof(int), 0644, proc_dointvec_minmax);

7532

sizeof(int), 0644, proc_dointvec_minmax);

7533

set_table_entry(&table[11], "name", sd->name,

7533

set_table_entry(&table[11], "name", sd->name,

7534

CORENAME_MAX_SIZE, 0444, proc_dostring);

7534

CORENAME_MAX_SIZE, 0444, proc_dostring);

7535

/* &table[12] is terminator */

7535

/* &table[12] is terminator */

7536

7537

return table;

7537

return table;

7538

}

7538

}

7539

7540

static ctl_table *sd_alloc_ctl_cpu_table(int cpu)

7540

static ctl_table *sd_alloc_ctl_cpu_table(int cpu)

7541

{

7541

{

7542

struct ctl_table *entry, *table;

7542

struct ctl_table *entry, *table;

7543

struct sched_domain *sd;

7543

struct sched_domain *sd;

7544

int domain_num = 0, i;

7544

int domain_num = 0, i;

7545

char buf[32];

7545

char buf[32];

7546

7547

for_each_domain(cpu, sd)

7547

for_each_domain(cpu, sd)

7548

domain_num++;

7548

domain_num++;

7549

entry = table = sd_alloc_ctl_entry(domain_num + 1);

7549

entry = table = sd_alloc_ctl_entry(domain_num + 1);

7550

if (table == NULL)

7550

if (table == NULL)

7551

return NULL;

7551

return NULL;

7552

7553

i = 0;

7553

i = 0;

7554

for_each_domain(cpu, sd) {

7554

for_each_domain(cpu, sd) {

7555

snprintf(buf, 32, "domain%d", i);

7555

snprintf(buf, 32, "domain%d", i);

7556

entry->procname = kstrdup(buf, GFP_KERNEL);

7556

entry->procname = kstrdup(buf, GFP_KERNEL);

7557

entry->mode = 0555;

7557

entry->mode = 0555;

7558

entry->child = sd_alloc_ctl_domain_table(sd);

7558

entry->child = sd_alloc_ctl_domain_table(sd);

7559

entry++;

7559

entry++;

7560

i++;

7560

i++;

7561

}

7561

}

7562

return table;

7562

return table;

7563

}

7563

}

7564

7565

static struct ctl_table_header *sd_sysctl_header;

7565

static struct ctl_table_header *sd_sysctl_header;

7566

static void register_sched_domain_sysctl(void)

7566

static void register_sched_domain_sysctl(void)

7567

{

7567

{

7568

int i, cpu_num = num_possible_cpus();

7568

int i, cpu_num = num_possible_cpus();

7569

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

7569

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

7570

char buf[32];

7570

char buf[32];

7571

7572

WARN_ON(sd_ctl_dir[0].child);

7572

WARN_ON(sd_ctl_dir[0].child);

7573

sd_ctl_dir[0].child = entry;

7573

sd_ctl_dir[0].child = entry;

7574

7575

if (entry == NULL)

7575

if (entry == NULL)

7576

return;

7576

return;

7577

7578

for_each_possible_cpu(i) {

7578

for_each_possible_cpu(i) {

7579

snprintf(buf, 32, "cpu%d", i);

7579

snprintf(buf, 32, "cpu%d", i);

7580

entry->procname = kstrdup(buf, GFP_KERNEL);

7580

entry->procname = kstrdup(buf, GFP_KERNEL);

7581

entry->mode = 0555;

7581

entry->mode = 0555;

7582

entry->child = sd_alloc_ctl_cpu_table(i);

7582

entry->child = sd_alloc_ctl_cpu_table(i);

7583

entry++;

7583

entry++;

7584

}

7584

}

7585

7586

WARN_ON(sd_sysctl_header);

7586

WARN_ON(sd_sysctl_header);

7587

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

7587

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

7588

}

7588

}

7589

7590

/* may be called multiple times per register */

7590

/* may be called multiple times per register */

7591

static void unregister_sched_domain_sysctl(void)

7591

static void unregister_sched_domain_sysctl(void)

7592

{

7592

{

7593

if (sd_sysctl_header)

7593

if (sd_sysctl_header)

7594

unregister_sysctl_table(sd_sysctl_header);

7594

unregister_sysctl_table(sd_sysctl_header);

7595

sd_sysctl_header = NULL;

7595

sd_sysctl_header = NULL;

7596

if (sd_ctl_dir[0].child)

7596

if (sd_ctl_dir[0].child)

7597

sd_free_ctl_entry(&sd_ctl_dir[0].child);

7597

sd_free_ctl_entry(&sd_ctl_dir[0].child);

7598

}

7598

}

7599

#else

7599

#else

7600

static void register_sched_domain_sysctl(void)

7600

static void register_sched_domain_sysctl(void)

7601

{

7601

{

7602

}

7602

}

7603

static void unregister_sched_domain_sysctl(void)

7603

static void unregister_sched_domain_sysctl(void)

7604

{

7604

{

7605

}

7605

}

7606

#endif

7606

#endif

7607

7608

static void set_rq_online(struct rq *rq)

7608

static void set_rq_online(struct rq *rq)

7609

{

7609

{

7610

if (!rq->online) {

7610

if (!rq->online) {

7611

const struct sched_class *class;

7611

const struct sched_class *class;

7612

7613

cpumask_set_cpu(rq->cpu, rq->rd->online);

7613

cpumask_set_cpu(rq->cpu, rq->rd->online);

7614

rq->online = 1;

7614

rq->online = 1;

7615

7616

for_each_class(class) {

7616

for_each_class(class) {

7617

if (class->rq_online)

7617

if (class->rq_online)

7618

class->rq_online(rq);

7618

class->rq_online(rq);

7619

}

7619

}

7620

}

7620

}

7621

}

7621

}

7622

7623

static void set_rq_offline(struct rq *rq)

7623

static void set_rq_offline(struct rq *rq)

7624

{

7624

{

7625

if (rq->online) {

7625

if (rq->online) {

7626

const struct sched_class *class;

7626

const struct sched_class *class;

7627

7628

for_each_class(class) {

7628

for_each_class(class) {

7629

if (class->rq_offline)

7629

if (class->rq_offline)

7630

class->rq_offline(rq);

7630

class->rq_offline(rq);

7631

}

7631

}

7632

7633

cpumask_clear_cpu(rq->cpu, rq->rd->online);

7633

cpumask_clear_cpu(rq->cpu, rq->rd->online);

7634

rq->online = 0;

7634

rq->online = 0;

7635

}

7635

}

7636

}

7636

}

7637

7638

/*

7638

/*

7639

* migration_call - callback that gets triggered when a CPU is added.

7639

* migration_call - callback that gets triggered when a CPU is added.

7640

* Here we can start up the necessary migration thread for the new CPU.

7640

* Here we can start up the necessary migration thread for the new CPU.

7641

*/

7641

*/

7642

static int __cpuinit

7642

static int __cpuinit

7643

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

7643

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

7644

{

7644

{

7645

struct task_struct *p;

7645

struct task_struct *p;

7646

int cpu = (long)hcpu;

7646

int cpu = (long)hcpu;

7647

unsigned long flags;

7647

unsigned long flags;

7648

struct rq *rq;

7648

struct rq *rq;

7649

7650

switch (action) {

7650

switch (action) {

7651

7652

case CPU_UP_PREPARE:

7652

case CPU_UP_PREPARE:

7653

case CPU_UP_PREPARE_FROZEN:

7653

case CPU_UP_PREPARE_FROZEN:

7654

p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);

7654

p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);

7655

if (IS_ERR(p))

7655

if (IS_ERR(p))

7656

return NOTIFY_BAD;

7656

return NOTIFY_BAD;

7657

kthread_bind(p, cpu);

7657

kthread_bind(p, cpu);

7658

/* Must be high prio: stop_machine expects to yield to it. */

7658

/* Must be high prio: stop_machine expects to yield to it. */

7659

rq = task_rq_lock(p, &flags);

7659

rq = task_rq_lock(p, &flags);

7660

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

7660

__setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);

7661

task_rq_unlock(rq, &flags);

7661

task_rq_unlock(rq, &flags);

7662

get_task_struct(p);

7662

get_task_struct(p);

7663

cpu_rq(cpu)->migration_thread = p;

7663

cpu_rq(cpu)->migration_thread = p;

7664

rq->calc_load_update = calc_load_update;

7664

rq->calc_load_update = calc_load_update;

7665

break;

7665

break;

7666

7667

case CPU_ONLINE:

7667

case CPU_ONLINE:

7668

case CPU_ONLINE_FROZEN:

7668

case CPU_ONLINE_FROZEN:

7669

/* Strictly unnecessary, as first user will wake it. */

7669

/* Strictly unnecessary, as first user will wake it. */

7670

wake_up_process(cpu_rq(cpu)->migration_thread);

7670

wake_up_process(cpu_rq(cpu)->migration_thread);

7671

7672

/* Update our root-domain */

7672

/* Update our root-domain */

7673

rq = cpu_rq(cpu);

7673

rq = cpu_rq(cpu);

7674

spin_lock_irqsave(&rq->lock, flags);

7674

spin_lock_irqsave(&rq->lock, flags);

7675

if (rq->rd) {

7675

if (rq->rd) {

7676

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

7676

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

7677

7678

set_rq_online(rq);

7678

set_rq_online(rq);

7679

}

7679

}

7680

spin_unlock_irqrestore(&rq->lock, flags);

7680

spin_unlock_irqrestore(&rq->lock, flags);

7681

break;

7681

break;

7682

7683

#ifdef CONFIG_HOTPLUG_CPU

7683

#ifdef CONFIG_HOTPLUG_CPU

7684

case CPU_UP_CANCELED:

7684

case CPU_UP_CANCELED:

7685

case CPU_UP_CANCELED_FROZEN:

7685

case CPU_UP_CANCELED_FROZEN:

7686

if (!cpu_rq(cpu)->migration_thread)

7686

if (!cpu_rq(cpu)->migration_thread)

7687

break;

7687

break;

7688

/* Unbind it from offline cpu so it can run. Fall thru. */

7688

/* Unbind it from offline cpu so it can run. Fall thru. */

7689

kthread_bind(cpu_rq(cpu)->migration_thread,

7689

kthread_bind(cpu_rq(cpu)->migration_thread,

7690

cpumask_any(cpu_online_mask));

7690

cpumask_any(cpu_online_mask));

7691

kthread_stop(cpu_rq(cpu)->migration_thread);

7691

kthread_stop(cpu_rq(cpu)->migration_thread);

7692

put_task_struct(cpu_rq(cpu)->migration_thread);

7692

put_task_struct(cpu_rq(cpu)->migration_thread);

7693

cpu_rq(cpu)->migration_thread = NULL;

7693

cpu_rq(cpu)->migration_thread = NULL;

7694

break;

7694

break;

7695

7696

case CPU_DEAD:

7696

case CPU_DEAD:

7697

case CPU_DEAD_FROZEN:

7697

case CPU_DEAD_FROZEN:

7698

cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */

7698

cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */

7699

migrate_live_tasks(cpu);

7699

migrate_live_tasks(cpu);

7700

rq = cpu_rq(cpu);

7700

rq = cpu_rq(cpu);

7701

kthread_stop(rq->migration_thread);

7701

kthread_stop(rq->migration_thread);

7702

put_task_struct(rq->migration_thread);

7702

put_task_struct(rq->migration_thread);

7703

rq->migration_thread = NULL;

7703

rq->migration_thread = NULL;

7704

/* Idle task back to normal (off runqueue, low prio) */

7704

/* Idle task back to normal (off runqueue, low prio) */

7705

spin_lock_irq(&rq->lock);

7705

spin_lock_irq(&rq->lock);

7706

update_rq_clock(rq);

7706

update_rq_clock(rq);

7707

deactivate_task(rq, rq->idle, 0);

7707

deactivate_task(rq, rq->idle, 0);

7708

__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);

7708

__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);

7709

rq->idle->sched_class = &idle_sched_class;

7709

rq->idle->sched_class = &idle_sched_class;

7710

migrate_dead_tasks(cpu);

7710

migrate_dead_tasks(cpu);

7711

spin_unlock_irq(&rq->lock);

7711

spin_unlock_irq(&rq->lock);

7712

cpuset_unlock();

7712

cpuset_unlock();

7713

migrate_nr_uninterruptible(rq);

7713

migrate_nr_uninterruptible(rq);

7714

BUG_ON(rq->nr_running != 0);

7714

BUG_ON(rq->nr_running != 0);

7715

calc_global_load_remove(rq);

7715

calc_global_load_remove(rq);

7716

/*

7716

/*

7717

* No need to migrate the tasks: it was best-effort if

7717

* No need to migrate the tasks: it was best-effort if

7718

* they didn't take sched_hotcpu_mutex. Just wake up

7718

* they didn't take sched_hotcpu_mutex. Just wake up

7719

* the requestors.

7719

* the requestors.

7720

*/

7720

*/

7721

spin_lock_irq(&rq->lock);

7721

spin_lock_irq(&rq->lock);

7722

while (!list_empty(&rq->migration_queue)) {

7722

while (!list_empty(&rq->migration_queue)) {

7723

struct migration_req *req;

7723

struct migration_req *req;

7724

7725

req = list_entry(rq->migration_queue.next,

7725

req = list_entry(rq->migration_queue.next,

7726

struct migration_req, list);

7726

struct migration_req, list);

7727

list_del_init(&req->list);

7727

list_del_init(&req->list);

7728

spin_unlock_irq(&rq->lock);

7728

spin_unlock_irq(&rq->lock);

7729

complete(&req->done);

7729

complete(&req->done);

7730

spin_lock_irq(&rq->lock);

7730

spin_lock_irq(&rq->lock);

7731

}

7731

}

7732

spin_unlock_irq(&rq->lock);

7732

spin_unlock_irq(&rq->lock);

7733

break;

7733

break;

7734

7735

case CPU_DYING:

7735

case CPU_DYING:

7736

case CPU_DYING_FROZEN:

7736

case CPU_DYING_FROZEN:

7737

/* Update our root-domain */

7737

/* Update our root-domain */

7738

rq = cpu_rq(cpu);

7738

rq = cpu_rq(cpu);

7739

spin_lock_irqsave(&rq->lock, flags);

7739

spin_lock_irqsave(&rq->lock, flags);

7740

if (rq->rd) {

7740

if (rq->rd) {

7741

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

7741

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

7742

set_rq_offline(rq);

7742

set_rq_offline(rq);

7743

}

7743

}

7744

spin_unlock_irqrestore(&rq->lock, flags);

7744

spin_unlock_irqrestore(&rq->lock, flags);

7745

break;

7745

break;

7746

#endif

7746

#endif

7747

}

7747

}

7748

return NOTIFY_OK;

7748

return NOTIFY_OK;

7749

}

7749

}

7750

7751

/*

7751

/*

7752

* Register at high priority so that task migration (migrate_all_tasks)

7752

* Register at high priority so that task migration (migrate_all_tasks)

7753

* happens before everything else. This has to be lower priority than

7753

* happens before everything else. This has to be lower priority than

7754

* the notifier in the perf_event subsystem, though.

7754

* the notifier in the perf_event subsystem, though.

7755

*/

7755

*/

7756

static struct notifier_block __cpuinitdata migration_notifier = {

7756

static struct notifier_block __cpuinitdata migration_notifier = {

7757

.notifier_call = migration_call,

7757

.notifier_call = migration_call,

7758

.priority = 10

7758

.priority = 10

7759

};

7759

};

7760

7761

static int __init migration_init(void)

7761

static int __init migration_init(void)

7762

{

7762

{

7763

void *cpu = (void *)(long)smp_processor_id();

7763

void *cpu = (void *)(long)smp_processor_id();

7764

int err;

7764

int err;

7765

7766

/* Start one for the boot CPU: */

7766

/* Start one for the boot CPU: */

7767

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

7767

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

7768

BUG_ON(err == NOTIFY_BAD);

7768

BUG_ON(err == NOTIFY_BAD);

7769

migration_call(&migration_notifier, CPU_ONLINE, cpu);

7769

migration_call(&migration_notifier, CPU_ONLINE, cpu);

7770

register_cpu_notifier(&migration_notifier);

7770

register_cpu_notifier(&migration_notifier);

7771

7772

return 0;

7772

return 0;

7773

}

7773

}

7774

early_initcall(migration_init);

7774

early_initcall(migration_init);

7775

#endif

7775

#endif

7776

7777

#ifdef CONFIG_SMP

7777

#ifdef CONFIG_SMP

7778

7779

#ifdef CONFIG_SCHED_DEBUG

7779

#ifdef CONFIG_SCHED_DEBUG

7780

7781

static __read_mostly int sched_domain_debug_enabled;

7781

static __read_mostly int sched_domain_debug_enabled;

7782

7783

static int __init sched_domain_debug_setup(char *str)

7783

static int __init sched_domain_debug_setup(char *str)

7784

{

7784

{

7785

sched_domain_debug_enabled = 1;

7785

sched_domain_debug_enabled = 1;

7786

7787

return 0;

7787

return 0;

7788

}

7788

}

7789

early_param("sched_debug", sched_domain_debug_setup);

7789

early_param("sched_debug", sched_domain_debug_setup);

7790

7791

static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

7791

static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

7792

struct cpumask *groupmask)

7792

struct cpumask *groupmask)

7793

{

7793

{

7794

struct sched_group *group = sd->groups;

7794

struct sched_group *group = sd->groups;

7795

char str[256];

7795

char str[256];

7796

7797

cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));

7797

cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));

7798

cpumask_clear(groupmask);

7798

cpumask_clear(groupmask);

7799

7800

printk(KERN_DEBUG "%*s domain %d: ", level, "", level);

7800

printk(KERN_DEBUG "%*s domain %d: ", level, "", level);

7801

7802

if (!(sd->flags & SD_LOAD_BALANCE)) {

7802

if (!(sd->flags & SD_LOAD_BALANCE)) {

7803

pr_cont("does not load-balance\n");

7803

pr_cont("does not load-balance\n");

7804

if (sd->parent)

7804

if (sd->parent)

7805

pr_err("ERROR: !SD_LOAD_BALANCE domain has parent\n");

7805

pr_err("ERROR: !SD_LOAD_BALANCE domain has parent\n");

7806

return -1;

7806

return -1;

7807

}

7807

}

7808

7809

pr_cont("span %s level %s\n", str, sd->name);

7809

pr_cont("span %s level %s\n", str, sd->name);

7810

7811

if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {

7811

if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {

7812

pr_err("ERROR: domain->span does not contain CPU%d\n", cpu);

7812

pr_err("ERROR: domain->span does not contain CPU%d\n", cpu);

7813

}

7813

}

7814

if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {

7814

if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {

7815

pr_err("ERROR: domain->groups does not contain CPU%d\n", cpu);

7815

pr_err("ERROR: domain->groups does not contain CPU%d\n", cpu);

7816

}

7816

}

7817

7818

printk(KERN_DEBUG "%*s groups:", level + 1, "");

7818

printk(KERN_DEBUG "%*s groups:", level + 1, "");

7819

do {

7819

do {

7820

if (!group) {

7820

if (!group) {

7821

pr_cont("\n");

7821

pr_cont("\n");

7822

pr_err("ERROR: group is NULL\n");

7822

pr_err("ERROR: group is NULL\n");

7823

break;

7823

break;

7824

}

7824

}

7825

7826

if (!group->cpu_power) {

7826

if (!group->cpu_power) {

7827

pr_cont("\n");

7827

pr_cont("\n");

7828

pr_err("ERROR: domain->cpu_power not set\n");

7828

pr_err("ERROR: domain->cpu_power not set\n");

7829

break;

7829

break;

7830

}

7830

}

7831

7832

if (!cpumask_weight(sched_group_cpus(group))) {

7832

if (!cpumask_weight(sched_group_cpus(group))) {

7833

pr_cont("\n");

7833

pr_cont("\n");

7834

pr_err("ERROR: empty group\n");

7834

pr_err("ERROR: empty group\n");

7835

break;

7835

break;

7836

}

7836

}

7837

7838

if (cpumask_intersects(groupmask, sched_group_cpus(group))) {

7838

if (cpumask_intersects(groupmask, sched_group_cpus(group))) {

7839

pr_cont("\n");

7839

pr_cont("\n");

7840

pr_err("ERROR: repeated CPUs\n");

7840

pr_err("ERROR: repeated CPUs\n");

7841

break;

7841

break;

7842

}

7842

}

7843

7844

cpumask_or(groupmask, groupmask, sched_group_cpus(group));

7844

cpumask_or(groupmask, groupmask, sched_group_cpus(group));

7845

7846

cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));

7846

cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));

7847

7848

pr_cont(" %s", str);

7848

pr_cont(" %s", str);

7849

if (group->cpu_power != SCHED_LOAD_SCALE) {

7849

if (group->cpu_power != SCHED_LOAD_SCALE) {

7850

pr_cont(" (cpu_power = %d)", group->cpu_power);

7850

pr_cont(" (cpu_power = %d)", group->cpu_power);

7851

}

7851

}

7852

7853

group = group->next;

7853

group = group->next;

7854

} while (group != sd->groups);

7854

} while (group != sd->groups);

7855

pr_cont("\n");

7855

pr_cont("\n");

7856

7857

if (!cpumask_equal(sched_domain_span(sd), groupmask))

7857

if (!cpumask_equal(sched_domain_span(sd), groupmask))

7858

pr_err("ERROR: groups don't span domain->span\n");

7858

pr_err("ERROR: groups don't span domain->span\n");

7859

7860

if (sd->parent &&

7860

if (sd->parent &&

7861

!cpumask_subset(groupmask, sched_domain_span(sd->parent)))

7861

!cpumask_subset(groupmask, sched_domain_span(sd->parent)))

7862

pr_err("ERROR: parent span is not a superset of domain->span\n");

7862

pr_err("ERROR: parent span is not a superset of domain->span\n");

7863

return 0;

7863

return 0;

7864

}

7864

}

7865

7866

static void sched_domain_debug(struct sched_domain *sd, int cpu)

7866

static void sched_domain_debug(struct sched_domain *sd, int cpu)

7867

{

7867

{

7868

cpumask_var_t groupmask;

7868

cpumask_var_t groupmask;

7869

int level = 0;

7869

int level = 0;

7870

7871

if (!sched_domain_debug_enabled)

7871

if (!sched_domain_debug_enabled)

7872

return;

7872

return;

7873

7874

if (!sd) {

7874

if (!sd) {

7875

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

7875

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

7876

return;

7876

return;

7877

}

7877

}

7878

7879

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

7879

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

7880

7881

if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {

7881

if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {

7882

printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");

7882

printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");

7883

return;

7883

return;

7884

}

7884

}

7885

7886

for (;;) {

7886

for (;;) {

7887

if (sched_domain_debug_one(sd, cpu, level, groupmask))

7887

if (sched_domain_debug_one(sd, cpu, level, groupmask))

7888

break;

7888

break;

7889

level++;

7889

level++;

7890

sd = sd->parent;

7890

sd = sd->parent;

7891

if (!sd)

7891

if (!sd)

7892

break;

7892

break;

7893

}

7893

}

7894

free_cpumask_var(groupmask);

7894

free_cpumask_var(groupmask);

7895

}

7895

}

7896

#else /* !CONFIG_SCHED_DEBUG */

7896

#else /* !CONFIG_SCHED_DEBUG */

7897

# define sched_domain_debug(sd, cpu) do { } while (0)

7897

# define sched_domain_debug(sd, cpu) do { } while (0)

7898

#endif /* CONFIG_SCHED_DEBUG */

7898

#endif /* CONFIG_SCHED_DEBUG */

7899

7900

static int sd_degenerate(struct sched_domain *sd)

7900

static int sd_degenerate(struct sched_domain *sd)

7901

{

7901

{

7902

if (cpumask_weight(sched_domain_span(sd)) == 1)

7902

if (cpumask_weight(sched_domain_span(sd)) == 1)

7903

return 1;

7903

return 1;

7904

7905

/* Following flags need at least 2 groups */

7905

/* Following flags need at least 2 groups */

7906

if (sd->flags & (SD_LOAD_BALANCE |

7906

if (sd->flags & (SD_LOAD_BALANCE |

7907

SD_BALANCE_NEWIDLE |

7907

SD_BALANCE_NEWIDLE |

7908

SD_BALANCE_FORK |

7908

SD_BALANCE_FORK |

7909

SD_BALANCE_EXEC |

7909

SD_BALANCE_EXEC |

7910

SD_SHARE_CPUPOWER |

7910

SD_SHARE_CPUPOWER |

7911

SD_SHARE_PKG_RESOURCES)) {

7911

SD_SHARE_PKG_RESOURCES)) {

7912

if (sd->groups != sd->groups->next)

7912

if (sd->groups != sd->groups->next)

7913

return 0;

7913

return 0;

7914

}

7914

}

7915

7916

/* Following flags don't use groups */

7916

/* Following flags don't use groups */

7917

if (sd->flags & (SD_WAKE_AFFINE))

7917

if (sd->flags & (SD_WAKE_AFFINE))

7918

return 0;

7918

return 0;

7919

7920

return 1;

7920

return 1;

7921

}

7921

}

7922

7923

static int

7923

static int

7924

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

7924

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

7925

{

7925

{

7926

unsigned long cflags = sd->flags, pflags = parent->flags;

7926

unsigned long cflags = sd->flags, pflags = parent->flags;

7927

7928

if (sd_degenerate(parent))

7928

if (sd_degenerate(parent))

7929

return 1;

7929

return 1;

7930

7931

if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))

7931

if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))

7932

return 0;

7932

return 0;

7933

7934

/* Flags needing groups don't count if only 1 group in parent */

7934

/* Flags needing groups don't count if only 1 group in parent */

7935

if (parent->groups == parent->groups->next) {

7935

if (parent->groups == parent->groups->next) {

7936

pflags &= ~(SD_LOAD_BALANCE |

7936

pflags &= ~(SD_LOAD_BALANCE |

7937

SD_BALANCE_NEWIDLE |

7937

SD_BALANCE_NEWIDLE |

7938

SD_BALANCE_FORK |

7938

SD_BALANCE_FORK |

7939

SD_BALANCE_EXEC |

7939

SD_BALANCE_EXEC |

7940

SD_SHARE_CPUPOWER |

7940

SD_SHARE_CPUPOWER |

7941

SD_SHARE_PKG_RESOURCES);

7941

SD_SHARE_PKG_RESOURCES);

7942

if (nr_node_ids == 1)

7942

if (nr_node_ids == 1)

7943

pflags &= ~SD_SERIALIZE;

7943

pflags &= ~SD_SERIALIZE;

7944

}

7944

}

7945

if (~cflags & pflags)

7945

if (~cflags & pflags)

7946

return 0;

7946

return 0;

7947

7948

return 1;

7948

return 1;

7949

}

7949

}

7950

7951

static void free_rootdomain(struct root_domain *rd)

7951

static void free_rootdomain(struct root_domain *rd)

7952

{

7952

{

7953

synchronize_sched();

7953

synchronize_sched();

7954

7955

cpupri_cleanup(&rd->cpupri);

7955

cpupri_cleanup(&rd->cpupri);

7956

7957

free_cpumask_var(rd->rto_mask);

7957

free_cpumask_var(rd->rto_mask);

7958

free_cpumask_var(rd->online);

7958

free_cpumask_var(rd->online);

7959

free_cpumask_var(rd->span);

7959

free_cpumask_var(rd->span);

7960

kfree(rd);

7960

kfree(rd);

7961

}

7961

}

7962

7963

static void rq_attach_root(struct rq *rq, struct root_domain *rd)

7963

static void rq_attach_root(struct rq *rq, struct root_domain *rd)

7964

{

7964

{

7965

struct root_domain *old_rd = NULL;

7965

struct root_domain *old_rd = NULL;

7966

unsigned long flags;

7966

unsigned long flags;

7967

7968

spin_lock_irqsave(&rq->lock, flags);

7968

spin_lock_irqsave(&rq->lock, flags);

7969

7970

if (rq->rd) {

7970

if (rq->rd) {

7971

old_rd = rq->rd;

7971

old_rd = rq->rd;

7972

7973

if (cpumask_test_cpu(rq->cpu, old_rd->online))

7973

if (cpumask_test_cpu(rq->cpu, old_rd->online))

7974

set_rq_offline(rq);

7974

set_rq_offline(rq);

7975

7976

cpumask_clear_cpu(rq->cpu, old_rd->span);

7976

cpumask_clear_cpu(rq->cpu, old_rd->span);

7977

7978

/*

7978

/*

7979

* If we dont want to free the old_rt yet then

7979

* If we dont want to free the old_rt yet then

7980

* set old_rd to NULL to skip the freeing later

7980

* set old_rd to NULL to skip the freeing later

7981

* in this function:

7981

* in this function:

7982

*/

7982

*/

7983

if (!atomic_dec_and_test(&old_rd->refcount))

7983

if (!atomic_dec_and_test(&old_rd->refcount))

7984

old_rd = NULL;

7984

old_rd = NULL;

7985

}

7985

}

7986

7987

atomic_inc(&rd->refcount);

7987

atomic_inc(&rd->refcount);

7988

rq->rd = rd;

7988

rq->rd = rd;

7989

7990

cpumask_set_cpu(rq->cpu, rd->span);

7990

cpumask_set_cpu(rq->cpu, rd->span);

7991

if (cpumask_test_cpu(rq->cpu, cpu_active_mask))

7991

if (cpumask_test_cpu(rq->cpu, cpu_active_mask))

7992

set_rq_online(rq);

7992

set_rq_online(rq);

7993

7994

spin_unlock_irqrestore(&rq->lock, flags);

7994

spin_unlock_irqrestore(&rq->lock, flags);

7995

7996

if (old_rd)

7996

if (old_rd)

7997

free_rootdomain(old_rd);

7997

free_rootdomain(old_rd);

7998

}

7998

}

7999

8000

static int init_rootdomain(struct root_domain *rd, bool bootmem)

8000

static int init_rootdomain(struct root_domain *rd, bool bootmem)

8001

{

8001

{

8002

gfp_t gfp = GFP_KERNEL;

8002

gfp_t gfp = GFP_KERNEL;

8003

8004

memset(rd, 0, sizeof(*rd));

8004

memset(rd, 0, sizeof(*rd));

8005

8006

if (bootmem)

8006

if (bootmem)

8007

gfp = GFP_NOWAIT;

8007

gfp = GFP_NOWAIT;

8008

8009

if (!alloc_cpumask_var(&rd->span, gfp))

8009

if (!alloc_cpumask_var(&rd->span, gfp))

8010

goto out;

8010

goto out;

8011

if (!alloc_cpumask_var(&rd->online, gfp))

8011

if (!alloc_cpumask_var(&rd->online, gfp))

8012

goto free_span;

8012

goto free_span;

8013

if (!alloc_cpumask_var(&rd->rto_mask, gfp))

8013

if (!alloc_cpumask_var(&rd->rto_mask, gfp))

8014

goto free_online;

8014

goto free_online;

8015

8016

if (cpupri_init(&rd->cpupri, bootmem) != 0)

8016

if (cpupri_init(&rd->cpupri, bootmem) != 0)

8017

goto free_rto_mask;

8017

goto free_rto_mask;

8018

return 0;

8018

return 0;

8019

8020

free_rto_mask:

8020

free_rto_mask:

8021

free_cpumask_var(rd->rto_mask);

8021

free_cpumask_var(rd->rto_mask);

8022

free_online:

8022

free_online:

8023

free_cpumask_var(rd->online);

8023

free_cpumask_var(rd->online);

8024

free_span:

8024

free_span:

8025

free_cpumask_var(rd->span);

8025

free_cpumask_var(rd->span);

8026

out:

8026

out:

8027

return -ENOMEM;

8027

return -ENOMEM;

8028

}

8028

}

8029

8030

static void init_defrootdomain(void)

8030

static void init_defrootdomain(void)

8031

{

8031

{

8032

init_rootdomain(&def_root_domain, true);

8032

init_rootdomain(&def_root_domain, true);

8033

8034

atomic_set(&def_root_domain.refcount, 1);

8034

atomic_set(&def_root_domain.refcount, 1);

8035

}

8035

}

8036

8037

static struct root_domain *alloc_rootdomain(void)

8037

static struct root_domain *alloc_rootdomain(void)

8038

{

8038

{

8039

struct root_domain *rd;

8039

struct root_domain *rd;

8040

8041

rd = kmalloc(sizeof(*rd), GFP_KERNEL);

8041

rd = kmalloc(sizeof(*rd), GFP_KERNEL);

8042

if (!rd)

8042

if (!rd)

8043

return NULL;

8043

return NULL;

8044

8045

if (init_rootdomain(rd, false) != 0) {

8045

if (init_rootdomain(rd, false) != 0) {

8046

kfree(rd);

8046

kfree(rd);

8047

return NULL;

8047

return NULL;

8048

}

8048

}

8049

8050

return rd;

8050

return rd;

8051

}

8051

}

8052

8053

/*

8053

/*

8054

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

8054

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

8055

* hold the hotplug lock.

8055

* hold the hotplug lock.

8056

*/

8056

*/

8057

static void

8057

static void

8058

cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

8058

cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

8059

{

8059

{

8060

struct rq *rq = cpu_rq(cpu);

8060

struct rq *rq = cpu_rq(cpu);

8061

struct sched_domain *tmp;

8061

struct sched_domain *tmp;

8062

8063

/* Remove the sched domains which do not contribute to scheduling. */

8063

/* Remove the sched domains which do not contribute to scheduling. */

8064

for (tmp = sd; tmp; ) {

8064

for (tmp = sd; tmp; ) {

8065

struct sched_domain *parent = tmp->parent;

8065

struct sched_domain *parent = tmp->parent;

8066

if (!parent)

8066

if (!parent)

8067

break;

8067

break;

8068

8069

if (sd_parent_degenerate(tmp, parent)) {

8069

if (sd_parent_degenerate(tmp, parent)) {

8070

tmp->parent = parent->parent;

8070

tmp->parent = parent->parent;

8071

if (parent->parent)

8071

if (parent->parent)

8072

parent->parent->child = tmp;

8072

parent->parent->child = tmp;

8073

} else

8073

} else

8074

tmp = tmp->parent;

8074

tmp = tmp->parent;

8075

}

8075

}

8076

8077

if (sd && sd_degenerate(sd)) {

8077

if (sd && sd_degenerate(sd)) {

8078

sd = sd->parent;

8078

sd = sd->parent;

8079

if (sd)

8079

if (sd)

8080

sd->child = NULL;

8080

sd->child = NULL;

8081

}

8081

}

8082

8083

sched_domain_debug(sd, cpu);

8083

sched_domain_debug(sd, cpu);

8084

8085

rq_attach_root(rq, rd);

8085

rq_attach_root(rq, rd);

8086

rcu_assign_pointer(rq->sd, sd);

8086

rcu_assign_pointer(rq->sd, sd);

8087

}

8087

}

8088

8089

/* cpus with isolated domains */

8089

/* cpus with isolated domains */

8090

static cpumask_var_t cpu_isolated_map;

8090

static cpumask_var_t cpu_isolated_map;

8091

8092

/* Setup the mask of cpus configured for isolated domains */

8092

/* Setup the mask of cpus configured for isolated domains */

8093

static int __init isolated_cpu_setup(char *str)

8093

static int __init isolated_cpu_setup(char *str)

8094

{

8094

{

8095

alloc_bootmem_cpumask_var(&cpu_isolated_map);

8095

alloc_bootmem_cpumask_var(&cpu_isolated_map);

8096

cpulist_parse(str, cpu_isolated_map);

8096

cpulist_parse(str, cpu_isolated_map);

8097

return 1;

8097

return 1;

8098

}

8098

}

8099

8100

__setup("isolcpus=", isolated_cpu_setup);

8100

__setup("isolcpus=", isolated_cpu_setup);

8101

8102

/*

8102

/*

8103

* init_sched_build_groups takes the cpumask we wish to span, and a pointer

8103

* init_sched_build_groups takes the cpumask we wish to span, and a pointer

8104

* to a function which identifies what group(along with sched group) a CPU

8104

* to a function which identifies what group(along with sched group) a CPU

8105

* belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids

8105

* belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids

8106

* (due to the fact that we keep track of groups covered with a struct cpumask).

8106

* (due to the fact that we keep track of groups covered with a struct cpumask).

8107

*

8107

*

8108

* init_sched_build_groups will build a circular linked list of the groups

8108

* init_sched_build_groups will build a circular linked list of the groups

8109

* covered by the given span, and will set each group's ->cpumask correctly,

8109

* covered by the given span, and will set each group's ->cpumask correctly,

8110

* and ->cpu_power to 0.

8110

* and ->cpu_power to 0.

8111

*/

8111

*/

8112

static void

8112

static void

8113

init_sched_build_groups(const struct cpumask *span,

8113

init_sched_build_groups(const struct cpumask *span,

8114

const struct cpumask *cpu_map,

8114

const struct cpumask *cpu_map,

8115

int (*group_fn)(int cpu, const struct cpumask *cpu_map,

8115

int (*group_fn)(int cpu, const struct cpumask *cpu_map,

8116

struct sched_group **sg,

8116

struct sched_group **sg,

8117

struct cpumask *tmpmask),

8117

struct cpumask *tmpmask),

8118

struct cpumask *covered, struct cpumask *tmpmask)

8118

struct cpumask *covered, struct cpumask *tmpmask)

8119

{

8119

{

8120

struct sched_group *first = NULL, *last = NULL;

8120

struct sched_group *first = NULL, *last = NULL;

8121

int i;

8121

int i;

8122

8123

cpumask_clear(covered);

8123

cpumask_clear(covered);

8124

8125

for_each_cpu(i, span) {

8125

for_each_cpu(i, span) {

8126

struct sched_group *sg;

8126

struct sched_group *sg;

8127

int group = group_fn(i, cpu_map, &sg, tmpmask);

8127

int group = group_fn(i, cpu_map, &sg, tmpmask);

8128

int j;

8128

int j;

8129

8130

if (cpumask_test_cpu(i, covered))

8130

if (cpumask_test_cpu(i, covered))

8131

continue;

8131

continue;

8132

8133

cpumask_clear(sched_group_cpus(sg));

8133

cpumask_clear(sched_group_cpus(sg));

8134

sg->cpu_power = 0;

8134

sg->cpu_power = 0;

8135

8136

for_each_cpu(j, span) {

8136

for_each_cpu(j, span) {

8137

if (group_fn(j, cpu_map, NULL, tmpmask) != group)

8137

if (group_fn(j, cpu_map, NULL, tmpmask) != group)

8138

continue;

8138

continue;

8139

8140

cpumask_set_cpu(j, covered);

8140

cpumask_set_cpu(j, covered);

8141

cpumask_set_cpu(j, sched_group_cpus(sg));

8141

cpumask_set_cpu(j, sched_group_cpus(sg));

8142

}

8142

}

8143

if (!first)

8143

if (!first)

8144

first = sg;

8144

first = sg;

8145

if (last)

8145

if (last)

8146

last->next = sg;

8146

last->next = sg;

8147

last = sg;

8147

last = sg;

8148

}

8148

}

8149

last->next = first;

8149

last->next = first;

8150

}

8150

}

8151

8152

#define SD_NODES_PER_DOMAIN 16

8152

#define SD_NODES_PER_DOMAIN 16

8153

8154

#ifdef CONFIG_NUMA

8154

#ifdef CONFIG_NUMA

8155

8156

/**

8156

/**

8157

* find_next_best_node - find the next node to include in a sched_domain

8157

* find_next_best_node - find the next node to include in a sched_domain

8158

* @node: node whose sched_domain we're building

8158

* @node: node whose sched_domain we're building

8159

* @used_nodes: nodes already in the sched_domain

8159

* @used_nodes: nodes already in the sched_domain

8160

*

8160

*

8161

* Find the next node to include in a given scheduling domain. Simply

8161

* Find the next node to include in a given scheduling domain. Simply

8162

* finds the closest node not already in the @used_nodes map.

8162

* finds the closest node not already in the @used_nodes map.

8163

*

8163

*

8164

* Should use nodemask_t.

8164

* Should use nodemask_t.

8165

*/

8165

*/

8166

static int find_next_best_node(int node, nodemask_t *used_nodes)

8166

static int find_next_best_node(int node, nodemask_t *used_nodes)

8167

{

8167

{

8168

int i, n, val, min_val, best_node = 0;

8168

int i, n, val, min_val, best_node = 0;

8169

8170

min_val = INT_MAX;

8170

min_val = INT_MAX;

8171

8172

for (i = 0; i < nr_node_ids; i++) {

8172

for (i = 0; i < nr_node_ids; i++) {

8173

/* Start at @node */

8173

/* Start at @node */

8174

n = (node + i) % nr_node_ids;

8174

n = (node + i) % nr_node_ids;

8175

8176

if (!nr_cpus_node(n))

8176

if (!nr_cpus_node(n))

8177

continue;

8177

continue;

8178

8179

/* Skip already used nodes */

8179

/* Skip already used nodes */

8180

if (node_isset(n, *used_nodes))

8180

if (node_isset(n, *used_nodes))

8181

continue;

8181

continue;

8182

8183

/* Simple min distance search */

8183

/* Simple min distance search */

8184

val = node_distance(node, n);

8184

val = node_distance(node, n);

8185

8186

if (val < min_val) {

8186

if (val < min_val) {

8187

min_val = val;

8187

min_val = val;

8188

best_node = n;

8188

best_node = n;

8189

}

8189

}

8190

}

8190

}

8191

8192

node_set(best_node, *used_nodes);

8192

node_set(best_node, *used_nodes);

8193

return best_node;

8193

return best_node;

8194

}

8194

}

8195

8196

/**

8196

/**

8197

* sched_domain_node_span - get a cpumask for a node's sched_domain

8197

* sched_domain_node_span - get a cpumask for a node's sched_domain

8198

* @node: node whose cpumask we're constructing

8198

* @node: node whose cpumask we're constructing

8199

* @span: resulting cpumask

8199

* @span: resulting cpumask

8200

*

8200

*

8201

* Given a node, construct a good cpumask for its sched_domain to span. It

8201

* Given a node, construct a good cpumask for its sched_domain to span. It

8202

* should be one that prevents unnecessary balancing, but also spreads tasks

8202

* should be one that prevents unnecessary balancing, but also spreads tasks

8203

* out optimally.

8203

* out optimally.

8204

*/

8204

*/

8205

static void sched_domain_node_span(int node, struct cpumask *span)

8205

static void sched_domain_node_span(int node, struct cpumask *span)

8206

{

8206

{

8207

nodemask_t used_nodes;

8207

nodemask_t used_nodes;

8208

int i;

8208

int i;

8209

8210

cpumask_clear(span);

8210

cpumask_clear(span);

8211

nodes_clear(used_nodes);

8211

nodes_clear(used_nodes);

8212

8213

cpumask_or(span, span, cpumask_of_node(node));

8213

cpumask_or(span, span, cpumask_of_node(node));

8214

node_set(node, used_nodes);

8214

node_set(node, used_nodes);

8215

8216

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

8216

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

8217

int next_node = find_next_best_node(node, &used_nodes);

8217

int next_node = find_next_best_node(node, &used_nodes);

8218

8219

cpumask_or(span, span, cpumask_of_node(next_node));

8219

cpumask_or(span, span, cpumask_of_node(next_node));

8220

}

8220

}

8221

}

8221

}

8222

#endif /* CONFIG_NUMA */

8222

#endif /* CONFIG_NUMA */

8223

8224

int sched_smt_power_savings = 0, sched_mc_power_savings = 0;

8224

int sched_smt_power_savings = 0, sched_mc_power_savings = 0;

8225

8226

/*

8226

/*

8227

* The cpus mask in sched_group and sched_domain hangs off the end.

8227

* The cpus mask in sched_group and sched_domain hangs off the end.

8228

*

8228

*

8229

* ( See the the comments in include/linux/sched.h:struct sched_group

8229

* ( See the the comments in include/linux/sched.h:struct sched_group

8230

* and struct sched_domain. )

8230

* and struct sched_domain. )

8231

*/

8231

*/

8232

struct static_sched_group {

8232

struct static_sched_group {

8233

struct sched_group sg;

8233

struct sched_group sg;

8234

DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);

8234

DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);

8235

};

8235

};

8236

8237

struct static_sched_domain {

8237

struct static_sched_domain {

8238

struct sched_domain sd;

8238

struct sched_domain sd;

8239

DECLARE_BITMAP(span, CONFIG_NR_CPUS);

8239

DECLARE_BITMAP(span, CONFIG_NR_CPUS);

8240

};

8240

};

8241

8242

struct s_data {

8242

struct s_data {

8243

#ifdef CONFIG_NUMA

8243

#ifdef CONFIG_NUMA

8244

int sd_allnodes;

8244

int sd_allnodes;

8245

cpumask_var_t domainspan;

8245

cpumask_var_t domainspan;

8246

cpumask_var_t covered;

8246

cpumask_var_t covered;

8247

cpumask_var_t notcovered;

8247

cpumask_var_t notcovered;

8248

#endif

8248

#endif

8249

cpumask_var_t nodemask;

8249

cpumask_var_t nodemask;

8250

cpumask_var_t this_sibling_map;

8250

cpumask_var_t this_sibling_map;

8251

cpumask_var_t this_core_map;

8251

cpumask_var_t this_core_map;

8252

cpumask_var_t send_covered;

8252

cpumask_var_t send_covered;

8253

cpumask_var_t tmpmask;

8253

cpumask_var_t tmpmask;

8254

struct sched_group **sched_group_nodes;

8254

struct sched_group **sched_group_nodes;

8255

struct root_domain *rd;

8255

struct root_domain *rd;

8256

};

8256

};

8257

8258

enum s_alloc {

8258

enum s_alloc {

8259

sa_sched_groups = 0,

8259

sa_sched_groups = 0,

8260

sa_rootdomain,

8260

sa_rootdomain,

8261

sa_tmpmask,

8261

sa_tmpmask,

8262

sa_send_covered,

8262

sa_send_covered,

8263

sa_this_core_map,

8263

sa_this_core_map,

8264

sa_this_sibling_map,

8264

sa_this_sibling_map,

8265

sa_nodemask,

8265

sa_nodemask,

8266

sa_sched_group_nodes,

8266

sa_sched_group_nodes,

8267

#ifdef CONFIG_NUMA

8267

#ifdef CONFIG_NUMA

8268

sa_notcovered,

8268

sa_notcovered,

8269

sa_covered,

8269

sa_covered,

8270

sa_domainspan,

8270

sa_domainspan,

8271

#endif

8271

#endif

8272

sa_none,

8272

sa_none,

8273

};

8273

};

8274

8275

/*

8275

/*

8276

* SMT sched-domains:

8276

* SMT sched-domains:

8277

*/

8277

*/

8278

#ifdef CONFIG_SCHED_SMT

8278

#ifdef CONFIG_SCHED_SMT

8279

static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);

8279

static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);

8280

static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);

8280

static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);

8281

8282

static int

8282

static int

8283

cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,

8283

cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,

8284

struct sched_group **sg, struct cpumask *unused)

8284

struct sched_group **sg, struct cpumask *unused)

8285

{

8285

{

8286

if (sg)

8286

if (sg)

8287

*sg = &per_cpu(sched_group_cpus, cpu).sg;

8287

*sg = &per_cpu(sched_group_cpus, cpu).sg;

8288

return cpu;

8288

return cpu;

8289

}

8289

}

8290

#endif /* CONFIG_SCHED_SMT */

8290

#endif /* CONFIG_SCHED_SMT */

8291

8292

/*

8292

/*

8293

* multi-core sched-domains:

8293

* multi-core sched-domains:

8294

*/

8294

*/

8295

#ifdef CONFIG_SCHED_MC

8295

#ifdef CONFIG_SCHED_MC

8296

static DEFINE_PER_CPU(struct static_sched_domain, core_domains);

8296

static DEFINE_PER_CPU(struct static_sched_domain, core_domains);

8297

static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);

8297

static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);

8298

#endif /* CONFIG_SCHED_MC */

8298

#endif /* CONFIG_SCHED_MC */

8299

8300

#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)

8300

#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)

8301

static int

8301

static int

8302

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

8302

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

8303

struct sched_group **sg, struct cpumask *mask)

8303

struct sched_group **sg, struct cpumask *mask)

8304

{

8304

{

8305

int group;

8305

int group;

8306

8307

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

8307

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

8308

group = cpumask_first(mask);

8308

group = cpumask_first(mask);

8309

if (sg)

8309

if (sg)

8310

*sg = &per_cpu(sched_group_core, group).sg;

8310

*sg = &per_cpu(sched_group_core, group).sg;

8311

return group;

8311

return group;

8312

}

8312

}

8313

#elif defined(CONFIG_SCHED_MC)

8313

#elif defined(CONFIG_SCHED_MC)

8314

static int

8314

static int

8315

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

8315

cpu_to_core_group(int cpu, const struct cpumask *cpu_map,

8316

struct sched_group **sg, struct cpumask *unused)

8316

struct sched_group **sg, struct cpumask *unused)

8317

{

8317

{

8318

if (sg)

8318

if (sg)

8319

*sg = &per_cpu(sched_group_core, cpu).sg;

8319

*sg = &per_cpu(sched_group_core, cpu).sg;

8320

return cpu;

8320

return cpu;

8321

}

8321

}

8322

#endif

8322

#endif

8323

8324

static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);

8324

static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);

8325

static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);

8325

static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);

8326

8327

static int

8327

static int

8328

cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,

8328

cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,

8329

struct sched_group **sg, struct cpumask *mask)

8329

struct sched_group **sg, struct cpumask *mask)

8330

{

8330

{

8331

int group;

8331

int group;

8332

#ifdef CONFIG_SCHED_MC

8332

#ifdef CONFIG_SCHED_MC

8333

cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);

8333

cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);

8334

group = cpumask_first(mask);

8334

group = cpumask_first(mask);

8335

#elif defined(CONFIG_SCHED_SMT)

8335

#elif defined(CONFIG_SCHED_SMT)

8336

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

8336

cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);

8337

group = cpumask_first(mask);

8337

group = cpumask_first(mask);

8338

#else

8338

#else

8339

group = cpu;

8339

group = cpu;

8340

#endif

8340

#endif

8341

if (sg)

8341

if (sg)

8342

*sg = &per_cpu(sched_group_phys, group).sg;

8342

*sg = &per_cpu(sched_group_phys, group).sg;

8343

return group;

8343

return group;

8344

}

8344

}

8345

8346

#ifdef CONFIG_NUMA

8346

#ifdef CONFIG_NUMA

8347

/*

8347

/*

8348

* The init_sched_build_groups can't handle what we want to do with node

8348

* The init_sched_build_groups can't handle what we want to do with node

8349

* groups, so roll our own. Now each node has its own list of groups which

8349

* groups, so roll our own. Now each node has its own list of groups which

8350

* gets dynamically allocated.

8350

* gets dynamically allocated.

8351

*/

8351

*/

8352

static DEFINE_PER_CPU(struct static_sched_domain, node_domains);

8352

static DEFINE_PER_CPU(struct static_sched_domain, node_domains);

8353

static struct sched_group ***sched_group_nodes_bycpu;

8353

static struct sched_group ***sched_group_nodes_bycpu;

8354

8355

static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);

8355

static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);

8356

static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);

8356

static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);

8357

8358

static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,

8358

static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,

8359

struct sched_group **sg,

8359

struct sched_group **sg,

8360

struct cpumask *nodemask)

8360

struct cpumask *nodemask)

8361

{

8361

{

8362

int group;

8362

int group;

8363

8364

cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);

8364

cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);

8365

group = cpumask_first(nodemask);

8365

group = cpumask_first(nodemask);

8366

8367

if (sg)

8367

if (sg)

8368

*sg = &per_cpu(sched_group_allnodes, group).sg;

8368

*sg = &per_cpu(sched_group_allnodes, group).sg;

8369

return group;

8369

return group;

8370

}

8370

}

8371

8372

static void init_numa_sched_groups_power(struct sched_group *group_head)

8372

static void init_numa_sched_groups_power(struct sched_group *group_head)

8373

{

8373

{

8374

struct sched_group *sg = group_head;

8374

struct sched_group *sg = group_head;

8375

int j;

8375

int j;

8376

8377

if (!sg)

8377

if (!sg)

8378

return;

8378

return;

8379

do {

8379

do {

8380

for_each_cpu(j, sched_group_cpus(sg)) {

8380

for_each_cpu(j, sched_group_cpus(sg)) {

8381

struct sched_domain *sd;

8381

struct sched_domain *sd;

8382

8383

sd = &per_cpu(phys_domains, j).sd;

8383

sd = &per_cpu(phys_domains, j).sd;

8384

if (j != group_first_cpu(sd->groups)) {

8384

if (j != group_first_cpu(sd->groups)) {

8385

/*

8385

/*

8386

* Only add "power" once for each

8386

* Only add "power" once for each

8387

* physical package.

8387

* physical package.

8388

*/

8388

*/

8389

continue;

8389

continue;

8390

}

8390

}

8391

8392

sg->cpu_power += sd->groups->cpu_power;

8392

sg->cpu_power += sd->groups->cpu_power;

8393

}

8393

}

8394

sg = sg->next;

8394

sg = sg->next;

8395

} while (sg != group_head);

8395

} while (sg != group_head);

8396

}

8396

}

8397

8398

static int build_numa_sched_groups(struct s_data *d,

8398

static int build_numa_sched_groups(struct s_data *d,

8399

const struct cpumask *cpu_map, int num)

8399

const struct cpumask *cpu_map, int num)

8400

{

8400

{

8401

struct sched_domain *sd;

8401

struct sched_domain *sd;

8402

struct sched_group *sg, *prev;

8402

struct sched_group *sg, *prev;

8403

int n, j;

8403

int n, j;

8404

8405

cpumask_clear(d->covered);

8405

cpumask_clear(d->covered);

8406

cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);

8406

cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);

8407

if (cpumask_empty(d->nodemask)) {

8407

if (cpumask_empty(d->nodemask)) {

8408

d->sched_group_nodes[num] = NULL;

8408

d->sched_group_nodes[num] = NULL;

8409

goto out;

8409

goto out;

8410

}

8410

}

8411

8412

sched_domain_node_span(num, d->domainspan);

8412

sched_domain_node_span(num, d->domainspan);

8413

cpumask_and(d->domainspan, d->domainspan, cpu_map);

8413

cpumask_and(d->domainspan, d->domainspan, cpu_map);

8414

8415

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

8415

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

8416

GFP_KERNEL, num);

8416

GFP_KERNEL, num);

8417

if (!sg) {

8417

if (!sg) {

8418

pr_warning("Can not alloc domain group for node %d\n", num);

8418

pr_warning("Can not alloc domain group for node %d\n", num);

8419

return -ENOMEM;

8419

return -ENOMEM;

8420

}

8420

}

8421

d->sched_group_nodes[num] = sg;

8421

d->sched_group_nodes[num] = sg;

8422

8423

for_each_cpu(j, d->nodemask) {

8423

for_each_cpu(j, d->nodemask) {

8424

sd = &per_cpu(node_domains, j).sd;

8424

sd = &per_cpu(node_domains, j).sd;

8425

sd->groups = sg;

8425

sd->groups = sg;

8426

}

8426

}

8427

8428

sg->cpu_power = 0;

8428

sg->cpu_power = 0;

8429

cpumask_copy(sched_group_cpus(sg), d->nodemask);

8429

cpumask_copy(sched_group_cpus(sg), d->nodemask);

8430

sg->next = sg;

8430

sg->next = sg;

8431

cpumask_or(d->covered, d->covered, d->nodemask);

8431

cpumask_or(d->covered, d->covered, d->nodemask);

8432

8433

prev = sg;

8433

prev = sg;

8434

for (j = 0; j < nr_node_ids; j++) {

8434

for (j = 0; j < nr_node_ids; j++) {

8435

n = (num + j) % nr_node_ids;

8435

n = (num + j) % nr_node_ids;

8436

cpumask_complement(d->notcovered, d->covered);

8436

cpumask_complement(d->notcovered, d->covered);

8437

cpumask_and(d->tmpmask, d->notcovered, cpu_map);

8437

cpumask_and(d->tmpmask, d->notcovered, cpu_map);

8438

cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);

8438

cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);

8439

if (cpumask_empty(d->tmpmask))

8439

if (cpumask_empty(d->tmpmask))

8440

break;

8440

break;

8441

cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));

8441

cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));

8442

if (cpumask_empty(d->tmpmask))

8442

if (cpumask_empty(d->tmpmask))

8443

continue;

8443

continue;

8444

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

8444

sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),

8445

GFP_KERNEL, num);

8445

GFP_KERNEL, num);

8446

if (!sg) {

8446

if (!sg) {

8447

pr_warning("Can not alloc domain group for node %d\n",

8447

pr_warning("Can not alloc domain group for node %d\n",

8448

j);

8448

j);

8449

return -ENOMEM;

8449

return -ENOMEM;

8450

}

8450

}

8451

sg->cpu_power = 0;

8451

sg->cpu_power = 0;

8452

cpumask_copy(sched_group_cpus(sg), d->tmpmask);

8452

cpumask_copy(sched_group_cpus(sg), d->tmpmask);

8453

sg->next = prev->next;

8453

sg->next = prev->next;

8454

cpumask_or(d->covered, d->covered, d->tmpmask);

8454

cpumask_or(d->covered, d->covered, d->tmpmask);

8455

prev->next = sg;

8455

prev->next = sg;

8456

prev = sg;

8456

prev = sg;

8457

}

8457

}

8458

out:

8458

out:

8459

return 0;

8459

return 0;

8460

}

8460

}

8461

#endif /* CONFIG_NUMA */

8461

#endif /* CONFIG_NUMA */

8462

8463

#ifdef CONFIG_NUMA

8463

#ifdef CONFIG_NUMA

8464

/* Free memory allocated for various sched_group structures */

8464

/* Free memory allocated for various sched_group structures */

8465

static void free_sched_groups(const struct cpumask *cpu_map,

8465

static void free_sched_groups(const struct cpumask *cpu_map,

8466

struct cpumask *nodemask)

8466

struct cpumask *nodemask)

8467

{

8467

{

8468

int cpu, i;

8468

int cpu, i;

8469

8470

for_each_cpu(cpu, cpu_map) {

8470

for_each_cpu(cpu, cpu_map) {

8471

struct sched_group **sched_group_nodes

8471

struct sched_group **sched_group_nodes

8472

= sched_group_nodes_bycpu[cpu];

8472

= sched_group_nodes_bycpu[cpu];

8473

8474

if (!sched_group_nodes)

8474

if (!sched_group_nodes)

8475

continue;

8475

continue;

8476

8477

for (i = 0; i < nr_node_ids; i++) {

8477

for (i = 0; i < nr_node_ids; i++) {

8478

struct sched_group *oldsg, *sg = sched_group_nodes[i];

8478

struct sched_group *oldsg, *sg = sched_group_nodes[i];

8479

8480

cpumask_and(nodemask, cpumask_of_node(i), cpu_map);

8480

cpumask_and(nodemask, cpumask_of_node(i), cpu_map);

8481

if (cpumask_empty(nodemask))

8481

if (cpumask_empty(nodemask))

8482

continue;

8482

continue;

8483

8484

if (sg == NULL)

8484

if (sg == NULL)

8485

continue;

8485

continue;

8486

sg = sg->next;

8486

sg = sg->next;

8487

next_sg:

8487

next_sg:

8488

oldsg = sg;

8488

oldsg = sg;

8489

sg = sg->next;

8489

sg = sg->next;

8490

kfree(oldsg);

8490

kfree(oldsg);

8491

if (oldsg != sched_group_nodes[i])

8491

if (oldsg != sched_group_nodes[i])

8492

goto next_sg;

8492

goto next_sg;

8493

}

8493

}

8494

kfree(sched_group_nodes);

8494

kfree(sched_group_nodes);

8495

sched_group_nodes_bycpu[cpu] = NULL;

8495

sched_group_nodes_bycpu[cpu] = NULL;

8496

}

8496

}

8497

}

8497

}

8498

#else /* !CONFIG_NUMA */

8498

#else /* !CONFIG_NUMA */

8499

static void free_sched_groups(const struct cpumask *cpu_map,

8499

static void free_sched_groups(const struct cpumask *cpu_map,

8500

struct cpumask *nodemask)

8500

struct cpumask *nodemask)

8501

{

8501

{

8502

}

8502

}

8503

#endif /* CONFIG_NUMA */

8503

#endif /* CONFIG_NUMA */

8504

8505

/*

8505

/*

8506

* Initialize sched groups cpu_power.

8506

* Initialize sched groups cpu_power.

8507

*

8507

*

8508

* cpu_power indicates the capacity of sched group, which is used while

8508

* cpu_power indicates the capacity of sched group, which is used while

8509

* distributing the load between different sched groups in a sched domain.

8509

* distributing the load between different sched groups in a sched domain.

8510

* Typically cpu_power for all the groups in a sched domain will be same unless

8510

* Typically cpu_power for all the groups in a sched domain will be same unless

8511

* there are asymmetries in the topology. If there are asymmetries, group

8511

* there are asymmetries in the topology. If there are asymmetries, group

8512

* having more cpu_power will pickup more load compared to the group having

8512

* having more cpu_power will pickup more load compared to the group having

8513

* less cpu_power.

8513

* less cpu_power.

8514

*/

8514

*/

8515

static void init_sched_groups_power(int cpu, struct sched_domain *sd)

8515

static void init_sched_groups_power(int cpu, struct sched_domain *sd)

8516

{

8516

{

8517

struct sched_domain *child;

8517

struct sched_domain *child;

8518

struct sched_group *group;

8518

struct sched_group *group;

8519

long power;

8519

long power;

8520

int weight;

8520

int weight;

8521

8522

WARN_ON(!sd || !sd->groups);

8522

WARN_ON(!sd || !sd->groups);

8523

8524

if (cpu != group_first_cpu(sd->groups))

8524

if (cpu != group_first_cpu(sd->groups))

8525

return;

8525

return;

8526

8527

child = sd->child;

8527

child = sd->child;

8528

8529

sd->groups->cpu_power = 0;

8529

sd->groups->cpu_power = 0;

8530

8531

if (!child) {

8531

if (!child) {

8532

power = SCHED_LOAD_SCALE;

8532

power = SCHED_LOAD_SCALE;

8533

weight = cpumask_weight(sched_domain_span(sd));

8533

weight = cpumask_weight(sched_domain_span(sd));

8534

/*

8534

/*

8535

* SMT siblings share the power of a single core.

8535

* SMT siblings share the power of a single core.

8536

* Usually multiple threads get a better yield out of

8536

* Usually multiple threads get a better yield out of

8537

* that one core than a single thread would have,

8537

* that one core than a single thread would have,

8538

* reflect that in sd->smt_gain.

8538

* reflect that in sd->smt_gain.

8539

*/

8539

*/

8540

if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {

8540

if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {

8541

power *= sd->smt_gain;

8541

power *= sd->smt_gain;

8542

power /= weight;

8542

power /= weight;

8543

power >>= SCHED_LOAD_SHIFT;

8543

power >>= SCHED_LOAD_SHIFT;

8544

}

8544

}

8545

sd->groups->cpu_power += power;

8545

sd->groups->cpu_power += power;

8546

return;

8546

return;

8547

}

8547

}

8548

8549

/*

8549

/*

8550

* Add cpu_power of each child group to this groups cpu_power.

8550

* Add cpu_power of each child group to this groups cpu_power.

8551

*/

8551

*/

8552

group = child->groups;

8552

group = child->groups;

8553

do {

8553

do {

8554

sd->groups->cpu_power += group->cpu_power;

8554

sd->groups->cpu_power += group->cpu_power;

8555

group = group->next;

8555

group = group->next;

8556

} while (group != child->groups);

8556

} while (group != child->groups);

8557

}

8557

}

8558

8559

/*

8559

/*

8560

* Initializers for schedule domains

8560

* Initializers for schedule domains

8561

* Non-inlined to reduce accumulated stack pressure in build_sched_domains()

8561

* Non-inlined to reduce accumulated stack pressure in build_sched_domains()

8562

*/

8562

*/

8563

8564

#ifdef CONFIG_SCHED_DEBUG

8564

#ifdef CONFIG_SCHED_DEBUG

8565

# define SD_INIT_NAME(sd, type) sd->name = #type

8565

# define SD_INIT_NAME(sd, type) sd->name = #type

8566

#else

8566

#else

8567

# define SD_INIT_NAME(sd, type) do { } while (0)

8567

# define SD_INIT_NAME(sd, type) do { } while (0)

8568

#endif

8568

#endif

8569

8570

#define SD_INIT(sd, type) sd_init_##type(sd)

8570

#define SD_INIT(sd, type) sd_init_##type(sd)

8571

8572

#define SD_INIT_FUNC(type) \

8572

#define SD_INIT_FUNC(type) \

8573

static noinline void sd_init_##type(struct sched_domain *sd) \

8573

static noinline void sd_init_##type(struct sched_domain *sd) \

8574

{ \

8574

{ \

8575

memset(sd, 0, sizeof(*sd)); \

8575

memset(sd, 0, sizeof(*sd)); \

8576

*sd = SD_##type##_INIT; \

8576

*sd = SD_##type##_INIT; \

8577

sd->level = SD_LV_##type; \

8577

sd->level = SD_LV_##type; \

8578

SD_INIT_NAME(sd, type); \

8578

SD_INIT_NAME(sd, type); \

8579

}

8579

}

8580

8581

SD_INIT_FUNC(CPU)

8581

SD_INIT_FUNC(CPU)

8582

#ifdef CONFIG_NUMA

8582

#ifdef CONFIG_NUMA

8583

SD_INIT_FUNC(ALLNODES)

8583

SD_INIT_FUNC(ALLNODES)

8584

SD_INIT_FUNC(NODE)

8584

SD_INIT_FUNC(NODE)

8585

#endif

8585

#endif

8586

#ifdef CONFIG_SCHED_SMT

8586

#ifdef CONFIG_SCHED_SMT

8587

SD_INIT_FUNC(SIBLING)

8587

SD_INIT_FUNC(SIBLING)

8588

#endif

8588

#endif

8589

#ifdef CONFIG_SCHED_MC

8589

#ifdef CONFIG_SCHED_MC

8590

SD_INIT_FUNC(MC)

8590

SD_INIT_FUNC(MC)

8591

#endif

8591

#endif

8592

8593

static int default_relax_domain_level = -1;

8593

static int default_relax_domain_level = -1;

8594

8595

static int __init setup_relax_domain_level(char *str)

8595

static int __init setup_relax_domain_level(char *str)

8596

{

8596

{

8597

unsigned long val;

8597

unsigned long val;

8598

8599

val = simple_strtoul(str, NULL, 0);

8599

val = simple_strtoul(str, NULL, 0);

8600

if (val < SD_LV_MAX)

8600

if (val < SD_LV_MAX)

8601

default_relax_domain_level = val;

8601

default_relax_domain_level = val;

8602

8603

return 1;

8603

return 1;

8604

}

8604

}

8605

__setup("relax_domain_level=", setup_relax_domain_level);

8605

__setup("relax_domain_level=", setup_relax_domain_level);

8606

8607

static void set_domain_attribute(struct sched_domain *sd,

8607

static void set_domain_attribute(struct sched_domain *sd,

8608

struct sched_domain_attr *attr)

8608

struct sched_domain_attr *attr)

8609

{

8609

{

8610

int request;

8610

int request;

8611

8612

if (!attr || attr->relax_domain_level < 0) {

8612

if (!attr || attr->relax_domain_level < 0) {

8613

if (default_relax_domain_level < 0)

8613

if (default_relax_domain_level < 0)

8614

return;

8614

return;

8615

else

8615

else

8616

request = default_relax_domain_level;

8616

request = default_relax_domain_level;

8617

} else

8617

} else

8618

request = attr->relax_domain_level;

8618

request = attr->relax_domain_level;

8619

if (request < sd->level) {

8619

if (request < sd->level) {

8620

/* turn off idle balance on this domain */

8620

/* turn off idle balance on this domain */

8621

sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

8621

sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

8622

} else {

8622

} else {

8623

/* turn on idle balance on this domain */

8623

/* turn on idle balance on this domain */

8624

sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

8624

sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

8625

}

8625

}

8626

}

8626

}

8627

8628

static void __free_domain_allocs(struct s_data *d, enum s_alloc what,

8628

static void __free_domain_allocs(struct s_data *d, enum s_alloc what,

8629

const struct cpumask *cpu_map)

8629

const struct cpumask *cpu_map)

8630

{

8630

{

8631

switch (what) {

8631

switch (what) {

8632

case sa_sched_groups:

8632

case sa_sched_groups:

8633

free_sched_groups(cpu_map, d->tmpmask); /* fall through */

8633

free_sched_groups(cpu_map, d->tmpmask); /* fall through */

8634

d->sched_group_nodes = NULL;

8634

d->sched_group_nodes = NULL;

8635

case sa_rootdomain:

8635

case sa_rootdomain:

8636

free_rootdomain(d->rd); /* fall through */

8636

free_rootdomain(d->rd); /* fall through */

8637

case sa_tmpmask:

8637

case sa_tmpmask:

8638

free_cpumask_var(d->tmpmask); /* fall through */

8638

free_cpumask_var(d->tmpmask); /* fall through */

8639

case sa_send_covered:

8639

case sa_send_covered:

8640

free_cpumask_var(d->send_covered); /* fall through */

8640

free_cpumask_var(d->send_covered); /* fall through */

8641

case sa_this_core_map:

8641

case sa_this_core_map:

8642

free_cpumask_var(d->this_core_map); /* fall through */

8642

free_cpumask_var(d->this_core_map); /* fall through */

8643

case sa_this_sibling_map:

8643

case sa_this_sibling_map:

8644

free_cpumask_var(d->this_sibling_map); /* fall through */

8644

free_cpumask_var(d->this_sibling_map); /* fall through */

8645

case sa_nodemask:

8645

case sa_nodemask:

8646

free_cpumask_var(d->nodemask); /* fall through */

8646

free_cpumask_var(d->nodemask); /* fall through */

8647

case sa_sched_group_nodes:

8647

case sa_sched_group_nodes:

8648

#ifdef CONFIG_NUMA

8648

#ifdef CONFIG_NUMA

8649

kfree(d->sched_group_nodes); /* fall through */

8649

kfree(d->sched_group_nodes); /* fall through */

8650

case sa_notcovered:

8650

case sa_notcovered:

8651

free_cpumask_var(d->notcovered); /* fall through */

8651

free_cpumask_var(d->notcovered); /* fall through */

8652

case sa_covered:

8652

case sa_covered:

8653

free_cpumask_var(d->covered); /* fall through */

8653

free_cpumask_var(d->covered); /* fall through */

8654

case sa_domainspan:

8654

case sa_domainspan:

8655

free_cpumask_var(d->domainspan); /* fall through */

8655

free_cpumask_var(d->domainspan); /* fall through */

8656

#endif

8656

#endif

8657

case sa_none:

8657

case sa_none:

8658

break;

8658

break;

8659

}

8659

}

8660

}

8660

}

8661

8662

static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,

8662

static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,

8663

const struct cpumask *cpu_map)

8663

const struct cpumask *cpu_map)

8664

{

8664

{

8665

#ifdef CONFIG_NUMA

8665

#ifdef CONFIG_NUMA

8666

if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))

8666

if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))

8667

return sa_none;

8667

return sa_none;

8668

if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))

8668

if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))

8669

return sa_domainspan;

8669

return sa_domainspan;

8670

if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))

8670

if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))

8671

return sa_covered;

8671

return sa_covered;

8672

/* Allocate the per-node list of sched groups */

8672

/* Allocate the per-node list of sched groups */

8673

d->sched_group_nodes = kcalloc(nr_node_ids,

8673

d->sched_group_nodes = kcalloc(nr_node_ids,

8674

sizeof(struct sched_group *), GFP_KERNEL);

8674

sizeof(struct sched_group *), GFP_KERNEL);

8675

if (!d->sched_group_nodes) {

8675

if (!d->sched_group_nodes) {

8676

pr_warning("Can not alloc sched group node list\n");

8676

pr_warning("Can not alloc sched group node list\n");

8677

return sa_notcovered;

8677

return sa_notcovered;

8678

}

8678

}

8679

sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;

8679

sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;

8680

#endif

8680

#endif

8681

if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))

8681

if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))

8682

return sa_sched_group_nodes;

8682

return sa_sched_group_nodes;

8683

if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))

8683

if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))

8684

return sa_nodemask;

8684

return sa_nodemask;

8685

if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))

8685

if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))

8686

return sa_this_sibling_map;

8686

return sa_this_sibling_map;

8687

if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))

8687

if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))

8688

return sa_this_core_map;

8688

return sa_this_core_map;

8689

if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))

8689

if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))

8690

return sa_send_covered;

8690

return sa_send_covered;

8691

d->rd = alloc_rootdomain();

8691

d->rd = alloc_rootdomain();

8692

if (!d->rd) {

8692

if (!d->rd) {

8693

pr_warning("Cannot alloc root domain\n");

8693

pr_warning("Cannot alloc root domain\n");

8694

return sa_tmpmask;

8694

return sa_tmpmask;

8695

}

8695

}

8696

return sa_rootdomain;

8696

return sa_rootdomain;

8697

}

8697

}

8698

8699

static struct sched_domain *__build_numa_sched_domains(struct s_data *d,

8699

static struct sched_domain *__build_numa_sched_domains(struct s_data *d,

8700

const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)

8700

const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)

8701

{

8701

{

8702

struct sched_domain *sd = NULL;

8702

struct sched_domain *sd = NULL;

8703

#ifdef CONFIG_NUMA

8703

#ifdef CONFIG_NUMA

8704

struct sched_domain *parent;

8704

struct sched_domain *parent;

8705

8706

d->sd_allnodes = 0;

8706

d->sd_allnodes = 0;

8707

if (cpumask_weight(cpu_map) >

8707

if (cpumask_weight(cpu_map) >

8708

SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {

8708

SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {

8709

sd = &per_cpu(allnodes_domains, i).sd;

8709

sd = &per_cpu(allnodes_domains, i).sd;

8710

SD_INIT(sd, ALLNODES);

8710

SD_INIT(sd, ALLNODES);

8711

set_domain_attribute(sd, attr);

8711

set_domain_attribute(sd, attr);

8712

cpumask_copy(sched_domain_span(sd), cpu_map);

8712

cpumask_copy(sched_domain_span(sd), cpu_map);

8713

cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);

8713

cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);

8714

d->sd_allnodes = 1;

8714

d->sd_allnodes = 1;

8715

}

8715

}

8716

parent = sd;

8716

parent = sd;

8717

8718

sd = &per_cpu(node_domains, i).sd;

8718

sd = &per_cpu(node_domains, i).sd;

8719

SD_INIT(sd, NODE);

8719

SD_INIT(sd, NODE);

8720

set_domain_attribute(sd, attr);

8720

set_domain_attribute(sd, attr);

8721

sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));

8721

sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));

8722

sd->parent = parent;

8722

sd->parent = parent;

8723

if (parent)

8723

if (parent)

8724

parent->child = sd;

8724

parent->child = sd;

8725

cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);

8725

cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);

8726

#endif

8726

#endif

8727

return sd;

8727

return sd;

8728

}

8728

}

8729

8730

static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,

8730

static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,

8731

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

8731

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

8732

struct sched_domain *parent, int i)

8732

struct sched_domain *parent, int i)

8733

{

8733

{

8734

struct sched_domain *sd;

8734

struct sched_domain *sd;

8735

sd = &per_cpu(phys_domains, i).sd;

8735

sd = &per_cpu(phys_domains, i).sd;

8736

SD_INIT(sd, CPU);

8736

SD_INIT(sd, CPU);

8737

set_domain_attribute(sd, attr);

8737

set_domain_attribute(sd, attr);

8738

cpumask_copy(sched_domain_span(sd), d->nodemask);

8738

cpumask_copy(sched_domain_span(sd), d->nodemask);

8739

sd->parent = parent;

8739

sd->parent = parent;

8740

if (parent)

8740

if (parent)

8741

parent->child = sd;

8741

parent->child = sd;

8742

cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);

8742

cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);

8743

return sd;

8743

return sd;

8744

}

8744

}

8745

8746

static struct sched_domain *__build_mc_sched_domain(struct s_data *d,

8746

static struct sched_domain *__build_mc_sched_domain(struct s_data *d,

8747

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

8747

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

8748

struct sched_domain *parent, int i)

8748

struct sched_domain *parent, int i)

8749

{

8749

{

8750

struct sched_domain *sd = parent;

8750

struct sched_domain *sd = parent;

8751

#ifdef CONFIG_SCHED_MC

8751

#ifdef CONFIG_SCHED_MC

8752

sd = &per_cpu(core_domains, i).sd;

8752

sd = &per_cpu(core_domains, i).sd;

8753

SD_INIT(sd, MC);

8753

SD_INIT(sd, MC);

8754

set_domain_attribute(sd, attr);

8754

set_domain_attribute(sd, attr);

8755

cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));

8755

cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));

8756

sd->parent = parent;

8756

sd->parent = parent;

8757

parent->child = sd;

8757

parent->child = sd;

8758

cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);

8758

cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);

8759

#endif

8759

#endif

8760

return sd;

8760

return sd;

8761

}

8761

}

8762

8763

static struct sched_domain *__build_smt_sched_domain(struct s_data *d,

8763

static struct sched_domain *__build_smt_sched_domain(struct s_data *d,

8764

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

8764

const struct cpumask *cpu_map, struct sched_domain_attr *attr,

8765

struct sched_domain *parent, int i)

8765

struct sched_domain *parent, int i)

8766

{

8766

{

8767

struct sched_domain *sd = parent;

8767

struct sched_domain *sd = parent;

8768

#ifdef CONFIG_SCHED_SMT

8768

#ifdef CONFIG_SCHED_SMT

8769

sd = &per_cpu(cpu_domains, i).sd;

8769

sd = &per_cpu(cpu_domains, i).sd;

8770

SD_INIT(sd, SIBLING);

8770

SD_INIT(sd, SIBLING);

8771

set_domain_attribute(sd, attr);

8771

set_domain_attribute(sd, attr);

8772

cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));

8772

cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));

8773

sd->parent = parent;

8773

sd->parent = parent;

8774

parent->child = sd;

8774

parent->child = sd;

8775

cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);

8775

cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);

8776

#endif

8776

#endif

8777

return sd;

8777

return sd;

8778

}

8778

}

8779

8780

static void build_sched_groups(struct s_data *d, enum sched_domain_level l,

8780

static void build_sched_groups(struct s_data *d, enum sched_domain_level l,

8781

const struct cpumask *cpu_map, int cpu)

8781

const struct cpumask *cpu_map, int cpu)

8782

{

8782

{

8783

switch (l) {

8783

switch (l) {

8784

#ifdef CONFIG_SCHED_SMT

8784

#ifdef CONFIG_SCHED_SMT

8785

case SD_LV_SIBLING: /* set up CPU (sibling) groups */

8785

case SD_LV_SIBLING: /* set up CPU (sibling) groups */

8786

cpumask_and(d->this_sibling_map, cpu_map,

8786

cpumask_and(d->this_sibling_map, cpu_map,

8787

topology_thread_cpumask(cpu));

8787

topology_thread_cpumask(cpu));

8788

if (cpu == cpumask_first(d->this_sibling_map))

8788

if (cpu == cpumask_first(d->this_sibling_map))

8789

init_sched_build_groups(d->this_sibling_map, cpu_map,

8789

init_sched_build_groups(d->this_sibling_map, cpu_map,

8790

&cpu_to_cpu_group,

8790

&cpu_to_cpu_group,

8791

d->send_covered, d->tmpmask);

8791

d->send_covered, d->tmpmask);

8792

break;

8792

break;

8793

#endif

8793

#endif

8794

#ifdef CONFIG_SCHED_MC

8794

#ifdef CONFIG_SCHED_MC

8795

case SD_LV_MC: /* set up multi-core groups */

8795

case SD_LV_MC: /* set up multi-core groups */

8796

cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));

8796

cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));

8797

if (cpu == cpumask_first(d->this_core_map))

8797

if (cpu == cpumask_first(d->this_core_map))

8798

init_sched_build_groups(d->this_core_map, cpu_map,

8798

init_sched_build_groups(d->this_core_map, cpu_map,

8799

&cpu_to_core_group,

8799

&cpu_to_core_group,

8800

d->send_covered, d->tmpmask);

8800

d->send_covered, d->tmpmask);

8801

break;

8801

break;

8802

#endif

8802

#endif

8803

case SD_LV_CPU: /* set up physical groups */

8803

case SD_LV_CPU: /* set up physical groups */

8804

cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);

8804

cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);

8805

if (!cpumask_empty(d->nodemask))

8805

if (!cpumask_empty(d->nodemask))

8806

init_sched_build_groups(d->nodemask, cpu_map,

8806

init_sched_build_groups(d->nodemask, cpu_map,

8807

&cpu_to_phys_group,

8807

&cpu_to_phys_group,

8808

d->send_covered, d->tmpmask);

8808

d->send_covered, d->tmpmask);

8809

break;

8809

break;

8810

#ifdef CONFIG_NUMA

8810

#ifdef CONFIG_NUMA

8811

case SD_LV_ALLNODES:

8811

case SD_LV_ALLNODES:

8812

init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,

8812

init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,

8813

d->send_covered, d->tmpmask);

8813

d->send_covered, d->tmpmask);

8814

break;

8814

break;

8815

#endif

8815

#endif

8816

default:

8816

default:

8817

break;

8817

break;

8818

}

8818

}

8819

}

8819

}

8820

8821

/*

8821

/*

8822

* Build sched domains for a given set of cpus and attach the sched domains

8822

* Build sched domains for a given set of cpus and attach the sched domains

8823

* to the individual cpus

8823

* to the individual cpus

8824

*/

8824

*/

8825

static int __build_sched_domains(const struct cpumask *cpu_map,

8825

static int __build_sched_domains(const struct cpumask *cpu_map,

8826

struct sched_domain_attr *attr)

8826

struct sched_domain_attr *attr)

8827

{

8827

{

8828

enum s_alloc alloc_state = sa_none;

8828

enum s_alloc alloc_state = sa_none;

8829

struct s_data d;

8829

struct s_data d;

8830

struct sched_domain *sd;

8830

struct sched_domain *sd;

8831

int i;

8831

int i;

8832

#ifdef CONFIG_NUMA

8832

#ifdef CONFIG_NUMA

8833

d.sd_allnodes = 0;

8833

d.sd_allnodes = 0;

8834

#endif

8834

#endif

8835

8836

alloc_state = __visit_domain_allocation_hell(&d, cpu_map);

8836

alloc_state = __visit_domain_allocation_hell(&d, cpu_map);

8837

if (alloc_state != sa_rootdomain)

8837

if (alloc_state != sa_rootdomain)

8838

goto error;

8838

goto error;

8839

alloc_state = sa_sched_groups;

8839

alloc_state = sa_sched_groups;

8840

8841

/*

8841

/*

8842

* Set up domains for cpus specified by the cpu_map.

8842

* Set up domains for cpus specified by the cpu_map.

8843

*/

8843

*/

8844

for_each_cpu(i, cpu_map) {

8844

for_each_cpu(i, cpu_map) {

8845

cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),

8845

cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),

8846

cpu_map);

8846

cpu_map);

8847

8848

sd = __build_numa_sched_domains(&d, cpu_map, attr, i);

8848

sd = __build_numa_sched_domains(&d, cpu_map, attr, i);

8849

sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);

8849

sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);

8850

sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);

8850

sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);

8851

sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);

8851

sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);

8852

}

8852

}

8853

8854

for_each_cpu(i, cpu_map) {

8854

for_each_cpu(i, cpu_map) {

8855

build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);

8855

build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);

8856

build_sched_groups(&d, SD_LV_MC, cpu_map, i);

8856

build_sched_groups(&d, SD_LV_MC, cpu_map, i);

8857

}

8857

}

8858

8859

/* Set up physical groups */

8859

/* Set up physical groups */

8860

for (i = 0; i < nr_node_ids; i++)

8860

for (i = 0; i < nr_node_ids; i++)

8861

build_sched_groups(&d, SD_LV_CPU, cpu_map, i);

8861

build_sched_groups(&d, SD_LV_CPU, cpu_map, i);

8862

8863

#ifdef CONFIG_NUMA

8863

#ifdef CONFIG_NUMA

8864

/* Set up node groups */

8864

/* Set up node groups */

8865

if (d.sd_allnodes)

8865

if (d.sd_allnodes)

8866

build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);

8866

build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);

8867

8868

for (i = 0; i < nr_node_ids; i++)

8868

for (i = 0; i < nr_node_ids; i++)

8869

if (build_numa_sched_groups(&d, cpu_map, i))

8869

if (build_numa_sched_groups(&d, cpu_map, i))

8870

goto error;

8870

goto error;

8871

#endif

8871

#endif

8872

8873

/* Calculate CPU power for physical packages and nodes */

8873

/* Calculate CPU power for physical packages and nodes */

8874

#ifdef CONFIG_SCHED_SMT

8874

#ifdef CONFIG_SCHED_SMT

8875

for_each_cpu(i, cpu_map) {

8875

for_each_cpu(i, cpu_map) {

8876

sd = &per_cpu(cpu_domains, i).sd;

8876

sd = &per_cpu(cpu_domains, i).sd;

8877

init_sched_groups_power(i, sd);

8877

init_sched_groups_power(i, sd);

8878

}

8878

}

8879

#endif

8879

#endif

8880

#ifdef CONFIG_SCHED_MC

8880

#ifdef CONFIG_SCHED_MC

8881

for_each_cpu(i, cpu_map) {

8881

for_each_cpu(i, cpu_map) {

8882

sd = &per_cpu(core_domains, i).sd;

8882

sd = &per_cpu(core_domains, i).sd;

8883

init_sched_groups_power(i, sd);

8883

init_sched_groups_power(i, sd);

8884

}

8884

}

8885

#endif

8885

#endif

8886

8887

for_each_cpu(i, cpu_map) {

8887

for_each_cpu(i, cpu_map) {

8888

sd = &per_cpu(phys_domains, i).sd;

8888

sd = &per_cpu(phys_domains, i).sd;

8889

init_sched_groups_power(i, sd);

8889

init_sched_groups_power(i, sd);

8890

}

8890

}

8891

8892

#ifdef CONFIG_NUMA

8892

#ifdef CONFIG_NUMA

8893

for (i = 0; i < nr_node_ids; i++)

8893

for (i = 0; i < nr_node_ids; i++)

8894

init_numa_sched_groups_power(d.sched_group_nodes[i]);

8894

init_numa_sched_groups_power(d.sched_group_nodes[i]);

8895

8896

if (d.sd_allnodes) {

8896

if (d.sd_allnodes) {

8897

struct sched_group *sg;

8897

struct sched_group *sg;

8898

8899

cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,

8899

cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,

8900

d.tmpmask);

8900

d.tmpmask);

8901

init_numa_sched_groups_power(sg);

8901

init_numa_sched_groups_power(sg);

8902

}

8902

}

8903

#endif

8903

#endif

8904

8905

/* Attach the domains */

8905

/* Attach the domains */

8906

for_each_cpu(i, cpu_map) {

8906

for_each_cpu(i, cpu_map) {

8907

#ifdef CONFIG_SCHED_SMT

8907

#ifdef CONFIG_SCHED_SMT

8908

sd = &per_cpu(cpu_domains, i).sd;

8908

sd = &per_cpu(cpu_domains, i).sd;

8909

#elif defined(CONFIG_SCHED_MC)

8909

#elif defined(CONFIG_SCHED_MC)

8910

sd = &per_cpu(core_domains, i).sd;

8910

sd = &per_cpu(core_domains, i).sd;

8911

#else

8911

#else

8912

sd = &per_cpu(phys_domains, i).sd;

8912

sd = &per_cpu(phys_domains, i).sd;

8913

#endif

8913

#endif

8914

cpu_attach_domain(sd, d.rd, i);

8914

cpu_attach_domain(sd, d.rd, i);

8915

}

8915

}

8916

8917

d.sched_group_nodes = NULL; /* don't free this we still need it */

8917

d.sched_group_nodes = NULL; /* don't free this we still need it */

8918

__free_domain_allocs(&d, sa_tmpmask, cpu_map);

8918

__free_domain_allocs(&d, sa_tmpmask, cpu_map);

8919

return 0;

8919

return 0;

8920

8921

error:

8921

error:

8922

__free_domain_allocs(&d, alloc_state, cpu_map);

8922

__free_domain_allocs(&d, alloc_state, cpu_map);

8923

return -ENOMEM;

8923

return -ENOMEM;

8924

}

8924

}

8925

8926

static int build_sched_domains(const struct cpumask *cpu_map)

8926

static int build_sched_domains(const struct cpumask *cpu_map)

8927

{

8927

{

8928

return __build_sched_domains(cpu_map, NULL);

8928

return __build_sched_domains(cpu_map, NULL);

8929

}

8929

}

8930

8931

static cpumask_var_t *doms_cur; /* current sched domains */

8931

static cpumask_var_t *doms_cur; /* current sched domains */

8932

static int ndoms_cur; /* number of sched domains in 'doms_cur' */

8932

static int ndoms_cur; /* number of sched domains in 'doms_cur' */

8933

static struct sched_domain_attr *dattr_cur;

8933

static struct sched_domain_attr *dattr_cur;

8934

/* attribues of custom domains in 'doms_cur' */

8934

/* attribues of custom domains in 'doms_cur' */

8935

8936

/*

8936

/*

8937

* Special case: If a kmalloc of a doms_cur partition (array of

8937

* Special case: If a kmalloc of a doms_cur partition (array of

8938

* cpumask) fails, then fallback to a single sched domain,

8938

* cpumask) fails, then fallback to a single sched domain,

8939

* as determined by the single cpumask fallback_doms.

8939

* as determined by the single cpumask fallback_doms.

8940

*/

8940

*/

8941

static cpumask_var_t fallback_doms;

8941

static cpumask_var_t fallback_doms;

8942

8943

/*

8943

/*

8944

* arch_update_cpu_topology lets virtualized architectures update the

8944

* arch_update_cpu_topology lets virtualized architectures update the

8945

* cpu core maps. It is supposed to return 1 if the topology changed

8945

* cpu core maps. It is supposed to return 1 if the topology changed

8946

* or 0 if it stayed the same.

8946

* or 0 if it stayed the same.

8947

*/

8947

*/

8948

int __attribute__((weak)) arch_update_cpu_topology(void)

8948

int __attribute__((weak)) arch_update_cpu_topology(void)

8949

{

8949

{

8950

return 0;

8950

return 0;

8951

}

8951

}

8952

8953

cpumask_var_t *alloc_sched_domains(unsigned int ndoms)

8953

cpumask_var_t *alloc_sched_domains(unsigned int ndoms)

8954

{

8954

{

8955

int i;

8955

int i;

8956

cpumask_var_t *doms;

8956

cpumask_var_t *doms;

8957

8958

doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);

8958

doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);

8959

if (!doms)

8959

if (!doms)

8960

return NULL;

8960

return NULL;

8961

for (i = 0; i < ndoms; i++) {

8961

for (i = 0; i < ndoms; i++) {

8962

if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {

8962

if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {

8963

free_sched_domains(doms, i);

8963

free_sched_domains(doms, i);

8964

return NULL;

8964

return NULL;

8965

}

8965

}

8966

}

8966

}

8967

return doms;

8967

return doms;

8968

}

8968

}

8969

8970

void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)

8970

void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)

8971

{

8971

{

8972

unsigned int i;

8972

unsigned int i;

8973

for (i = 0; i < ndoms; i++)

8973

for (i = 0; i < ndoms; i++)

8974

free_cpumask_var(doms[i]);

8974

free_cpumask_var(doms[i]);

8975

kfree(doms);

8975

kfree(doms);

8976

}

8976

}

8977

8978

/*

8978

/*

8979

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

8979

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

8980

* For now this just excludes isolated cpus, but could be used to

8980

* For now this just excludes isolated cpus, but could be used to

8981

* exclude other special cases in the future.

8981

* exclude other special cases in the future.

8982

*/

8982

*/

8983

static int arch_init_sched_domains(const struct cpumask *cpu_map)

8983

static int arch_init_sched_domains(const struct cpumask *cpu_map)

8984

{

8984

{

8985

int err;

8985

int err;

8986

8987

arch_update_cpu_topology();

8987

arch_update_cpu_topology();

8988

ndoms_cur = 1;

8988

ndoms_cur = 1;

8989

doms_cur = alloc_sched_domains(ndoms_cur);

8989

doms_cur = alloc_sched_domains(ndoms_cur);

8990

if (!doms_cur)

8990

if (!doms_cur)

8991

doms_cur = &fallback_doms;

8991

doms_cur = &fallback_doms;

8992

cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);

8992

cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);

8993

dattr_cur = NULL;

8993

dattr_cur = NULL;

8994

err = build_sched_domains(doms_cur[0]);

8994

err = build_sched_domains(doms_cur[0]);

8995

register_sched_domain_sysctl();

8995

register_sched_domain_sysctl();

8996

8997

return err;

8997

return err;

8998

}

8998

}

8999

9000

static void arch_destroy_sched_domains(const struct cpumask *cpu_map,

9000

static void arch_destroy_sched_domains(const struct cpumask *cpu_map,

9001

struct cpumask *tmpmask)

9001

struct cpumask *tmpmask)

9002

{

9002

{

9003

free_sched_groups(cpu_map, tmpmask);

9003

free_sched_groups(cpu_map, tmpmask);

9004

}

9004

}

9005

9006

/*

9006

/*

9007

* Detach sched domains from a group of cpus specified in cpu_map

9007

* Detach sched domains from a group of cpus specified in cpu_map

9008

* These cpus will now be attached to the NULL domain

9008

* These cpus will now be attached to the NULL domain

9009

*/

9009

*/

9010

static void detach_destroy_domains(const struct cpumask *cpu_map)

9010

static void detach_destroy_domains(const struct cpumask *cpu_map)

9011

{

9011

{

9012

/* Save because hotplug lock held. */

9012

/* Save because hotplug lock held. */

9013

static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);

9013

static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);

9014

int i;

9014

int i;

9015

9016

for_each_cpu(i, cpu_map)

9016

for_each_cpu(i, cpu_map)

9017

cpu_attach_domain(NULL, &def_root_domain, i);

9017

cpu_attach_domain(NULL, &def_root_domain, i);

9018

synchronize_sched();

9018

synchronize_sched();

9019

arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));

9019

arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));

9020

}

9020

}

9021

9022

/* handle null as "default" */

9022

/* handle null as "default" */

9023

static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,

9023

static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,

9024

struct sched_domain_attr *new, int idx_new)

9024

struct sched_domain_attr *new, int idx_new)

9025

{

9025

{

9026

struct sched_domain_attr tmp;

9026

struct sched_domain_attr tmp;

9027

9028

/* fast path */

9028

/* fast path */

9029

if (!new && !cur)

9029

if (!new && !cur)

9030

return 1;

9030

return 1;

9031

9032

tmp = SD_ATTR_INIT;

9032

tmp = SD_ATTR_INIT;

9033

return !memcmp(cur ? (cur + idx_cur) : &tmp,

9033

return !memcmp(cur ? (cur + idx_cur) : &tmp,

9034

new ? (new + idx_new) : &tmp,

9034

new ? (new + idx_new) : &tmp,

9035

sizeof(struct sched_domain_attr));

9035

sizeof(struct sched_domain_attr));

9036

}

9036

}

9037

9038

/*

9038

/*

9039

* Partition sched domains as specified by the 'ndoms_new'

9039

* Partition sched domains as specified by the 'ndoms_new'

9040

* cpumasks in the array doms_new[] of cpumasks. This compares

9040

* cpumasks in the array doms_new[] of cpumasks. This compares

9041

* doms_new[] to the current sched domain partitioning, doms_cur[].

9041

* doms_new[] to the current sched domain partitioning, doms_cur[].

9042

* It destroys each deleted domain and builds each new domain.

9042

* It destroys each deleted domain and builds each new domain.

9043

*

9043

*

9044

* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.

9044

* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.

9045

* The masks don't intersect (don't overlap.) We should setup one

9045

* The masks don't intersect (don't overlap.) We should setup one

9046

* sched domain for each mask. CPUs not in any of the cpumasks will

9046

* sched domain for each mask. CPUs not in any of the cpumasks will

9047

* not be load balanced. If the same cpumask appears both in the

9047

* not be load balanced. If the same cpumask appears both in the

9048

* current 'doms_cur' domains and in the new 'doms_new', we can leave

9048

* current 'doms_cur' domains and in the new 'doms_new', we can leave

9049

* it as it is.

9049

* it as it is.

9050

*

9050

*

9051

* The passed in 'doms_new' should be allocated using

9051

* The passed in 'doms_new' should be allocated using

9052

* alloc_sched_domains. This routine takes ownership of it and will

9052

* alloc_sched_domains. This routine takes ownership of it and will

9053

* free_sched_domains it when done with it. If the caller failed the

9053

* free_sched_domains it when done with it. If the caller failed the

9054

* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,

9054

* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,

9055

* and partition_sched_domains() will fallback to the single partition

9055

* and partition_sched_domains() will fallback to the single partition

9056

* 'fallback_doms', it also forces the domains to be rebuilt.

9056

* 'fallback_doms', it also forces the domains to be rebuilt.

9057

*

9057

*

9058

* If doms_new == NULL it will be replaced with cpu_online_mask.

9058

* If doms_new == NULL it will be replaced with cpu_online_mask.

9059

* ndoms_new == 0 is a special case for destroying existing domains,

9059

* ndoms_new == 0 is a special case for destroying existing domains,

9060

* and it will not create the default domain.

9060

* and it will not create the default domain.

9061

*

9061

*

9062

* Call with hotplug lock held

9062

* Call with hotplug lock held

9063

*/

9063

*/

9064

void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

9064

void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

9065

struct sched_domain_attr *dattr_new)

9065

struct sched_domain_attr *dattr_new)

9066

{

9066

{

9067

int i, j, n;

9067

int i, j, n;

9068

int new_topology;

9068

int new_topology;

9069

9070

mutex_lock(&sched_domains_mutex);

9070

mutex_lock(&sched_domains_mutex);

9071

9072

/* always unregister in case we don't destroy any domains */

9072

/* always unregister in case we don't destroy any domains */

9073

unregister_sched_domain_sysctl();

9073

unregister_sched_domain_sysctl();

9074

9075

/* Let architecture update cpu core mappings. */

9075

/* Let architecture update cpu core mappings. */

9076

new_topology = arch_update_cpu_topology();

9076

new_topology = arch_update_cpu_topology();

9077

9078

n = doms_new ? ndoms_new : 0;

9078

n = doms_new ? ndoms_new : 0;

9079

9080

/* Destroy deleted domains */

9080

/* Destroy deleted domains */

9081

for (i = 0; i < ndoms_cur; i++) {

9081

for (i = 0; i < ndoms_cur; i++) {

9082

for (j = 0; j < n && !new_topology; j++) {

9082

for (j = 0; j < n && !new_topology; j++) {

9083

if (cpumask_equal(doms_cur[i], doms_new[j])

9083

if (cpumask_equal(doms_cur[i], doms_new[j])

9084

&& dattrs_equal(dattr_cur, i, dattr_new, j))

9084

&& dattrs_equal(dattr_cur, i, dattr_new, j))

9085

goto match1;

9085

goto match1;

9086

}

9086

}

9087

/* no match - a current sched domain not in new doms_new[] */

9087

/* no match - a current sched domain not in new doms_new[] */

9088

detach_destroy_domains(doms_cur[i]);

9088

detach_destroy_domains(doms_cur[i]);

9089

match1:

9089

match1:

9090

;

9090

;

9091

}

9091

}

9092

9093

if (doms_new == NULL) {

9093

if (doms_new == NULL) {

9094

ndoms_cur = 0;

9094

ndoms_cur = 0;

9095

doms_new = &fallback_doms;

9095

doms_new = &fallback_doms;

9096

cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);

9096

cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);

9097

WARN_ON_ONCE(dattr_new);

9097

WARN_ON_ONCE(dattr_new);

9098

}

9098

}

9099

9100

/* Build new domains */

9100

/* Build new domains */

9101

for (i = 0; i < ndoms_new; i++) {

9101

for (i = 0; i < ndoms_new; i++) {

9102

for (j = 0; j < ndoms_cur && !new_topology; j++) {

9102

for (j = 0; j < ndoms_cur && !new_topology; j++) {

9103

if (cpumask_equal(doms_new[i], doms_cur[j])

9103

if (cpumask_equal(doms_new[i], doms_cur[j])

9104

&& dattrs_equal(dattr_new, i, dattr_cur, j))

9104

&& dattrs_equal(dattr_new, i, dattr_cur, j))

9105

goto match2;

9105

goto match2;

9106

}

9106

}

9107

/* no match - add a new doms_new */

9107

/* no match - add a new doms_new */

9108

__build_sched_domains(doms_new[i],

9108

__build_sched_domains(doms_new[i],

9109

dattr_new ? dattr_new + i : NULL);

9109

dattr_new ? dattr_new + i : NULL);

9110

match2:

9110

match2:

9111

;

9111

;

9112

}

9112

}

9113

9114

/* Remember the new sched domains */

9114

/* Remember the new sched domains */

9115

if (doms_cur != &fallback_doms)

9115

if (doms_cur != &fallback_doms)

9116

free_sched_domains(doms_cur, ndoms_cur);

9116

free_sched_domains(doms_cur, ndoms_cur);

9117

kfree(dattr_cur); /* kfree(NULL) is safe */

9117

kfree(dattr_cur); /* kfree(NULL) is safe */

9118

doms_cur = doms_new;

9118

doms_cur = doms_new;

9119

dattr_cur = dattr_new;

9119

dattr_cur = dattr_new;

9120

ndoms_cur = ndoms_new;

9120

ndoms_cur = ndoms_new;

9121

9122

register_sched_domain_sysctl();

9122

register_sched_domain_sysctl();

9123

9124

mutex_unlock(&sched_domains_mutex);

9124

mutex_unlock(&sched_domains_mutex);

9125

}

9125

}

9126

9127

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

9127

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

9128

static void arch_reinit_sched_domains(void)

9128

static void arch_reinit_sched_domains(void)

9129

{

9129

{

9130

get_online_cpus();

9130

get_online_cpus();

9131

9132

/* Destroy domains first to force the rebuild */

9132

/* Destroy domains first to force the rebuild */

9133

partition_sched_domains(0, NULL, NULL);

9133

partition_sched_domains(0, NULL, NULL);

9134

9135

rebuild_sched_domains();

9135

rebuild_sched_domains();

9136

put_online_cpus();

9136

put_online_cpus();

9137

}

9137

}

9138

9139

static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)

9139

static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)

9140

{

9140

{

9141

unsigned int level = 0;

9141

unsigned int level = 0;

9142

9143

if (sscanf(buf, "%u", &level) != 1)

9143

if (sscanf(buf, "%u", &level) != 1)

9144

return -EINVAL;

9144

return -EINVAL;

9145

9146

/*

9146

/*

9147

* level is always be positive so don't check for

9147

* level is always be positive so don't check for

9148

* level < POWERSAVINGS_BALANCE_NONE which is 0

9148

* level < POWERSAVINGS_BALANCE_NONE which is 0

9149

* What happens on 0 or 1 byte write,

9149

* What happens on 0 or 1 byte write,

9150

* need to check for count as well?

9150

* need to check for count as well?

9151

*/

9151

*/

9152

9153

if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)

9153

if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)

9154

return -EINVAL;

9154

return -EINVAL;

9155

9156

if (smt)

9156

if (smt)

9157

sched_smt_power_savings = level;

9157

sched_smt_power_savings = level;

9158

else

9158

else

9159

sched_mc_power_savings = level;

9159

sched_mc_power_savings = level;

9160

9161

arch_reinit_sched_domains();

9161

arch_reinit_sched_domains();

9162

9163

return count;

9163

return count;

9164

}

9164

}

9165

9166

#ifdef CONFIG_SCHED_MC

9166

#ifdef CONFIG_SCHED_MC

9167

static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,

9167

static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,

9168

char *page)

9168

char *page)

9169

{

9169

{

9170

return sprintf(page, "%u\n", sched_mc_power_savings);

9170

return sprintf(page, "%u\n", sched_mc_power_savings);

9171

}

9171

}

9172

static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,

9172

static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,

9173

const char *buf, size_t count)

9173

const char *buf, size_t count)

9174

{

9174

{

9175

return sched_power_savings_store(buf, count, 0);

9175

return sched_power_savings_store(buf, count, 0);

9176

}

9176

}

9177

static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,

9177

static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,

9178

sched_mc_power_savings_show,

9178

sched_mc_power_savings_show,

9179

sched_mc_power_savings_store);

9179

sched_mc_power_savings_store);

9180

#endif

9180

#endif

9181

9182

#ifdef CONFIG_SCHED_SMT

9182

#ifdef CONFIG_SCHED_SMT

9183

static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,

9183

static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,

9184

char *page)

9184

char *page)

9185

{

9185

{

9186

return sprintf(page, "%u\n", sched_smt_power_savings);

9186

return sprintf(page, "%u\n", sched_smt_power_savings);

9187

}

9187

}

9188

static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,

9188

static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,

9189

const char *buf, size_t count)

9189

const char *buf, size_t count)

9190

{

9190

{

9191

return sched_power_savings_store(buf, count, 1);

9191

return sched_power_savings_store(buf, count, 1);

9192

}

9192

}

9193

static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,

9193

static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,

9194

sched_smt_power_savings_show,

9194

sched_smt_power_savings_show,

9195

sched_smt_power_savings_store);

9195

sched_smt_power_savings_store);

9196

#endif

9196

#endif

9197

9198

int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)

9198

int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)

9199

{

9199

{

9200

int err = 0;

9200

int err = 0;

9201

9202

#ifdef CONFIG_SCHED_SMT

9202

#ifdef CONFIG_SCHED_SMT

9203

if (smt_capable())

9203

if (smt_capable())

9204

err = sysfs_create_file(&cls->kset.kobj,

9204

err = sysfs_create_file(&cls->kset.kobj,

9205

&attr_sched_smt_power_savings.attr);

9205

&attr_sched_smt_power_savings.attr);

9206

#endif

9206

#endif

9207

#ifdef CONFIG_SCHED_MC

9207

#ifdef CONFIG_SCHED_MC

9208

if (!err && mc_capable())

9208

if (!err && mc_capable())

9209

err = sysfs_create_file(&cls->kset.kobj,

9209

err = sysfs_create_file(&cls->kset.kobj,

9210

&attr_sched_mc_power_savings.attr);

9210

&attr_sched_mc_power_savings.attr);

9211

#endif

9211

#endif

9212

return err;

9212

return err;

9213

}

9213

}

9214

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

9214

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

9215

9216

#ifndef CONFIG_CPUSETS

9216

#ifndef CONFIG_CPUSETS

9217

/*

9217

/*

9218

* Add online and remove offline CPUs from the scheduler domains.

9218

* Add online and remove offline CPUs from the scheduler domains.

9219

* When cpusets are enabled they take over this function.

9219

* When cpusets are enabled they take over this function.

9220

*/

9220

*/

9221

static int update_sched_domains(struct notifier_block *nfb,

9221

static int update_sched_domains(struct notifier_block *nfb,

9222

unsigned long action, void *hcpu)

9222

unsigned long action, void *hcpu)

9223

{

9223

{

9224

switch (action) {

9224

switch (action) {

9225

case CPU_ONLINE:

9225

case CPU_ONLINE:

9226

case CPU_ONLINE_FROZEN:

9226

case CPU_ONLINE_FROZEN:

9227

case CPU_DOWN_PREPARE:

9227

case CPU_DOWN_PREPARE:

9228

case CPU_DOWN_PREPARE_FROZEN:

9228

case CPU_DOWN_PREPARE_FROZEN:

9229

case CPU_DOWN_FAILED:

9229

case CPU_DOWN_FAILED:

9230

case CPU_DOWN_FAILED_FROZEN:

9230

case CPU_DOWN_FAILED_FROZEN:

9231

partition_sched_domains(1, NULL, NULL);

9231

partition_sched_domains(1, NULL, NULL);

9232

return NOTIFY_OK;

9232

return NOTIFY_OK;

9233

9234

default:

9234

default:

9235

return NOTIFY_DONE;

9235

return NOTIFY_DONE;

9236

}

9236

}

9237

}

9237

}

9238

#endif

9238

#endif

9239

9240

static int update_runtime(struct notifier_block *nfb,

9240

static int update_runtime(struct notifier_block *nfb,

9241

unsigned long action, void *hcpu)

9241

unsigned long action, void *hcpu)

9242

{

9242

{

9243

int cpu = (int)(long)hcpu;

9243

int cpu = (int)(long)hcpu;

9244

9245

switch (action) {

9245

switch (action) {

9246

case CPU_DOWN_PREPARE:

9246

case CPU_DOWN_PREPARE:

9247

case CPU_DOWN_PREPARE_FROZEN:

9247

case CPU_DOWN_PREPARE_FROZEN:

9248

disable_runtime(cpu_rq(cpu));

9248

disable_runtime(cpu_rq(cpu));

9249

return NOTIFY_OK;

9249

return NOTIFY_OK;

9250

9251

case CPU_DOWN_FAILED:

9251

case CPU_DOWN_FAILED:

9252

case CPU_DOWN_FAILED_FROZEN:

9252

case CPU_DOWN_FAILED_FROZEN:

9253

case CPU_ONLINE:

9253

case CPU_ONLINE:

9254

case CPU_ONLINE_FROZEN:

9254

case CPU_ONLINE_FROZEN:

9255

enable_runtime(cpu_rq(cpu));

9255

enable_runtime(cpu_rq(cpu));

9256

return NOTIFY_OK;

9256

return NOTIFY_OK;

9257

9258

default:

9258

default:

9259

return NOTIFY_DONE;

9259

return NOTIFY_DONE;

9260

}

9260

}

9261

}

9261

}

9262

9263

void __init sched_init_smp(void)

9263

void __init sched_init_smp(void)

9264

{

9264

{

9265

cpumask_var_t non_isolated_cpus;

9265

cpumask_var_t non_isolated_cpus;

9266

9267

alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);

9267

alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);

9268

alloc_cpumask_var(&fallback_doms, GFP_KERNEL);

9268

alloc_cpumask_var(&fallback_doms, GFP_KERNEL);

9269

9270

#if defined(CONFIG_NUMA)

9270

#if defined(CONFIG_NUMA)

9271

sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),

9271

sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),

9272

GFP_KERNEL);

9272

GFP_KERNEL);

9273

BUG_ON(sched_group_nodes_bycpu == NULL);

9273

BUG_ON(sched_group_nodes_bycpu == NULL);

9274

#endif

9274

#endif

9275

get_online_cpus();

9275

get_online_cpus();

9276

mutex_lock(&sched_domains_mutex);

9276

mutex_lock(&sched_domains_mutex);

9277

arch_init_sched_domains(cpu_active_mask);

9277

arch_init_sched_domains(cpu_active_mask);

9278

cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);

9278

cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);

9279

if (cpumask_empty(non_isolated_cpus))

9279

if (cpumask_empty(non_isolated_cpus))

9280

cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);

9280

cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);

9281

mutex_unlock(&sched_domains_mutex);

9281

mutex_unlock(&sched_domains_mutex);

9282

put_online_cpus();

9282

put_online_cpus();

9283

9284

#ifndef CONFIG_CPUSETS

9284

#ifndef CONFIG_CPUSETS

9285

/* XXX: Theoretical race here - CPU may be hotplugged now */

9285

/* XXX: Theoretical race here - CPU may be hotplugged now */

9286

hotcpu_notifier(update_sched_domains, 0);

9286

hotcpu_notifier(update_sched_domains, 0);

9287

#endif

9287

#endif

9288

9289

/* RT runtime code needs to handle some hotplug events */

9289

/* RT runtime code needs to handle some hotplug events */

9290

hotcpu_notifier(update_runtime, 0);

9290

hotcpu_notifier(update_runtime, 0);

9291

9292

init_hrtick();

9292

init_hrtick();

9293

9294

/* Move init over to a non-isolated CPU */

9294

/* Move init over to a non-isolated CPU */

9295

if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)

9295

if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)

9296

BUG();

9296

BUG();

9297

sched_init_granularity();

9297

sched_init_granularity();

9298

free_cpumask_var(non_isolated_cpus);

9298

free_cpumask_var(non_isolated_cpus);

9299

9300

init_sched_rt_class();

9300

init_sched_rt_class();

9301

}

9301

}

9302

#else

9302

#else

9303

void __init sched_init_smp(void)

9303

void __init sched_init_smp(void)

9304

{

9304

{

9305

sched_init_granularity();

9305

sched_init_granularity();

9306

}

9306

}

9307

#endif /* CONFIG_SMP */

9307

#endif /* CONFIG_SMP */

9308

9309

const_debug unsigned int sysctl_timer_migration = 1;

9309

const_debug unsigned int sysctl_timer_migration = 1;

9310

9311

int in_sched_functions(unsigned long addr)

9311

int in_sched_functions(unsigned long addr)

9312

{

9312

{

9313

return in_lock_functions(addr) ||

9313

return in_lock_functions(addr) ||

9314

(addr >= (unsigned long)__sched_text_start

9314

(addr >= (unsigned long)__sched_text_start

9315

&& addr < (unsigned long)__sched_text_end);

9315

&& addr < (unsigned long)__sched_text_end);

9316

}

9316

}

9317

9318

static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)

9318

static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)

9319

{

9319

{

9320

cfs_rq->tasks_timeline = RB_ROOT;

9320

cfs_rq->tasks_timeline = RB_ROOT;

9321

INIT_LIST_HEAD(&cfs_rq->tasks);

9321

INIT_LIST_HEAD(&cfs_rq->tasks);

9322

#ifdef CONFIG_FAIR_GROUP_SCHED

9322

#ifdef CONFIG_FAIR_GROUP_SCHED

9323

cfs_rq->rq = rq;

9323

cfs_rq->rq = rq;

9324

#endif

9324

#endif

9325

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

9325

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

9326

}

9326

}

9327

9328

static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)

9328

static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)

9329

{

9329

{

9330

struct rt_prio_array *array;

9330

struct rt_prio_array *array;

9331

int i;

9331

int i;

9332

9333

array = &rt_rq->active;

9333

array = &rt_rq->active;

9334

for (i = 0; i < MAX_RT_PRIO; i++) {

9334

for (i = 0; i < MAX_RT_PRIO; i++) {

9335

INIT_LIST_HEAD(array->queue + i);

9335

INIT_LIST_HEAD(array->queue + i);

9336

__clear_bit(i, array->bitmap);

9336

__clear_bit(i, array->bitmap);

9337

}

9337

}

9338

/* delimiter for bitsearch: */

9338

/* delimiter for bitsearch: */

9339

__set_bit(MAX_RT_PRIO, array->bitmap);

9339

__set_bit(MAX_RT_PRIO, array->bitmap);

9340

9341

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

9341

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

9342

rt_rq->highest_prio.curr = MAX_RT_PRIO;

9342

rt_rq->highest_prio.curr = MAX_RT_PRIO;

9343

#ifdef CONFIG_SMP

9343

#ifdef CONFIG_SMP

9344

rt_rq->highest_prio.next = MAX_RT_PRIO;

9344

rt_rq->highest_prio.next = MAX_RT_PRIO;

9345

#endif

9345

#endif

9346

#endif

9346

#endif

9347

#ifdef CONFIG_SMP

9347

#ifdef CONFIG_SMP

9348

rt_rq->rt_nr_migratory = 0;

9348

rt_rq->rt_nr_migratory = 0;

9349

rt_rq->overloaded = 0;

9349

rt_rq->overloaded = 0;

9350

plist_head_init(&rt_rq->pushable_tasks, &rq->lock);

9350

plist_head_init(&rt_rq->pushable_tasks, &rq->lock);

9351

#endif

9351

#endif

9352

9353

rt_rq->rt_time = 0;

9353

rt_rq->rt_time = 0;

9354

rt_rq->rt_throttled = 0;

9354

rt_rq->rt_throttled = 0;

9355

rt_rq->rt_runtime = 0;

9355

rt_rq->rt_runtime = 0;

9356

spin_lock_init(&rt_rq->rt_runtime_lock);

9356

spin_lock_init(&rt_rq->rt_runtime_lock);

9357

9358

#ifdef CONFIG_RT_GROUP_SCHED

9358

#ifdef CONFIG_RT_GROUP_SCHED

9359

rt_rq->rt_nr_boosted = 0;

9359

rt_rq->rt_nr_boosted = 0;

9360

rt_rq->rq = rq;

9360

rt_rq->rq = rq;

9361

#endif

9361

#endif

9362

}

9362

}

9363

9364

#ifdef CONFIG_FAIR_GROUP_SCHED

9364

#ifdef CONFIG_FAIR_GROUP_SCHED

9365

static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

9365

static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

9366

struct sched_entity *se, int cpu, int add,

9366

struct sched_entity *se, int cpu, int add,

9367

struct sched_entity *parent)

9367

struct sched_entity *parent)

9368

{

9368

{

9369

struct rq *rq = cpu_rq(cpu);

9369

struct rq *rq = cpu_rq(cpu);

9370

tg->cfs_rq[cpu] = cfs_rq;

9370

tg->cfs_rq[cpu] = cfs_rq;

9371

init_cfs_rq(cfs_rq, rq);

9371

init_cfs_rq(cfs_rq, rq);

9372

cfs_rq->tg = tg;

9372

cfs_rq->tg = tg;

9373

if (add)

9373

if (add)

9374

list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);

9374

list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);

9375

9376

tg->se[cpu] = se;

9376

tg->se[cpu] = se;

9377

/* se could be NULL for init_task_group */

9377

/* se could be NULL for init_task_group */

9378

if (!se)

9378

if (!se)

9379

return;

9379

return;

9380

9381

if (!parent)

9381

if (!parent)

9382

se->cfs_rq = &rq->cfs;

9382

se->cfs_rq = &rq->cfs;

9383

else

9383

else

9384

se->cfs_rq = parent->my_q;

9384

se->cfs_rq = parent->my_q;

9385

9386

se->my_q = cfs_rq;

9386

se->my_q = cfs_rq;

9387

se->load.weight = tg->shares;

9387

se->load.weight = tg->shares;

9388

se->load.inv_weight = 0;

9388

se->load.inv_weight = 0;

9389

se->parent = parent;

9389

se->parent = parent;

9390

}

9390

}

9391

#endif

9391

#endif

9392

9393

#ifdef CONFIG_RT_GROUP_SCHED

9393

#ifdef CONFIG_RT_GROUP_SCHED

9394

static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,

9394

static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,

9395

struct sched_rt_entity *rt_se, int cpu, int add,

9395

struct sched_rt_entity *rt_se, int cpu, int add,

9396

struct sched_rt_entity *parent)

9396

struct sched_rt_entity *parent)

9397

{

9397

{

9398

struct rq *rq = cpu_rq(cpu);

9398

struct rq *rq = cpu_rq(cpu);

9399

9400

tg->rt_rq[cpu] = rt_rq;

9400

tg->rt_rq[cpu] = rt_rq;

9401

init_rt_rq(rt_rq, rq);

9401

init_rt_rq(rt_rq, rq);

9402

rt_rq->tg = tg;

9402

rt_rq->tg = tg;

9403

rt_rq->rt_se = rt_se;

9403

rt_rq->rt_se = rt_se;

9404

rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;

9404

rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;

9405

if (add)

9405

if (add)

9406

list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);

9406

list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);

9407

9408

tg->rt_se[cpu] = rt_se;

9408

tg->rt_se[cpu] = rt_se;

9409

if (!rt_se)

9409

if (!rt_se)

9410

return;

9410

return;

9411

9412

if (!parent)

9412

if (!parent)

9413

rt_se->rt_rq = &rq->rt;

9413

rt_se->rt_rq = &rq->rt;

9414

else

9414

else

9415

rt_se->rt_rq = parent->my_q;

9415

rt_se->rt_rq = parent->my_q;

9416

9417

rt_se->my_q = rt_rq;

9417

rt_se->my_q = rt_rq;

9418

rt_se->parent = parent;

9418

rt_se->parent = parent;

9419

INIT_LIST_HEAD(&rt_se->run_list);

9419

INIT_LIST_HEAD(&rt_se->run_list);

9420

}

9420

}

9421

#endif

9421

#endif

9422

9423

void __init sched_init(void)

9423

void __init sched_init(void)

9424

{

9424

{

9425

int i, j;

9425

int i, j;

9426

unsigned long alloc_size = 0, ptr;

9426

unsigned long alloc_size = 0, ptr;

9427

9428

#ifdef CONFIG_FAIR_GROUP_SCHED

9428

#ifdef CONFIG_FAIR_GROUP_SCHED

9429

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

9429

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

9430

#endif

9430

#endif

9431

#ifdef CONFIG_RT_GROUP_SCHED

9431

#ifdef CONFIG_RT_GROUP_SCHED

9432

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

9432

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

9433

#endif

9433

#endif

9434

#ifdef CONFIG_USER_SCHED

9434

#ifdef CONFIG_USER_SCHED

9435

alloc_size *= 2;

9435

alloc_size *= 2;

9436

#endif

9436

#endif

9437

#ifdef CONFIG_CPUMASK_OFFSTACK

9437

#ifdef CONFIG_CPUMASK_OFFSTACK

9438

alloc_size += num_possible_cpus() * cpumask_size();

9438

alloc_size += num_possible_cpus() * cpumask_size();

9439

#endif

9439

#endif

9440

if (alloc_size) {

9440

if (alloc_size) {

9441

ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

9441

ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

9442

9443

#ifdef CONFIG_FAIR_GROUP_SCHED

9443

#ifdef CONFIG_FAIR_GROUP_SCHED

9444

init_task_group.se = (struct sched_entity **)ptr;

9444

init_task_group.se = (struct sched_entity **)ptr;

9445

ptr += nr_cpu_ids * sizeof(void **);

9445

ptr += nr_cpu_ids * sizeof(void **);

9446

9447

init_task_group.cfs_rq = (struct cfs_rq **)ptr;

9447

init_task_group.cfs_rq = (struct cfs_rq **)ptr;

9448

ptr += nr_cpu_ids * sizeof(void **);

9448

ptr += nr_cpu_ids * sizeof(void **);

9449

9450

#ifdef CONFIG_USER_SCHED

9450

#ifdef CONFIG_USER_SCHED

9451

root_task_group.se = (struct sched_entity **)ptr;

9451

root_task_group.se = (struct sched_entity **)ptr;

9452

ptr += nr_cpu_ids * sizeof(void **);

9452

ptr += nr_cpu_ids * sizeof(void **);

9453

9454

root_task_group.cfs_rq = (struct cfs_rq **)ptr;

9454

root_task_group.cfs_rq = (struct cfs_rq **)ptr;

9455

ptr += nr_cpu_ids * sizeof(void **);

9455

ptr += nr_cpu_ids * sizeof(void **);

9456

#endif /* CONFIG_USER_SCHED */

9456

#endif /* CONFIG_USER_SCHED */

9457

#endif /* CONFIG_FAIR_GROUP_SCHED */

9457

#endif /* CONFIG_FAIR_GROUP_SCHED */

9458

#ifdef CONFIG_RT_GROUP_SCHED

9458

#ifdef CONFIG_RT_GROUP_SCHED

9459

init_task_group.rt_se = (struct sched_rt_entity **)ptr;

9459

init_task_group.rt_se = (struct sched_rt_entity **)ptr;

9460

ptr += nr_cpu_ids * sizeof(void **);

9460

ptr += nr_cpu_ids * sizeof(void **);

9461

9462

init_task_group.rt_rq = (struct rt_rq **)ptr;

9462

init_task_group.rt_rq = (struct rt_rq **)ptr;

9463

ptr += nr_cpu_ids * sizeof(void **);

9463

ptr += nr_cpu_ids * sizeof(void **);

9464

9465

#ifdef CONFIG_USER_SCHED

9465

#ifdef CONFIG_USER_SCHED

9466

root_task_group.rt_se = (struct sched_rt_entity **)ptr;

9466

root_task_group.rt_se = (struct sched_rt_entity **)ptr;

9467

ptr += nr_cpu_ids * sizeof(void **);

9467

ptr += nr_cpu_ids * sizeof(void **);

9468

9469

root_task_group.rt_rq = (struct rt_rq **)ptr;

9469

root_task_group.rt_rq = (struct rt_rq **)ptr;

9470

ptr += nr_cpu_ids * sizeof(void **);

9470

ptr += nr_cpu_ids * sizeof(void **);

9471

#endif /* CONFIG_USER_SCHED */

9471

#endif /* CONFIG_USER_SCHED */

9472

#endif /* CONFIG_RT_GROUP_SCHED */

9472

#endif /* CONFIG_RT_GROUP_SCHED */

9473

#ifdef CONFIG_CPUMASK_OFFSTACK

9473

#ifdef CONFIG_CPUMASK_OFFSTACK

9474

for_each_possible_cpu(i) {

9474

for_each_possible_cpu(i) {

9475

per_cpu(load_balance_tmpmask, i) = (void *)ptr;

9475

per_cpu(load_balance_tmpmask, i) = (void *)ptr;

9476

ptr += cpumask_size();

9476

ptr += cpumask_size();

9477

}

9477

}

9478

#endif /* CONFIG_CPUMASK_OFFSTACK */

9478

#endif /* CONFIG_CPUMASK_OFFSTACK */

9479

}

9479

}

9480

9481

#ifdef CONFIG_SMP

9481

#ifdef CONFIG_SMP

9482

init_defrootdomain();

9482

init_defrootdomain();

9483

#endif

9483

#endif

9484

9485

init_rt_bandwidth(&def_rt_bandwidth,

9485

init_rt_bandwidth(&def_rt_bandwidth,

9486

global_rt_period(), global_rt_runtime());

9486

global_rt_period(), global_rt_runtime());

9487

9488

#ifdef CONFIG_RT_GROUP_SCHED

9488

#ifdef CONFIG_RT_GROUP_SCHED

9489

init_rt_bandwidth(&init_task_group.rt_bandwidth,

9489

init_rt_bandwidth(&init_task_group.rt_bandwidth,

9490

global_rt_period(), global_rt_runtime());

9490

global_rt_period(), global_rt_runtime());

9491

#ifdef CONFIG_USER_SCHED

9491

#ifdef CONFIG_USER_SCHED

9492

init_rt_bandwidth(&root_task_group.rt_bandwidth,

9492

init_rt_bandwidth(&root_task_group.rt_bandwidth,

9493

global_rt_period(), RUNTIME_INF);

9493

global_rt_period(), RUNTIME_INF);

9494

#endif /* CONFIG_USER_SCHED */

9494

#endif /* CONFIG_USER_SCHED */

9495

#endif /* CONFIG_RT_GROUP_SCHED */

9495

#endif /* CONFIG_RT_GROUP_SCHED */

9496

9497

#ifdef CONFIG_GROUP_SCHED

9497

#ifdef CONFIG_GROUP_SCHED

9498

list_add(&init_task_group.list, &task_groups);

9498

list_add(&init_task_group.list, &task_groups);

9499

INIT_LIST_HEAD(&init_task_group.children);

9499

INIT_LIST_HEAD(&init_task_group.children);

9500

9501

#ifdef CONFIG_USER_SCHED

9501

#ifdef CONFIG_USER_SCHED

9502

INIT_LIST_HEAD(&root_task_group.children);

9502

INIT_LIST_HEAD(&root_task_group.children);

9503

init_task_group.parent = &root_task_group;

9503

init_task_group.parent = &root_task_group;

9504

list_add(&init_task_group.siblings, &root_task_group.children);

9504

list_add(&init_task_group.siblings, &root_task_group.children);

9505

#endif /* CONFIG_USER_SCHED */

9505

#endif /* CONFIG_USER_SCHED */

9506

#endif /* CONFIG_GROUP_SCHED */

9506

#endif /* CONFIG_GROUP_SCHED */

9507

9508

#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP

9508

#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP

9509

update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),

9509

update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),

9510

__alignof__(unsigned long));

9510

__alignof__(unsigned long));

9511

#endif

9511

#endif

9512

for_each_possible_cpu(i) {

9512

for_each_possible_cpu(i) {

9513

struct rq *rq;

9513

struct rq *rq;

9514

9515

rq = cpu_rq(i);

9515

rq = cpu_rq(i);

9516

spin_lock_init(&rq->lock);

9516

spin_lock_init(&rq->lock);

9517

rq->nr_running = 0;

9517

rq->nr_running = 0;

9518

rq->calc_load_active = 0;

9518

rq->calc_load_active = 0;

9519

rq->calc_load_update = jiffies + LOAD_FREQ;

9519

rq->calc_load_update = jiffies + LOAD_FREQ;

9520

init_cfs_rq(&rq->cfs, rq);

9520

init_cfs_rq(&rq->cfs, rq);

9521

init_rt_rq(&rq->rt, rq);

9521

init_rt_rq(&rq->rt, rq);

9522

#ifdef CONFIG_FAIR_GROUP_SCHED

9522

#ifdef CONFIG_FAIR_GROUP_SCHED

9523

init_task_group.shares = init_task_group_load;

9523

init_task_group.shares = init_task_group_load;

9524

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

9524

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

9525

#ifdef CONFIG_CGROUP_SCHED

9525

#ifdef CONFIG_CGROUP_SCHED

9526

/*

9526

/*

9527

* How much cpu bandwidth does init_task_group get?

9527

* How much cpu bandwidth does init_task_group get?

9528

*

9528

*

9529

* In case of task-groups formed thr' the cgroup filesystem, it

9529

* In case of task-groups formed thr' the cgroup filesystem, it

9530

* gets 100% of the cpu resources in the system. This overall

9530

* gets 100% of the cpu resources in the system. This overall

9531

* system cpu resource is divided among the tasks of

9531

* system cpu resource is divided among the tasks of

9532

* init_task_group and its child task-groups in a fair manner,

9532

* init_task_group and its child task-groups in a fair manner,

9533

* based on each entity's (task or task-group's) weight

9533

* based on each entity's (task or task-group's) weight

9534

* (se->load.weight).

9534

* (se->load.weight).

9535

*

9535

*

9536

* In other words, if init_task_group has 10 tasks of weight

9536

* In other words, if init_task_group has 10 tasks of weight

9537

* 1024) and two child groups A0 and A1 (of weight 1024 each),

9537

* 1024) and two child groups A0 and A1 (of weight 1024 each),

9538

* then A0's share of the cpu resource is:

9538

* then A0's share of the cpu resource is:

9539

*

9539

*

9540

* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%

9540

* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%

9541

*

9541

*

9542

* We achieve this by letting init_task_group's tasks sit

9542

* We achieve this by letting init_task_group's tasks sit

9543

* directly in rq->cfs (i.e init_task_group->se[] = NULL).

9543

* directly in rq->cfs (i.e init_task_group->se[] = NULL).

9544

*/

9544

*/

9545

init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);

9545

init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);

9546

#elif defined CONFIG_USER_SCHED

9546

#elif defined CONFIG_USER_SCHED

9547

root_task_group.shares = NICE_0_LOAD;

9547

root_task_group.shares = NICE_0_LOAD;

9548

init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);

9548

init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);

9549

/*

9549

/*

9550

* In case of task-groups formed thr' the user id of tasks,

9550

* In case of task-groups formed thr' the user id of tasks,

9551

* init_task_group represents tasks belonging to root user.

9551

* init_task_group represents tasks belonging to root user.

9552

* Hence it forms a sibling of all subsequent groups formed.

9552

* Hence it forms a sibling of all subsequent groups formed.

9553

* In this case, init_task_group gets only a fraction of overall

9553

* In this case, init_task_group gets only a fraction of overall

9554

* system cpu resource, based on the weight assigned to root

9554

* system cpu resource, based on the weight assigned to root

9555

* user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished

9555

* user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished

9556

* by letting tasks of init_task_group sit in a separate cfs_rq

9556

* by letting tasks of init_task_group sit in a separate cfs_rq

9557

* (init_tg_cfs_rq) and having one entity represent this group of

9557

* (init_tg_cfs_rq) and having one entity represent this group of

9558

* tasks in rq->cfs (i.e init_task_group->se[] != NULL).

9558

* tasks in rq->cfs (i.e init_task_group->se[] != NULL).

9559

*/

9559

*/

9560

init_tg_cfs_entry(&init_task_group,

9560

init_tg_cfs_entry(&init_task_group,

9561

&per_cpu(init_tg_cfs_rq, i),

9561

&per_cpu(init_tg_cfs_rq, i),

9562

&per_cpu(init_sched_entity, i), i, 1,

9562

&per_cpu(init_sched_entity, i), i, 1,

9563

root_task_group.se[i]);

9563

root_task_group.se[i]);

9564

9565

#endif

9565

#endif

9566

#endif /* CONFIG_FAIR_GROUP_SCHED */

9566

#endif /* CONFIG_FAIR_GROUP_SCHED */

9567

9568

rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

9568

rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

9569

#ifdef CONFIG_RT_GROUP_SCHED

9569

#ifdef CONFIG_RT_GROUP_SCHED

9570

INIT_LIST_HEAD(&rq->leaf_rt_rq_list);

9570

INIT_LIST_HEAD(&rq->leaf_rt_rq_list);

9571

#ifdef CONFIG_CGROUP_SCHED

9571

#ifdef CONFIG_CGROUP_SCHED

9572

init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);

9572

init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);

9573

#elif defined CONFIG_USER_SCHED

9573

#elif defined CONFIG_USER_SCHED

9574

init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);

9574

init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);

9575

init_tg_rt_entry(&init_task_group,

9575

init_tg_rt_entry(&init_task_group,

9576

&per_cpu(init_rt_rq, i),

9576

&per_cpu(init_rt_rq, i),

9577

&per_cpu(init_sched_rt_entity, i), i, 1,

9577

&per_cpu(init_sched_rt_entity, i), i, 1,

9578

root_task_group.rt_se[i]);

9578

root_task_group.rt_se[i]);

9579

#endif

9579

#endif

9580

#endif

9580

#endif

9581

9582

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

9582

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

9583

rq->cpu_load[j] = 0;

9583

rq->cpu_load[j] = 0;

9584

#ifdef CONFIG_SMP

9584

#ifdef CONFIG_SMP

9585

rq->sd = NULL;

9585

rq->sd = NULL;

9586

rq->rd = NULL;

9586

rq->rd = NULL;

9587

rq->post_schedule = 0;

9587

rq->post_schedule = 0;

9588

rq->active_balance = 0;

9588

rq->active_balance = 0;

9589

rq->next_balance = jiffies;

9589

rq->next_balance = jiffies;

9590

rq->push_cpu = 0;

9590

rq->push_cpu = 0;

9591

rq->cpu = i;

9591

rq->cpu = i;

9592

rq->online = 0;

9592

rq->online = 0;

9593

rq->migration_thread = NULL;

9593

rq->migration_thread = NULL;

9594

rq->idle_stamp = 0;

9594

rq->idle_stamp = 0;

9595

rq->avg_idle = 2*sysctl_sched_migration_cost;

9595

rq->avg_idle = 2*sysctl_sched_migration_cost;

9596

INIT_LIST_HEAD(&rq->migration_queue);

9596

INIT_LIST_HEAD(&rq->migration_queue);

9597

rq_attach_root(rq, &def_root_domain);

9597

rq_attach_root(rq, &def_root_domain);

9598

#endif

9598

#endif

9599

init_rq_hrtick(rq);

9599

init_rq_hrtick(rq);

9600

atomic_set(&rq->nr_iowait, 0);

9600

atomic_set(&rq->nr_iowait, 0);

9601

}

9601

}

9602

9603

set_load_weight(&init_task);

9603

set_load_weight(&init_task);

9604

9605

#ifdef CONFIG_PREEMPT_NOTIFIERS

9605

#ifdef CONFIG_PREEMPT_NOTIFIERS

9606

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

9606

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

9607

#endif

9607

#endif

9608

9609

#ifdef CONFIG_SMP

9609

#ifdef CONFIG_SMP

9610

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

9610

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

9611

#endif

9611

#endif

9612

9613

#ifdef CONFIG_RT_MUTEXES

9613

#ifdef CONFIG_RT_MUTEXES

9614

plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);

9614

plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);

9615

#endif

9615

#endif

9616

9617

/*

9617

/*

9618

* The boot idle thread does lazy MMU switching as well:

9618

* The boot idle thread does lazy MMU switching as well:

9619

*/

9619

*/

9620

atomic_inc(&init_mm.mm_count);

9620

atomic_inc(&init_mm.mm_count);

9621

enter_lazy_tlb(&init_mm, current);

9621

enter_lazy_tlb(&init_mm, current);

9622

9623

/*

9623

/*

9624

* Make us the idle thread. Technically, schedule() should not be

9624

* Make us the idle thread. Technically, schedule() should not be

9625

* called from this thread, however somewhere below it might be,

9625

* called from this thread, however somewhere below it might be,

9626

* but because we are the idle thread, we just pick up running again

9626

* but because we are the idle thread, we just pick up running again

9627

* when this runqueue becomes "idle".

9627

* when this runqueue becomes "idle".

9628

*/

9628

*/

9629

init_idle(current, smp_processor_id());

9629

init_idle(current, smp_processor_id());

9630

9631

calc_load_update = jiffies + LOAD_FREQ;

9631

calc_load_update = jiffies + LOAD_FREQ;

9632

9633

/*

9633

/*

9634

* During early bootup we pretend to be a normal task:

9634

* During early bootup we pretend to be a normal task:

9635

*/

9635

*/

9636

current->sched_class = &fair_sched_class;

9636

current->sched_class = &fair_sched_class;

9637

9638

/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */

9638

/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */

9639

zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);

9639

zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);

9640

#ifdef CONFIG_SMP

9640

#ifdef CONFIG_SMP

9641

#ifdef CONFIG_NO_HZ

9641

#ifdef CONFIG_NO_HZ

9642

zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);

9642

zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);

9643

alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);

9643

alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);

9644

#endif

9644

#endif

9645

/* May be allocated at isolcpus cmdline parse time */

9645

/* May be allocated at isolcpus cmdline parse time */

9646

if (cpu_isolated_map == NULL)

9646

if (cpu_isolated_map == NULL)

9647

zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);

9647

zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);

9648

#endif /* SMP */

9648

#endif /* SMP */

9649

9650

perf_event_init();

9650

perf_event_init();

9651

9652

scheduler_running = 1;

9652

scheduler_running = 1;

9653

}

9653

}

9654

9655

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

9655

#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP

9656

static inline int preempt_count_equals(int preempt_offset)

9656

static inline int preempt_count_equals(int preempt_offset)

9657

{

9657

{

9658

int nested = preempt_count() & ~PREEMPT_ACTIVE;

9658

int nested = preempt_count() & ~PREEMPT_ACTIVE;

9659

9660

return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);

9660

return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);

9661

}

9661

}

9662

9663

void __might_sleep(char *file, int line, int preempt_offset)

9663

void __might_sleep(char *file, int line, int preempt_offset)

9664

{

9664

{

9665

#ifdef in_atomic

9665

#ifdef in_atomic

9666

static unsigned long prev_jiffy; /* ratelimiting */

9666

static unsigned long prev_jiffy; /* ratelimiting */

9667

9668

if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||

9668

if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||

9669

system_state != SYSTEM_RUNNING || oops_in_progress)

9669

system_state != SYSTEM_RUNNING || oops_in_progress)

9670

return;

9670

return;

9671

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

9671

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

9672

return;

9672

return;

9673

prev_jiffy = jiffies;

9673

prev_jiffy = jiffies;

9674

9675

pr_err("BUG: sleeping function called from invalid context at %s:%d\n",

9675

pr_err("BUG: sleeping function called from invalid context at %s:%d\n",

9676

file, line);

9676

file, line);

9677

pr_err("in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

9677

pr_err("in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

9678

in_atomic(), irqs_disabled(),

9678

in_atomic(), irqs_disabled(),

9679

current->pid, current->comm);

9679

current->pid, current->comm);

9680

9681

debug_show_held_locks(current);

9681

debug_show_held_locks(current);

9682

if (irqs_disabled())

9682

if (irqs_disabled())

9683

print_irqtrace_events(current);

9683

print_irqtrace_events(current);

9684

dump_stack();

9684

dump_stack();

9685

#endif

9685

#endif

9686

}

9686

}

9687

EXPORT_SYMBOL(__might_sleep);

9687

EXPORT_SYMBOL(__might_sleep);

9688

#endif

9688

#endif

9689

9690

#ifdef CONFIG_MAGIC_SYSRQ

9690

#ifdef CONFIG_MAGIC_SYSRQ

9691

static void normalize_task(struct rq *rq, struct task_struct *p)

9691

static void normalize_task(struct rq *rq, struct task_struct *p)

9692

{

9692

{

9693

int on_rq;

9693

int on_rq;

9694

9695

update_rq_clock(rq);

9695

update_rq_clock(rq);

9696

on_rq = p->se.on_rq;

9696

on_rq = p->se.on_rq;

9697

if (on_rq)

9697

if (on_rq)

9698

deactivate_task(rq, p, 0);

9698

deactivate_task(rq, p, 0);

9699

__setscheduler(rq, p, SCHED_NORMAL, 0);

9699

__setscheduler(rq, p, SCHED_NORMAL, 0);

9700

if (on_rq) {

9700

if (on_rq) {

9701

activate_task(rq, p, 0);

9701

activate_task(rq, p, 0);

9702

resched_task(rq->curr);

9702

resched_task(rq->curr);

9703

}

9703

}

9704

}

9704

}

9705

9706

void normalize_rt_tasks(void)

9706

void normalize_rt_tasks(void)

9707

{

9707

{

9708

struct task_struct *g, *p;

9708

struct task_struct *g, *p;

9709

unsigned long flags;

9709

unsigned long flags;

9710

struct rq *rq;

9710

struct rq *rq;

9711

9712

read_lock_irqsave(&tasklist_lock, flags);

9712

read_lock_irqsave(&tasklist_lock, flags);

9713

do_each_thread(g, p) {

9713

do_each_thread(g, p) {

9714

/*

9714

/*

9715

* Only normalize user tasks:

9715

* Only normalize user tasks:

9716

*/

9716

*/

9717

if (!p->mm)

9717

if (!p->mm)

9718

continue;

9718

continue;

9719

9720

p->se.exec_start = 0;

9720

p->se.exec_start = 0;

9721

#ifdef CONFIG_SCHEDSTATS

9721

#ifdef CONFIG_SCHEDSTATS

9722

p->se.wait_start = 0;

9722

p->se.wait_start = 0;

9723

p->se.sleep_start = 0;

9723

p->se.sleep_start = 0;

9724

p->se.block_start = 0;

9724

p->se.block_start = 0;

9725

#endif

9725

#endif

9726

9727

if (!rt_task(p)) {

9727

if (!rt_task(p)) {

9728

/*

9728

/*

9729

* Renice negative nice level userspace

9729

* Renice negative nice level userspace

9730

* tasks back to 0:

9730

* tasks back to 0:

9731

*/

9731

*/

9732

if (TASK_NICE(p) < 0 && p->mm)

9732

if (TASK_NICE(p) < 0 && p->mm)

9733

set_user_nice(p, 0);

9733

set_user_nice(p, 0);

9734

continue;

9734

continue;

9735

}

9735

}

9736

9737

spin_lock(&p->pi_lock);

9737

spin_lock(&p->pi_lock);

9738

rq = __task_rq_lock(p);

9738

rq = __task_rq_lock(p);

9739

9740

normalize_task(rq, p);

9740

normalize_task(rq, p);

9741

9742

__task_rq_unlock(rq);

9742

__task_rq_unlock(rq);

9743

spin_unlock(&p->pi_lock);

9743

spin_unlock(&p->pi_lock);

9744

} while_each_thread(g, p);

9744

} while_each_thread(g, p);

9745

9746

read_unlock_irqrestore(&tasklist_lock, flags);

9746

read_unlock_irqrestore(&tasklist_lock, flags);

9747

}

9747

}

9748

9749

#endif /* CONFIG_MAGIC_SYSRQ */

9749

#endif /* CONFIG_MAGIC_SYSRQ */

9750

9751

#ifdef CONFIG_IA64

9751

#ifdef CONFIG_IA64

9752

/*

9752

/*

9753

* These functions are only useful for the IA64 MCA handling.

9753

* These functions are only useful for the IA64 MCA handling.

9754

*

9754

*

9755

* They can only be called when the whole system has been

9755

* They can only be called when the whole system has been

9756

* stopped - every CPU needs to be quiescent, and no scheduling

9756

* stopped - every CPU needs to be quiescent, and no scheduling

9757

* activity can take place. Using them for anything else would

9757

* activity can take place. Using them for anything else would

9758

* be a serious bug, and as a result, they aren't even visible

9758

* be a serious bug, and as a result, they aren't even visible

9759

* under any other configuration.

9759

* under any other configuration.

9760

*/

9760

*/

9761

9762

/**

9762

/**

9763

* curr_task - return the current task for a given cpu.

9763

* curr_task - return the current task for a given cpu.

9764

* @cpu: the processor in question.

9764

* @cpu: the processor in question.

9765

*

9765

*

9766

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

9766

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

9767

*/

9767

*/

9768

struct task_struct *curr_task(int cpu)

9768

struct task_struct *curr_task(int cpu)

9769

{

9769

{

9770

return cpu_curr(cpu);

9770

return cpu_curr(cpu);

9771

}

9771

}

9772

9773

/**

9773

/**

9774

* set_curr_task - set the current task for a given cpu.

9774

* set_curr_task - set the current task for a given cpu.

9775

* @cpu: the processor in question.

9775

* @cpu: the processor in question.

9776

* @p: the task pointer to set.

9776

* @p: the task pointer to set.

9777

*

9777

*

9778

* Description: This function must only be used when non-maskable interrupts

9778

* Description: This function must only be used when non-maskable interrupts

9779

* are serviced on a separate stack. It allows the architecture to switch the

9779

* are serviced on a separate stack. It allows the architecture to switch the

9780

* notion of the current task on a cpu in a non-blocking manner. This function

9780

* notion of the current task on a cpu in a non-blocking manner. This function

9781

* must be called with all CPU's synchronized, and interrupts disabled, the

9781

* must be called with all CPU's synchronized, and interrupts disabled, the

9782

* and caller must save the original value of the current task (see

9782

* and caller must save the original value of the current task (see

9783

* curr_task() above) and restore that value before reenabling interrupts and

9783

* curr_task() above) and restore that value before reenabling interrupts and

9784

* re-starting the system.

9784

* re-starting the system.

9785

*

9785

*

9786

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

9786

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

9787

*/

9787

*/

9788

void set_curr_task(int cpu, struct task_struct *p)

9788

void set_curr_task(int cpu, struct task_struct *p)

9789

{

9789

{

9790

cpu_curr(cpu) = p;

9790

cpu_curr(cpu) = p;

9791

}

9791

}

9792

9793

#endif

9793

#endif

9794

9795

#ifdef CONFIG_FAIR_GROUP_SCHED

9795

#ifdef CONFIG_FAIR_GROUP_SCHED

9796

static void free_fair_sched_group(struct task_group *tg)

9796

static void free_fair_sched_group(struct task_group *tg)

9797

{

9797

{

9798

int i;

9798

int i;

9799

9800

for_each_possible_cpu(i) {

9800

for_each_possible_cpu(i) {

9801

if (tg->cfs_rq)

9801

if (tg->cfs_rq)

9802

kfree(tg->cfs_rq[i]);

9802

kfree(tg->cfs_rq[i]);

9803

if (tg->se)

9803

if (tg->se)

9804

kfree(tg->se[i]);

9804

kfree(tg->se[i]);

9805

}

9805

}

9806

9807

kfree(tg->cfs_rq);

9807

kfree(tg->cfs_rq);

9808

kfree(tg->se);

9808

kfree(tg->se);

9809

}

9809

}

9810

9811

static

9811

static

9812

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

9812

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

9813

{

9813

{

9814

struct cfs_rq *cfs_rq;

9814

struct cfs_rq *cfs_rq;

9815

struct sched_entity *se;

9815

struct sched_entity *se;

9816

struct rq *rq;

9816

struct rq *rq;

9817

int i;

9817

int i;

9818

9819

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

9819

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

9820

if (!tg->cfs_rq)

9820

if (!tg->cfs_rq)

9821

goto err;

9821

goto err;

9822

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

9822

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

9823

if (!tg->se)

9823

if (!tg->se)

9824

goto err;

9824

goto err;

9825

9826

tg->shares = NICE_0_LOAD;

9826

tg->shares = NICE_0_LOAD;

9827

9828

for_each_possible_cpu(i) {

9828

for_each_possible_cpu(i) {

9829

rq = cpu_rq(i);

9829

rq = cpu_rq(i);

9830

9831

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

9831

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

9832

GFP_KERNEL, cpu_to_node(i));

9832

GFP_KERNEL, cpu_to_node(i));

9833

if (!cfs_rq)

9833

if (!cfs_rq)

9834

goto err;

9834

goto err;

9835

9836

se = kzalloc_node(sizeof(struct sched_entity),

9836

se = kzalloc_node(sizeof(struct sched_entity),

9837

GFP_KERNEL, cpu_to_node(i));

9837

GFP_KERNEL, cpu_to_node(i));

9838

if (!se)

9838

if (!se)

9839

goto err_free_rq;

9839

goto err_free_rq;

9840

9841

init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);

9841

init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);

9842

}

9842

}

9843

9844

return 1;

9844

return 1;

9845

9846

err_free_rq:

9846

err_free_rq:

9847

kfree(cfs_rq);

9847

kfree(cfs_rq);

9848

err:

9848

err:

9849

return 0;

9849

return 0;

9850

}

9850

}

9851

9852

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

9852

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

9853

{

9853

{

9854

list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,

9854

list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,

9855

&cpu_rq(cpu)->leaf_cfs_rq_list);

9855

&cpu_rq(cpu)->leaf_cfs_rq_list);

9856

}

9856

}

9857

9858

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

9858

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

9859

{

9859

{

9860

list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);

9860

list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);

9861

}

9861

}

9862

#else /* !CONFG_FAIR_GROUP_SCHED */

9862

#else /* !CONFG_FAIR_GROUP_SCHED */

9863

static inline void free_fair_sched_group(struct task_group *tg)

9863

static inline void free_fair_sched_group(struct task_group *tg)

9864

{

9864

{

9865

}

9865

}

9866

9867

static inline

9867

static inline

9868

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

9868

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

9869

{

9869

{

9870

return 1;

9870

return 1;

9871

}

9871

}

9872

9873

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

9873

static inline void register_fair_sched_group(struct task_group *tg, int cpu)

9874

{

9874

{

9875

}

9875

}

9876

9877

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

9877

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

9878

{

9878

{

9879

}

9879

}

9880

#endif /* CONFIG_FAIR_GROUP_SCHED */

9880

#endif /* CONFIG_FAIR_GROUP_SCHED */

9881

9882

#ifdef CONFIG_RT_GROUP_SCHED

9882

#ifdef CONFIG_RT_GROUP_SCHED

9883

static void free_rt_sched_group(struct task_group *tg)

9883

static void free_rt_sched_group(struct task_group *tg)

9884

{

9884

{

9885

int i;

9885

int i;

9886

9887

destroy_rt_bandwidth(&tg->rt_bandwidth);

9887

destroy_rt_bandwidth(&tg->rt_bandwidth);

9888

9889

for_each_possible_cpu(i) {

9889

for_each_possible_cpu(i) {

9890

if (tg->rt_rq)

9890

if (tg->rt_rq)

9891

kfree(tg->rt_rq[i]);

9891

kfree(tg->rt_rq[i]);

9892

if (tg->rt_se)

9892

if (tg->rt_se)

9893

kfree(tg->rt_se[i]);

9893

kfree(tg->rt_se[i]);

9894

}

9894

}

9895

9896

kfree(tg->rt_rq);

9896

kfree(tg->rt_rq);

9897

kfree(tg->rt_se);

9897

kfree(tg->rt_se);

9898

}

9898

}

9899

9900

static

9900

static

9901

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

9901

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

9902

{

9902

{

9903

struct rt_rq *rt_rq;

9903

struct rt_rq *rt_rq;

9904

struct sched_rt_entity *rt_se;

9904

struct sched_rt_entity *rt_se;

9905

struct rq *rq;

9905

struct rq *rq;

9906

int i;

9906

int i;

9907

9908

tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);

9908

tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);

9909

if (!tg->rt_rq)

9909

if (!tg->rt_rq)

9910

goto err;

9910

goto err;

9911

tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);

9911

tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);

9912

if (!tg->rt_se)

9912

if (!tg->rt_se)

9913

goto err;

9913

goto err;

9914

9915

init_rt_bandwidth(&tg->rt_bandwidth,

9915

init_rt_bandwidth(&tg->rt_bandwidth,

9916

ktime_to_ns(def_rt_bandwidth.rt_period), 0);

9916

ktime_to_ns(def_rt_bandwidth.rt_period), 0);

9917

9918

for_each_possible_cpu(i) {

9918

for_each_possible_cpu(i) {

9919

rq = cpu_rq(i);

9919

rq = cpu_rq(i);

9920

9921

rt_rq = kzalloc_node(sizeof(struct rt_rq),

9921

rt_rq = kzalloc_node(sizeof(struct rt_rq),

9922

GFP_KERNEL, cpu_to_node(i));

9922

GFP_KERNEL, cpu_to_node(i));

9923

if (!rt_rq)

9923

if (!rt_rq)

9924

goto err;

9924

goto err;

9925

9926

rt_se = kzalloc_node(sizeof(struct sched_rt_entity),

9926

rt_se = kzalloc_node(sizeof(struct sched_rt_entity),

9927

GFP_KERNEL, cpu_to_node(i));

9927

GFP_KERNEL, cpu_to_node(i));

9928

if (!rt_se)

9928

if (!rt_se)

9929

goto err_free_rq;

9929

goto err_free_rq;

9930

9931

init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);

9931

init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);

9932

}

9932

}

9933

9934

return 1;

9934

return 1;

9935

9936

err_free_rq:

9936

err_free_rq:

9937

kfree(rt_rq);

9937

kfree(rt_rq);

9938

err:

9938

err:

9939

return 0;

9939

return 0;

9940

}

9940

}

9941

9942

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

9942

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

9943

{

9943

{

9944

list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,

9944

list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,

9945

&cpu_rq(cpu)->leaf_rt_rq_list);

9945

&cpu_rq(cpu)->leaf_rt_rq_list);

9946

}

9946

}

9947

9948

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

9948

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

9949

{

9949

{

9950

list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);

9950

list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);

9951

}

9951

}

9952

#else /* !CONFIG_RT_GROUP_SCHED */

9952

#else /* !CONFIG_RT_GROUP_SCHED */

9953

static inline void free_rt_sched_group(struct task_group *tg)

9953

static inline void free_rt_sched_group(struct task_group *tg)

9954

{

9954

{

9955

}

9955

}

9956

9957

static inline

9957

static inline

9958

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

9958

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

9959

{

9959

{

9960

return 1;

9960

return 1;

9961

}

9961

}

9962

9963

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

9963

static inline void register_rt_sched_group(struct task_group *tg, int cpu)

9964

{

9964

{

9965

}

9965

}

9966

9967

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

9967

static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)

9968

{

9968

{

9969

}

9969

}

9970

#endif /* CONFIG_RT_GROUP_SCHED */

9970

#endif /* CONFIG_RT_GROUP_SCHED */

9971

9972

#ifdef CONFIG_GROUP_SCHED

9972

#ifdef CONFIG_GROUP_SCHED

9973

static void free_sched_group(struct task_group *tg)

9973

static void free_sched_group(struct task_group *tg)

9974

{

9974

{

9975

free_fair_sched_group(tg);

9975

free_fair_sched_group(tg);

9976

free_rt_sched_group(tg);

9976

free_rt_sched_group(tg);

9977

kfree(tg);

9977

kfree(tg);

9978

}

9978

}

9979

9980

/* allocate runqueue etc for a new task group */

9980

/* allocate runqueue etc for a new task group */

9981

struct task_group *sched_create_group(struct task_group *parent)

9981

struct task_group *sched_create_group(struct task_group *parent)

9982

{

9982

{

9983

struct task_group *tg;

9983

struct task_group *tg;

9984

unsigned long flags;

9984

unsigned long flags;

9985

int i;

9985

int i;

9986

9987

tg = kzalloc(sizeof(*tg), GFP_KERNEL);

9987

tg = kzalloc(sizeof(*tg), GFP_KERNEL);

9988

if (!tg)

9988

if (!tg)

9989

return ERR_PTR(-ENOMEM);

9989

return ERR_PTR(-ENOMEM);

9990

9991

if (!alloc_fair_sched_group(tg, parent))

9991

if (!alloc_fair_sched_group(tg, parent))

9992

goto err;

9992

goto err;

9993

9994

if (!alloc_rt_sched_group(tg, parent))

9994

if (!alloc_rt_sched_group(tg, parent))

9995

goto err;

9995

goto err;

9996

9997

spin_lock_irqsave(&task_group_lock, flags);

9997

spin_lock_irqsave(&task_group_lock, flags);

9998

for_each_possible_cpu(i) {

9998

for_each_possible_cpu(i) {

9999

register_fair_sched_group(tg, i);

9999

register_fair_sched_group(tg, i);

10000

register_rt_sched_group(tg, i);

10000

register_rt_sched_group(tg, i);

10001

}

10001

}

10002

list_add_rcu(&tg->list, &task_groups);

10002

list_add_rcu(&tg->list, &task_groups);

10003

10004

WARN_ON(!parent); /* root should already exist */

10004

WARN_ON(!parent); /* root should already exist */

10005

10006

tg->parent = parent;

10006

tg->parent = parent;

10007

INIT_LIST_HEAD(&tg->children);

10007

INIT_LIST_HEAD(&tg->children);

10008

list_add_rcu(&tg->siblings, &parent->children);

10008

list_add_rcu(&tg->siblings, &parent->children);

10009

spin_unlock_irqrestore(&task_group_lock, flags);

10009

spin_unlock_irqrestore(&task_group_lock, flags);

10010

10011

return tg;

10011

return tg;

10012

10013

err:

10013

err:

10014

free_sched_group(tg);

10014

free_sched_group(tg);

10015

return ERR_PTR(-ENOMEM);

10015

return ERR_PTR(-ENOMEM);

10016

}

10016

}

10017

10018

/* rcu callback to free various structures associated with a task group */

10018

/* rcu callback to free various structures associated with a task group */

10019

static void free_sched_group_rcu(struct rcu_head *rhp)

10019

static void free_sched_group_rcu(struct rcu_head *rhp)

10020

{

10020

{

10021

/* now it should be safe to free those cfs_rqs */

10021

/* now it should be safe to free those cfs_rqs */

10022

free_sched_group(container_of(rhp, struct task_group, rcu));

10022

free_sched_group(container_of(rhp, struct task_group, rcu));

10023

}

10023

}

10024

10025

/* Destroy runqueue etc associated with a task group */

10025

/* Destroy runqueue etc associated with a task group */

10026

void sched_destroy_group(struct task_group *tg)

10026

void sched_destroy_group(struct task_group *tg)

10027

{

10027

{

10028

unsigned long flags;

10028

unsigned long flags;

10029

int i;

10029

int i;

10030

10031

spin_lock_irqsave(&task_group_lock, flags);

10031

spin_lock_irqsave(&task_group_lock, flags);

10032

for_each_possible_cpu(i) {

10032

for_each_possible_cpu(i) {

10033

unregister_fair_sched_group(tg, i);

10033

unregister_fair_sched_group(tg, i);

10034

unregister_rt_sched_group(tg, i);

10034

unregister_rt_sched_group(tg, i);

10035

}

10035

}

10036

list_del_rcu(&tg->list);

10036

list_del_rcu(&tg->list);

10037

list_del_rcu(&tg->siblings);

10037

list_del_rcu(&tg->siblings);

10038

spin_unlock_irqrestore(&task_group_lock, flags);

10038

spin_unlock_irqrestore(&task_group_lock, flags);

10039

10040

/* wait for possible concurrent references to cfs_rqs complete */

10040

/* wait for possible concurrent references to cfs_rqs complete */

10041

call_rcu(&tg->rcu, free_sched_group_rcu);

10041

call_rcu(&tg->rcu, free_sched_group_rcu);

10042

}

10042

}

10043

10044

/* change task's runqueue when it moves between groups.

10044

/* change task's runqueue when it moves between groups.

10045

* The caller of this function should have put the task in its new group

10045

* The caller of this function should have put the task in its new group

10046

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

10046

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

10047

* reflect its new group.

10047

* reflect its new group.

10048

*/

10048

*/

10049

void sched_move_task(struct task_struct *tsk)

10049

void sched_move_task(struct task_struct *tsk)

10050

{

10050

{

10051

int on_rq, running;

10051

int on_rq, running;

10052

unsigned long flags;

10052

unsigned long flags;

10053

struct rq *rq;

10053

struct rq *rq;

10054

10055

rq = task_rq_lock(tsk, &flags);

10055

rq = task_rq_lock(tsk, &flags);

10056

10057

update_rq_clock(rq);

10057

update_rq_clock(rq);

10058

10059

running = task_current(rq, tsk);

10059

running = task_current(rq, tsk);

10060

on_rq = tsk->se.on_rq;

10060

on_rq = tsk->se.on_rq;

10061

10062

if (on_rq)

10062

if (on_rq)

10063

dequeue_task(rq, tsk, 0);

10063

dequeue_task(rq, tsk, 0);

10064

if (unlikely(running))

10064

if (unlikely(running))

10065

tsk->sched_class->put_prev_task(rq, tsk);

10065

tsk->sched_class->put_prev_task(rq, tsk);

10066

10067

set_task_rq(tsk, task_cpu(tsk));

10067

set_task_rq(tsk, task_cpu(tsk));

10068

10069

#ifdef CONFIG_FAIR_GROUP_SCHED

10069

#ifdef CONFIG_FAIR_GROUP_SCHED

10070

if (tsk->sched_class->moved_group)

10070

if (tsk->sched_class->moved_group)

10071

tsk->sched_class->moved_group(tsk);

10071

tsk->sched_class->moved_group(tsk);

10072

#endif

10072

#endif

10073

10074

if (unlikely(running))

10074

if (unlikely(running))

10075

tsk->sched_class->set_curr_task(rq);

10075

tsk->sched_class->set_curr_task(rq);

10076

if (on_rq)

10076

if (on_rq)

10077

enqueue_task(rq, tsk, 0);

10077

enqueue_task(rq, tsk, 0);

10078

10079

task_rq_unlock(rq, &flags);

10079

task_rq_unlock(rq, &flags);

10080

}

10080

}

10081

#endif /* CONFIG_GROUP_SCHED */

10081

#endif /* CONFIG_GROUP_SCHED */

10082

10083

#ifdef CONFIG_FAIR_GROUP_SCHED

10083

#ifdef CONFIG_FAIR_GROUP_SCHED

10084

static void __set_se_shares(struct sched_entity *se, unsigned long shares)

10084

static void __set_se_shares(struct sched_entity *se, unsigned long shares)

10085

{

10085

{

10086

struct cfs_rq *cfs_rq = se->cfs_rq;

10086

struct cfs_rq *cfs_rq = se->cfs_rq;

10087

int on_rq;

10087

int on_rq;

10088

10089

on_rq = se->on_rq;

10089

on_rq = se->on_rq;

10090

if (on_rq)

10090

if (on_rq)

10091

dequeue_entity(cfs_rq, se, 0);

10091

dequeue_entity(cfs_rq, se, 0);

10092

10093

se->load.weight = shares;

10093

se->load.weight = shares;

10094

se->load.inv_weight = 0;

10094

se->load.inv_weight = 0;

10095

10096

if (on_rq)

10096

if (on_rq)

10097

enqueue_entity(cfs_rq, se, 0);

10097

enqueue_entity(cfs_rq, se, 0);

10098

}

10098

}

10099

10100

static void set_se_shares(struct sched_entity *se, unsigned long shares)

10100

static void set_se_shares(struct sched_entity *se, unsigned long shares)

10101

{

10101

{

10102

struct cfs_rq *cfs_rq = se->cfs_rq;

10102

struct cfs_rq *cfs_rq = se->cfs_rq;

10103

struct rq *rq = cfs_rq->rq;

10103

struct rq *rq = cfs_rq->rq;

10104

unsigned long flags;

10104

unsigned long flags;

10105

10106

spin_lock_irqsave(&rq->lock, flags);

10106

spin_lock_irqsave(&rq->lock, flags);

10107

__set_se_shares(se, shares);

10107

__set_se_shares(se, shares);

10108

spin_unlock_irqrestore(&rq->lock, flags);

10108

spin_unlock_irqrestore(&rq->lock, flags);

10109

}

10109

}

10110

10111

static DEFINE_MUTEX(shares_mutex);

10111

static DEFINE_MUTEX(shares_mutex);

10112

10113

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

10113

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

10114

{

10114

{

10115

int i;

10115

int i;

10116

unsigned long flags;

10116

unsigned long flags;

10117

10118

/*

10118

/*

10119

* We can't change the weight of the root cgroup.

10119

* We can't change the weight of the root cgroup.

10120

*/

10120

*/

10121

if (!tg->se[0])

10121

if (!tg->se[0])

10122

return -EINVAL;

10122

return -EINVAL;

10123

10124

if (shares < MIN_SHARES)

10124

if (shares < MIN_SHARES)

10125

shares = MIN_SHARES;

10125

shares = MIN_SHARES;

10126

else if (shares > MAX_SHARES)

10126

else if (shares > MAX_SHARES)

10127

shares = MAX_SHARES;

10127

shares = MAX_SHARES;

10128

10129

mutex_lock(&shares_mutex);

10129

mutex_lock(&shares_mutex);

10130

if (tg->shares == shares)

10130

if (tg->shares == shares)

10131

goto done;

10131

goto done;

10132

10133

spin_lock_irqsave(&task_group_lock, flags);

10133

spin_lock_irqsave(&task_group_lock, flags);

10134

for_each_possible_cpu(i)

10134

for_each_possible_cpu(i)

10135

unregister_fair_sched_group(tg, i);

10135

unregister_fair_sched_group(tg, i);

10136

list_del_rcu(&tg->siblings);

10136

list_del_rcu(&tg->siblings);

10137

spin_unlock_irqrestore(&task_group_lock, flags);

10137

spin_unlock_irqrestore(&task_group_lock, flags);

10138

10139

/* wait for any ongoing reference to this group to finish */

10139

/* wait for any ongoing reference to this group to finish */

10140

synchronize_sched();

10140

synchronize_sched();

10141

10142

/*

10142

/*

10143

* Now we are free to modify the group's share on each cpu

10143

* Now we are free to modify the group's share on each cpu

10144

* w/o tripping rebalance_share or load_balance_fair.

10144

* w/o tripping rebalance_share or load_balance_fair.

10145

*/

10145

*/

10146

tg->shares = shares;

10146

tg->shares = shares;

10147

for_each_possible_cpu(i) {

10147

for_each_possible_cpu(i) {

10148

/*

10148

/*

10149

* force a rebalance

10149

* force a rebalance

10150

*/

10150

*/

10151

cfs_rq_set_shares(tg->cfs_rq[i], 0);

10151

cfs_rq_set_shares(tg->cfs_rq[i], 0);

10152

set_se_shares(tg->se[i], shares);

10152

set_se_shares(tg->se[i], shares);

10153

}

10153

}

10154

10155

/*

10155

/*

10156

* Enable load balance activity on this group, by inserting it back on

10156

* Enable load balance activity on this group, by inserting it back on

10157

* each cpu's rq->leaf_cfs_rq_list.

10157

* each cpu's rq->leaf_cfs_rq_list.

10158

*/

10158

*/

10159

spin_lock_irqsave(&task_group_lock, flags);

10159

spin_lock_irqsave(&task_group_lock, flags);

10160

for_each_possible_cpu(i)

10160

for_each_possible_cpu(i)

10161

register_fair_sched_group(tg, i);

10161

register_fair_sched_group(tg, i);

10162

list_add_rcu(&tg->siblings, &tg->parent->children);

10162

list_add_rcu(&tg->siblings, &tg->parent->children);

10163

spin_unlock_irqrestore(&task_group_lock, flags);

10163

spin_unlock_irqrestore(&task_group_lock, flags);

10164

done:

10164

done:

10165

mutex_unlock(&shares_mutex);

10165

mutex_unlock(&shares_mutex);

10166

return 0;

10166

return 0;

10167

}

10167

}

10168

10169

unsigned long sched_group_shares(struct task_group *tg)

10169

unsigned long sched_group_shares(struct task_group *tg)

10170

{

10170

{

10171

return tg->shares;

10171

return tg->shares;

10172

}

10172

}

10173

#endif

10173

#endif

10174

10175

#ifdef CONFIG_RT_GROUP_SCHED

10175

#ifdef CONFIG_RT_GROUP_SCHED

10176

/*

10176

/*

10177

* Ensure that the real time constraints are schedulable.

10177

* Ensure that the real time constraints are schedulable.

10178

*/

10178

*/

10179

static DEFINE_MUTEX(rt_constraints_mutex);

10179

static DEFINE_MUTEX(rt_constraints_mutex);

10180

10181

static unsigned long to_ratio(u64 period, u64 runtime)

10181

static unsigned long to_ratio(u64 period, u64 runtime)

10182

{

10182

{

10183

if (runtime == RUNTIME_INF)

10183

if (runtime == RUNTIME_INF)

10184

return 1ULL << 20;

10184

return 1ULL << 20;

10185

10186

return div64_u64(runtime << 20, period);

10186

return div64_u64(runtime << 20, period);

10187

}

10187

}

10188

10189

/* Must be called with tasklist_lock held */

10189

/* Must be called with tasklist_lock held */

10190

static inline int tg_has_rt_tasks(struct task_group *tg)

10190

static inline int tg_has_rt_tasks(struct task_group *tg)

10191

{

10191

{

10192

struct task_struct *g, *p;

10192

struct task_struct *g, *p;

10193

10194

do_each_thread(g, p) {

10194

do_each_thread(g, p) {

10195

if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)

10195

if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)

10196

return 1;

10196

return 1;

10197

} while_each_thread(g, p);

10197

} while_each_thread(g, p);

10198

10199

return 0;

10199

return 0;

10200

}

10200

}

10201

10202

struct rt_schedulable_data {

10202

struct rt_schedulable_data {

10203

struct task_group *tg;

10203

struct task_group *tg;

10204

u64 rt_period;

10204

u64 rt_period;

10205

u64 rt_runtime;

10205

u64 rt_runtime;

10206

};

10206

};

10207

10208

static int tg_schedulable(struct task_group *tg, void *data)

10208

static int tg_schedulable(struct task_group *tg, void *data)

10209

{

10209

{

10210

struct rt_schedulable_data *d = data;

10210

struct rt_schedulable_data *d = data;

10211

struct task_group *child;

10211

struct task_group *child;

10212

unsigned long total, sum = 0;

10212

unsigned long total, sum = 0;

10213

u64 period, runtime;

10213

u64 period, runtime;

10214

10215

period = ktime_to_ns(tg->rt_bandwidth.rt_period);

10215

period = ktime_to_ns(tg->rt_bandwidth.rt_period);

10216

runtime = tg->rt_bandwidth.rt_runtime;

10216

runtime = tg->rt_bandwidth.rt_runtime;

10217

10218

if (tg == d->tg) {

10218

if (tg == d->tg) {

10219

period = d->rt_period;

10219

period = d->rt_period;

10220

runtime = d->rt_runtime;

10220

runtime = d->rt_runtime;

10221

}

10221

}

10222

10223

#ifdef CONFIG_USER_SCHED

10223

#ifdef CONFIG_USER_SCHED

10224

if (tg == &root_task_group) {

10224

if (tg == &root_task_group) {

10225

period = global_rt_period();

10225

period = global_rt_period();

10226

runtime = global_rt_runtime();

10226

runtime = global_rt_runtime();

10227

}

10227

}

10228

#endif

10228

#endif

10229

10230

/*

10230

/*

10231

* Cannot have more runtime than the period.

10231

* Cannot have more runtime than the period.

10232

*/

10232

*/

10233

if (runtime > period && runtime != RUNTIME_INF)

10233

if (runtime > period && runtime != RUNTIME_INF)

10234

return -EINVAL;

10234

return -EINVAL;

10235

10236

/*

10236

/*

10237

* Ensure we don't starve existing RT tasks.

10237

* Ensure we don't starve existing RT tasks.

10238

*/

10238

*/

10239

if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))

10239

if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))

10240

return -EBUSY;

10240

return -EBUSY;

10241

10242

total = to_ratio(period, runtime);

10242

total = to_ratio(period, runtime);

10243

10244

/*

10244

/*

10245

* Nobody can have more than the global setting allows.

10245

* Nobody can have more than the global setting allows.

10246

*/

10246

*/

10247

if (total > to_ratio(global_rt_period(), global_rt_runtime()))

10247

if (total > to_ratio(global_rt_period(), global_rt_runtime()))

10248

return -EINVAL;

10248

return -EINVAL;

10249

10250

/*

10250

/*

10251

* The sum of our children's runtime should not exceed our own.

10251

* The sum of our children's runtime should not exceed our own.

10252

*/

10252

*/

10253

list_for_each_entry_rcu(child, &tg->children, siblings) {

10253

list_for_each_entry_rcu(child, &tg->children, siblings) {

10254

period = ktime_to_ns(child->rt_bandwidth.rt_period);

10254

period = ktime_to_ns(child->rt_bandwidth.rt_period);

10255

runtime = child->rt_bandwidth.rt_runtime;

10255

runtime = child->rt_bandwidth.rt_runtime;

10256

10257

if (child == d->tg) {

10257

if (child == d->tg) {

10258

period = d->rt_period;

10258

period = d->rt_period;

10259

runtime = d->rt_runtime;

10259

runtime = d->rt_runtime;

10260

}

10260

}

10261

10262

sum += to_ratio(period, runtime);

10262

sum += to_ratio(period, runtime);

10263

}

10263

}

10264

10265

if (sum > total)

10265

if (sum > total)

10266

return -EINVAL;

10266

return -EINVAL;

10267

10268

return 0;

10268

return 0;

10269

}

10269

}

10270

10271

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)

10271

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)

10272

{

10272

{

10273

struct rt_schedulable_data data = {

10273

struct rt_schedulable_data data = {

10274

.tg = tg,

10274

.tg = tg,

10275

.rt_period = period,

10275

.rt_period = period,

10276

.rt_runtime = runtime,

10276

.rt_runtime = runtime,

10277

};

10277

};

10278

10279

return walk_tg_tree(tg_schedulable, tg_nop, &data);

10279

return walk_tg_tree(tg_schedulable, tg_nop, &data);

10280

}

10280

}

10281

10282

static int tg_set_bandwidth(struct task_group *tg,

10282

static int tg_set_bandwidth(struct task_group *tg,

10283

u64 rt_period, u64 rt_runtime)

10283

u64 rt_period, u64 rt_runtime)

10284

{

10284

{

10285

int i, err = 0;

10285

int i, err = 0;

10286

10287

mutex_lock(&rt_constraints_mutex);

10287

mutex_lock(&rt_constraints_mutex);

10288

read_lock(&tasklist_lock);

10288

read_lock(&tasklist_lock);

10289

err = __rt_schedulable(tg, rt_period, rt_runtime);

10289

err = __rt_schedulable(tg, rt_period, rt_runtime);

10290

if (err)

10290

if (err)

10291

goto unlock;

10291

goto unlock;

10292

10293

spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);

10293

spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);

10294

tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);

10294

tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);

10295

tg->rt_bandwidth.rt_runtime = rt_runtime;

10295

tg->rt_bandwidth.rt_runtime = rt_runtime;

10296

10297

for_each_possible_cpu(i) {

10297

for_each_possible_cpu(i) {

10298

struct rt_rq *rt_rq = tg->rt_rq[i];

10298

struct rt_rq *rt_rq = tg->rt_rq[i];

10299

10300

spin_lock(&rt_rq->rt_runtime_lock);

10300

spin_lock(&rt_rq->rt_runtime_lock);

10301

rt_rq->rt_runtime = rt_runtime;

10301

rt_rq->rt_runtime = rt_runtime;

10302

spin_unlock(&rt_rq->rt_runtime_lock);

10302

spin_unlock(&rt_rq->rt_runtime_lock);

10303

}

10303

}

10304

spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);

10304

spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);

10305

unlock:

10305

unlock:

10306

read_unlock(&tasklist_lock);

10306

read_unlock(&tasklist_lock);

10307

mutex_unlock(&rt_constraints_mutex);

10307

mutex_unlock(&rt_constraints_mutex);

10308

10309

return err;

10309

return err;

10310

}

10310

}

10311

10312

int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)

10312

int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)

10313

{

10313

{

10314

u64 rt_runtime, rt_period;

10314

u64 rt_runtime, rt_period;

10315

10316

rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);

10316

rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);

10317

rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;

10317

rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;

10318

if (rt_runtime_us < 0)

10318

if (rt_runtime_us < 0)

10319

rt_runtime = RUNTIME_INF;

10319

rt_runtime = RUNTIME_INF;

10320

10321

return tg_set_bandwidth(tg, rt_period, rt_runtime);

10321

return tg_set_bandwidth(tg, rt_period, rt_runtime);

10322

}

10322

}

10323

10324

long sched_group_rt_runtime(struct task_group *tg)

10324

long sched_group_rt_runtime(struct task_group *tg)

10325

{

10325

{

10326

u64 rt_runtime_us;

10326

u64 rt_runtime_us;

10327

10328

if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)

10328

if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)

10329

return -1;

10329

return -1;

10330

10331

rt_runtime_us = tg->rt_bandwidth.rt_runtime;

10331

rt_runtime_us = tg->rt_bandwidth.rt_runtime;

10332

do_div(rt_runtime_us, NSEC_PER_USEC);

10332

do_div(rt_runtime_us, NSEC_PER_USEC);

10333

return rt_runtime_us;

10333

return rt_runtime_us;

10334

}

10334

}

10335

10336

int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)

10336

int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)

10337

{

10337

{

10338

u64 rt_runtime, rt_period;

10338

u64 rt_runtime, rt_period;

10339

10340

rt_period = (u64)rt_period_us * NSEC_PER_USEC;

10340

rt_period = (u64)rt_period_us * NSEC_PER_USEC;

10341

rt_runtime = tg->rt_bandwidth.rt_runtime;

10341

rt_runtime = tg->rt_bandwidth.rt_runtime;

10342

10343

if (rt_period == 0)

10343

if (rt_period == 0)

10344

return -EINVAL;

10344

return -EINVAL;

10345

10346

return tg_set_bandwidth(tg, rt_period, rt_runtime);

10346

return tg_set_bandwidth(tg, rt_period, rt_runtime);

10347

}

10347

}

10348

10349

long sched_group_rt_period(struct task_group *tg)

10349

long sched_group_rt_period(struct task_group *tg)

10350

{

10350

{

10351

u64 rt_period_us;

10351

u64 rt_period_us;

10352

10353

rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);

10353

rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);

10354

do_div(rt_period_us, NSEC_PER_USEC);

10354

do_div(rt_period_us, NSEC_PER_USEC);

10355

return rt_period_us;

10355

return rt_period_us;

10356

}

10356

}

10357

10358

static int sched_rt_global_constraints(void)

10358

static int sched_rt_global_constraints(void)

10359

{

10359

{

10360

u64 runtime, period;

10360

u64 runtime, period;

10361

int ret = 0;

10361

int ret = 0;

10362

10363

if (sysctl_sched_rt_period <= 0)

10363

if (sysctl_sched_rt_period <= 0)

10364

return -EINVAL;

10364

return -EINVAL;

10365

10366

runtime = global_rt_runtime();

10366

runtime = global_rt_runtime();

10367

period = global_rt_period();

10367

period = global_rt_period();

10368

10369

/*

10369

/*

10370

* Sanity check on the sysctl variables.

10370

* Sanity check on the sysctl variables.

10371

*/

10371

*/

10372

if (runtime > period && runtime != RUNTIME_INF)

10372

if (runtime > period && runtime != RUNTIME_INF)

10373

return -EINVAL;

10373

return -EINVAL;

10374

10375

mutex_lock(&rt_constraints_mutex);

10375

mutex_lock(&rt_constraints_mutex);

10376

read_lock(&tasklist_lock);

10376

read_lock(&tasklist_lock);

10377

ret = __rt_schedulable(NULL, 0, 0);

10377

ret = __rt_schedulable(NULL, 0, 0);

10378

read_unlock(&tasklist_lock);

10378

read_unlock(&tasklist_lock);

10379

mutex_unlock(&rt_constraints_mutex);

10379

mutex_unlock(&rt_constraints_mutex);

10380

10381

return ret;

10381

return ret;

10382

}

10382

}

10383

10384

int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)

10384

int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)

10385

{

10385

{

10386

/* Don't accept realtime tasks when there is no way for them to run */

10386

/* Don't accept realtime tasks when there is no way for them to run */

10387

if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)

10387

if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)

10388

return 0;

10388

return 0;

10389

10390

return 1;

10390

return 1;

10391

}

10391

}

10392

10393

#else /* !CONFIG_RT_GROUP_SCHED */

10393

#else /* !CONFIG_RT_GROUP_SCHED */

10394

static int sched_rt_global_constraints(void)

10394

static int sched_rt_global_constraints(void)

10395

{

10395

{

10396

unsigned long flags;

10396

unsigned long flags;

10397

int i;

10397

int i;

10398

10399

if (sysctl_sched_rt_period <= 0)

10399

if (sysctl_sched_rt_period <= 0)

10400

return -EINVAL;

10400

return -EINVAL;

10401

10402

/*

10402

/*

10403

* There's always some RT tasks in the root group

10403

* There's always some RT tasks in the root group

10404

* -- migration, kstopmachine etc..

10404

* -- migration, kstopmachine etc..

10405

*/

10405

*/

10406

if (sysctl_sched_rt_runtime == 0)

10406

if (sysctl_sched_rt_runtime == 0)

10407

return -EBUSY;

10407

return -EBUSY;

10408

10409

spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);

10409

spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);

10410

for_each_possible_cpu(i) {

10410

for_each_possible_cpu(i) {

10411

struct rt_rq *rt_rq = &cpu_rq(i)->rt;

10411

struct rt_rq *rt_rq = &cpu_rq(i)->rt;

10412

10413

spin_lock(&rt_rq->rt_runtime_lock);

10413

spin_lock(&rt_rq->rt_runtime_lock);

10414

rt_rq->rt_runtime = global_rt_runtime();

10414

rt_rq->rt_runtime = global_rt_runtime();

10415

spin_unlock(&rt_rq->rt_runtime_lock);

10415

spin_unlock(&rt_rq->rt_runtime_lock);

10416

}

10416

}

10417

spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

10417

spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

10418

10419

return 0;

10419

return 0;

10420

}

10420

}

10421

#endif /* CONFIG_RT_GROUP_SCHED */

10421

#endif /* CONFIG_RT_GROUP_SCHED */

10422

10423

int sched_rt_handler(struct ctl_table *table, int write,

10423

int sched_rt_handler(struct ctl_table *table, int write,

10424

void __user *buffer, size_t *lenp,

10424

void __user *buffer, size_t *lenp,

10425

loff_t *ppos)

10425

loff_t *ppos)

10426

{

10426

{

10427

int ret;

10427

int ret;

10428

int old_period, old_runtime;

10428

int old_period, old_runtime;

10429

static DEFINE_MUTEX(mutex);

10429

static DEFINE_MUTEX(mutex);

10430

10431

mutex_lock(&mutex);

10431

mutex_lock(&mutex);

10432

old_period = sysctl_sched_rt_period;

10432

old_period = sysctl_sched_rt_period;

10433

old_runtime = sysctl_sched_rt_runtime;

10433

old_runtime = sysctl_sched_rt_runtime;

10434

10435

ret = proc_dointvec(table, write, buffer, lenp, ppos);

10435

ret = proc_dointvec(table, write, buffer, lenp, ppos);

10436

10437

if (!ret && write) {

10437

if (!ret && write) {

10438

ret = sched_rt_global_constraints();

10438

ret = sched_rt_global_constraints();

10439

if (ret) {

10439

if (ret) {

10440

sysctl_sched_rt_period = old_period;

10440

sysctl_sched_rt_period = old_period;

10441

sysctl_sched_rt_runtime = old_runtime;

10441

sysctl_sched_rt_runtime = old_runtime;

10442

} else {

10442

} else {

10443

def_rt_bandwidth.rt_runtime = global_rt_runtime();

10443

def_rt_bandwidth.rt_runtime = global_rt_runtime();

10444

def_rt_bandwidth.rt_period =

10444

def_rt_bandwidth.rt_period =

10445

ns_to_ktime(global_rt_period());

10445

ns_to_ktime(global_rt_period());

10446

}

10446

}

10447

}

10447

}

10448

mutex_unlock(&mutex);

10448

mutex_unlock(&mutex);

10449

10450

return ret;

10450

return ret;

10451

}

10451

}

10452

10453

#ifdef CONFIG_CGROUP_SCHED

10453

#ifdef CONFIG_CGROUP_SCHED

10454

10455

/* return corresponding task_group object of a cgroup */

10455

/* return corresponding task_group object of a cgroup */

10456

static inline struct task_group *cgroup_tg(struct cgroup *cgrp)

10456

static inline struct task_group *cgroup_tg(struct cgroup *cgrp)

10457

{

10457

{

10458

return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),

10458

return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),

10459

struct task_group, css);

10459

struct task_group, css);

10460

}

10460

}

10461

10462

static struct cgroup_subsys_state *

10462

static struct cgroup_subsys_state *

10463

cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)

10463

cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)

10464

{

10464

{

10465

struct task_group *tg, *parent;

10465

struct task_group *tg, *parent;

10466

10467

if (!cgrp->parent) {

10467

if (!cgrp->parent) {

10468

/* This is early initialization for the top cgroup */

10468

/* This is early initialization for the top cgroup */

10469

return &init_task_group.css;

10469

return &init_task_group.css;

10470

}

10470

}

10471

10472

parent = cgroup_tg(cgrp->parent);

10472

parent = cgroup_tg(cgrp->parent);

10473

tg = sched_create_group(parent);

10473

tg = sched_create_group(parent);

10474

if (IS_ERR(tg))

10474

if (IS_ERR(tg))

10475

return ERR_PTR(-ENOMEM);

10475

return ERR_PTR(-ENOMEM);

10476

10477

return &tg->css;

10477

return &tg->css;

10478

}

10478

}

10479

10480

static void

10480

static void

10481

cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

10481

cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

10482

{

10482

{

10483

struct task_group *tg = cgroup_tg(cgrp);

10483

struct task_group *tg = cgroup_tg(cgrp);

10484

10485

sched_destroy_group(tg);

10485

sched_destroy_group(tg);

10486

}

10486

}

10487

10488

static int

10488

static int

10489

cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

10489

cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

10490

{

10490

{

10491

#ifdef CONFIG_RT_GROUP_SCHED

10491

#ifdef CONFIG_RT_GROUP_SCHED

10492

if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))

10492

if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))

10493

return -EINVAL;

10493

return -EINVAL;

10494

#else

10494

#else

10495

/* We don't support RT-tasks being in separate groups */

10495

/* We don't support RT-tasks being in separate groups */

10496

if (tsk->sched_class != &fair_sched_class)

10496

if (tsk->sched_class != &fair_sched_class)

10497

return -EINVAL;

10497

return -EINVAL;

10498

#endif

10498

#endif

10499

return 0;

10499

return 0;

10500

}

10500

}

10501

10502

static int

10502

static int

10503

cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

10503

cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

10504

struct task_struct *tsk, bool threadgroup)

10504

struct task_struct *tsk, bool threadgroup)

10505

{

10505

{

10506

int retval = cpu_cgroup_can_attach_task(cgrp, tsk);

10506

int retval = cpu_cgroup_can_attach_task(cgrp, tsk);

10507

if (retval)

10507

if (retval)

10508

return retval;

10508

return retval;

10509

if (threadgroup) {

10509

if (threadgroup) {

10510

struct task_struct *c;

10510

struct task_struct *c;

10511

rcu_read_lock();

10511

rcu_read_lock();

10512

list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {

10512

list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {

10513

retval = cpu_cgroup_can_attach_task(cgrp, c);

10513

retval = cpu_cgroup_can_attach_task(cgrp, c);

10514

if (retval) {

10514

if (retval) {

10515

rcu_read_unlock();

10515

rcu_read_unlock();

10516

return retval;

10516

return retval;

10517

}

10517

}

10518

}

10518

}

10519

rcu_read_unlock();

10519

rcu_read_unlock();

10520

}

10520

}

10521

return 0;

10521

return 0;

10522

}

10522

}

10523

10524

static void

10524

static void

10525

cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

10525

cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,

10526

struct cgroup *old_cont, struct task_struct *tsk,

10526

struct cgroup *old_cont, struct task_struct *tsk,

10527

bool threadgroup)

10527

bool threadgroup)

10528

{

10528

{

10529

sched_move_task(tsk);

10529

sched_move_task(tsk);

10530

if (threadgroup) {

10530

if (threadgroup) {

10531

struct task_struct *c;

10531

struct task_struct *c;

10532

rcu_read_lock();

10532

rcu_read_lock();

10533

list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {

10533

list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {

10534

sched_move_task(c);

10534

sched_move_task(c);

10535

}

10535

}

10536

rcu_read_unlock();

10536

rcu_read_unlock();

10537

}

10537

}

10538

}

10538

}

10539

10540

#ifdef CONFIG_FAIR_GROUP_SCHED

10540

#ifdef CONFIG_FAIR_GROUP_SCHED

10541

static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,

10541

static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,

10542

u64 shareval)

10542

u64 shareval)

10543

{

10543

{

10544

return sched_group_set_shares(cgroup_tg(cgrp), shareval);

10544

return sched_group_set_shares(cgroup_tg(cgrp), shareval);

10545

}

10545

}

10546

10547

static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)

10547

static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)

10548

{

10548

{

10549

struct task_group *tg = cgroup_tg(cgrp);

10549

struct task_group *tg = cgroup_tg(cgrp);

10550

10551

return (u64) tg->shares;

10551

return (u64) tg->shares;

10552

}

10552

}

10553

#endif /* CONFIG_FAIR_GROUP_SCHED */

10553

#endif /* CONFIG_FAIR_GROUP_SCHED */

10554

10555

#ifdef CONFIG_RT_GROUP_SCHED

10555

#ifdef CONFIG_RT_GROUP_SCHED

10556

static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,

10556

static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,

10557

s64 val)

10557

s64 val)

10558

{

10558

{

10559

return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);

10559

return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);

10560

}

10560

}

10561

10562

static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)

10562

static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)

10563

{

10563

{

10564

return sched_group_rt_runtime(cgroup_tg(cgrp));

10564

return sched_group_rt_runtime(cgroup_tg(cgrp));

10565

}

10565

}

10566

10567

static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,

10567

static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,

10568

u64 rt_period_us)

10568

u64 rt_period_us)

10569

{

10569

{

10570

return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);

10570

return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);

10571

}

10571

}

10572

10573

static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)

10573

static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)

10574

{

10574

{

10575

return sched_group_rt_period(cgroup_tg(cgrp));

10575

return sched_group_rt_period(cgroup_tg(cgrp));

10576

}

10576

}

10577

#endif /* CONFIG_RT_GROUP_SCHED */

10577

#endif /* CONFIG_RT_GROUP_SCHED */

10578

10579

static struct cftype cpu_files[] = {

10579

static struct cftype cpu_files[] = {

10580

#ifdef CONFIG_FAIR_GROUP_SCHED

10580

#ifdef CONFIG_FAIR_GROUP_SCHED

10581

{

10581

{

10582

.name = "shares",

10582

.name = "shares",

10583

.read_u64 = cpu_shares_read_u64,

10583

.read_u64 = cpu_shares_read_u64,

10584

.write_u64 = cpu_shares_write_u64,

10584

.write_u64 = cpu_shares_write_u64,

10585

},

10585

},

10586

#endif

10586

#endif

10587

#ifdef CONFIG_RT_GROUP_SCHED

10587

#ifdef CONFIG_RT_GROUP_SCHED

10588

{

10588

{

10589

.name = "rt_runtime_us",

10589

.name = "rt_runtime_us",

10590

.read_s64 = cpu_rt_runtime_read,

10590

.read_s64 = cpu_rt_runtime_read,

10591

.write_s64 = cpu_rt_runtime_write,

10591

.write_s64 = cpu_rt_runtime_write,

10592

},

10592

},

10593

{

10593

{

10594

.name = "rt_period_us",

10594

.name = "rt_period_us",

10595

.read_u64 = cpu_rt_period_read_uint,

10595

.read_u64 = cpu_rt_period_read_uint,

10596

.write_u64 = cpu_rt_period_write_uint,

10596

.write_u64 = cpu_rt_period_write_uint,

10597

},

10597

},

10598

#endif

10598

#endif

10599

};

10599

};

10600

10601

static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)

10601

static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)

10602

{

10602

{

10603

return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));

10603

return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));

10604

}

10604

}

10605

10606

struct cgroup_subsys cpu_cgroup_subsys = {

10606

struct cgroup_subsys cpu_cgroup_subsys = {

10607

.name = "cpu",

10607

.name = "cpu",

10608

.create = cpu_cgroup_create,

10608

.create = cpu_cgroup_create,

10609

.destroy = cpu_cgroup_destroy,

10609

.destroy = cpu_cgroup_destroy,

10610

.can_attach = cpu_cgroup_can_attach,

10610

.can_attach = cpu_cgroup_can_attach,

10611

.attach = cpu_cgroup_attach,

10611

.attach = cpu_cgroup_attach,

10612

.populate = cpu_cgroup_populate,

10612

.populate = cpu_cgroup_populate,

10613

.subsys_id = cpu_cgroup_subsys_id,

10613

.subsys_id = cpu_cgroup_subsys_id,

10614

.early_init = 1,

10614

.early_init = 1,

10615

};

10615

};

10616

10617

#endif /* CONFIG_CGROUP_SCHED */

10617

#endif /* CONFIG_CGROUP_SCHED */

10618

10619

#ifdef CONFIG_CGROUP_CPUACCT

10619

#ifdef CONFIG_CGROUP_CPUACCT

10620

10621

/*

10621

/*

10622

* CPU accounting code for task groups.

10622

* CPU accounting code for task groups.

10623

*

10623

*

10624

* Based on the work by Paul Menage (menage@google.com) and Balbir Singh

10624

* Based on the work by Paul Menage (menage@google.com) and Balbir Singh

10625

* (balbir@in.ibm.com).

10625

* (balbir@in.ibm.com).

10626

*/

10626

*/

10627

10628

/* track cpu usage of a group of tasks and its child groups */

10628

/* track cpu usage of a group of tasks and its child groups */

10629

struct cpuacct {

10629

struct cpuacct {

10630

struct cgroup_subsys_state css;

10630

struct cgroup_subsys_state css;

10631

/* cpuusage holds pointer to a u64-type object on every cpu */

10631

/* cpuusage holds pointer to a u64-type object on every cpu */

10632

u64 *cpuusage;

10632

u64 *cpuusage;

10633

struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];

10633

struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];

10634

struct cpuacct *parent;

10634

struct cpuacct *parent;

10635

};

10635

};

10636

10637

struct cgroup_subsys cpuacct_subsys;

10637

struct cgroup_subsys cpuacct_subsys;

10638

10639

/* return cpu accounting group corresponding to this container */

10639

/* return cpu accounting group corresponding to this container */

10640

static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)

10640

static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)

10641

{

10641

{

10642

return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),

10642

return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),

10643

struct cpuacct, css);

10643

struct cpuacct, css);

10644

}

10644

}

10645

10646

/* return cpu accounting group to which this task belongs */

10646

/* return cpu accounting group to which this task belongs */

10647

static inline struct cpuacct *task_ca(struct task_struct *tsk)

10647

static inline struct cpuacct *task_ca(struct task_struct *tsk)

10648

{

10648

{

10649

return container_of(task_subsys_state(tsk, cpuacct_subsys_id),

10649

return container_of(task_subsys_state(tsk, cpuacct_subsys_id),

10650

struct cpuacct, css);

10650

struct cpuacct, css);

10651

}

10651

}

10652

10653

/* create a new cpu accounting group */

10653

/* create a new cpu accounting group */

10654

static struct cgroup_subsys_state *cpuacct_create(

10654

static struct cgroup_subsys_state *cpuacct_create(

10655

struct cgroup_subsys *ss, struct cgroup *cgrp)

10655

struct cgroup_subsys *ss, struct cgroup *cgrp)

10656

{

10656

{

10657

struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);

10657

struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);

10658

int i;

10658

int i;

10659

10660

if (!ca)

10660

if (!ca)

10661

goto out;

10661

goto out;

10662

10663

ca->cpuusage = alloc_percpu(u64);

10663

ca->cpuusage = alloc_percpu(u64);

10664

if (!ca->cpuusage)

10664

if (!ca->cpuusage)

10665

goto out_free_ca;

10665

goto out_free_ca;

10666

10667

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

10667

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

10668

if (percpu_counter_init(&ca->cpustat[i], 0))

10668

if (percpu_counter_init(&ca->cpustat[i], 0))

10669

goto out_free_counters;

10669

goto out_free_counters;

10670

10671

if (cgrp->parent)

10671

if (cgrp->parent)

10672

ca->parent = cgroup_ca(cgrp->parent);

10672

ca->parent = cgroup_ca(cgrp->parent);

10673

10674

return &ca->css;

10674

return &ca->css;

10675

10676

out_free_counters:

10676

out_free_counters:

10677

while (--i >= 0)

10677

while (--i >= 0)

10678

percpu_counter_destroy(&ca->cpustat[i]);

10678

percpu_counter_destroy(&ca->cpustat[i]);

10679

free_percpu(ca->cpuusage);

10679

free_percpu(ca->cpuusage);

10680

out_free_ca:

10680

out_free_ca:

10681

kfree(ca);

10681

kfree(ca);

10682

out:

10682

out:

10683

return ERR_PTR(-ENOMEM);

10683

return ERR_PTR(-ENOMEM);

10684

}

10684

}

10685

10686

/* destroy an existing cpu accounting group */

10686

/* destroy an existing cpu accounting group */

10687

static void

10687

static void

10688

cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

10688

cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

10689

{

10689

{

10690

struct cpuacct *ca = cgroup_ca(cgrp);

10690

struct cpuacct *ca = cgroup_ca(cgrp);

10691

int i;

10691

int i;

10692

10693

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

10693

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

10694

percpu_counter_destroy(&ca->cpustat[i]);

10694

percpu_counter_destroy(&ca->cpustat[i]);

10695

free_percpu(ca->cpuusage);

10695

free_percpu(ca->cpuusage);

10696

kfree(ca);

10696

kfree(ca);

10697

}

10697

}

10698

10699

static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)

10699

static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)

10700

{

10700

{

10701

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10701

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10702

u64 data;

10702

u64 data;

10703

10704

#ifndef CONFIG_64BIT

10704

#ifndef CONFIG_64BIT

10705

/*

10705

/*

10706

* Take rq->lock to make 64-bit read safe on 32-bit platforms.

10706

* Take rq->lock to make 64-bit read safe on 32-bit platforms.

10707

*/

10707

*/

10708

spin_lock_irq(&cpu_rq(cpu)->lock);

10708

spin_lock_irq(&cpu_rq(cpu)->lock);

10709

data = *cpuusage;

10709

data = *cpuusage;

10710

spin_unlock_irq(&cpu_rq(cpu)->lock);

10710

spin_unlock_irq(&cpu_rq(cpu)->lock);

10711

#else

10711

#else

10712

data = *cpuusage;

10712

data = *cpuusage;

10713

#endif

10713

#endif

10714

10715

return data;

10715

return data;

10716

}

10716

}

10717

10718

static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)

10718

static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)

10719

{

10719

{

10720

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10720

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10721

10722

#ifndef CONFIG_64BIT

10722

#ifndef CONFIG_64BIT

10723

/*

10723

/*

10724

* Take rq->lock to make 64-bit write safe on 32-bit platforms.

10724

* Take rq->lock to make 64-bit write safe on 32-bit platforms.

10725

*/

10725

*/

10726

spin_lock_irq(&cpu_rq(cpu)->lock);

10726

spin_lock_irq(&cpu_rq(cpu)->lock);

10727

*cpuusage = val;

10727

*cpuusage = val;

10728

spin_unlock_irq(&cpu_rq(cpu)->lock);

10728

spin_unlock_irq(&cpu_rq(cpu)->lock);

10729

#else

10729

#else

10730

*cpuusage = val;

10730

*cpuusage = val;

10731

#endif

10731

#endif

10732

}

10732

}

10733

10734

/* return total cpu usage (in nanoseconds) of a group */

10734

/* return total cpu usage (in nanoseconds) of a group */

10735

static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)

10735

static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)

10736

{

10736

{

10737

struct cpuacct *ca = cgroup_ca(cgrp);

10737

struct cpuacct *ca = cgroup_ca(cgrp);

10738

u64 totalcpuusage = 0;

10738

u64 totalcpuusage = 0;

10739

int i;

10739

int i;

10740

10741

for_each_present_cpu(i)

10741

for_each_present_cpu(i)

10742

totalcpuusage += cpuacct_cpuusage_read(ca, i);

10742

totalcpuusage += cpuacct_cpuusage_read(ca, i);

10743

10744

return totalcpuusage;

10744

return totalcpuusage;

10745

}

10745

}

10746

10747

static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,

10747

static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,

10748

u64 reset)

10748

u64 reset)

10749

{

10749

{

10750

struct cpuacct *ca = cgroup_ca(cgrp);

10750

struct cpuacct *ca = cgroup_ca(cgrp);

10751

int err = 0;

10751

int err = 0;

10752

int i;

10752

int i;

10753

10754

if (reset) {

10754

if (reset) {

10755

err = -EINVAL;

10755

err = -EINVAL;

10756

goto out;

10756

goto out;

10757

}

10757

}

10758

10759

for_each_present_cpu(i)

10759

for_each_present_cpu(i)

10760

cpuacct_cpuusage_write(ca, i, 0);

10760

cpuacct_cpuusage_write(ca, i, 0);

10761

10762

out:

10762

out:

10763

return err;

10763

return err;

10764

}

10764

}

10765

10766

static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,

10766

static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,

10767

struct seq_file *m)

10767

struct seq_file *m)

10768

{

10768

{

10769

struct cpuacct *ca = cgroup_ca(cgroup);

10769

struct cpuacct *ca = cgroup_ca(cgroup);

10770

u64 percpu;

10770

u64 percpu;

10771

int i;

10771

int i;

10772

10773

for_each_present_cpu(i) {

10773

for_each_present_cpu(i) {

10774

percpu = cpuacct_cpuusage_read(ca, i);

10774

percpu = cpuacct_cpuusage_read(ca, i);

10775

seq_printf(m, "%llu ", (unsigned long long) percpu);

10775

seq_printf(m, "%llu ", (unsigned long long) percpu);

10776

}

10776

}

10777

seq_printf(m, "\n");

10777

seq_printf(m, "\n");

10778

return 0;

10778

return 0;

10779

}

10779

}

10780

10781

static const char *cpuacct_stat_desc[] = {

10781

static const char *cpuacct_stat_desc[] = {

10782

[CPUACCT_STAT_USER] = "user",

10782

[CPUACCT_STAT_USER] = "user",

10783

[CPUACCT_STAT_SYSTEM] = "system",

10783

[CPUACCT_STAT_SYSTEM] = "system",

10784

};

10784

};

10785

10786

static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,

10786

static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,

10787

struct cgroup_map_cb *cb)

10787

struct cgroup_map_cb *cb)

10788

{

10788

{

10789

struct cpuacct *ca = cgroup_ca(cgrp);

10789

struct cpuacct *ca = cgroup_ca(cgrp);

10790

int i;

10790

int i;

10791

10792

for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {

10792

for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {

10793

s64 val = percpu_counter_read(&ca->cpustat[i]);

10793

s64 val = percpu_counter_read(&ca->cpustat[i]);

10794

val = cputime64_to_clock_t(val);

10794

val = cputime64_to_clock_t(val);

10795

cb->fill(cb, cpuacct_stat_desc[i], val);

10795

cb->fill(cb, cpuacct_stat_desc[i], val);

10796

}

10796

}

10797

return 0;

10797

return 0;

10798

}

10798

}

10799

10800

static struct cftype files[] = {

10800

static struct cftype files[] = {

10801

{

10801

{

10802

.name = "usage",

10802

.name = "usage",

10803

.read_u64 = cpuusage_read,

10803

.read_u64 = cpuusage_read,

10804

.write_u64 = cpuusage_write,

10804

.write_u64 = cpuusage_write,

10805

},

10805

},

10806

{

10806

{

10807

.name = "usage_percpu",

10807

.name = "usage_percpu",

10808

.read_seq_string = cpuacct_percpu_seq_read,

10808

.read_seq_string = cpuacct_percpu_seq_read,

10809

},

10809

},

10810

{

10810

{

10811

.name = "stat",

10811

.name = "stat",

10812

.read_map = cpuacct_stats_show,

10812

.read_map = cpuacct_stats_show,

10813

},

10813

},

10814

};

10814

};

10815

10816

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)

10816

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)

10817

{

10817

{

10818

return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));

10818

return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));

10819

}

10819

}

10820

10821

/*

10821

/*

10822

* charge this task's execution time to its accounting group.

10822

* charge this task's execution time to its accounting group.

10823

*

10823

*

10824

* called with rq->lock held.

10824

* called with rq->lock held.

10825

*/

10825

*/

10826

static void cpuacct_charge(struct task_struct *tsk, u64 cputime)

10826

static void cpuacct_charge(struct task_struct *tsk, u64 cputime)

10827

{

10827

{

10828

struct cpuacct *ca;

10828

struct cpuacct *ca;

10829

int cpu;

10829

int cpu;

10830

10831

if (unlikely(!cpuacct_subsys.active))

10831

if (unlikely(!cpuacct_subsys.active))

10832

return;

10832

return;

10833

10834

cpu = task_cpu(tsk);

10834

cpu = task_cpu(tsk);

10835

10836

rcu_read_lock();

10836

rcu_read_lock();

10837

10838

ca = task_ca(tsk);

10838

ca = task_ca(tsk);

10839

10840

for (; ca; ca = ca->parent) {

10840

for (; ca; ca = ca->parent) {

10841

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10841

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

10842

*cpuusage += cputime;

10842

*cpuusage += cputime;

10843

}

10843

}

10844

10845

rcu_read_unlock();

10845

rcu_read_unlock();

10846

}

10846

}

10847

10848

/*

10848

/*

10849

* Charge the system/user time to the task's accounting group.

10849

* Charge the system/user time to the task's accounting group.

10850

*/

10850

*/

10851

static void cpuacct_update_stats(struct task_struct *tsk,

10851

static void cpuacct_update_stats(struct task_struct *tsk,

10852

enum cpuacct_stat_index idx, cputime_t val)

10852

enum cpuacct_stat_index idx, cputime_t val)

10853

{

10853

{

10854

struct cpuacct *ca;

10854

struct cpuacct *ca;

10855

10856

if (unlikely(!cpuacct_subsys.active))

10856

if (unlikely(!cpuacct_subsys.active))

10857

return;

10857

return;

10858

10859

rcu_read_lock();

10859

rcu_read_lock();

10860

ca = task_ca(tsk);

10860

ca = task_ca(tsk);

10861

10862

do {

10862

do {

10863

percpu_counter_add(&ca->cpustat[idx], val);

10863

percpu_counter_add(&ca->cpustat[idx], val);

10864

ca = ca->parent;

10864

ca = ca->parent;

10865

} while (ca);

10865

} while (ca);

10866

rcu_read_unlock();

10866

rcu_read_unlock();

10867

}

10867

}

10868

10869

struct cgroup_subsys cpuacct_subsys = {

10869

struct cgroup_subsys cpuacct_subsys = {

10870

.name = "cpuacct",

10870

.name = "cpuacct",

10871

.create = cpuacct_create,

10871

.create = cpuacct_create,

10872

.destroy = cpuacct_destroy,

10872

.destroy = cpuacct_destroy,

10873

.populate = cpuacct_populate,

10873

.populate = cpuacct_populate,

10874

.subsys_id = cpuacct_subsys_id,

10874

.subsys_id = cpuacct_subsys_id,

10875

};

10875

};

10876

#endif /* CONFIG_CGROUP_CPUACCT */

10876

#endif /* CONFIG_CGROUP_CPUACCT */

10877

10878

#ifndef CONFIG_SMP

10878

#ifndef CONFIG_SMP

10879

10880

int rcu_expedited_torture_stats(char *page)

10880

int rcu_expedited_torture_stats(char *page)

10881

{

10881

{

10882

return 0;

10882

return 0;

10883

}

10883

}

10884

EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);

10884

EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);

10885

10886

void synchronize_sched_expedited(void)

10886

void synchronize_sched_expedited(void)

10887

{

10887

{

10888

}

10888

}

10889

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

10889

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

10890

10891

#else /* #ifndef CONFIG_SMP */

10891

#else /* #ifndef CONFIG_SMP */

10892

10893

static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);

10893

static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);

10894

static DEFINE_MUTEX(rcu_sched_expedited_mutex);

10894

static DEFINE_MUTEX(rcu_sched_expedited_mutex);

10895

10896

#define RCU_EXPEDITED_STATE_POST -2

10896

#define RCU_EXPEDITED_STATE_POST -2

10897

#define RCU_EXPEDITED_STATE_IDLE -1

10897

#define RCU_EXPEDITED_STATE_IDLE -1

10898

10899

static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;

10899

static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;

10900

10901

int rcu_expedited_torture_stats(char *page)

10901

int rcu_expedited_torture_stats(char *page)

10902

{

10902

{

10903

int cnt = 0;

10903

int cnt = 0;

10904

int cpu;

10904

int cpu;

10905

10906

cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);

10906

cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);

10907

for_each_online_cpu(cpu) {

10907

for_each_online_cpu(cpu) {

10908

cnt += sprintf(&page[cnt], " %d:%d",

10908

cnt += sprintf(&page[cnt], " %d:%d",

10909

cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);

10909

cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);

10910

}

10910

}

10911

cnt += sprintf(&page[cnt], "\n");

10911

cnt += sprintf(&page[cnt], "\n");

10912

return cnt;

10912

return cnt;

10913

}

10913

}

10914

EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);

10914

EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);

10915

10916

static long synchronize_sched_expedited_count;

10916

static long synchronize_sched_expedited_count;

10917

10918

/*

10918

/*

10919

* Wait for an rcu-sched grace period to elapse, but use "big hammer"

10919

* Wait for an rcu-sched grace period to elapse, but use "big hammer"

10920

* approach to force grace period to end quickly. This consumes

10920

* approach to force grace period to end quickly. This consumes

10921

* significant time on all CPUs, and is thus not recommended for

10921

* significant time on all CPUs, and is thus not recommended for

10922

* any sort of common-case code.

10922

* any sort of common-case code.

10923

*

10923

*

10924

* Note that it is illegal to call this function while holding any

10924

* Note that it is illegal to call this function while holding any

10925

* lock that is acquired by a CPU-hotplug notifier. Failing to

10925

* lock that is acquired by a CPU-hotplug notifier. Failing to

10926

* observe this restriction will result in deadlock.

10926

* observe this restriction will result in deadlock.

10927

*/

10927

*/

10928

void synchronize_sched_expedited(void)

10928

void synchronize_sched_expedited(void)

10929

{

10929

{

10930

int cpu;

10930

int cpu;

10931

unsigned long flags;

10931

unsigned long flags;

10932

bool need_full_sync = 0;

10932

bool need_full_sync = 0;

10933

struct rq *rq;

10933

struct rq *rq;

10934

struct migration_req *req;

10934

struct migration_req *req;

10935

long snap;

10935

long snap;

10936

int trycount = 0;

10936

int trycount = 0;

10937

10938

smp_mb(); /* ensure prior mod happens before capturing snap. */

10938

smp_mb(); /* ensure prior mod happens before capturing snap. */

10939

snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;

10939

snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;

10940

get_online_cpus();

10940

get_online_cpus();

10941

while (!mutex_trylock(&rcu_sched_expedited_mutex)) {

10941

while (!mutex_trylock(&rcu_sched_expedited_mutex)) {

10942

put_online_cpus();

10942

put_online_cpus();

10943

if (trycount++ < 10)

10943

if (trycount++ < 10)

10944

udelay(trycount * num_online_cpus());

10944

udelay(trycount * num_online_cpus());

10945

else {

10945

else {

10946

synchronize_sched();

10946

synchronize_sched();

10947

return;

10947

return;

10948

}

10948

}

10949

if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {

10949

if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {

10950

smp_mb(); /* ensure test happens before caller kfree */

10950

smp_mb(); /* ensure test happens before caller kfree */

10951

return;

10951

return;

10952

}

10952

}

10953

get_online_cpus();

10953

get_online_cpus();

10954

}

10954

}

10955

rcu_expedited_state = RCU_EXPEDITED_STATE_POST;

10955

rcu_expedited_state = RCU_EXPEDITED_STATE_POST;

10956

for_each_online_cpu(cpu) {

10956

for_each_online_cpu(cpu) {

10957

rq = cpu_rq(cpu);

10957

rq = cpu_rq(cpu);

10958

req = &per_cpu(rcu_migration_req, cpu);

10958

req = &per_cpu(rcu_migration_req, cpu);

10959

init_completion(&req->done);

10959

init_completion(&req->done);

10960

req->task = NULL;

10960

req->task = NULL;

10961

req->dest_cpu = RCU_MIGRATION_NEED_QS;

10961

req->dest_cpu = RCU_MIGRATION_NEED_QS;

10962

spin_lock_irqsave(&rq->lock, flags);

10962

spin_lock_irqsave(&rq->lock, flags);

10963

list_add(&req->list, &rq->migration_queue);

10963

list_add(&req->list, &rq->migration_queue);

10964

spin_unlock_irqrestore(&rq->lock, flags);

10964

spin_unlock_irqrestore(&rq->lock, flags);

10965

wake_up_process(rq->migration_thread);

10965

wake_up_process(rq->migration_thread);

10966

}

10966

}

10967

for_each_online_cpu(cpu) {

10967

for_each_online_cpu(cpu) {

10968

rcu_expedited_state = cpu;

10968

rcu_expedited_state = cpu;

10969

req = &per_cpu(rcu_migration_req, cpu);

10969

req = &per_cpu(rcu_migration_req, cpu);

10970

rq = cpu_rq(cpu);

10970

rq = cpu_rq(cpu);

10971

wait_for_completion(&req->done);

10971

wait_for_completion(&req->done);

10972

spin_lock_irqsave(&rq->lock, flags);

10972

spin_lock_irqsave(&rq->lock, flags);

10973

if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))

10973

if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))

10974

need_full_sync = 1;

10974

need_full_sync = 1;

10975

req->dest_cpu = RCU_MIGRATION_IDLE;

10975

req->dest_cpu = RCU_MIGRATION_IDLE;

10976

spin_unlock_irqrestore(&rq->lock, flags);

10976

spin_unlock_irqrestore(&rq->lock, flags);

10977

}

10977

}

10978

rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;

10978

rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;

10979

synchronize_sched_expedited_count++;

10979

synchronize_sched_expedited_count++;

10980

mutex_unlock(&rcu_sched_expedited_mutex);

10980

mutex_unlock(&rcu_sched_expedited_mutex);

10981

put_online_cpus();

10981

put_online_cpus();

10982

if (need_full_sync)

10982

if (need_full_sync)

10983

synchronize_sched();

10983

synchronize_sched();

10984

}

10984

}

10985

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

10985

EXPORT_SYMBOL_GPL(synchronize_sched_expedited);

10986

10987

#endif /* #else #ifndef CONFIG_SMP */

10987

#endif /* #else #ifndef CONFIG_SMP */

10988

GITLAB

sched: Use rcu in sched_get_rr_param()