Eric Lee / linux-smarc-t335x-v3.2

1

/*

1

/*

2

* kernel/sched.c

2

* kernel/sched.c

3

*

3

*

4

* Kernel scheduler and related syscalls

4

* Kernel scheduler and related syscalls

5

*

5

*

6

7

*

7

*

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

8

* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and

9

* make semaphores SMP safe

9

* make semaphores SMP safe

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

10

* 1998-11-19 Implemented schedule_timeout() and related stuff

11

* by Andrea Arcangeli

11

* by Andrea Arcangeli

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

12

* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:

13

* hybrid priority-list and round-robin design with

13

* hybrid priority-list and round-robin design with

14

* an array-switch method of distributing timeslices

14

* an array-switch method of distributing timeslices

15

* and per-CPU runqueues. Cleanups and useful suggestions

15

* and per-CPU runqueues. Cleanups and useful suggestions

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

16

* by Davide Libenzi, preemptible kernel bits by Robert Love.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

17

* 2003-09-03 Interactivity tuning by Con Kolivas.

18

* 2004-04-02 Scheduler domains code by Nick Piggin

18

* 2004-04-02 Scheduler domains code by Nick Piggin

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

19

* 2007-04-15 Work begun on replacing all interactivity tuning with a

20

* fair scheduling design by Con Kolivas.

20

* fair scheduling design by Con Kolivas.

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

21

* 2007-05-05 Load balancing (smp-nice) and other improvements

22

* by Peter Williams

22

* by Peter Williams

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

23

* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

24

* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri

25

* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,

25

* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,

26

* Thomas Gleixner, Mike Kravetz

26

* Thomas Gleixner, Mike Kravetz

27

*/

27

*/

28

29

#include <linux/mm.h>

29

#include <linux/mm.h>

30

#include <linux/module.h>

30

#include <linux/module.h>

31

#include <linux/nmi.h>

31

#include <linux/nmi.h>

32

#include <linux/init.h>

32

#include <linux/init.h>

33

#include <linux/uaccess.h>

33

#include <linux/uaccess.h>

34

#include <linux/highmem.h>

34

#include <linux/highmem.h>

35

#include <asm/mmu_context.h>

35

#include <asm/mmu_context.h>

36

#include <linux/interrupt.h>

36

#include <linux/interrupt.h>

37

#include <linux/capability.h>

37

#include <linux/capability.h>

38

#include <linux/completion.h>

38

#include <linux/completion.h>

39

#include <linux/kernel_stat.h>

39

#include <linux/kernel_stat.h>

40

#include <linux/debug_locks.h>

40

#include <linux/debug_locks.h>

41

#include <linux/perf_event.h>

41

#include <linux/perf_event.h>

42

#include <linux/security.h>

42

#include <linux/security.h>

43

#include <linux/notifier.h>

43

#include <linux/notifier.h>

44

#include <linux/profile.h>

44

#include <linux/profile.h>

45

#include <linux/freezer.h>

45

#include <linux/freezer.h>

46

#include <linux/vmalloc.h>

46

#include <linux/vmalloc.h>

47

#include <linux/blkdev.h>

47

#include <linux/blkdev.h>

48

#include <linux/delay.h>

48

#include <linux/delay.h>

49

#include <linux/pid_namespace.h>

49

#include <linux/pid_namespace.h>

50

#include <linux/smp.h>

50

#include <linux/smp.h>

51

#include <linux/threads.h>

51

#include <linux/threads.h>

52

#include <linux/timer.h>

52

#include <linux/timer.h>

53

#include <linux/rcupdate.h>

53

#include <linux/rcupdate.h>

54

#include <linux/cpu.h>

54

#include <linux/cpu.h>

55

#include <linux/cpuset.h>

55

#include <linux/cpuset.h>

56

#include <linux/percpu.h>

56

#include <linux/percpu.h>

57

#include <linux/proc_fs.h>

57

#include <linux/proc_fs.h>

58

#include <linux/seq_file.h>

58

#include <linux/seq_file.h>

59

#include <linux/stop_machine.h>

59

#include <linux/stop_machine.h>

60

#include <linux/sysctl.h>

60

#include <linux/sysctl.h>

61

#include <linux/syscalls.h>

61

#include <linux/syscalls.h>

62

#include <linux/times.h>

62

#include <linux/times.h>

63

#include <linux/tsacct_kern.h>

63

#include <linux/tsacct_kern.h>

64

#include <linux/kprobes.h>

64

#include <linux/kprobes.h>

65

#include <linux/delayacct.h>

65

#include <linux/delayacct.h>

66

#include <linux/unistd.h>

66

#include <linux/unistd.h>

67

#include <linux/pagemap.h>

67

#include <linux/pagemap.h>

68

#include <linux/hrtimer.h>

68

#include <linux/hrtimer.h>

69

#include <linux/tick.h>

69

#include <linux/tick.h>

70

#include <linux/debugfs.h>

70

#include <linux/debugfs.h>

71

#include <linux/ctype.h>

71

#include <linux/ctype.h>

72

#include <linux/ftrace.h>

72

#include <linux/ftrace.h>

73

#include <linux/slab.h>

73

#include <linux/slab.h>

74

75

#include <asm/tlb.h>

75

#include <asm/tlb.h>

76

#include <asm/irq_regs.h>

76

#include <asm/irq_regs.h>

77

#include <asm/mutex.h>

77

#include <asm/mutex.h>

78

#ifdef CONFIG_PARAVIRT

78

#ifdef CONFIG_PARAVIRT

79

#include <asm/paravirt.h>

79

#include <asm/paravirt.h>

80

#endif

80

#endif

81

82

#include "sched_cpupri.h"

82

#include "sched_cpupri.h"

83

#include "workqueue_sched.h"

83

#include "workqueue_sched.h"

84

#include "sched_autogroup.h"

84

#include "sched_autogroup.h"

85

86

#define CREATE_TRACE_POINTS

86

#define CREATE_TRACE_POINTS

87

#include <trace/events/sched.h>

87

#include <trace/events/sched.h>

88

89

/*

89

/*

90

* Convert user-nice values [ -20 ... 0 ... 19 ]

90

* Convert user-nice values [ -20 ... 0 ... 19 ]

91

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

91

* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],

92

* and back.

92

* and back.

93

*/

93

*/

94

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

94

#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)

95

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

95

#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)

96

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

96

#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)

97

98

/*

98

/*

99

* 'User priority' is the nice value converted to something we

99

* 'User priority' is the nice value converted to something we

100

* can work with better when scaling various scheduler parameters,

100

* can work with better when scaling various scheduler parameters,

101

* it's a [ 0 ... 39 ] range.

101

* it's a [ 0 ... 39 ] range.

102

*/

102

*/

103

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

103

#define USER_PRIO(p) ((p)-MAX_RT_PRIO)

104

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

104

#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)

105

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

105

#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))

106

107

/*

107

/*

108

* Helpers for converting nanosecond timing to jiffy resolution

108

* Helpers for converting nanosecond timing to jiffy resolution

109

*/

109

*/

110

#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))

110

#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))

111

112

#define NICE_0_LOAD SCHED_LOAD_SCALE

112

#define NICE_0_LOAD SCHED_LOAD_SCALE

113

#define NICE_0_SHIFT SCHED_LOAD_SHIFT

113

#define NICE_0_SHIFT SCHED_LOAD_SHIFT

114

115

/*

115

/*

116

* These are the 'tuning knobs' of the scheduler:

116

* These are the 'tuning knobs' of the scheduler:

117

*

117

*

118

* default timeslice is 100 msecs (used only for SCHED_RR tasks).

118

* default timeslice is 100 msecs (used only for SCHED_RR tasks).

119

* Timeslices get refilled after they expire.

119

* Timeslices get refilled after they expire.

120

*/

120

*/

121

#define DEF_TIMESLICE (100 * HZ / 1000)

121

#define DEF_TIMESLICE (100 * HZ / 1000)

122

123

/*

123

/*

124

* single value that denotes runtime == period, ie unlimited time.

124

* single value that denotes runtime == period, ie unlimited time.

125

*/

125

*/

126

#define RUNTIME_INF ((u64)~0ULL)

126

#define RUNTIME_INF ((u64)~0ULL)

127

128

static inline int rt_policy(int policy)

128

static inline int rt_policy(int policy)

129

{

129

{

130

if (policy == SCHED_FIFO || policy == SCHED_RR)

130

if (policy == SCHED_FIFO || policy == SCHED_RR)

131

return 1;

131

return 1;

132

return 0;

132

return 0;

133

}

133

}

134

135

static inline int task_has_rt_policy(struct task_struct *p)

135

static inline int task_has_rt_policy(struct task_struct *p)

136

{

136

{

137

return rt_policy(p->policy);

137

return rt_policy(p->policy);

138

}

138

}

139

140

/*

140

/*

141

* This is the priority-queue data structure of the RT scheduling class:

141

* This is the priority-queue data structure of the RT scheduling class:

142

*/

142

*/

143

struct rt_prio_array {

143

struct rt_prio_array {

144

DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */

144

DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */

145

struct list_head queue[MAX_RT_PRIO];

145

struct list_head queue[MAX_RT_PRIO];

146

};

146

};

147

148

struct rt_bandwidth {

148

struct rt_bandwidth {

149

/* nests inside the rq lock: */

149

/* nests inside the rq lock: */

150

raw_spinlock_t rt_runtime_lock;

150

raw_spinlock_t rt_runtime_lock;

151

ktime_t rt_period;

151

ktime_t rt_period;

152

u64 rt_runtime;

152

u64 rt_runtime;

153

struct hrtimer rt_period_timer;

153

struct hrtimer rt_period_timer;

154

};

154

};

155

156

static struct rt_bandwidth def_rt_bandwidth;

156

static struct rt_bandwidth def_rt_bandwidth;

157

158

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

158

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

159

160

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)

160

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)

161

{

161

{

162

struct rt_bandwidth *rt_b =

162

struct rt_bandwidth *rt_b =

163

container_of(timer, struct rt_bandwidth, rt_period_timer);

163

container_of(timer, struct rt_bandwidth, rt_period_timer);

164

ktime_t now;

164

ktime_t now;

165

int overrun;

165

int overrun;

166

int idle = 0;

166

int idle = 0;

167

168

for (;;) {

168

for (;;) {

169

now = hrtimer_cb_get_time(timer);

169

now = hrtimer_cb_get_time(timer);

170

overrun = hrtimer_forward(timer, now, rt_b->rt_period);

170

overrun = hrtimer_forward(timer, now, rt_b->rt_period);

171

172

if (!overrun)

172

if (!overrun)

173

break;

173

break;

174

175

idle = do_sched_rt_period_timer(rt_b, overrun);

175

idle = do_sched_rt_period_timer(rt_b, overrun);

176

}

176

}

177

178

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

178

return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;

179

}

179

}

180

181

static

181

static

182

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)

182

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)

183

{

183

{

184

rt_b->rt_period = ns_to_ktime(period);

184

rt_b->rt_period = ns_to_ktime(period);

185

rt_b->rt_runtime = runtime;

185

rt_b->rt_runtime = runtime;

186

187

raw_spin_lock_init(&rt_b->rt_runtime_lock);

187

raw_spin_lock_init(&rt_b->rt_runtime_lock);

188

189

hrtimer_init(&rt_b->rt_period_timer,

189

hrtimer_init(&rt_b->rt_period_timer,

190

CLOCK_MONOTONIC, HRTIMER_MODE_REL);

190

CLOCK_MONOTONIC, HRTIMER_MODE_REL);

191

rt_b->rt_period_timer.function = sched_rt_period_timer;

191

rt_b->rt_period_timer.function = sched_rt_period_timer;

192

}

192

}

193

194

static inline int rt_bandwidth_enabled(void)

194

static inline int rt_bandwidth_enabled(void)

195

{

195

{

196

return sysctl_sched_rt_runtime >= 0;

196

return sysctl_sched_rt_runtime >= 0;

197

}

197

}

198

199

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)

199

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)

200

{

200

{

201

ktime_t now;

201

ktime_t now;

202

203

if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)

203

if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)

204

return;

204

return;

205

206

if (hrtimer_active(&rt_b->rt_period_timer))

206

if (hrtimer_active(&rt_b->rt_period_timer))

207

return;

207

return;

208

209

raw_spin_lock(&rt_b->rt_runtime_lock);

209

raw_spin_lock(&rt_b->rt_runtime_lock);

210

for (;;) {

210

for (;;) {

211

unsigned long delta;

211

unsigned long delta;

212

ktime_t soft, hard;

212

ktime_t soft, hard;

213

214

if (hrtimer_active(&rt_b->rt_period_timer))

214

if (hrtimer_active(&rt_b->rt_period_timer))

215

break;

215

break;

216

217

now = hrtimer_cb_get_time(&rt_b->rt_period_timer);

217

now = hrtimer_cb_get_time(&rt_b->rt_period_timer);

218

hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);

218

hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);

219

220

soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);

220

soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);

221

hard = hrtimer_get_expires(&rt_b->rt_period_timer);

221

hard = hrtimer_get_expires(&rt_b->rt_period_timer);

222

delta = ktime_to_ns(ktime_sub(hard, soft));

222

delta = ktime_to_ns(ktime_sub(hard, soft));

223

__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,

223

__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,

224

HRTIMER_MODE_ABS_PINNED, 0);

224

HRTIMER_MODE_ABS_PINNED, 0);

225

}

225

}

226

raw_spin_unlock(&rt_b->rt_runtime_lock);

226

raw_spin_unlock(&rt_b->rt_runtime_lock);

227

}

227

}

228

229

#ifdef CONFIG_RT_GROUP_SCHED

229

#ifdef CONFIG_RT_GROUP_SCHED

230

static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)

230

static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)

231

{

231

{

232

hrtimer_cancel(&rt_b->rt_period_timer);

232

hrtimer_cancel(&rt_b->rt_period_timer);

233

}

233

}

234

#endif

234

#endif

235

236

/*

236

/*

237

* sched_domains_mutex serializes calls to init_sched_domains,

237

* sched_domains_mutex serializes calls to init_sched_domains,

238

* detach_destroy_domains and partition_sched_domains.

238

* detach_destroy_domains and partition_sched_domains.

239

*/

239

*/

240

static DEFINE_MUTEX(sched_domains_mutex);

240

static DEFINE_MUTEX(sched_domains_mutex);

241

242

#ifdef CONFIG_CGROUP_SCHED

242

#ifdef CONFIG_CGROUP_SCHED

243

244

#include <linux/cgroup.h>

244

#include <linux/cgroup.h>

245

246

struct cfs_rq;

246

struct cfs_rq;

247

248

static LIST_HEAD(task_groups);

248

static LIST_HEAD(task_groups);

249

250

/* task group related information */

250

/* task group related information */

251

struct task_group {

251

struct task_group {

252

struct cgroup_subsys_state css;

252

struct cgroup_subsys_state css;

253

254

#ifdef CONFIG_FAIR_GROUP_SCHED

254

#ifdef CONFIG_FAIR_GROUP_SCHED

255

/* schedulable entities of this group on each cpu */

255

/* schedulable entities of this group on each cpu */

256

struct sched_entity **se;

256

struct sched_entity **se;

257

/* runqueue "owned" by this group on each cpu */

257

/* runqueue "owned" by this group on each cpu */

258

struct cfs_rq **cfs_rq;

258

struct cfs_rq **cfs_rq;

259

unsigned long shares;

259

unsigned long shares;

260

261

atomic_t load_weight;

261

atomic_t load_weight;

262

#endif

262

#endif

263

264

#ifdef CONFIG_RT_GROUP_SCHED

264

#ifdef CONFIG_RT_GROUP_SCHED

265

struct sched_rt_entity **rt_se;

265

struct sched_rt_entity **rt_se;

266

struct rt_rq **rt_rq;

266

struct rt_rq **rt_rq;

267

268

struct rt_bandwidth rt_bandwidth;

268

struct rt_bandwidth rt_bandwidth;

269

#endif

269

#endif

270

271

struct rcu_head rcu;

271

struct rcu_head rcu;

272

struct list_head list;

272

struct list_head list;

273

274

struct task_group *parent;

274

struct task_group *parent;

275

struct list_head siblings;

275

struct list_head siblings;

276

struct list_head children;

276

struct list_head children;

277

278

#ifdef CONFIG_SCHED_AUTOGROUP

278

#ifdef CONFIG_SCHED_AUTOGROUP

279

struct autogroup *autogroup;

279

struct autogroup *autogroup;

280

#endif

280

#endif

281

};

281

};

282

283

/* task_group_lock serializes the addition/removal of task groups */

283

/* task_group_lock serializes the addition/removal of task groups */

284

static DEFINE_SPINLOCK(task_group_lock);

284

static DEFINE_SPINLOCK(task_group_lock);

285

286

#ifdef CONFIG_FAIR_GROUP_SCHED

286

#ifdef CONFIG_FAIR_GROUP_SCHED

287

288

# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD

288

# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD

289

290

/*

290

/*

291

* A weight of 0 or 1 can cause arithmetics problems.

291

* A weight of 0 or 1 can cause arithmetics problems.

292

* A weight of a cfs_rq is the sum of weights of which entities

292

* A weight of a cfs_rq is the sum of weights of which entities

293

* are queued on this cfs_rq, so a weight of a entity should not be

293

* are queued on this cfs_rq, so a weight of a entity should not be

294

* too large, so as the shares value of a task group.

294

* too large, so as the shares value of a task group.

295

* (The default weight is 1024 - so there's no practical

295

* (The default weight is 1024 - so there's no practical

296

* limitation from this.)

296

* limitation from this.)

297

*/

297

*/

298

#define MIN_SHARES (1UL << 1)

298

#define MIN_SHARES (1UL << 1)

299

#define MAX_SHARES (1UL << 18)

299

#define MAX_SHARES (1UL << 18)

300

301

static int root_task_group_load = ROOT_TASK_GROUP_LOAD;

301

static int root_task_group_load = ROOT_TASK_GROUP_LOAD;

302

#endif

302

#endif

303

304

/* Default task group.

304

/* Default task group.

305

* Every task in system belong to this group at bootup.

305

* Every task in system belong to this group at bootup.

306

*/

306

*/

307

struct task_group root_task_group;

307

struct task_group root_task_group;

308

309

#endif /* CONFIG_CGROUP_SCHED */

309

#endif /* CONFIG_CGROUP_SCHED */

310

311

/* CFS-related fields in a runqueue */

311

/* CFS-related fields in a runqueue */

312

struct cfs_rq {

312

struct cfs_rq {

313

struct load_weight load;

313

struct load_weight load;

314

unsigned long nr_running;

314

unsigned long nr_running;

315

316

u64 exec_clock;

316

u64 exec_clock;

317

u64 min_vruntime;

317

u64 min_vruntime;

318

#ifndef CONFIG_64BIT

318

#ifndef CONFIG_64BIT

319

u64 min_vruntime_copy;

319

u64 min_vruntime_copy;

320

#endif

320

#endif

321

322

struct rb_root tasks_timeline;

322

struct rb_root tasks_timeline;

323

struct rb_node *rb_leftmost;

323

struct rb_node *rb_leftmost;

324

325

struct list_head tasks;

325

struct list_head tasks;

326

struct list_head *balance_iterator;

326

struct list_head *balance_iterator;

327

328

/*

328

/*

329

* 'curr' points to currently running entity on this cfs_rq.

329

* 'curr' points to currently running entity on this cfs_rq.

330

* It is set to NULL otherwise (i.e when none are currently running).

330

* It is set to NULL otherwise (i.e when none are currently running).

331

*/

331

*/

332

struct sched_entity *curr, *next, *last, *skip;

332

struct sched_entity *curr, *next, *last, *skip;

333

334

#ifdef CONFIG_SCHED_DEBUG

334

#ifdef CONFIG_SCHED_DEBUG

335

unsigned int nr_spread_over;

335

unsigned int nr_spread_over;

336

#endif

336

#endif

337

338

#ifdef CONFIG_FAIR_GROUP_SCHED

338

#ifdef CONFIG_FAIR_GROUP_SCHED

339

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

339

struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */

340

341

/*

341

/*

342

* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

342

* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in

343

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

343

* a hierarchy). Non-leaf lrqs hold other higher schedulable entities

344

* (like users, containers etc.)

344

* (like users, containers etc.)

345

*

345

*

346

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

346

* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This

347

* list is used during load balance.

347

* list is used during load balance.

348

*/

348

*/

349

int on_list;

349

int on_list;

350

struct list_head leaf_cfs_rq_list;

350

struct list_head leaf_cfs_rq_list;

351

struct task_group *tg; /* group that "owns" this runqueue */

351

struct task_group *tg; /* group that "owns" this runqueue */

352

353

#ifdef CONFIG_SMP

353

#ifdef CONFIG_SMP

354

/*

354

/*

355

* the part of load.weight contributed by tasks

355

* the part of load.weight contributed by tasks

356

*/

356

*/

357

unsigned long task_weight;

357

unsigned long task_weight;

358

359

/*

359

/*

360

* h_load = weight * f(tg)

360

* h_load = weight * f(tg)

361

*

361

*

362

* Where f(tg) is the recursive weight fraction assigned to

362

* Where f(tg) is the recursive weight fraction assigned to

363

* this group.

363

* this group.

364

*/

364

*/

365

unsigned long h_load;

365

unsigned long h_load;

366

367

/*

367

/*

368

* Maintaining per-cpu shares distribution for group scheduling

368

* Maintaining per-cpu shares distribution for group scheduling

369

*

369

*

370

* load_stamp is the last time we updated the load average

370

* load_stamp is the last time we updated the load average

371

* load_last is the last time we updated the load average and saw load

371

* load_last is the last time we updated the load average and saw load

372

* load_unacc_exec_time is currently unaccounted execution time

372

* load_unacc_exec_time is currently unaccounted execution time

373

*/

373

*/

374

u64 load_avg;

374

u64 load_avg;

375

u64 load_period;

375

u64 load_period;

376

u64 load_stamp, load_last, load_unacc_exec_time;

376

u64 load_stamp, load_last, load_unacc_exec_time;

377

378

unsigned long load_contribution;

378

unsigned long load_contribution;

379

#endif

379

#endif

380

#endif

380

#endif

381

};

381

};

382

383

/* Real-Time classes' related field in a runqueue: */

383

/* Real-Time classes' related field in a runqueue: */

384

struct rt_rq {

384

struct rt_rq {

385

struct rt_prio_array active;

385

struct rt_prio_array active;

386

unsigned long rt_nr_running;

386

unsigned long rt_nr_running;

387

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

387

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED

388

struct {

388

struct {

389

int curr; /* highest queued rt task prio */

389

int curr; /* highest queued rt task prio */

390

#ifdef CONFIG_SMP

390

#ifdef CONFIG_SMP

391

int next; /* next highest */

391

int next; /* next highest */

392

#endif

392

#endif

393

} highest_prio;

393

} highest_prio;

394

#endif

394

#endif

395

#ifdef CONFIG_SMP

395

#ifdef CONFIG_SMP

396

unsigned long rt_nr_migratory;

396

unsigned long rt_nr_migratory;

397

unsigned long rt_nr_total;

397

unsigned long rt_nr_total;

398

int overloaded;

398

int overloaded;

399

struct plist_head pushable_tasks;

399

struct plist_head pushable_tasks;

400

#endif

400

#endif

401

int rt_throttled;

401

int rt_throttled;

402

u64 rt_time;

402

u64 rt_time;

403

u64 rt_runtime;

403

u64 rt_runtime;

404

/* Nests inside the rq lock: */

404

/* Nests inside the rq lock: */

405

raw_spinlock_t rt_runtime_lock;

405

raw_spinlock_t rt_runtime_lock;

406

407

#ifdef CONFIG_RT_GROUP_SCHED

407

#ifdef CONFIG_RT_GROUP_SCHED

408

unsigned long rt_nr_boosted;

408

unsigned long rt_nr_boosted;

409

410

struct rq *rq;

410

struct rq *rq;

411

struct list_head leaf_rt_rq_list;

411

struct list_head leaf_rt_rq_list;

412

struct task_group *tg;

412

struct task_group *tg;

413

#endif

413

#endif

414

};

414

};

415

416

#ifdef CONFIG_SMP

416

#ifdef CONFIG_SMP

417

418

/*

418

/*

419

* We add the notion of a root-domain which will be used to define per-domain

419

* We add the notion of a root-domain which will be used to define per-domain

420

* variables. Each exclusive cpuset essentially defines an island domain by

420

* variables. Each exclusive cpuset essentially defines an island domain by

421

* fully partitioning the member cpus from any other cpuset. Whenever a new

421

* fully partitioning the member cpus from any other cpuset. Whenever a new

422

* exclusive cpuset is created, we also create and attach a new root-domain

422

* exclusive cpuset is created, we also create and attach a new root-domain

423

* object.

423

* object.

424

*

424

*

425

*/

425

*/

426

struct root_domain {

426

struct root_domain {

427

atomic_t refcount;

427

atomic_t refcount;

428

atomic_t rto_count;

428

atomic_t rto_count;

429

struct rcu_head rcu;

429

struct rcu_head rcu;

430

cpumask_var_t span;

430

cpumask_var_t span;

431

cpumask_var_t online;

431

cpumask_var_t online;

432

433

/*

433

/*

434

* The "RT overload" flag: it gets set if a CPU has more than

434

* The "RT overload" flag: it gets set if a CPU has more than

435

* one runnable RT task.

435

* one runnable RT task.

436

*/

436

*/

437

cpumask_var_t rto_mask;

437

cpumask_var_t rto_mask;

438

struct cpupri cpupri;

438

struct cpupri cpupri;

439

};

439

};

440

441

/*

441

/*

442

* By default the system creates a single root-domain with all cpus as

442

* By default the system creates a single root-domain with all cpus as

443

* members (mimicking the global state we have today).

443

* members (mimicking the global state we have today).

444

*/

444

*/

445

static struct root_domain def_root_domain;

445

static struct root_domain def_root_domain;

446

447

#endif /* CONFIG_SMP */

447

#endif /* CONFIG_SMP */

448

449

/*

449

/*

450

* This is the main, per-CPU runqueue data structure.

450

* This is the main, per-CPU runqueue data structure.

451

*

451

*

452

* Locking rule: those places that want to lock multiple runqueues

452

* Locking rule: those places that want to lock multiple runqueues

453

* (such as the load balancing or the thread migration code), lock

453

* (such as the load balancing or the thread migration code), lock

454

* acquire operations must be ordered by ascending &runqueue.

454

* acquire operations must be ordered by ascending &runqueue.

455

*/

455

*/

456

struct rq {

456

struct rq {

457

/* runqueue lock: */

457

/* runqueue lock: */

458

raw_spinlock_t lock;

458

raw_spinlock_t lock;

459

460

/*

460

/*

461

* nr_running and cpu_load should be in the same cacheline because

461

* nr_running and cpu_load should be in the same cacheline because

462

* remote CPUs use both these fields when doing load calculation.

462

* remote CPUs use both these fields when doing load calculation.

463

*/

463

*/

464

unsigned long nr_running;

464

unsigned long nr_running;

465

#define CPU_LOAD_IDX_MAX 5

465

#define CPU_LOAD_IDX_MAX 5

466

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

466

unsigned long cpu_load[CPU_LOAD_IDX_MAX];

467

unsigned long last_load_update_tick;

467

unsigned long last_load_update_tick;

468

#ifdef CONFIG_NO_HZ

468

#ifdef CONFIG_NO_HZ

469

u64 nohz_stamp;

469

u64 nohz_stamp;

470

unsigned char nohz_balance_kick;

470

unsigned char nohz_balance_kick;

471

#endif

471

#endif

472

int skip_clock_update;

472

int skip_clock_update;

473

474

/* capture load from *all* tasks on this cpu: */

474

/* capture load from *all* tasks on this cpu: */

475

struct load_weight load;

475

struct load_weight load;

476

unsigned long nr_load_updates;

476

unsigned long nr_load_updates;

477

u64 nr_switches;

477

u64 nr_switches;

478

479

struct cfs_rq cfs;

479

struct cfs_rq cfs;

480

struct rt_rq rt;

480

struct rt_rq rt;

481

482

#ifdef CONFIG_FAIR_GROUP_SCHED

482

#ifdef CONFIG_FAIR_GROUP_SCHED

483

/* list of leaf cfs_rq on this cpu: */

483

/* list of leaf cfs_rq on this cpu: */

484

struct list_head leaf_cfs_rq_list;

484

struct list_head leaf_cfs_rq_list;

485

#endif

485

#endif

486

#ifdef CONFIG_RT_GROUP_SCHED

486

#ifdef CONFIG_RT_GROUP_SCHED

487

struct list_head leaf_rt_rq_list;

487

struct list_head leaf_rt_rq_list;

488

#endif

488

#endif

489

490

/*

490

/*

491

* This is part of a global counter where only the total sum

491

* This is part of a global counter where only the total sum

492

* over all CPUs matters. A task can increase this counter on

492

* over all CPUs matters. A task can increase this counter on

493

* one CPU and if it got migrated afterwards it may decrease

493

* one CPU and if it got migrated afterwards it may decrease

494

* it on another CPU. Always updated under the runqueue lock:

494

* it on another CPU. Always updated under the runqueue lock:

495

*/

495

*/

496

unsigned long nr_uninterruptible;

496

unsigned long nr_uninterruptible;

497

498

struct task_struct *curr, *idle, *stop;

498

struct task_struct *curr, *idle, *stop;

499

unsigned long next_balance;

499

unsigned long next_balance;

500

struct mm_struct *prev_mm;

500

struct mm_struct *prev_mm;

501

502

u64 clock;

502

u64 clock;

503

u64 clock_task;

503

u64 clock_task;

504

505

atomic_t nr_iowait;

505

atomic_t nr_iowait;

506

507

#ifdef CONFIG_SMP

507

#ifdef CONFIG_SMP

508

struct root_domain *rd;

508

struct root_domain *rd;

509

struct sched_domain *sd;

509

struct sched_domain *sd;

510

511

unsigned long cpu_power;

511

unsigned long cpu_power;

512

513

unsigned char idle_at_tick;

513

unsigned char idle_at_tick;

514

/* For active balancing */

514

/* For active balancing */

515

int post_schedule;

515

int post_schedule;

516

int active_balance;

516

int active_balance;

517

int push_cpu;

517

int push_cpu;

518

struct cpu_stop_work active_balance_work;

518

struct cpu_stop_work active_balance_work;

519

/* cpu of this runqueue: */

519

/* cpu of this runqueue: */

520

int cpu;

520

int cpu;

521

int online;

521

int online;

522

523

unsigned long avg_load_per_task;

523

unsigned long avg_load_per_task;

524

525

u64 rt_avg;

525

u64 rt_avg;

526

u64 age_stamp;

526

u64 age_stamp;

527

u64 idle_stamp;

527

u64 idle_stamp;

528

u64 avg_idle;

528

u64 avg_idle;

529

#endif

529

#endif

530

531

#ifdef CONFIG_IRQ_TIME_ACCOUNTING

531

#ifdef CONFIG_IRQ_TIME_ACCOUNTING

532

u64 prev_irq_time;

532

u64 prev_irq_time;

533

#endif

533

#endif

534

#ifdef CONFIG_PARAVIRT

534

#ifdef CONFIG_PARAVIRT

535

u64 prev_steal_time;

535

u64 prev_steal_time;

536

#endif

536

#endif

537

#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING

537

#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING

538

u64 prev_steal_time_rq;

538

u64 prev_steal_time_rq;

539

#endif

539

#endif

540

541

/* calc_load related fields */

541

/* calc_load related fields */

542

unsigned long calc_load_update;

542

unsigned long calc_load_update;

543

long calc_load_active;

543

long calc_load_active;

544

545

#ifdef CONFIG_SCHED_HRTICK

545

#ifdef CONFIG_SCHED_HRTICK

546

#ifdef CONFIG_SMP

546

#ifdef CONFIG_SMP

547

int hrtick_csd_pending;

547

int hrtick_csd_pending;

548

struct call_single_data hrtick_csd;

548

struct call_single_data hrtick_csd;

549

#endif

549

#endif

550

struct hrtimer hrtick_timer;

550

struct hrtimer hrtick_timer;

551

#endif

551

#endif

552

553

#ifdef CONFIG_SCHEDSTATS

553

#ifdef CONFIG_SCHEDSTATS

554

/* latency stats */

554

/* latency stats */

555

struct sched_info rq_sched_info;

555

struct sched_info rq_sched_info;

556

unsigned long long rq_cpu_time;

556

unsigned long long rq_cpu_time;

557

/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

557

/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */

558

559

/* sys_sched_yield() stats */

559

/* sys_sched_yield() stats */

560

unsigned int yld_count;

560

unsigned int yld_count;

561

562

/* schedule() stats */

562

/* schedule() stats */

563

unsigned int sched_switch;

563

unsigned int sched_switch;

564

unsigned int sched_count;

564

unsigned int sched_count;

565

unsigned int sched_goidle;

565

unsigned int sched_goidle;

566

567

/* try_to_wake_up() stats */

567

/* try_to_wake_up() stats */

568

unsigned int ttwu_count;

568

unsigned int ttwu_count;

569

unsigned int ttwu_local;

569

unsigned int ttwu_local;

570

#endif

570

#endif

571

572

#ifdef CONFIG_SMP

572

#ifdef CONFIG_SMP

573

struct task_struct *wake_list;

573

struct task_struct *wake_list;

574

#endif

574

#endif

575

};

575

};

576

577

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

577

static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

578

579

580

static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);

580

static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);

581

582

static inline int cpu_of(struct rq *rq)

582

static inline int cpu_of(struct rq *rq)

583

{

583

{

584

#ifdef CONFIG_SMP

584

#ifdef CONFIG_SMP

585

return rq->cpu;

585

return rq->cpu;

586

#else

586

#else

587

return 0;

587

return 0;

588

#endif

588

#endif

589

}

589

}

590

591

#define rcu_dereference_check_sched_domain(p) \

591

#define rcu_dereference_check_sched_domain(p) \

592

rcu_dereference_check((p), \

592

rcu_dereference_check((p), \

593

lockdep_is_held(&sched_domains_mutex))

593

lockdep_is_held(&sched_domains_mutex))

594

595

/*

595

/*

596

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

596

* The domain tree (rq->sd) is protected by RCU's quiescent state transition.

597

* See detach_destroy_domains: synchronize_sched for details.

597

* See detach_destroy_domains: synchronize_sched for details.

598

*

598

*

599

* The domain tree of any CPU may only be accessed from within

599

* The domain tree of any CPU may only be accessed from within

600

* preempt-disabled sections.

600

* preempt-disabled sections.

601

*/

601

*/

602

#define for_each_domain(cpu, __sd) \

602

#define for_each_domain(cpu, __sd) \

603

for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)

603

for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)

604

605

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

605

#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))

606

#define this_rq() (&__get_cpu_var(runqueues))

606

#define this_rq() (&__get_cpu_var(runqueues))

607

#define task_rq(p) cpu_rq(task_cpu(p))

607

#define task_rq(p) cpu_rq(task_cpu(p))

608

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

608

#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

609

#define raw_rq() (&__raw_get_cpu_var(runqueues))

609

#define raw_rq() (&__raw_get_cpu_var(runqueues))

610

611

#ifdef CONFIG_CGROUP_SCHED

611

#ifdef CONFIG_CGROUP_SCHED

612

613

/*

613

/*

614

* Return the group to which this tasks belongs.

614

* Return the group to which this tasks belongs.

615

*

615

*

616

* We use task_subsys_state_check() and extend the RCU verification with

616

* We use task_subsys_state_check() and extend the RCU verification with

617

* pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each

617

* pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each

618

* task it moves into the cgroup. Therefore by holding either of those locks,

618

* task it moves into the cgroup. Therefore by holding either of those locks,

619

* we pin the task to the current cgroup.

619

* we pin the task to the current cgroup.

620

*/

620

*/

621

static inline struct task_group *task_group(struct task_struct *p)

621

static inline struct task_group *task_group(struct task_struct *p)

622

{

622

{

623

struct task_group *tg;

623

struct task_group *tg;

624

struct cgroup_subsys_state *css;

624

struct cgroup_subsys_state *css;

625

626

css = task_subsys_state_check(p, cpu_cgroup_subsys_id,

626

css = task_subsys_state_check(p, cpu_cgroup_subsys_id,

627

lockdep_is_held(&p->pi_lock) ||

627

lockdep_is_held(&p->pi_lock) ||

628

lockdep_is_held(&task_rq(p)->lock));

628

lockdep_is_held(&task_rq(p)->lock));

629

tg = container_of(css, struct task_group, css);

629

tg = container_of(css, struct task_group, css);

630

631

return autogroup_task_group(p, tg);

631

return autogroup_task_group(p, tg);

632

}

632

}

633

634

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

634

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */

635

static inline void set_task_rq(struct task_struct *p, unsigned int cpu)

635

static inline void set_task_rq(struct task_struct *p, unsigned int cpu)

636

{

636

{

637

#ifdef CONFIG_FAIR_GROUP_SCHED

637

#ifdef CONFIG_FAIR_GROUP_SCHED

638

p->se.cfs_rq = task_group(p)->cfs_rq[cpu];

638

p->se.cfs_rq = task_group(p)->cfs_rq[cpu];

639

p->se.parent = task_group(p)->se[cpu];

639

p->se.parent = task_group(p)->se[cpu];

640

#endif

640

#endif

641

642

#ifdef CONFIG_RT_GROUP_SCHED

642

#ifdef CONFIG_RT_GROUP_SCHED

643

p->rt.rt_rq = task_group(p)->rt_rq[cpu];

643

p->rt.rt_rq = task_group(p)->rt_rq[cpu];

644

p->rt.parent = task_group(p)->rt_se[cpu];

644

p->rt.parent = task_group(p)->rt_se[cpu];

645

#endif

645

#endif

646

}

646

}

647

648

#else /* CONFIG_CGROUP_SCHED */

648

#else /* CONFIG_CGROUP_SCHED */

649

650

static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }

650

static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }

651

static inline struct task_group *task_group(struct task_struct *p)

651

static inline struct task_group *task_group(struct task_struct *p)

652

{

652

{

653

return NULL;

653

return NULL;

654

}

654

}

655

656

#endif /* CONFIG_CGROUP_SCHED */

656

#endif /* CONFIG_CGROUP_SCHED */

657

658

static void update_rq_clock_task(struct rq *rq, s64 delta);

658

static void update_rq_clock_task(struct rq *rq, s64 delta);

659

660

static void update_rq_clock(struct rq *rq)

660

static void update_rq_clock(struct rq *rq)

661

{

661

{

662

s64 delta;

662

s64 delta;

663

664

if (rq->skip_clock_update > 0)

664

if (rq->skip_clock_update > 0)

665

return;

665

return;

666

667

delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;

667

delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;

668

rq->clock += delta;

668

rq->clock += delta;

669

update_rq_clock_task(rq, delta);

669

update_rq_clock_task(rq, delta);

670

}

670

}

671

672

/*

672

/*

673

* Tunables that become constants when CONFIG_SCHED_DEBUG is off:

673

* Tunables that become constants when CONFIG_SCHED_DEBUG is off:

674

*/

674

*/

675

#ifdef CONFIG_SCHED_DEBUG

675

#ifdef CONFIG_SCHED_DEBUG

676

# define const_debug __read_mostly

676

# define const_debug __read_mostly

677

#else

677

#else

678

# define const_debug static const

678

# define const_debug static const

679

#endif

679

#endif

680

681

/**

681

/**

682

* runqueue_is_locked - Returns true if the current cpu runqueue is locked

682

* runqueue_is_locked - Returns true if the current cpu runqueue is locked

683

* @cpu: the processor in question.

683

* @cpu: the processor in question.

684

*

684

*

685

* This interface allows printk to be called with the runqueue lock

685

* This interface allows printk to be called with the runqueue lock

686

* held and know whether or not it is OK to wake up the klogd.

686

* held and know whether or not it is OK to wake up the klogd.

687

*/

687

*/

688

int runqueue_is_locked(int cpu)

688

int runqueue_is_locked(int cpu)

689

{

689

{

690

return raw_spin_is_locked(&cpu_rq(cpu)->lock);

690

return raw_spin_is_locked(&cpu_rq(cpu)->lock);

691

}

691

}

692

693

/*

693

/*

694

* Debugging: various feature bits

694

* Debugging: various feature bits

695

*/

695

*/

696

697

#define SCHED_FEAT(name, enabled) \

697

#define SCHED_FEAT(name, enabled) \

698

__SCHED_FEAT_##name ,

698

__SCHED_FEAT_##name ,

699

700

enum {

700

enum {

701

#include "sched_features.h"

701

#include "sched_features.h"

702

};

702

};

703

704

#undef SCHED_FEAT

704

#undef SCHED_FEAT

705

706

#define SCHED_FEAT(name, enabled) \

706

#define SCHED_FEAT(name, enabled) \

707

(1UL << __SCHED_FEAT_##name) * enabled |

707

(1UL << __SCHED_FEAT_##name) * enabled |

708

709

const_debug unsigned int sysctl_sched_features =

709

const_debug unsigned int sysctl_sched_features =

710

#include "sched_features.h"

710

#include "sched_features.h"

711

0;

711

0;

712

713

#undef SCHED_FEAT

713

#undef SCHED_FEAT

714

715

#ifdef CONFIG_SCHED_DEBUG

715

#ifdef CONFIG_SCHED_DEBUG

716

#define SCHED_FEAT(name, enabled) \

716

#define SCHED_FEAT(name, enabled) \

717

#name ,

717

#name ,

718

719

static __read_mostly char *sched_feat_names[] = {

719

static __read_mostly char *sched_feat_names[] = {

720

#include "sched_features.h"

720

#include "sched_features.h"

721

NULL

721

NULL

722

};

722

};

723

724

#undef SCHED_FEAT

724

#undef SCHED_FEAT

725

726

static int sched_feat_show(struct seq_file *m, void *v)

726

static int sched_feat_show(struct seq_file *m, void *v)

727

{

727

{

728

int i;

728

int i;

729

730

for (i = 0; sched_feat_names[i]; i++) {

730

for (i = 0; sched_feat_names[i]; i++) {

731

if (!(sysctl_sched_features & (1UL << i)))

731

if (!(sysctl_sched_features & (1UL << i)))

732

seq_puts(m, "NO_");

732

seq_puts(m, "NO_");

733

seq_printf(m, "%s ", sched_feat_names[i]);

733

seq_printf(m, "%s ", sched_feat_names[i]);

734

}

734

}

735

seq_puts(m, "\n");

735

seq_puts(m, "\n");

736

737

return 0;

737

return 0;

738

}

738

}

739

740

static ssize_t

740

static ssize_t

741

sched_feat_write(struct file *filp, const char __user *ubuf,

741

sched_feat_write(struct file *filp, const char __user *ubuf,

742

size_t cnt, loff_t *ppos)

742

size_t cnt, loff_t *ppos)

743

{

743

{

744

char buf[64];

744

char buf[64];

745

char *cmp;

745

char *cmp;

746

int neg = 0;

746

int neg = 0;

747

int i;

747

int i;

748

749

if (cnt > 63)

749

if (cnt > 63)

750

cnt = 63;

750

cnt = 63;

751

752

if (copy_from_user(&buf, ubuf, cnt))

752

if (copy_from_user(&buf, ubuf, cnt))

753

return -EFAULT;

753

return -EFAULT;

754

755

buf[cnt] = 0;

755

buf[cnt] = 0;

756

cmp = strstrip(buf);

756

cmp = strstrip(buf);

757

758

if (strncmp(cmp, "NO_", 3) == 0) {

758

if (strncmp(cmp, "NO_", 3) == 0) {

759

neg = 1;

759

neg = 1;

760

cmp += 3;

760

cmp += 3;

761

}

761

}

762

763

for (i = 0; sched_feat_names[i]; i++) {

763

for (i = 0; sched_feat_names[i]; i++) {

764

if (strcmp(cmp, sched_feat_names[i]) == 0) {

764

if (strcmp(cmp, sched_feat_names[i]) == 0) {

765

if (neg)

765

if (neg)

766

sysctl_sched_features &= ~(1UL << i);

766

sysctl_sched_features &= ~(1UL << i);

767

else

767

else

768

sysctl_sched_features |= (1UL << i);

768

sysctl_sched_features |= (1UL << i);

769

break;

769

break;

770

}

770

}

771

}

771

}

772

773

if (!sched_feat_names[i])

773

if (!sched_feat_names[i])

774

return -EINVAL;

774

return -EINVAL;

775

776

*ppos += cnt;

776

*ppos += cnt;

777

778

return cnt;

778

return cnt;

779

}

779

}

780

781

static int sched_feat_open(struct inode *inode, struct file *filp)

781

static int sched_feat_open(struct inode *inode, struct file *filp)

782

{

782

{

783

return single_open(filp, sched_feat_show, NULL);

783

return single_open(filp, sched_feat_show, NULL);

784

}

784

}

785

786

static const struct file_operations sched_feat_fops = {

786

static const struct file_operations sched_feat_fops = {

787

.open = sched_feat_open,

787

.open = sched_feat_open,

788

.write = sched_feat_write,

788

.write = sched_feat_write,

789

.read = seq_read,

789

.read = seq_read,

790

.llseek = seq_lseek,

790

.llseek = seq_lseek,

791

.release = single_release,

791

.release = single_release,

792

};

792

};

793

794

static __init int sched_init_debug(void)

794

static __init int sched_init_debug(void)

795

{

795

{

796

debugfs_create_file("sched_features", 0644, NULL, NULL,

796

debugfs_create_file("sched_features", 0644, NULL, NULL,

797

&sched_feat_fops);

797

&sched_feat_fops);

798

799

return 0;

799

return 0;

800

}

800

}

801

late_initcall(sched_init_debug);

801

late_initcall(sched_init_debug);

802

803

#endif

803

#endif

804

805

#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))

805

#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))

806

807

/*

807

/*

808

* Number of tasks to iterate in a single balance run.

808

* Number of tasks to iterate in a single balance run.

809

* Limited because this is done with IRQs disabled.

809

* Limited because this is done with IRQs disabled.

810

*/

810

*/

811

const_debug unsigned int sysctl_sched_nr_migrate = 32;

811

const_debug unsigned int sysctl_sched_nr_migrate = 32;

812

813

/*

813

/*

814

* period over which we average the RT time consumption, measured

814

* period over which we average the RT time consumption, measured

815

* in ms.

815

* in ms.

816

*

816

*

817

* default: 1s

817

* default: 1s

818

*/

818

*/

819

const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

819

const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

820

821

/*

821

/*

822

* period over which we measure -rt task cpu usage in us.

822

* period over which we measure -rt task cpu usage in us.

823

* default: 1s

823

* default: 1s

824

*/

824

*/

825

unsigned int sysctl_sched_rt_period = 1000000;

825

unsigned int sysctl_sched_rt_period = 1000000;

826

827

static __read_mostly int scheduler_running;

827

static __read_mostly int scheduler_running;

828

829

/*

829

/*

830

* part of the period that we allow rt tasks to run in us.

830

* part of the period that we allow rt tasks to run in us.

831

* default: 0.95s

831

* default: 0.95s

832

*/

832

*/

833

int sysctl_sched_rt_runtime = 950000;

833

int sysctl_sched_rt_runtime = 950000;

834

835

static inline u64 global_rt_period(void)

835

static inline u64 global_rt_period(void)

836

{

836

{

837

return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;

837

return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;

838

}

838

}

839

840

static inline u64 global_rt_runtime(void)

840

static inline u64 global_rt_runtime(void)

841

{

841

{

842

if (sysctl_sched_rt_runtime < 0)

842

if (sysctl_sched_rt_runtime < 0)

843

return RUNTIME_INF;

843

return RUNTIME_INF;

844

845

return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;

845

return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;

846

}

846

}

847

848

#ifndef prepare_arch_switch

848

#ifndef prepare_arch_switch

849

# define prepare_arch_switch(next) do { } while (0)

849

# define prepare_arch_switch(next) do { } while (0)

850

#endif

850

#endif

851

#ifndef finish_arch_switch

851

#ifndef finish_arch_switch

852

# define finish_arch_switch(prev) do { } while (0)

852

# define finish_arch_switch(prev) do { } while (0)

853

#endif

853

#endif

854

855

static inline int task_current(struct rq *rq, struct task_struct *p)

855

static inline int task_current(struct rq *rq, struct task_struct *p)

856

{

856

{

857

return rq->curr == p;

857

return rq->curr == p;

858

}

858

}

859

860

static inline int task_running(struct rq *rq, struct task_struct *p)

860

static inline int task_running(struct rq *rq, struct task_struct *p)

861

{

861

{

862

#ifdef CONFIG_SMP

862

#ifdef CONFIG_SMP

863

return p->on_cpu;

863

return p->on_cpu;

864

#else

864

#else

865

return task_current(rq, p);

865

return task_current(rq, p);

866

#endif

866

#endif

867

}

867

}

868

869

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

869

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

870

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

870

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

871

{

871

{

872

#ifdef CONFIG_SMP

872

#ifdef CONFIG_SMP

873

/*

873

/*

874

* We can optimise this out completely for !SMP, because the

874

* We can optimise this out completely for !SMP, because the

875

* SMP rebalancing from interrupt is the only thing that cares

875

* SMP rebalancing from interrupt is the only thing that cares

876

* here.

876

* here.

877

*/

877

*/

878

next->on_cpu = 1;

878

next->on_cpu = 1;

879

#endif

879

#endif

880

}

880

}

881

882

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

882

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

883

{

883

{

884

#ifdef CONFIG_SMP

884

#ifdef CONFIG_SMP

885

/*

885

/*

886

* After ->on_cpu is cleared, the task can be moved to a different CPU.

886

* After ->on_cpu is cleared, the task can be moved to a different CPU.

887

* We must ensure this doesn't happen until the switch is completely

887

* We must ensure this doesn't happen until the switch is completely

888

* finished.

888

* finished.

889

*/

889

*/

890

smp_wmb();

890

smp_wmb();

891

prev->on_cpu = 0;

891

prev->on_cpu = 0;

892

#endif

892

#endif

893

#ifdef CONFIG_DEBUG_SPINLOCK

893

#ifdef CONFIG_DEBUG_SPINLOCK

894

/* this is a valid case when another task releases the spinlock */

894

/* this is a valid case when another task releases the spinlock */

895

rq->lock.owner = current;

895

rq->lock.owner = current;

896

#endif

896

#endif

897

/*

897

/*

898

* If we are tracking spinlock dependencies then we have to

898

* If we are tracking spinlock dependencies then we have to

899

* fix up the runqueue lock - which gets 'carried over' from

899

* fix up the runqueue lock - which gets 'carried over' from

900

* prev into current:

900

* prev into current:

901

*/

901

*/

902

spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

902

spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

903

904

raw_spin_unlock_irq(&rq->lock);

904

raw_spin_unlock_irq(&rq->lock);

905

}

905

}

906

907

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

907

#else /* __ARCH_WANT_UNLOCKED_CTXSW */

908

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

908

static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)

909

{

909

{

910

#ifdef CONFIG_SMP

910

#ifdef CONFIG_SMP

911

/*

911

/*

912

* We can optimise this out completely for !SMP, because the

912

* We can optimise this out completely for !SMP, because the

913

* SMP rebalancing from interrupt is the only thing that cares

913

* SMP rebalancing from interrupt is the only thing that cares

914

* here.

914

* here.

915

*/

915

*/

916

next->on_cpu = 1;

916

next->on_cpu = 1;

917

#endif

917

#endif

918

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

918

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

919

raw_spin_unlock_irq(&rq->lock);

919

raw_spin_unlock_irq(&rq->lock);

920

#else

920

#else

921

raw_spin_unlock(&rq->lock);

921

raw_spin_unlock(&rq->lock);

922

#endif

922

#endif

923

}

923

}

924

925

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

925

static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)

926

{

926

{

927

#ifdef CONFIG_SMP

927

#ifdef CONFIG_SMP

928

/*

928

/*

929

* After ->on_cpu is cleared, the task can be moved to a different CPU.

929

* After ->on_cpu is cleared, the task can be moved to a different CPU.

930

* We must ensure this doesn't happen until the switch is completely

930

* We must ensure this doesn't happen until the switch is completely

931

* finished.

931

* finished.

932

*/

932

*/

933

smp_wmb();

933

smp_wmb();

934

prev->on_cpu = 0;

934

prev->on_cpu = 0;

935

#endif

935

#endif

936

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

936

#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW

937

local_irq_enable();

937

local_irq_enable();

938

#endif

938

#endif

939

}

939

}

940

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

940

#endif /* __ARCH_WANT_UNLOCKED_CTXSW */

941

942

/*

942

/*

943

* __task_rq_lock - lock the rq @p resides on.

943

* __task_rq_lock - lock the rq @p resides on.

944

*/

944

*/

945

static inline struct rq *__task_rq_lock(struct task_struct *p)

945

static inline struct rq *__task_rq_lock(struct task_struct *p)

946

__acquires(rq->lock)

946

__acquires(rq->lock)

947

{

947

{

948

struct rq *rq;

948

struct rq *rq;

949

950

lockdep_assert_held(&p->pi_lock);

950

lockdep_assert_held(&p->pi_lock);

951

952

for (;;) {

952

for (;;) {

953

rq = task_rq(p);

953

rq = task_rq(p);

954

raw_spin_lock(&rq->lock);

954

raw_spin_lock(&rq->lock);

955

if (likely(rq == task_rq(p)))

955

if (likely(rq == task_rq(p)))

956

return rq;

956

return rq;

957

raw_spin_unlock(&rq->lock);

957

raw_spin_unlock(&rq->lock);

958

}

958

}

959

}

959

}

960

961

/*

961

/*

962

* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.

962

* task_rq_lock - lock p->pi_lock and lock the rq @p resides on.

963

*/

963

*/

964

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

964

static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)

965

__acquires(p->pi_lock)

965

__acquires(p->pi_lock)

966

__acquires(rq->lock)

966

__acquires(rq->lock)

967

{

967

{

968

struct rq *rq;

968

struct rq *rq;

969

970

for (;;) {

970

for (;;) {

971

raw_spin_lock_irqsave(&p->pi_lock, *flags);

971

raw_spin_lock_irqsave(&p->pi_lock, *flags);

972

rq = task_rq(p);

972

rq = task_rq(p);

973

raw_spin_lock(&rq->lock);

973

raw_spin_lock(&rq->lock);

974

if (likely(rq == task_rq(p)))

974

if (likely(rq == task_rq(p)))

975

return rq;

975

return rq;

976

raw_spin_unlock(&rq->lock);

976

raw_spin_unlock(&rq->lock);

977

raw_spin_unlock_irqrestore(&p->pi_lock, *flags);

977

raw_spin_unlock_irqrestore(&p->pi_lock, *flags);

978

}

978

}

979

}

979

}

980

981

static void __task_rq_unlock(struct rq *rq)

981

static void __task_rq_unlock(struct rq *rq)

982

__releases(rq->lock)

982

__releases(rq->lock)

983

{

983

{

984

raw_spin_unlock(&rq->lock);

984

raw_spin_unlock(&rq->lock);

985

}

985

}

986

987

static inline void

987

static inline void

988

task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)

988

task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)

989

__releases(rq->lock)

989

__releases(rq->lock)

990

__releases(p->pi_lock)

990

__releases(p->pi_lock)

991

{

991

{

992

raw_spin_unlock(&rq->lock);

992

raw_spin_unlock(&rq->lock);

993

raw_spin_unlock_irqrestore(&p->pi_lock, *flags);

993

raw_spin_unlock_irqrestore(&p->pi_lock, *flags);

994

}

994

}

995

996

/*

996

/*

997

* this_rq_lock - lock this runqueue and disable interrupts.

997

* this_rq_lock - lock this runqueue and disable interrupts.

998

*/

998

*/

999

static struct rq *this_rq_lock(void)

999

static struct rq *this_rq_lock(void)

1000

__acquires(rq->lock)

1000

__acquires(rq->lock)

1001

{

1001

{

1002

struct rq *rq;

1002

struct rq *rq;

1003

1004

local_irq_disable();

1004

local_irq_disable();

1005

rq = this_rq();

1005

rq = this_rq();

1006

raw_spin_lock(&rq->lock);

1006

raw_spin_lock(&rq->lock);

1007

1008

return rq;

1008

return rq;

1009

}

1009

}

1010

1011

#ifdef CONFIG_SCHED_HRTICK

1011

#ifdef CONFIG_SCHED_HRTICK

1012

/*

1012

/*

1013

* Use HR-timers to deliver accurate preemption points.

1013

* Use HR-timers to deliver accurate preemption points.

1014

*

1014

*

1015

* Its all a bit involved since we cannot program an hrt while holding the

1015

* Its all a bit involved since we cannot program an hrt while holding the

1016

* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a

1016

* rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a

1017

* reschedule event.

1017

* reschedule event.

1018

*

1018

*

1019

* When we get rescheduled we reprogram the hrtick_timer outside of the

1019

* When we get rescheduled we reprogram the hrtick_timer outside of the

1020

* rq->lock.

1020

* rq->lock.

1021

*/

1021

*/

1022

1023

/*

1023

/*

1024

* Use hrtick when:

1024

* Use hrtick when:

1025

* - enabled by features

1025

* - enabled by features

1026

* - hrtimer is actually high res

1026

* - hrtimer is actually high res

1027

*/

1027

*/

1028

static inline int hrtick_enabled(struct rq *rq)

1028

static inline int hrtick_enabled(struct rq *rq)

1029

{

1029

{

1030

if (!sched_feat(HRTICK))

1030

if (!sched_feat(HRTICK))

1031

return 0;

1031

return 0;

1032

if (!cpu_active(cpu_of(rq)))

1032

if (!cpu_active(cpu_of(rq)))

1033

return 0;

1033

return 0;

1034

return hrtimer_is_hres_active(&rq->hrtick_timer);

1034

return hrtimer_is_hres_active(&rq->hrtick_timer);

1035

}

1035

}

1036

1037

static void hrtick_clear(struct rq *rq)

1037

static void hrtick_clear(struct rq *rq)

1038

{

1038

{

1039

if (hrtimer_active(&rq->hrtick_timer))

1039

if (hrtimer_active(&rq->hrtick_timer))

1040

hrtimer_cancel(&rq->hrtick_timer);

1040

hrtimer_cancel(&rq->hrtick_timer);

1041

}

1041

}

1042

1043

/*

1043

/*

1044

* High-resolution timer tick.

1044

* High-resolution timer tick.

1045

* Runs from hardirq context with interrupts disabled.

1045

* Runs from hardirq context with interrupts disabled.

1046

*/

1046

*/

1047

static enum hrtimer_restart hrtick(struct hrtimer *timer)

1047

static enum hrtimer_restart hrtick(struct hrtimer *timer)

1048

{

1048

{

1049

struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1049

struct rq *rq = container_of(timer, struct rq, hrtick_timer);

1050

1051

WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1051

WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

1052

1053

raw_spin_lock(&rq->lock);

1053

raw_spin_lock(&rq->lock);

1054

update_rq_clock(rq);

1054

update_rq_clock(rq);

1055

rq->curr->sched_class->task_tick(rq, rq->curr, 1);

1055

rq->curr->sched_class->task_tick(rq, rq->curr, 1);

1056

raw_spin_unlock(&rq->lock);

1056

raw_spin_unlock(&rq->lock);

1057

1058

return HRTIMER_NORESTART;

1058

return HRTIMER_NORESTART;

1059

}

1059

}

1060

1061

#ifdef CONFIG_SMP

1061

#ifdef CONFIG_SMP

1062

/*

1062

/*

1063

* called from hardirq (IPI) context

1063

* called from hardirq (IPI) context

1064

*/

1064

*/

1065

static void __hrtick_start(void *arg)

1065

static void __hrtick_start(void *arg)

1066

{

1066

{

1067

struct rq *rq = arg;

1067

struct rq *rq = arg;

1068

1069

raw_spin_lock(&rq->lock);

1069

raw_spin_lock(&rq->lock);

1070

hrtimer_restart(&rq->hrtick_timer);

1070

hrtimer_restart(&rq->hrtick_timer);

1071

rq->hrtick_csd_pending = 0;

1071

rq->hrtick_csd_pending = 0;

1072

raw_spin_unlock(&rq->lock);

1072

raw_spin_unlock(&rq->lock);

1073

}

1073

}

1074

1075

/*

1075

/*

1076

* Called to set the hrtick timer state.

1076

* Called to set the hrtick timer state.

1077

*

1077

*

1078

* called with rq->lock held and irqs disabled

1078

* called with rq->lock held and irqs disabled

1079

*/

1079

*/

1080

static void hrtick_start(struct rq *rq, u64 delay)

1080

static void hrtick_start(struct rq *rq, u64 delay)

1081

{

1081

{

1082

struct hrtimer *timer = &rq->hrtick_timer;

1082

struct hrtimer *timer = &rq->hrtick_timer;

1083

ktime_t time = ktime_add_ns(timer->base->get_time(), delay);

1083

ktime_t time = ktime_add_ns(timer->base->get_time(), delay);

1084

1085

hrtimer_set_expires(timer, time);

1085

hrtimer_set_expires(timer, time);

1086

1087

if (rq == this_rq()) {

1087

if (rq == this_rq()) {

1088

hrtimer_restart(timer);

1088

hrtimer_restart(timer);

1089

} else if (!rq->hrtick_csd_pending) {

1089

} else if (!rq->hrtick_csd_pending) {

1090

__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);

1090

__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);

1091

rq->hrtick_csd_pending = 1;

1091

rq->hrtick_csd_pending = 1;

1092

}

1092

}

1093

}

1093

}

1094

1095

static int

1095

static int

1096

hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)

1096

hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)

1097

{

1097

{

1098

int cpu = (int)(long)hcpu;

1098

int cpu = (int)(long)hcpu;

1099

1100

switch (action) {

1100

switch (action) {

1101

case CPU_UP_CANCELED:

1101

case CPU_UP_CANCELED:

1102

case CPU_UP_CANCELED_FROZEN:

1102

case CPU_UP_CANCELED_FROZEN:

1103

case CPU_DOWN_PREPARE:

1103

case CPU_DOWN_PREPARE:

1104

case CPU_DOWN_PREPARE_FROZEN:

1104

case CPU_DOWN_PREPARE_FROZEN:

1105

case CPU_DEAD:

1105

case CPU_DEAD:

1106

case CPU_DEAD_FROZEN:

1106

case CPU_DEAD_FROZEN:

1107

hrtick_clear(cpu_rq(cpu));

1107

hrtick_clear(cpu_rq(cpu));

1108

return NOTIFY_OK;

1108

return NOTIFY_OK;

1109

}

1109

}

1110

1111

return NOTIFY_DONE;

1111

return NOTIFY_DONE;

1112

}

1112

}

1113

1114

static __init void init_hrtick(void)

1114

static __init void init_hrtick(void)

1115

{

1115

{

1116

hotcpu_notifier(hotplug_hrtick, 0);

1116

hotcpu_notifier(hotplug_hrtick, 0);

1117

}

1117

}

1118

#else

1118

#else

1119

/*

1119

/*

1120

* Called to set the hrtick timer state.

1120

* Called to set the hrtick timer state.

1121

*

1121

*

1122

* called with rq->lock held and irqs disabled

1122

* called with rq->lock held and irqs disabled

1123

*/

1123

*/

1124

static void hrtick_start(struct rq *rq, u64 delay)

1124

static void hrtick_start(struct rq *rq, u64 delay)

1125

{

1125

{

1126

__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,

1126

__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,

1127

HRTIMER_MODE_REL_PINNED, 0);

1127

HRTIMER_MODE_REL_PINNED, 0);

1128

}

1128

}

1129

1130

static inline void init_hrtick(void)

1130

static inline void init_hrtick(void)

1131

{

1131

{

1132

}

1132

}

1133

#endif /* CONFIG_SMP */

1133

#endif /* CONFIG_SMP */

1134

1135

static void init_rq_hrtick(struct rq *rq)

1135

static void init_rq_hrtick(struct rq *rq)

1136

{

1136

{

1137

#ifdef CONFIG_SMP

1137

#ifdef CONFIG_SMP

1138

rq->hrtick_csd_pending = 0;

1138

rq->hrtick_csd_pending = 0;

1139

1140

rq->hrtick_csd.flags = 0;

1140

rq->hrtick_csd.flags = 0;

1141

rq->hrtick_csd.func = __hrtick_start;

1141

rq->hrtick_csd.func = __hrtick_start;

1142

rq->hrtick_csd.info = rq;

1142

rq->hrtick_csd.info = rq;

1143

#endif

1143

#endif

1144

1145

hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

1145

hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);

1146

rq->hrtick_timer.function = hrtick;

1146

rq->hrtick_timer.function = hrtick;

1147

}

1147

}

1148

#else /* CONFIG_SCHED_HRTICK */

1148

#else /* CONFIG_SCHED_HRTICK */

1149

static inline void hrtick_clear(struct rq *rq)

1149

static inline void hrtick_clear(struct rq *rq)

1150

{

1150

{

1151

}

1151

}

1152

1153

static inline void init_rq_hrtick(struct rq *rq)

1153

static inline void init_rq_hrtick(struct rq *rq)

1154

{

1154

{

1155

}

1155

}

1156

1157

static inline void init_hrtick(void)

1157

static inline void init_hrtick(void)

1158

{

1158

{

1159

}

1159

}

1160

#endif /* CONFIG_SCHED_HRTICK */

1160

#endif /* CONFIG_SCHED_HRTICK */

1161

1162

/*

1162

/*

1163

* resched_task - mark a task 'to be rescheduled now'.

1163

* resched_task - mark a task 'to be rescheduled now'.

1164

*

1164

*

1165

* On UP this means the setting of the need_resched flag, on SMP it

1165

* On UP this means the setting of the need_resched flag, on SMP it

1166

* might also involve a cross-CPU call to trigger the scheduler on

1166

* might also involve a cross-CPU call to trigger the scheduler on

1167

* the target CPU.

1167

* the target CPU.

1168

*/

1168

*/

1169

#ifdef CONFIG_SMP

1169

#ifdef CONFIG_SMP

1170

1171

#ifndef tsk_is_polling

1171

#ifndef tsk_is_polling

1172

#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)

1172

#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)

1173

#endif

1173

#endif

1174

1175

static void resched_task(struct task_struct *p)

1175

static void resched_task(struct task_struct *p)

1176

{

1176

{

1177

int cpu;

1177

int cpu;

1178

1179

assert_raw_spin_locked(&task_rq(p)->lock);

1179

assert_raw_spin_locked(&task_rq(p)->lock);

1180

1181

if (test_tsk_need_resched(p))

1181

if (test_tsk_need_resched(p))

1182

return;

1182

return;

1183

1184

set_tsk_need_resched(p);

1184

set_tsk_need_resched(p);

1185

1186

cpu = task_cpu(p);

1186

cpu = task_cpu(p);

1187

if (cpu == smp_processor_id())

1187

if (cpu == smp_processor_id())

1188

return;

1188

return;

1189

1190

/* NEED_RESCHED must be visible before we test polling */

1190

/* NEED_RESCHED must be visible before we test polling */

1191

smp_mb();

1191

smp_mb();

1192

if (!tsk_is_polling(p))

1192

if (!tsk_is_polling(p))

1193

smp_send_reschedule(cpu);

1193

smp_send_reschedule(cpu);

1194

}

1194

}

1195

1196

static void resched_cpu(int cpu)

1196

static void resched_cpu(int cpu)

1197

{

1197

{

1198

struct rq *rq = cpu_rq(cpu);

1198

struct rq *rq = cpu_rq(cpu);

1199

unsigned long flags;

1199

unsigned long flags;

1200

1201

if (!raw_spin_trylock_irqsave(&rq->lock, flags))

1201

if (!raw_spin_trylock_irqsave(&rq->lock, flags))

1202

return;

1202

return;

1203

resched_task(cpu_curr(cpu));

1203

resched_task(cpu_curr(cpu));

1204

raw_spin_unlock_irqrestore(&rq->lock, flags);

1204

raw_spin_unlock_irqrestore(&rq->lock, flags);

1205

}

1205

}

1206

1207

#ifdef CONFIG_NO_HZ

1207

#ifdef CONFIG_NO_HZ

1208

/*

1208

/*

1209

* In the semi idle case, use the nearest busy cpu for migrating timers

1209

* In the semi idle case, use the nearest busy cpu for migrating timers

1210

* from an idle cpu. This is good for power-savings.

1210

* from an idle cpu. This is good for power-savings.

1211

*

1211

*

1212

* We don't do similar optimization for completely idle system, as

1212

* We don't do similar optimization for completely idle system, as

1213

* selecting an idle cpu will add more delays to the timers than intended

1213

* selecting an idle cpu will add more delays to the timers than intended

1214

* (as that cpu's timer base may not be uptodate wrt jiffies etc).

1214

* (as that cpu's timer base may not be uptodate wrt jiffies etc).

1215

*/

1215

*/

1216

int get_nohz_timer_target(void)

1216

int get_nohz_timer_target(void)

1217

{

1217

{

1218

int cpu = smp_processor_id();

1218

int cpu = smp_processor_id();

1219

int i;

1219

int i;

1220

struct sched_domain *sd;

1220

struct sched_domain *sd;

1221

1222

rcu_read_lock();

1222

rcu_read_lock();

1223

for_each_domain(cpu, sd) {

1223

for_each_domain(cpu, sd) {

1224

for_each_cpu(i, sched_domain_span(sd)) {

1224

for_each_cpu(i, sched_domain_span(sd)) {

1225

if (!idle_cpu(i)) {

1225

if (!idle_cpu(i)) {

1226

cpu = i;

1226

cpu = i;

1227

goto unlock;

1227

goto unlock;

1228

}

1228

}

1229

}

1229

}

1230

}

1230

}

1231

unlock:

1231

unlock:

1232

rcu_read_unlock();

1232

rcu_read_unlock();

1233

return cpu;

1233

return cpu;

1234

}

1234

}

1235

/*

1235

/*

1236

* When add_timer_on() enqueues a timer into the timer wheel of an

1236

* When add_timer_on() enqueues a timer into the timer wheel of an

1237

* idle CPU then this timer might expire before the next timer event

1237

* idle CPU then this timer might expire before the next timer event

1238

* which is scheduled to wake up that CPU. In case of a completely

1238

* which is scheduled to wake up that CPU. In case of a completely

1239

* idle system the next event might even be infinite time into the

1239

* idle system the next event might even be infinite time into the

1240

* future. wake_up_idle_cpu() ensures that the CPU is woken up and

1240

* future. wake_up_idle_cpu() ensures that the CPU is woken up and

1241

* leaves the inner idle loop so the newly added timer is taken into

1241

* leaves the inner idle loop so the newly added timer is taken into

1242

* account when the CPU goes back to idle and evaluates the timer

1242

* account when the CPU goes back to idle and evaluates the timer

1243

* wheel for the next timer event.

1243

* wheel for the next timer event.

1244

*/

1244

*/

1245

void wake_up_idle_cpu(int cpu)

1245

void wake_up_idle_cpu(int cpu)

1246

{

1246

{

1247

struct rq *rq = cpu_rq(cpu);

1247

struct rq *rq = cpu_rq(cpu);

1248

1249

if (cpu == smp_processor_id())

1249

if (cpu == smp_processor_id())

1250

return;

1250

return;

1251

1252

/*

1252

/*

1253

* This is safe, as this function is called with the timer

1253

* This is safe, as this function is called with the timer

1254

* wheel base lock of (cpu) held. When the CPU is on the way

1254

* wheel base lock of (cpu) held. When the CPU is on the way

1255

* to idle and has not yet set rq->curr to idle then it will

1255

* to idle and has not yet set rq->curr to idle then it will

1256

* be serialized on the timer wheel base lock and take the new

1256

* be serialized on the timer wheel base lock and take the new

1257

* timer into account automatically.

1257

* timer into account automatically.

1258

*/

1258

*/

1259

if (rq->curr != rq->idle)

1259

if (rq->curr != rq->idle)

1260

return;

1260

return;

1261

1262

/*

1262

/*

1263

* We can set TIF_RESCHED on the idle task of the other CPU

1263

* We can set TIF_RESCHED on the idle task of the other CPU

1264

* lockless. The worst case is that the other CPU runs the

1264

* lockless. The worst case is that the other CPU runs the

1265

* idle task through an additional NOOP schedule()

1265

* idle task through an additional NOOP schedule()

1266

*/

1266

*/

1267

set_tsk_need_resched(rq->idle);

1267

set_tsk_need_resched(rq->idle);

1268

1269

/* NEED_RESCHED must be visible before we test polling */

1269

/* NEED_RESCHED must be visible before we test polling */

1270

smp_mb();

1270

smp_mb();

1271

if (!tsk_is_polling(rq->idle))

1271

if (!tsk_is_polling(rq->idle))

1272

smp_send_reschedule(cpu);

1272

smp_send_reschedule(cpu);

1273

}

1273

}

1274

1275

#endif /* CONFIG_NO_HZ */

1275

#endif /* CONFIG_NO_HZ */

1276

1277

static u64 sched_avg_period(void)

1277

static u64 sched_avg_period(void)

1278

{

1278

{

1279

return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;

1279

return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;

1280

}

1280

}

1281

1282

static void sched_avg_update(struct rq *rq)

1282

static void sched_avg_update(struct rq *rq)

1283

{

1283

{

1284

s64 period = sched_avg_period();

1284

s64 period = sched_avg_period();

1285

1286

while ((s64)(rq->clock - rq->age_stamp) > period) {

1286

while ((s64)(rq->clock - rq->age_stamp) > period) {

1287

/*

1287

/*

1288

* Inline assembly required to prevent the compiler

1288

* Inline assembly required to prevent the compiler

1289

* optimising this loop into a divmod call.

1289

* optimising this loop into a divmod call.

1290

* See __iter_div_u64_rem() for another example of this.

1290

* See __iter_div_u64_rem() for another example of this.

1291

*/

1291

*/

1292

asm("" : "+rm" (rq->age_stamp));

1292

asm("" : "+rm" (rq->age_stamp));

1293

rq->age_stamp += period;

1293

rq->age_stamp += period;

1294

rq->rt_avg /= 2;

1294

rq->rt_avg /= 2;

1295

}

1295

}

1296

}

1296

}

1297

1298

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1298

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1299

{

1299

{

1300

rq->rt_avg += rt_delta;

1300

rq->rt_avg += rt_delta;

1301

sched_avg_update(rq);

1301

sched_avg_update(rq);

1302

}

1302

}

1303

1304

#else /* !CONFIG_SMP */

1304

#else /* !CONFIG_SMP */

1305

static void resched_task(struct task_struct *p)

1305

static void resched_task(struct task_struct *p)

1306

{

1306

{

1307

assert_raw_spin_locked(&task_rq(p)->lock);

1307

assert_raw_spin_locked(&task_rq(p)->lock);

1308

set_tsk_need_resched(p);

1308

set_tsk_need_resched(p);

1309

}

1309

}

1310

1311

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1311

static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)

1312

{

1312

{

1313

}

1313

}

1314

1315

static void sched_avg_update(struct rq *rq)

1315

static void sched_avg_update(struct rq *rq)

1316

{

1316

{

1317

}

1317

}

1318

#endif /* CONFIG_SMP */

1318

#endif /* CONFIG_SMP */

1319

1320

#if BITS_PER_LONG == 32

1320

#if BITS_PER_LONG == 32

1321

# define WMULT_CONST (~0UL)

1321

# define WMULT_CONST (~0UL)

1322

#else

1322

#else

1323

# define WMULT_CONST (1UL << 32)

1323

# define WMULT_CONST (1UL << 32)

1324

#endif

1324

#endif

1325

1326

#define WMULT_SHIFT 32

1326

#define WMULT_SHIFT 32

1327

1328

/*

1328

/*

1329

* Shift right and round:

1329

* Shift right and round:

1330

*/

1330

*/

1331

#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))

1331

#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))

1332

1333

/*

1333

/*

1334

* delta *= weight / lw

1334

* delta *= weight / lw

1335

*/

1335

*/

1336

static unsigned long

1336

static unsigned long

1337

calc_delta_mine(unsigned long delta_exec, unsigned long weight,

1337

calc_delta_mine(unsigned long delta_exec, unsigned long weight,

1338

struct load_weight *lw)

1338

struct load_weight *lw)

1339

{

1339

{

1340

u64 tmp;

1340

u64 tmp;

1341

1342

/*

1342

/*

1343

* weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched

1343

* weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched

1344

* entities since MIN_SHARES = 2. Treat weight as 1 if less than

1344

* entities since MIN_SHARES = 2. Treat weight as 1 if less than

1345

* 2^SCHED_LOAD_RESOLUTION.

1345

* 2^SCHED_LOAD_RESOLUTION.

1346

*/

1346

*/

1347

if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))

1347

if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))

1348

tmp = (u64)delta_exec * scale_load_down(weight);

1348

tmp = (u64)delta_exec * scale_load_down(weight);

1349

else

1349

else

1350

tmp = (u64)delta_exec;

1350

tmp = (u64)delta_exec;

1351

1352

if (!lw->inv_weight) {

1352

if (!lw->inv_weight) {

1353

unsigned long w = scale_load_down(lw->weight);

1353

unsigned long w = scale_load_down(lw->weight);

1354

1355

if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))

1355

if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))

1356

lw->inv_weight = 1;

1356

lw->inv_weight = 1;

1357

else if (unlikely(!w))

1357

else if (unlikely(!w))

1358

lw->inv_weight = WMULT_CONST;

1358

lw->inv_weight = WMULT_CONST;

1359

else

1359

else

1360

lw->inv_weight = WMULT_CONST / w;

1360

lw->inv_weight = WMULT_CONST / w;

1361

}

1361

}

1362

1363

/*

1363

/*

1364

* Check whether we'd overflow the 64-bit multiplication:

1364

* Check whether we'd overflow the 64-bit multiplication:

1365

*/

1365

*/

1366

if (unlikely(tmp > WMULT_CONST))

1366

if (unlikely(tmp > WMULT_CONST))

1367

tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,

1367

tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,

1368

WMULT_SHIFT/2);

1368

WMULT_SHIFT/2);

1369

else

1369

else

1370

tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

1370

tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

1371

1372

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);

1372

return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);

1373

}

1373

}

1374

1375

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

1375

static inline void update_load_add(struct load_weight *lw, unsigned long inc)

1376

{

1376

{

1377

lw->weight += inc;

1377

lw->weight += inc;

1378

lw->inv_weight = 0;

1378

lw->inv_weight = 0;

1379

}

1379

}

1380

1381

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

1381

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)

1382

{

1382

{

1383

lw->weight -= dec;

1383

lw->weight -= dec;

1384

lw->inv_weight = 0;

1384

lw->inv_weight = 0;

1385

}

1385

}

1386

1387

static inline void update_load_set(struct load_weight *lw, unsigned long w)

1387

static inline void update_load_set(struct load_weight *lw, unsigned long w)

1388

{

1388

{

1389

lw->weight = w;

1389

lw->weight = w;

1390

lw->inv_weight = 0;

1390

lw->inv_weight = 0;

1391

}

1391

}

1392

1393

/*

1393

/*

1394

* To aid in avoiding the subversion of "niceness" due to uneven distribution

1394

* To aid in avoiding the subversion of "niceness" due to uneven distribution

1395

* of tasks with abnormal "nice" values across CPUs the contribution that

1395

* of tasks with abnormal "nice" values across CPUs the contribution that

1396

* each task makes to its run queue's load is weighted according to its

1396

* each task makes to its run queue's load is weighted according to its

1397

* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a

1397

* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a

1398

* scaled version of the new time slice allocation that they receive on time

1398

* scaled version of the new time slice allocation that they receive on time

1399

* slice expiry etc.

1399

* slice expiry etc.

1400

*/

1400

*/

1401

1402

#define WEIGHT_IDLEPRIO 3

1402

#define WEIGHT_IDLEPRIO 3

1403

#define WMULT_IDLEPRIO 1431655765

1403

#define WMULT_IDLEPRIO 1431655765

1404

1405

/*

1405

/*

1406

* Nice levels are multiplicative, with a gentle 10% change for every

1406

* Nice levels are multiplicative, with a gentle 10% change for every

1407

* nice level changed. I.e. when a CPU-bound task goes from nice 0 to

1407

* nice level changed. I.e. when a CPU-bound task goes from nice 0 to

1408

* nice 1, it will get ~10% less CPU time than another CPU-bound task

1408

* nice 1, it will get ~10% less CPU time than another CPU-bound task

1409

* that remained on nice 0.

1409

* that remained on nice 0.

1410

*

1410

*

1411

* The "10% effect" is relative and cumulative: from _any_ nice level,

1411

* The "10% effect" is relative and cumulative: from _any_ nice level,

1412

* if you go up 1 level, it's -10% CPU usage, if you go down 1 level

1412

* if you go up 1 level, it's -10% CPU usage, if you go down 1 level

1413

* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.

1413

* it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.

1414

* If a task goes up by ~10% and another task goes down by ~10% then

1414

* If a task goes up by ~10% and another task goes down by ~10% then

1415

* the relative distance between them is ~25%.)

1415

* the relative distance between them is ~25%.)

1416

*/

1416

*/

1417

static const int prio_to_weight[40] = {

1417

static const int prio_to_weight[40] = {

1418

/* -20 */ 88761, 71755, 56483, 46273, 36291,

1418

/* -20 */ 88761, 71755, 56483, 46273, 36291,

1419

/* -15 */ 29154, 23254, 18705, 14949, 11916,

1419

/* -15 */ 29154, 23254, 18705, 14949, 11916,

1420

/* -10 */ 9548, 7620, 6100, 4904, 3906,

1420

/* -10 */ 9548, 7620, 6100, 4904, 3906,

1421

/* -5 */ 3121, 2501, 1991, 1586, 1277,

1421

/* -5 */ 3121, 2501, 1991, 1586, 1277,

1422

/* 0 */ 1024, 820, 655, 526, 423,

1422

/* 0 */ 1024, 820, 655, 526, 423,

1423

/* 5 */ 335, 272, 215, 172, 137,

1423

/* 5 */ 335, 272, 215, 172, 137,

1424

/* 10 */ 110, 87, 70, 56, 45,

1424

/* 10 */ 110, 87, 70, 56, 45,

1425

/* 15 */ 36, 29, 23, 18, 15,

1425

/* 15 */ 36, 29, 23, 18, 15,

1426

};

1426

};

1427

1428

/*

1428

/*

1429

* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.

1429

* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.

1430

*

1430

*

1431

* In cases where the weight does not change often, we can use the

1431

* In cases where the weight does not change often, we can use the

1432

* precalculated inverse to speed up arithmetics by turning divisions

1432

* precalculated inverse to speed up arithmetics by turning divisions

1433

* into multiplications:

1433

* into multiplications:

1434

*/

1434

*/

1435

static const u32 prio_to_wmult[40] = {

1435

static const u32 prio_to_wmult[40] = {

1436

/* -20 */ 48388, 59856, 76040, 92818, 118348,

1436

/* -20 */ 48388, 59856, 76040, 92818, 118348,

1437

/* -15 */ 147320, 184698, 229616, 287308, 360437,

1437

/* -15 */ 147320, 184698, 229616, 287308, 360437,

1438

/* -10 */ 449829, 563644, 704093, 875809, 1099582,

1438

/* -10 */ 449829, 563644, 704093, 875809, 1099582,

1439

/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,

1439

/* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,

1440

/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,

1440

/* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,

1441

/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,

1441

/* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,

1442

/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,

1442

/* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,

1443

/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,

1443

/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,

1444

};

1444

};

1445

1446

/* Time spent by the tasks of the cpu accounting group executing in ... */

1446

/* Time spent by the tasks of the cpu accounting group executing in ... */

1447

enum cpuacct_stat_index {

1447

enum cpuacct_stat_index {

1448

CPUACCT_STAT_USER, /* ... user mode */

1448

CPUACCT_STAT_USER, /* ... user mode */

1449

CPUACCT_STAT_SYSTEM, /* ... kernel mode */

1449

CPUACCT_STAT_SYSTEM, /* ... kernel mode */

1450

1451

CPUACCT_STAT_NSTATS,

1451

CPUACCT_STAT_NSTATS,

1452

};

1452

};

1453

1454

#ifdef CONFIG_CGROUP_CPUACCT

1454

#ifdef CONFIG_CGROUP_CPUACCT

1455

static void cpuacct_charge(struct task_struct *tsk, u64 cputime);

1455

static void cpuacct_charge(struct task_struct *tsk, u64 cputime);

1456

static void cpuacct_update_stats(struct task_struct *tsk,

1456

static void cpuacct_update_stats(struct task_struct *tsk,

1457

enum cpuacct_stat_index idx, cputime_t val);

1457

enum cpuacct_stat_index idx, cputime_t val);

1458

#else

1458

#else

1459

static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}

1459

static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}

1460

static inline void cpuacct_update_stats(struct task_struct *tsk,

1460

static inline void cpuacct_update_stats(struct task_struct *tsk,

1461

enum cpuacct_stat_index idx, cputime_t val) {}

1461

enum cpuacct_stat_index idx, cputime_t val) {}

1462

#endif

1462

#endif

1463

1464

static inline void inc_cpu_load(struct rq *rq, unsigned long load)

1464

static inline void inc_cpu_load(struct rq *rq, unsigned long load)

1465

{

1465

{

1466

update_load_add(&rq->load, load);

1466

update_load_add(&rq->load, load);

1467

}

1467

}

1468

1469

static inline void dec_cpu_load(struct rq *rq, unsigned long load)

1469

static inline void dec_cpu_load(struct rq *rq, unsigned long load)

1470

{

1470

{

1471

update_load_sub(&rq->load, load);

1471

update_load_sub(&rq->load, load);

1472

}

1472

}

1473

1474

#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)

1474

#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)

1475

typedef int (*tg_visitor)(struct task_group *, void *);

1475

typedef int (*tg_visitor)(struct task_group *, void *);

1476

1477

/*

1477

/*

1478

* Iterate the full tree, calling @down when first entering a node and @up when

1478

* Iterate the full tree, calling @down when first entering a node and @up when

1479

* leaving it for the final time.

1479

* leaving it for the final time.

1480

*/

1480

*/

1481

static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)

1481

static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)

1482

{

1482

{

1483

struct task_group *parent, *child;

1483

struct task_group *parent, *child;

1484

int ret;

1484

int ret;

1485

1486

rcu_read_lock();

1486

rcu_read_lock();

1487

parent = &root_task_group;

1487

parent = &root_task_group;

1488

down:

1488

down:

1489

ret = (*down)(parent, data);

1489

ret = (*down)(parent, data);

1490

if (ret)

1490

if (ret)

1491

goto out_unlock;

1491

goto out_unlock;

1492

list_for_each_entry_rcu(child, &parent->children, siblings) {

1492

list_for_each_entry_rcu(child, &parent->children, siblings) {

1493

parent = child;

1493

parent = child;

1494

goto down;

1494

goto down;

1495

1496

up:

1496

up:

1497

continue;

1497

continue;

1498

}

1498

}

1499

ret = (*up)(parent, data);

1499

ret = (*up)(parent, data);

1500

if (ret)

1500

if (ret)

1501

goto out_unlock;

1501

goto out_unlock;

1502

1503

child = parent;

1503

child = parent;

1504

parent = parent->parent;

1504

parent = parent->parent;

1505

if (parent)

1505

if (parent)

1506

goto up;

1506

goto up;

1507

out_unlock:

1507

out_unlock:

1508

rcu_read_unlock();

1508

rcu_read_unlock();

1509

1510

return ret;

1510

return ret;

1511

}

1511

}

1512

1513

static int tg_nop(struct task_group *tg, void *data)

1513

static int tg_nop(struct task_group *tg, void *data)

1514

{

1514

{

1515

return 0;

1515

return 0;

1516

}

1516

}

1517

#endif

1517

#endif

1518

1519

#ifdef CONFIG_SMP

1519

#ifdef CONFIG_SMP

1520

/* Used instead of source_load when we know the type == 0 */

1520

/* Used instead of source_load when we know the type == 0 */

1521

static unsigned long weighted_cpuload(const int cpu)

1521

static unsigned long weighted_cpuload(const int cpu)

1522

{

1522

{

1523

return cpu_rq(cpu)->load.weight;

1523

return cpu_rq(cpu)->load.weight;

1524

}

1524

}

1525

1526

/*

1526

/*

1527

* Return a low guess at the load of a migration-source cpu weighted

1527

* Return a low guess at the load of a migration-source cpu weighted

1528

* according to the scheduling class and "nice" value.

1528

* according to the scheduling class and "nice" value.

1529

*

1529

*

1530

* We want to under-estimate the load of migration sources, to

1530

* We want to under-estimate the load of migration sources, to

1531

* balance conservatively.

1531

* balance conservatively.

1532

*/

1532

*/

1533

static unsigned long source_load(int cpu, int type)

1533

static unsigned long source_load(int cpu, int type)

1534

{

1534

{

1535

struct rq *rq = cpu_rq(cpu);

1535

struct rq *rq = cpu_rq(cpu);

1536

unsigned long total = weighted_cpuload(cpu);

1536

unsigned long total = weighted_cpuload(cpu);

1537

1538

if (type == 0 || !sched_feat(LB_BIAS))

1538

if (type == 0 || !sched_feat(LB_BIAS))

1539

return total;

1539

return total;

1540

1541

return min(rq->cpu_load[type-1], total);

1541

return min(rq->cpu_load[type-1], total);

1542

}

1542

}

1543

1544

/*

1544

/*

1545

* Return a high guess at the load of a migration-target cpu weighted

1545

* Return a high guess at the load of a migration-target cpu weighted

1546

* according to the scheduling class and "nice" value.

1546

* according to the scheduling class and "nice" value.

1547

*/

1547

*/

1548

static unsigned long target_load(int cpu, int type)

1548

static unsigned long target_load(int cpu, int type)

1549

{

1549

{

1550

struct rq *rq = cpu_rq(cpu);

1550

struct rq *rq = cpu_rq(cpu);

1551

unsigned long total = weighted_cpuload(cpu);

1551

unsigned long total = weighted_cpuload(cpu);

1552

1553

if (type == 0 || !sched_feat(LB_BIAS))

1553

if (type == 0 || !sched_feat(LB_BIAS))

1554

return total;

1554

return total;

1555

1556

return max(rq->cpu_load[type-1], total);

1556

return max(rq->cpu_load[type-1], total);

1557

}

1557

}

1558

1559

static unsigned long power_of(int cpu)

1559

static unsigned long power_of(int cpu)

1560

{

1560

{

1561

return cpu_rq(cpu)->cpu_power;

1561

return cpu_rq(cpu)->cpu_power;

1562

}

1562

}

1563

1564

static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);

1564

static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);

1565

1566

static unsigned long cpu_avg_load_per_task(int cpu)

1566

static unsigned long cpu_avg_load_per_task(int cpu)

1567

{

1567

{

1568

struct rq *rq = cpu_rq(cpu);

1568

struct rq *rq = cpu_rq(cpu);

1569

unsigned long nr_running = ACCESS_ONCE(rq->nr_running);

1569

unsigned long nr_running = ACCESS_ONCE(rq->nr_running);

1570

1571

if (nr_running)

1571

if (nr_running)

1572

rq->avg_load_per_task = rq->load.weight / nr_running;

1572

rq->avg_load_per_task = rq->load.weight / nr_running;

1573

else

1573

else

1574

rq->avg_load_per_task = 0;

1574

rq->avg_load_per_task = 0;

1575

1576

return rq->avg_load_per_task;

1576

return rq->avg_load_per_task;

1577

}

1577

}

1578

1579

#ifdef CONFIG_PREEMPT

1579

#ifdef CONFIG_PREEMPT

1580

1581

static void double_rq_lock(struct rq *rq1, struct rq *rq2);

1581

static void double_rq_lock(struct rq *rq1, struct rq *rq2);

1582

1583

/*

1583

/*

1584

* fair double_lock_balance: Safely acquires both rq->locks in a fair

1584

* fair double_lock_balance: Safely acquires both rq->locks in a fair

1585

* way at the expense of forcing extra atomic operations in all

1585

* way at the expense of forcing extra atomic operations in all

1586

* invocations. This assures that the double_lock is acquired using the

1586

* invocations. This assures that the double_lock is acquired using the

1587

* same underlying policy as the spinlock_t on this architecture, which

1587

* same underlying policy as the spinlock_t on this architecture, which

1588

* reduces latency compared to the unfair variant below. However, it

1588

* reduces latency compared to the unfair variant below. However, it

1589

* also adds more overhead and therefore may reduce throughput.

1589

* also adds more overhead and therefore may reduce throughput.

1590

*/

1590

*/

1591

static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1591

static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1592

__releases(this_rq->lock)

1592

__releases(this_rq->lock)

1593

__acquires(busiest->lock)

1593

__acquires(busiest->lock)

1594

__acquires(this_rq->lock)

1594

__acquires(this_rq->lock)

1595

{

1595

{

1596

raw_spin_unlock(&this_rq->lock);

1596

raw_spin_unlock(&this_rq->lock);

1597

double_rq_lock(this_rq, busiest);

1597

double_rq_lock(this_rq, busiest);

1598

1599

return 1;

1599

return 1;

1600

}

1600

}

1601

1602

#else

1602

#else

1603

/*

1603

/*

1604

* Unfair double_lock_balance: Optimizes throughput at the expense of

1604

* Unfair double_lock_balance: Optimizes throughput at the expense of

1605

* latency by eliminating extra atomic operations when the locks are

1605

* latency by eliminating extra atomic operations when the locks are

1606

* already in proper order on entry. This favors lower cpu-ids and will

1606

* already in proper order on entry. This favors lower cpu-ids and will

1607

* grant the double lock to lower cpus over higher ids under contention,

1607

* grant the double lock to lower cpus over higher ids under contention,

1608

* regardless of entry order into the function.

1608

* regardless of entry order into the function.

1609

*/

1609

*/

1610

static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1610

static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)

1611

__releases(this_rq->lock)

1611

__releases(this_rq->lock)

1612

__acquires(busiest->lock)

1612

__acquires(busiest->lock)

1613

__acquires(this_rq->lock)

1613

__acquires(this_rq->lock)

1614

{

1614

{

1615

int ret = 0;

1615

int ret = 0;

1616

1617

if (unlikely(!raw_spin_trylock(&busiest->lock))) {

1617

if (unlikely(!raw_spin_trylock(&busiest->lock))) {

1618

if (busiest < this_rq) {

1618

if (busiest < this_rq) {

1619

raw_spin_unlock(&this_rq->lock);

1619

raw_spin_unlock(&this_rq->lock);

1620

raw_spin_lock(&busiest->lock);

1620

raw_spin_lock(&busiest->lock);

1621

raw_spin_lock_nested(&this_rq->lock,

1621

raw_spin_lock_nested(&this_rq->lock,

1622

SINGLE_DEPTH_NESTING);

1622

SINGLE_DEPTH_NESTING);

1623

ret = 1;

1623

ret = 1;

1624

} else

1624

} else

1625

raw_spin_lock_nested(&busiest->lock,

1625

raw_spin_lock_nested(&busiest->lock,

1626

SINGLE_DEPTH_NESTING);

1626

SINGLE_DEPTH_NESTING);

1627

}

1627

}

1628

return ret;

1628

return ret;

1629

}

1629

}

1630

1631

#endif /* CONFIG_PREEMPT */

1631

#endif /* CONFIG_PREEMPT */

1632

1633

/*

1633

/*

1634

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

1634

* double_lock_balance - lock the busiest runqueue, this_rq is locked already.

1635

*/

1635

*/

1636

static int double_lock_balance(struct rq *this_rq, struct rq *busiest)

1636

static int double_lock_balance(struct rq *this_rq, struct rq *busiest)

1637

{

1637

{

1638

if (unlikely(!irqs_disabled())) {

1638

if (unlikely(!irqs_disabled())) {

1639

/* printk() doesn't work good under rq->lock */

1639

/* printk() doesn't work good under rq->lock */

1640

raw_spin_unlock(&this_rq->lock);

1640

raw_spin_unlock(&this_rq->lock);

1641

BUG_ON(1);

1641

BUG_ON(1);

1642

}

1642

}

1643

1644

return _double_lock_balance(this_rq, busiest);

1644

return _double_lock_balance(this_rq, busiest);

1645

}

1645

}

1646

1647

static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)

1647

static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)

1648

__releases(busiest->lock)

1648

__releases(busiest->lock)

1649

{

1649

{

1650

raw_spin_unlock(&busiest->lock);

1650

raw_spin_unlock(&busiest->lock);

1651

lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);

1651

lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);

1652

}

1652

}

1653

1654

/*

1654

/*

1655

* double_rq_lock - safely lock two runqueues

1655

* double_rq_lock - safely lock two runqueues

1656

*

1656

*

1657

* Note this does not disable interrupts like task_rq_lock,

1657

* Note this does not disable interrupts like task_rq_lock,

1658

* you need to do so manually before calling.

1658

* you need to do so manually before calling.

1659

*/

1659

*/

1660

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

1660

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

1661

__acquires(rq1->lock)

1661

__acquires(rq1->lock)

1662

__acquires(rq2->lock)

1662

__acquires(rq2->lock)

1663

{

1663

{

1664

BUG_ON(!irqs_disabled());

1664

BUG_ON(!irqs_disabled());

1665

if (rq1 == rq2) {

1665

if (rq1 == rq2) {

1666

raw_spin_lock(&rq1->lock);

1666

raw_spin_lock(&rq1->lock);

1667

__acquire(rq2->lock); /* Fake it out ;) */

1667

__acquire(rq2->lock); /* Fake it out ;) */

1668

} else {

1668

} else {

1669

if (rq1 < rq2) {

1669

if (rq1 < rq2) {

1670

raw_spin_lock(&rq1->lock);

1670

raw_spin_lock(&rq1->lock);

1671

raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);

1671

raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);

1672

} else {

1672

} else {

1673

raw_spin_lock(&rq2->lock);

1673

raw_spin_lock(&rq2->lock);

1674

raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);

1674

raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);

1675

}

1675

}

1676

}

1676

}

1677

}

1677

}

1678

1679

/*

1679

/*

1680

* double_rq_unlock - safely unlock two runqueues

1680

* double_rq_unlock - safely unlock two runqueues

1681

*

1681

*

1682

* Note this does not restore interrupts like task_rq_unlock,

1682

* Note this does not restore interrupts like task_rq_unlock,

1683

* you need to do so manually after calling.

1683

* you need to do so manually after calling.

1684

*/

1684

*/

1685

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

1685

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

1686

__releases(rq1->lock)

1686

__releases(rq1->lock)

1687

__releases(rq2->lock)

1687

__releases(rq2->lock)

1688

{

1688

{

1689

raw_spin_unlock(&rq1->lock);

1689

raw_spin_unlock(&rq1->lock);

1690

if (rq1 != rq2)

1690

if (rq1 != rq2)

1691

raw_spin_unlock(&rq2->lock);

1691

raw_spin_unlock(&rq2->lock);

1692

else

1692

else

1693

__release(rq2->lock);

1693

__release(rq2->lock);

1694

}

1694

}

1695

1696

#else /* CONFIG_SMP */

1696

#else /* CONFIG_SMP */

1697

1698

/*

1698

/*

1699

* double_rq_lock - safely lock two runqueues

1699

* double_rq_lock - safely lock two runqueues

1700

*

1700

*

1701

* Note this does not disable interrupts like task_rq_lock,

1701

* Note this does not disable interrupts like task_rq_lock,

1702

* you need to do so manually before calling.

1702

* you need to do so manually before calling.

1703

*/

1703

*/

1704

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

1704

static void double_rq_lock(struct rq *rq1, struct rq *rq2)

1705

__acquires(rq1->lock)

1705

__acquires(rq1->lock)

1706

__acquires(rq2->lock)

1706

__acquires(rq2->lock)

1707

{

1707

{

1708

BUG_ON(!irqs_disabled());

1708

BUG_ON(!irqs_disabled());

1709

BUG_ON(rq1 != rq2);

1709

BUG_ON(rq1 != rq2);

1710

raw_spin_lock(&rq1->lock);

1710

raw_spin_lock(&rq1->lock);

1711

__acquire(rq2->lock); /* Fake it out ;) */

1711

__acquire(rq2->lock); /* Fake it out ;) */

1712

}

1712

}

1713

1714

/*

1714

/*

1715

* double_rq_unlock - safely unlock two runqueues

1715

* double_rq_unlock - safely unlock two runqueues

1716

*

1716

*

1717

* Note this does not restore interrupts like task_rq_unlock,

1717

* Note this does not restore interrupts like task_rq_unlock,

1718

* you need to do so manually after calling.

1718

* you need to do so manually after calling.

1719

*/

1719

*/

1720

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

1720

static void double_rq_unlock(struct rq *rq1, struct rq *rq2)

1721

__releases(rq1->lock)

1721

__releases(rq1->lock)

1722

__releases(rq2->lock)

1722

__releases(rq2->lock)

1723

{

1723

{

1724

BUG_ON(rq1 != rq2);

1724

BUG_ON(rq1 != rq2);

1725

raw_spin_unlock(&rq1->lock);

1725

raw_spin_unlock(&rq1->lock);

1726

__release(rq2->lock);

1726

__release(rq2->lock);

1727

}

1727

}

1728

1729

#endif

1729

#endif

1730

1731

static void calc_load_account_idle(struct rq *this_rq);

1731

static void calc_load_account_idle(struct rq *this_rq);

1732

static void update_sysctl(void);

1732

static void update_sysctl(void);

1733

static int get_update_sysctl_factor(void);

1733

static int get_update_sysctl_factor(void);

1734

static void update_cpu_load(struct rq *this_rq);

1734

static void update_cpu_load(struct rq *this_rq);

1735

1736

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

1736

static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)

1737

{

1737

{

1738

set_task_rq(p, cpu);

1738

set_task_rq(p, cpu);

1739

#ifdef CONFIG_SMP

1739

#ifdef CONFIG_SMP

1740

/*

1740

/*

1741

* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be

1741

* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be

1742

* successfuly executed on another CPU. We must ensure that updates of

1742

* successfuly executed on another CPU. We must ensure that updates of

1743

* per-task data have been completed by this moment.

1743

* per-task data have been completed by this moment.

1744

*/

1744

*/

1745

smp_wmb();

1745

smp_wmb();

1746

task_thread_info(p)->cpu = cpu;

1746

task_thread_info(p)->cpu = cpu;

1747

#endif

1747

#endif

1748

}

1748

}

1749

1750

static const struct sched_class rt_sched_class;

1750

static const struct sched_class rt_sched_class;

1751

1752

#define sched_class_highest (&stop_sched_class)

1752

#define sched_class_highest (&stop_sched_class)

1753

#define for_each_class(class) \

1753

#define for_each_class(class) \

1754

for (class = sched_class_highest; class; class = class->next)

1754

for (class = sched_class_highest; class; class = class->next)

1755

1756

#include "sched_stats.h"

1756

#include "sched_stats.h"

1757

1758

static void inc_nr_running(struct rq *rq)

1758

static void inc_nr_running(struct rq *rq)

1759

{

1759

{

1760

rq->nr_running++;

1760

rq->nr_running++;

1761

}

1761

}

1762

1763

static void dec_nr_running(struct rq *rq)

1763

static void dec_nr_running(struct rq *rq)

1764

{

1764

{

1765

rq->nr_running--;

1765

rq->nr_running--;

1766

}

1766

}

1767

1768

static void set_load_weight(struct task_struct *p)

1768

static void set_load_weight(struct task_struct *p)

1769

{

1769

{

1770

int prio = p->static_prio - MAX_RT_PRIO;

1770

int prio = p->static_prio - MAX_RT_PRIO;

1771

struct load_weight *load = &p->se.load;

1771

struct load_weight *load = &p->se.load;

1772

1773

/*

1773

/*

1774

* SCHED_IDLE tasks get minimal weight:

1774

* SCHED_IDLE tasks get minimal weight:

1775

*/

1775

*/

1776

if (p->policy == SCHED_IDLE) {

1776

if (p->policy == SCHED_IDLE) {

1777

load->weight = scale_load(WEIGHT_IDLEPRIO);

1777

load->weight = scale_load(WEIGHT_IDLEPRIO);

1778

load->inv_weight = WMULT_IDLEPRIO;

1778

load->inv_weight = WMULT_IDLEPRIO;

1779

return;

1779

return;

1780

}

1780

}

1781

1782

load->weight = scale_load(prio_to_weight[prio]);

1782

load->weight = scale_load(prio_to_weight[prio]);

1783

load->inv_weight = prio_to_wmult[prio];

1783

load->inv_weight = prio_to_wmult[prio];

1784

}

1784

}

1785

1786

static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)

1786

static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)

1787

{

1787

{

1788

update_rq_clock(rq);

1788

update_rq_clock(rq);

1789

sched_info_queued(p);

1789

sched_info_queued(p);

1790

p->sched_class->enqueue_task(rq, p, flags);

1790

p->sched_class->enqueue_task(rq, p, flags);

1791

}

1791

}

1792

1793

static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)

1793

static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)

1794

{

1794

{

1795

update_rq_clock(rq);

1795

update_rq_clock(rq);

1796

sched_info_dequeued(p);

1796

sched_info_dequeued(p);

1797

p->sched_class->dequeue_task(rq, p, flags);

1797

p->sched_class->dequeue_task(rq, p, flags);

1798

}

1798

}

1799

1800

/*

1800

/*

1801

* activate_task - move a task to the runqueue.

1801

* activate_task - move a task to the runqueue.

1802

*/

1802

*/

1803

static void activate_task(struct rq *rq, struct task_struct *p, int flags)

1803

static void activate_task(struct rq *rq, struct task_struct *p, int flags)

1804

{

1804

{

1805

if (task_contributes_to_load(p))

1805

if (task_contributes_to_load(p))

1806

rq->nr_uninterruptible--;

1806

rq->nr_uninterruptible--;

1807

1808

enqueue_task(rq, p, flags);

1808

enqueue_task(rq, p, flags);

1809

inc_nr_running(rq);

1809

inc_nr_running(rq);

1810

}

1810

}

1811

1812

/*

1812

/*

1813

* deactivate_task - remove a task from the runqueue.

1813

* deactivate_task - remove a task from the runqueue.

1814

*/

1814

*/

1815

static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)

1815

static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)

1816

{

1816

{

1817

if (task_contributes_to_load(p))

1817

if (task_contributes_to_load(p))

1818

rq->nr_uninterruptible++;

1818

rq->nr_uninterruptible++;

1819

1820

dequeue_task(rq, p, flags);

1820

dequeue_task(rq, p, flags);

1821

dec_nr_running(rq);

1821

dec_nr_running(rq);

1822

}

1822

}

1823

1824

#ifdef CONFIG_IRQ_TIME_ACCOUNTING

1824

#ifdef CONFIG_IRQ_TIME_ACCOUNTING

1825

1826

/*

1826

/*

1827

* There are no locks covering percpu hardirq/softirq time.

1827

* There are no locks covering percpu hardirq/softirq time.

1828

* They are only modified in account_system_vtime, on corresponding CPU

1828

* They are only modified in account_system_vtime, on corresponding CPU

1829

* with interrupts disabled. So, writes are safe.

1829

* with interrupts disabled. So, writes are safe.

1830

* They are read and saved off onto struct rq in update_rq_clock().

1830

* They are read and saved off onto struct rq in update_rq_clock().

1831

* This may result in other CPU reading this CPU's irq time and can

1831

* This may result in other CPU reading this CPU's irq time and can

1832

* race with irq/account_system_vtime on this CPU. We would either get old

1832

* race with irq/account_system_vtime on this CPU. We would either get old

1833

* or new value with a side effect of accounting a slice of irq time to wrong

1833

* or new value with a side effect of accounting a slice of irq time to wrong

1834

* task when irq is in progress while we read rq->clock. That is a worthy

1834

* task when irq is in progress while we read rq->clock. That is a worthy

1835

* compromise in place of having locks on each irq in account_system_time.

1835

* compromise in place of having locks on each irq in account_system_time.

1836

*/

1836

*/

1837

static DEFINE_PER_CPU(u64, cpu_hardirq_time);

1837

static DEFINE_PER_CPU(u64, cpu_hardirq_time);

1838

static DEFINE_PER_CPU(u64, cpu_softirq_time);

1838

static DEFINE_PER_CPU(u64, cpu_softirq_time);

1839

1840

static DEFINE_PER_CPU(u64, irq_start_time);

1840

static DEFINE_PER_CPU(u64, irq_start_time);

1841

static int sched_clock_irqtime;

1841

static int sched_clock_irqtime;

1842

1843

void enable_sched_clock_irqtime(void)

1843

void enable_sched_clock_irqtime(void)

1844

{

1844

{

1845

sched_clock_irqtime = 1;

1845

sched_clock_irqtime = 1;

1846

}

1846

}

1847

1848

void disable_sched_clock_irqtime(void)

1848

void disable_sched_clock_irqtime(void)

1849

{

1849

{

1850

sched_clock_irqtime = 0;

1850

sched_clock_irqtime = 0;

1851

}

1851

}

1852

1853

#ifndef CONFIG_64BIT

1853

#ifndef CONFIG_64BIT

1854

static DEFINE_PER_CPU(seqcount_t, irq_time_seq);

1854

static DEFINE_PER_CPU(seqcount_t, irq_time_seq);

1855

1856

static inline void irq_time_write_begin(void)

1856

static inline void irq_time_write_begin(void)

1857

{

1857

{

1858

__this_cpu_inc(irq_time_seq.sequence);

1858

__this_cpu_inc(irq_time_seq.sequence);

1859

smp_wmb();

1859

smp_wmb();

1860

}

1860

}

1861

1862

static inline void irq_time_write_end(void)

1862

static inline void irq_time_write_end(void)

1863

{

1863

{

1864

smp_wmb();

1864

smp_wmb();

1865

__this_cpu_inc(irq_time_seq.sequence);

1865

__this_cpu_inc(irq_time_seq.sequence);

1866

}

1866

}

1867

1868

static inline u64 irq_time_read(int cpu)

1868

static inline u64 irq_time_read(int cpu)

1869

{

1869

{

1870

u64 irq_time;

1870

u64 irq_time;

1871

unsigned seq;

1871

unsigned seq;

1872

1873

do {

1873

do {

1874

seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));

1874

seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));

1875

irq_time = per_cpu(cpu_softirq_time, cpu) +

1875

irq_time = per_cpu(cpu_softirq_time, cpu) +

1876

per_cpu(cpu_hardirq_time, cpu);

1876

per_cpu(cpu_hardirq_time, cpu);

1877

} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));

1877

} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));

1878

1879

return irq_time;

1879

return irq_time;

1880

}

1880

}

1881

#else /* CONFIG_64BIT */

1881

#else /* CONFIG_64BIT */

1882

static inline void irq_time_write_begin(void)

1882

static inline void irq_time_write_begin(void)

1883

{

1883

{

1884

}

1884

}

1885

1886

static inline void irq_time_write_end(void)

1886

static inline void irq_time_write_end(void)

1887

{

1887

{

1888

}

1888

}

1889

1890

static inline u64 irq_time_read(int cpu)

1890

static inline u64 irq_time_read(int cpu)

1891

{

1891

{

1892

return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);

1892

return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);

1893

}

1893

}

1894

#endif /* CONFIG_64BIT */

1894

#endif /* CONFIG_64BIT */

1895

1896

/*

1896

/*

1897

* Called before incrementing preempt_count on {soft,}irq_enter

1897

* Called before incrementing preempt_count on {soft,}irq_enter

1898

* and before decrementing preempt_count on {soft,}irq_exit.

1898

* and before decrementing preempt_count on {soft,}irq_exit.

1899

*/

1899

*/

1900

void account_system_vtime(struct task_struct *curr)

1900

void account_system_vtime(struct task_struct *curr)

1901

{

1901

{

1902

unsigned long flags;

1902

unsigned long flags;

1903

s64 delta;

1903

s64 delta;

1904

int cpu;

1904

int cpu;

1905

1906

if (!sched_clock_irqtime)

1906

if (!sched_clock_irqtime)

1907

return;

1907

return;

1908

1909

local_irq_save(flags);

1909

local_irq_save(flags);

1910

1911

cpu = smp_processor_id();

1911

cpu = smp_processor_id();

1912

delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);

1912

delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);

1913

__this_cpu_add(irq_start_time, delta);

1913

__this_cpu_add(irq_start_time, delta);

1914

1915

irq_time_write_begin();

1915

irq_time_write_begin();

1916

/*

1916

/*

1917

* We do not account for softirq time from ksoftirqd here.

1917

* We do not account for softirq time from ksoftirqd here.

1918

* We want to continue accounting softirq time to ksoftirqd thread

1918

* We want to continue accounting softirq time to ksoftirqd thread

1919

* in that case, so as not to confuse scheduler with a special task

1919

* in that case, so as not to confuse scheduler with a special task

1920

* that do not consume any time, but still wants to run.

1920

* that do not consume any time, but still wants to run.

1921

*/

1921

*/

1922

if (hardirq_count())

1922

if (hardirq_count())

1923

__this_cpu_add(cpu_hardirq_time, delta);

1923

__this_cpu_add(cpu_hardirq_time, delta);

1924

else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())

1924

else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())

1925

__this_cpu_add(cpu_softirq_time, delta);

1925

__this_cpu_add(cpu_softirq_time, delta);

1926

1927

irq_time_write_end();

1927

irq_time_write_end();

1928

local_irq_restore(flags);

1928

local_irq_restore(flags);

1929

}

1929

}

1930

EXPORT_SYMBOL_GPL(account_system_vtime);

1930

EXPORT_SYMBOL_GPL(account_system_vtime);

1931

1932

#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

1932

#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

1933

1934

#ifdef CONFIG_PARAVIRT

1934

#ifdef CONFIG_PARAVIRT

1935

static inline u64 steal_ticks(u64 steal)

1935

static inline u64 steal_ticks(u64 steal)

1936

{

1936

{

1937

if (unlikely(steal > NSEC_PER_SEC))

1937

if (unlikely(steal > NSEC_PER_SEC))

1938

return div_u64(steal, TICK_NSEC);

1938

return div_u64(steal, TICK_NSEC);

1939

1940

return __iter_div_u64_rem(steal, TICK_NSEC, &steal);

1940

return __iter_div_u64_rem(steal, TICK_NSEC, &steal);

1941

}

1941

}

1942

#endif

1942

#endif

1943

1944

static void update_rq_clock_task(struct rq *rq, s64 delta)

1944

static void update_rq_clock_task(struct rq *rq, s64 delta)

1945

{

1945

{

1946

/*

1946

/*

1947

* In theory, the compile should just see 0 here, and optimize out the call

1947

* In theory, the compile should just see 0 here, and optimize out the call

1948

* to sched_rt_avg_update. But I don't trust it...

1948

* to sched_rt_avg_update. But I don't trust it...

1949

*/

1949

*/

1950

#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

1950

#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

1951

s64 steal = 0, irq_delta = 0;

1951

s64 steal = 0, irq_delta = 0;

1952

#endif

1952

#endif

1953

#ifdef CONFIG_IRQ_TIME_ACCOUNTING

1953

#ifdef CONFIG_IRQ_TIME_ACCOUNTING

1954

irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;

1954

irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;

1955

1956

/*

1956

/*

1957

* Since irq_time is only updated on {soft,}irq_exit, we might run into

1957

* Since irq_time is only updated on {soft,}irq_exit, we might run into

1958

* this case when a previous update_rq_clock() happened inside a

1958

* this case when a previous update_rq_clock() happened inside a

1959

* {soft,}irq region.

1959

* {soft,}irq region.

1960

*

1960

*

1961

* When this happens, we stop ->clock_task and only update the

1961

* When this happens, we stop ->clock_task and only update the

1962

* prev_irq_time stamp to account for the part that fit, so that a next

1962

* prev_irq_time stamp to account for the part that fit, so that a next

1963

* update will consume the rest. This ensures ->clock_task is

1963

* update will consume the rest. This ensures ->clock_task is

1964

* monotonic.

1964

* monotonic.

1965

*

1965

*

1966

* It does however cause some slight miss-attribution of {soft,}irq

1966

* It does however cause some slight miss-attribution of {soft,}irq

1967

* time, a more accurate solution would be to update the irq_time using

1967

* time, a more accurate solution would be to update the irq_time using

1968

* the current rq->clock timestamp, except that would require using

1968

* the current rq->clock timestamp, except that would require using

1969

* atomic ops.

1969

* atomic ops.

1970

*/

1970

*/

1971

if (irq_delta > delta)

1971

if (irq_delta > delta)

1972

irq_delta = delta;

1972

irq_delta = delta;

1973

1974

rq->prev_irq_time += irq_delta;

1974

rq->prev_irq_time += irq_delta;

1975

delta -= irq_delta;

1975

delta -= irq_delta;

1976

#endif

1976

#endif

1977

#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING

1977

#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING

1978

if (static_branch((&paravirt_steal_rq_enabled))) {

1978

if (static_branch((&paravirt_steal_rq_enabled))) {

1979

u64 st;

1979

u64 st;

1980

1981

steal = paravirt_steal_clock(cpu_of(rq));

1981

steal = paravirt_steal_clock(cpu_of(rq));

1982

steal -= rq->prev_steal_time_rq;

1982

steal -= rq->prev_steal_time_rq;

1983

1984

if (unlikely(steal > delta))

1984

if (unlikely(steal > delta))

1985

steal = delta;

1985

steal = delta;

1986

1987

st = steal_ticks(steal);

1987

st = steal_ticks(steal);

1988

steal = st * TICK_NSEC;

1988

steal = st * TICK_NSEC;

1989

1990

rq->prev_steal_time_rq += steal;

1990

rq->prev_steal_time_rq += steal;

1991

1992

delta -= steal;

1992

delta -= steal;

1993

}

1993

}

1994

#endif

1994

#endif

1995

1996

rq->clock_task += delta;

1996

rq->clock_task += delta;

1997

1998

#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

1998

#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)

1999

if ((irq_delta + steal) && sched_feat(NONTASK_POWER))

1999

if ((irq_delta + steal) && sched_feat(NONTASK_POWER))

2000

sched_rt_avg_update(rq, irq_delta + steal);

2000

sched_rt_avg_update(rq, irq_delta + steal);

2001

#endif

2001

#endif

2002

}

2002

}

2003

2004

#ifdef CONFIG_IRQ_TIME_ACCOUNTING

2004

#ifdef CONFIG_IRQ_TIME_ACCOUNTING

2005

static int irqtime_account_hi_update(void)

2005

static int irqtime_account_hi_update(void)

2006

{

2006

{

2007

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

2007

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

2008

unsigned long flags;

2008

unsigned long flags;

2009

u64 latest_ns;

2009

u64 latest_ns;

2010

int ret = 0;

2010

int ret = 0;

2011

2012

local_irq_save(flags);

2012

local_irq_save(flags);

2013

latest_ns = this_cpu_read(cpu_hardirq_time);

2013

latest_ns = this_cpu_read(cpu_hardirq_time);

2014

if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))

2014

if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))

2015

ret = 1;

2015

ret = 1;

2016

local_irq_restore(flags);

2016

local_irq_restore(flags);

2017

return ret;

2017

return ret;

2018

}

2018

}

2019

2020

static int irqtime_account_si_update(void)

2020

static int irqtime_account_si_update(void)

2021

{

2021

{

2022

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

2022

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

2023

unsigned long flags;

2023

unsigned long flags;

2024

u64 latest_ns;

2024

u64 latest_ns;

2025

int ret = 0;

2025

int ret = 0;

2026

2027

local_irq_save(flags);

2027

local_irq_save(flags);

2028

latest_ns = this_cpu_read(cpu_softirq_time);

2028

latest_ns = this_cpu_read(cpu_softirq_time);

2029

if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))

2029

if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))

2030

ret = 1;

2030

ret = 1;

2031

local_irq_restore(flags);

2031

local_irq_restore(flags);

2032

return ret;

2032

return ret;

2033

}

2033

}

2034

2035

#else /* CONFIG_IRQ_TIME_ACCOUNTING */

2035

#else /* CONFIG_IRQ_TIME_ACCOUNTING */

2036

2037

#define sched_clock_irqtime (0)

2037

#define sched_clock_irqtime (0)

2038

2039

#endif

2039

#endif

2040

2041

#include "sched_idletask.c"

2041

#include "sched_idletask.c"

2042

#include "sched_fair.c"

2042

#include "sched_fair.c"

2043

#include "sched_rt.c"

2043

#include "sched_rt.c"

2044

#include "sched_autogroup.c"

2044

#include "sched_autogroup.c"

2045

#include "sched_stoptask.c"

2045

#include "sched_stoptask.c"

2046

#ifdef CONFIG_SCHED_DEBUG

2046

#ifdef CONFIG_SCHED_DEBUG

2047

# include "sched_debug.c"

2047

# include "sched_debug.c"

2048

#endif

2048

#endif

2049

2050

void sched_set_stop_task(int cpu, struct task_struct *stop)

2050

void sched_set_stop_task(int cpu, struct task_struct *stop)

2051

{

2051

{

2052

struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };

2052

struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };

2053

struct task_struct *old_stop = cpu_rq(cpu)->stop;

2053

struct task_struct *old_stop = cpu_rq(cpu)->stop;

2054

2055

if (stop) {

2055

if (stop) {

2056

/*

2056

/*

2057

* Make it appear like a SCHED_FIFO task, its something

2057

* Make it appear like a SCHED_FIFO task, its something

2058

* userspace knows about and won't get confused about.

2058

* userspace knows about and won't get confused about.

2059

*

2059

*

2060

* Also, it will make PI more or less work without too

2060

* Also, it will make PI more or less work without too

2061

* much confusion -- but then, stop work should not

2061

* much confusion -- but then, stop work should not

2062

* rely on PI working anyway.

2062

* rely on PI working anyway.

2063

*/

2063

*/

2064

sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);

2064

sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);

2065

2066

stop->sched_class = &stop_sched_class;

2066

stop->sched_class = &stop_sched_class;

2067

}

2067

}

2068

2069

cpu_rq(cpu)->stop = stop;

2069

cpu_rq(cpu)->stop = stop;

2070

2071

if (old_stop) {

2071

if (old_stop) {

2072

/*

2072

/*

2073

* Reset it back to a normal scheduling class so that

2073

* Reset it back to a normal scheduling class so that

2074

* it can die in pieces.

2074

* it can die in pieces.

2075

*/

2075

*/

2076

old_stop->sched_class = &rt_sched_class;

2076

old_stop->sched_class = &rt_sched_class;

2077

}

2077

}

2078

}

2078

}

2079

2080

/*

2080

/*

2081

* __normal_prio - return the priority that is based on the static prio

2081

* __normal_prio - return the priority that is based on the static prio

2082

*/

2082

*/

2083

static inline int __normal_prio(struct task_struct *p)

2083

static inline int __normal_prio(struct task_struct *p)

2084

{

2084

{

2085

return p->static_prio;

2085

return p->static_prio;

2086

}

2086

}

2087

2088

/*

2088

/*

2089

* Calculate the expected normal priority: i.e. priority

2089

* Calculate the expected normal priority: i.e. priority

2090

* without taking RT-inheritance into account. Might be

2090

* without taking RT-inheritance into account. Might be

2091

* boosted by interactivity modifiers. Changes upon fork,

2091

* boosted by interactivity modifiers. Changes upon fork,

2092

* setprio syscalls, and whenever the interactivity

2092

* setprio syscalls, and whenever the interactivity

2093

* estimator recalculates.

2093

* estimator recalculates.

2094

*/

2094

*/

2095

static inline int normal_prio(struct task_struct *p)

2095

static inline int normal_prio(struct task_struct *p)

2096

{

2096

{

2097

int prio;

2097

int prio;

2098

2099

if (task_has_rt_policy(p))

2099

if (task_has_rt_policy(p))

2100

prio = MAX_RT_PRIO-1 - p->rt_priority;

2100

prio = MAX_RT_PRIO-1 - p->rt_priority;

2101

else

2101

else

2102

prio = __normal_prio(p);

2102

prio = __normal_prio(p);

2103

return prio;

2103

return prio;

2104

}

2104

}

2105

2106

/*

2106

/*

2107

* Calculate the current priority, i.e. the priority

2107

* Calculate the current priority, i.e. the priority

2108

* taken into account by the scheduler. This value might

2108

* taken into account by the scheduler. This value might

2109

* be boosted by RT tasks, or might be boosted by

2109

* be boosted by RT tasks, or might be boosted by

2110

* interactivity modifiers. Will be RT if the task got

2110

* interactivity modifiers. Will be RT if the task got

2111

* RT-boosted. If not then it returns p->normal_prio.

2111

* RT-boosted. If not then it returns p->normal_prio.

2112

*/

2112

*/

2113

static int effective_prio(struct task_struct *p)

2113

static int effective_prio(struct task_struct *p)

2114

{

2114

{

2115

p->normal_prio = normal_prio(p);

2115

p->normal_prio = normal_prio(p);

2116

/*

2116

/*

2117

* If we are RT tasks or we were boosted to RT priority,

2117

* If we are RT tasks or we were boosted to RT priority,

2118

* keep the priority unchanged. Otherwise, update priority

2118

* keep the priority unchanged. Otherwise, update priority

2119

* to the normal priority:

2119

* to the normal priority:

2120

*/

2120

*/

2121

if (!rt_prio(p->prio))

2121

if (!rt_prio(p->prio))

2122

return p->normal_prio;

2122

return p->normal_prio;

2123

return p->prio;

2123

return p->prio;

2124

}

2124

}

2125

2126

/**

2126

/**

2127

* task_curr - is this task currently executing on a CPU?

2127

* task_curr - is this task currently executing on a CPU?

2128

* @p: the task in question.

2128

* @p: the task in question.

2129

*/

2129

*/

2130

inline int task_curr(const struct task_struct *p)

2130

inline int task_curr(const struct task_struct *p)

2131

{

2131

{

2132

return cpu_curr(task_cpu(p)) == p;

2132

return cpu_curr(task_cpu(p)) == p;

2133

}

2133

}

2134

2135

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

2135

static inline void check_class_changed(struct rq *rq, struct task_struct *p,

2136

const struct sched_class *prev_class,

2136

const struct sched_class *prev_class,

2137

int oldprio)

2137

int oldprio)

2138

{

2138

{

2139

if (prev_class != p->sched_class) {

2139

if (prev_class != p->sched_class) {

2140

if (prev_class->switched_from)

2140

if (prev_class->switched_from)

2141

prev_class->switched_from(rq, p);

2141

prev_class->switched_from(rq, p);

2142

p->sched_class->switched_to(rq, p);

2142

p->sched_class->switched_to(rq, p);

2143

} else if (oldprio != p->prio)

2143

} else if (oldprio != p->prio)

2144

p->sched_class->prio_changed(rq, p, oldprio);

2144

p->sched_class->prio_changed(rq, p, oldprio);

2145

}

2145

}

2146

2147

static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)

2147

static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)

2148

{

2148

{

2149

const struct sched_class *class;

2149

const struct sched_class *class;

2150

2151

if (p->sched_class == rq->curr->sched_class) {

2151

if (p->sched_class == rq->curr->sched_class) {

2152

rq->curr->sched_class->check_preempt_curr(rq, p, flags);

2152

rq->curr->sched_class->check_preempt_curr(rq, p, flags);

2153

} else {

2153

} else {

2154

for_each_class(class) {

2154

for_each_class(class) {

2155

if (class == rq->curr->sched_class)

2155

if (class == rq->curr->sched_class)

2156

break;

2156

break;

2157

if (class == p->sched_class) {

2157

if (class == p->sched_class) {

2158

resched_task(rq->curr);

2158

resched_task(rq->curr);

2159

break;

2159

break;

2160

}

2160

}

2161

}

2161

}

2162

}

2162

}

2163

2164

/*

2164

/*

2165

* A queue event has occurred, and we're going to schedule. In

2165

* A queue event has occurred, and we're going to schedule. In

2166

* this case, we can save a useless back to back clock update.

2166

* this case, we can save a useless back to back clock update.

2167

*/

2167

*/

2168

if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))

2168

if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))

2169

rq->skip_clock_update = 1;

2169

rq->skip_clock_update = 1;

2170

}

2170

}

2171

2172

#ifdef CONFIG_SMP

2172

#ifdef CONFIG_SMP

2173

/*

2173

/*

2174

* Is this task likely cache-hot:

2174

* Is this task likely cache-hot:

2175

*/

2175

*/

2176

static int

2176

static int

2177

task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)

2177

task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)

2178

{

2178

{

2179

s64 delta;

2179

s64 delta;

2180

2181

if (p->sched_class != &fair_sched_class)

2181

if (p->sched_class != &fair_sched_class)

2182

return 0;

2182

return 0;

2183

2184

if (unlikely(p->policy == SCHED_IDLE))

2184

if (unlikely(p->policy == SCHED_IDLE))

2185

return 0;

2185

return 0;

2186

2187

/*

2187

/*

2188

* Buddy candidates are cache hot:

2188

* Buddy candidates are cache hot:

2189

*/

2189

*/

2190

if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&

2190

if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&

2191

(&p->se == cfs_rq_of(&p->se)->next ||

2191

(&p->se == cfs_rq_of(&p->se)->next ||

2192

&p->se == cfs_rq_of(&p->se)->last))

2192

&p->se == cfs_rq_of(&p->se)->last))

2193

return 1;

2193

return 1;

2194

2195

if (sysctl_sched_migration_cost == -1)

2195

if (sysctl_sched_migration_cost == -1)

2196

return 1;

2196

return 1;

2197

if (sysctl_sched_migration_cost == 0)

2197

if (sysctl_sched_migration_cost == 0)

2198

return 0;

2198

return 0;

2199

2200

delta = now - p->se.exec_start;

2200

delta = now - p->se.exec_start;

2201

2202

return delta < (s64)sysctl_sched_migration_cost;

2202

return delta < (s64)sysctl_sched_migration_cost;

2203

}

2203

}

2204

2205

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

2205

void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

2206

{

2206

{

2207

#ifdef CONFIG_SCHED_DEBUG

2207

#ifdef CONFIG_SCHED_DEBUG

2208

/*

2208

/*

2209

* We should never call set_task_cpu() on a blocked task,

2209

* We should never call set_task_cpu() on a blocked task,

2210

* ttwu() will sort out the placement.

2210

* ttwu() will sort out the placement.

2211

*/

2211

*/

2212

WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&

2212

WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&

2213

!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));

2213

!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));

2214

2215

#ifdef CONFIG_LOCKDEP

2215

#ifdef CONFIG_LOCKDEP

2216

/*

2216

/*

2217

* The caller should hold either p->pi_lock or rq->lock, when changing

2217

* The caller should hold either p->pi_lock or rq->lock, when changing

2218

* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.

2218

* a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.

2219

*

2219

*

2220

* sched_move_task() holds both and thus holding either pins the cgroup,

2220

* sched_move_task() holds both and thus holding either pins the cgroup,

2221

* see set_task_rq().

2221

* see set_task_rq().

2222

*

2222

*

2223

* Furthermore, all task_rq users should acquire both locks, see

2223

* Furthermore, all task_rq users should acquire both locks, see

2224

* task_rq_lock().

2224

* task_rq_lock().

2225

*/

2225

*/

2226

WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||

2226

WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||

2227

lockdep_is_held(&task_rq(p)->lock)));

2227

lockdep_is_held(&task_rq(p)->lock)));

2228

#endif

2228

#endif

2229

#endif

2229

#endif

2230

2231

trace_sched_migrate_task(p, new_cpu);

2231

trace_sched_migrate_task(p, new_cpu);

2232

2233

if (task_cpu(p) != new_cpu) {

2233

if (task_cpu(p) != new_cpu) {

2234

p->se.nr_migrations++;

2234

p->se.nr_migrations++;

2235

perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);

2235

perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);

2236

}

2236

}

2237

2238

__set_task_cpu(p, new_cpu);

2238

__set_task_cpu(p, new_cpu);

2239

}

2239

}

2240

2241

struct migration_arg {

2241

struct migration_arg {

2242

struct task_struct *task;

2242

struct task_struct *task;

2243

int dest_cpu;

2243

int dest_cpu;

2244

};

2244

};

2245

2246

static int migration_cpu_stop(void *data);

2246

static int migration_cpu_stop(void *data);

2247

2248

/*

2248

/*

2249

* wait_task_inactive - wait for a thread to unschedule.

2249

* wait_task_inactive - wait for a thread to unschedule.

2250

*

2250

*

2251

* If @match_state is nonzero, it's the @p->state value just checked and

2251

* If @match_state is nonzero, it's the @p->state value just checked and

2252

* not expected to change. If it changes, i.e. @p might have woken up,

2252

* not expected to change. If it changes, i.e. @p might have woken up,

2253

* then return zero. When we succeed in waiting for @p to be off its CPU,

2253

* then return zero. When we succeed in waiting for @p to be off its CPU,

2254

* we return a positive number (its total switch count). If a second call

2254

* we return a positive number (its total switch count). If a second call

2255

* a short while later returns the same number, the caller can be sure that

2255

* a short while later returns the same number, the caller can be sure that

2256

* @p has remained unscheduled the whole time.

2256

* @p has remained unscheduled the whole time.

2257

*

2257

*

2258

* The caller must ensure that the task *will* unschedule sometime soon,

2258

* The caller must ensure that the task *will* unschedule sometime soon,

2259

* else this function might spin for a *long* time. This function can't

2259

* else this function might spin for a *long* time. This function can't

2260

* be called with interrupts off, or it may introduce deadlock with

2260

* be called with interrupts off, or it may introduce deadlock with

2261

* smp_call_function() if an IPI is sent by the same process we are

2261

* smp_call_function() if an IPI is sent by the same process we are

2262

* waiting to become inactive.

2262

* waiting to become inactive.

2263

*/

2263

*/

2264

unsigned long wait_task_inactive(struct task_struct *p, long match_state)

2264

unsigned long wait_task_inactive(struct task_struct *p, long match_state)

2265

{

2265

{

2266

unsigned long flags;

2266

unsigned long flags;

2267

int running, on_rq;

2267

int running, on_rq;

2268

unsigned long ncsw;

2268

unsigned long ncsw;

2269

struct rq *rq;

2269

struct rq *rq;

2270

2271

for (;;) {

2271

for (;;) {

2272

/*

2272

/*

2273

* We do the initial early heuristics without holding

2273

* We do the initial early heuristics without holding

2274

* any task-queue locks at all. We'll only try to get

2274

* any task-queue locks at all. We'll only try to get

2275

* the runqueue lock when things look like they will

2275

* the runqueue lock when things look like they will

2276

* work out!

2276

* work out!

2277

*/

2277

*/

2278

rq = task_rq(p);

2278

rq = task_rq(p);

2279

2280

/*

2280

/*

2281

* If the task is actively running on another CPU

2281

* If the task is actively running on another CPU

2282

* still, just relax and busy-wait without holding

2282

* still, just relax and busy-wait without holding

2283

* any locks.

2283

* any locks.

2284

*

2284

*

2285

* NOTE! Since we don't hold any locks, it's not

2285

* NOTE! Since we don't hold any locks, it's not

2286

* even sure that "rq" stays as the right runqueue!

2286

* even sure that "rq" stays as the right runqueue!

2287

* But we don't care, since "task_running()" will

2287

* But we don't care, since "task_running()" will

2288

* return false if the runqueue has changed and p

2288

* return false if the runqueue has changed and p

2289

* is actually now running somewhere else!

2289

* is actually now running somewhere else!

2290

*/

2290

*/

2291

while (task_running(rq, p)) {

2291

while (task_running(rq, p)) {

2292

if (match_state && unlikely(p->state != match_state))

2292

if (match_state && unlikely(p->state != match_state))

2293

return 0;

2293

return 0;

2294

cpu_relax();

2294

cpu_relax();

2295

}

2295

}

2296

2297

/*

2297

/*

2298

* Ok, time to look more closely! We need the rq

2298

* Ok, time to look more closely! We need the rq

2299

* lock now, to be *sure*. If we're wrong, we'll

2299

* lock now, to be *sure*. If we're wrong, we'll

2300

* just go back and repeat.

2300

* just go back and repeat.

2301

*/

2301

*/

2302

rq = task_rq_lock(p, &flags);

2302

rq = task_rq_lock(p, &flags);

2303

trace_sched_wait_task(p);

2303

trace_sched_wait_task(p);

2304

running = task_running(rq, p);

2304

running = task_running(rq, p);

2305

on_rq = p->on_rq;

2305

on_rq = p->on_rq;

2306

ncsw = 0;

2306

ncsw = 0;

2307

if (!match_state || p->state == match_state)

2307

if (!match_state || p->state == match_state)

2308

ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

2308

ncsw = p->nvcsw | LONG_MIN; /* sets MSB */

2309

task_rq_unlock(rq, p, &flags);

2309

task_rq_unlock(rq, p, &flags);

2310

2311

/*

2311

/*

2312

* If it changed from the expected state, bail out now.

2312

* If it changed from the expected state, bail out now.

2313

*/

2313

*/

2314

if (unlikely(!ncsw))

2314

if (unlikely(!ncsw))

2315

break;

2315

break;

2316

2317

/*

2317

/*

2318

* Was it really running after all now that we

2318

* Was it really running after all now that we

2319

* checked with the proper locks actually held?

2319

* checked with the proper locks actually held?

2320

*

2320

*

2321

* Oops. Go back and try again..

2321

* Oops. Go back and try again..

2322

*/

2322

*/

2323

if (unlikely(running)) {

2323

if (unlikely(running)) {

2324

cpu_relax();

2324

cpu_relax();

2325

continue;

2325

continue;

2326

}

2326

}

2327

2328

/*

2328

/*

2329

* It's not enough that it's not actively running,

2329

* It's not enough that it's not actively running,

2330

* it must be off the runqueue _entirely_, and not

2330

* it must be off the runqueue _entirely_, and not

2331

* preempted!

2331

* preempted!

2332

*

2332

*

2333

* So if it was still runnable (but just not actively

2333

* So if it was still runnable (but just not actively

2334

* running right now), it's preempted, and we should

2334

* running right now), it's preempted, and we should

2335

* yield - it could be a while.

2335

* yield - it could be a while.

2336

*/

2336

*/

2337

if (unlikely(on_rq)) {

2337

if (unlikely(on_rq)) {

2338

ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);

2338

ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);

2339

2340

set_current_state(TASK_UNINTERRUPTIBLE);

2340

set_current_state(TASK_UNINTERRUPTIBLE);

2341

schedule_hrtimeout(&to, HRTIMER_MODE_REL);

2341

schedule_hrtimeout(&to, HRTIMER_MODE_REL);

2342

continue;

2342

continue;

2343

}

2343

}

2344

2345

/*

2345

/*

2346

* Ahh, all good. It wasn't running, and it wasn't

2346

* Ahh, all good. It wasn't running, and it wasn't

2347

* runnable, which means that it will never become

2347

* runnable, which means that it will never become

2348

* running in the future either. We're all done!

2348

* running in the future either. We're all done!

2349

*/

2349

*/

2350

break;

2350

break;

2351

}

2351

}

2352

2353

return ncsw;

2353

return ncsw;

2354

}

2354

}

2355

2356

/***

2356

/***

2357

* kick_process - kick a running thread to enter/exit the kernel

2357

* kick_process - kick a running thread to enter/exit the kernel

2358

* @p: the to-be-kicked thread

2358

* @p: the to-be-kicked thread

2359

*

2359

*

2360

* Cause a process which is running on another CPU to enter

2360

* Cause a process which is running on another CPU to enter

2361

* kernel-mode, without any delay. (to get signals handled.)

2361

* kernel-mode, without any delay. (to get signals handled.)

2362

*

2362

*

2363

* NOTE: this function doesn't have to take the runqueue lock,

2363

* NOTE: this function doesn't have to take the runqueue lock,

2364

* because all it wants to ensure is that the remote task enters

2364

* because all it wants to ensure is that the remote task enters

2365

* the kernel. If the IPI races and the task has been migrated

2365

* the kernel. If the IPI races and the task has been migrated

2366

* to another CPU then no harm is done and the purpose has been

2366

* to another CPU then no harm is done and the purpose has been

2367

* achieved as well.

2367

* achieved as well.

2368

*/

2368

*/

2369

void kick_process(struct task_struct *p)

2369

void kick_process(struct task_struct *p)

2370

{

2370

{

2371

int cpu;

2371

int cpu;

2372

2373

preempt_disable();

2373

preempt_disable();

2374

cpu = task_cpu(p);

2374

cpu = task_cpu(p);

2375

if ((cpu != smp_processor_id()) && task_curr(p))

2375

if ((cpu != smp_processor_id()) && task_curr(p))

2376

smp_send_reschedule(cpu);

2376

smp_send_reschedule(cpu);

2377

preempt_enable();

2377

preempt_enable();

2378

}

2378

}

2379

EXPORT_SYMBOL_GPL(kick_process);

2379

EXPORT_SYMBOL_GPL(kick_process);

2380

#endif /* CONFIG_SMP */

2380

#endif /* CONFIG_SMP */

2381

2382

#ifdef CONFIG_SMP

2382

#ifdef CONFIG_SMP

2383

/*

2383

/*

2384

* ->cpus_allowed is protected by both rq->lock and p->pi_lock

2384

* ->cpus_allowed is protected by both rq->lock and p->pi_lock

2385

*/

2385

*/

2386

static int select_fallback_rq(int cpu, struct task_struct *p)

2386

static int select_fallback_rq(int cpu, struct task_struct *p)

2387

{

2387

{

2388

int dest_cpu;

2388

int dest_cpu;

2389

const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));

2389

const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));

2390

2391

/* Look for allowed, online CPU in same node. */

2391

/* Look for allowed, online CPU in same node. */

2392

for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)

2392

for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)

2393

if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

2393

if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

2394

return dest_cpu;

2394

return dest_cpu;

2395

2396

/* Any allowed, online CPU? */

2396

/* Any allowed, online CPU? */

2397

dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);

2397

dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);

2398

if (dest_cpu < nr_cpu_ids)

2398

if (dest_cpu < nr_cpu_ids)

2399

return dest_cpu;

2399

return dest_cpu;

2400

2401

/* No more Mr. Nice Guy. */

2401

/* No more Mr. Nice Guy. */

2402

dest_cpu = cpuset_cpus_allowed_fallback(p);

2402

dest_cpu = cpuset_cpus_allowed_fallback(p);

2403

/*

2403

/*

2404

* Don't tell them about moving exiting tasks or

2404

* Don't tell them about moving exiting tasks or

2405

* kernel threads (both mm NULL), since they never

2405

* kernel threads (both mm NULL), since they never

2406

* leave kernel.

2406

* leave kernel.

2407

*/

2407

*/

2408

if (p->mm && printk_ratelimit()) {

2408

if (p->mm && printk_ratelimit()) {

2409

printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",

2409

printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",

2410

task_pid_nr(p), p->comm, cpu);

2410

task_pid_nr(p), p->comm, cpu);

2411

}

2411

}

2412

2413

return dest_cpu;

2413

return dest_cpu;

2414

}

2414

}

2415

2416

/*

2416

/*

2417

* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.

2417

* The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.

2418

*/

2418

*/

2419

static inline

2419

static inline

2420

int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)

2420

int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)

2421

{

2421

{

2422

int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);

2422

int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);

2423

2424

/*

2424

/*

2425

* In order not to call set_task_cpu() on a blocking task we need

2425

* In order not to call set_task_cpu() on a blocking task we need

2426

* to rely on ttwu() to place the task on a valid ->cpus_allowed

2426

* to rely on ttwu() to place the task on a valid ->cpus_allowed

2427

* cpu.

2427

* cpu.

2428

*

2428

*

2429

* Since this is common to all placement strategies, this lives here.

2429

* Since this is common to all placement strategies, this lives here.

2430

*

2430

*

2431

* [ this allows ->select_task() to simply return task_cpu(p) and

2431

* [ this allows ->select_task() to simply return task_cpu(p) and

2432

* not worry about this generic constraint ]

2432

* not worry about this generic constraint ]

2433

*/

2433

*/

2434

if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||

2434

if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||

2435

!cpu_online(cpu)))

2435

!cpu_online(cpu)))

2436

cpu = select_fallback_rq(task_cpu(p), p);

2436

cpu = select_fallback_rq(task_cpu(p), p);

2437

2438

return cpu;

2438

return cpu;

2439

}

2439

}

2440

2441

static void update_avg(u64 *avg, u64 sample)

2441

static void update_avg(u64 *avg, u64 sample)

2442

{

2442

{

2443

s64 diff = sample - *avg;

2443

s64 diff = sample - *avg;

2444

*avg += diff >> 3;

2444

*avg += diff >> 3;

2445

}

2445

}

2446

#endif

2446

#endif

2447

2448

static void

2448

static void

2449

ttwu_stat(struct task_struct *p, int cpu, int wake_flags)

2449

ttwu_stat(struct task_struct *p, int cpu, int wake_flags)

2450

{

2450

{

2451

#ifdef CONFIG_SCHEDSTATS

2451

#ifdef CONFIG_SCHEDSTATS

2452

struct rq *rq = this_rq();

2452

struct rq *rq = this_rq();

2453

2454

#ifdef CONFIG_SMP

2454

#ifdef CONFIG_SMP

2455

int this_cpu = smp_processor_id();

2455

int this_cpu = smp_processor_id();

2456

2457

if (cpu == this_cpu) {

2457

if (cpu == this_cpu) {

2458

schedstat_inc(rq, ttwu_local);

2458

schedstat_inc(rq, ttwu_local);

2459

schedstat_inc(p, se.statistics.nr_wakeups_local);

2459

schedstat_inc(p, se.statistics.nr_wakeups_local);

2460

} else {

2460

} else {

2461

struct sched_domain *sd;

2461

struct sched_domain *sd;

2462

2463

schedstat_inc(p, se.statistics.nr_wakeups_remote);

2463

schedstat_inc(p, se.statistics.nr_wakeups_remote);

2464

rcu_read_lock();

2464

rcu_read_lock();

2465

for_each_domain(this_cpu, sd) {

2465

for_each_domain(this_cpu, sd) {

2466

if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {

2466

if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {

2467

schedstat_inc(sd, ttwu_wake_remote);

2467

schedstat_inc(sd, ttwu_wake_remote);

2468

break;

2468

break;

2469

}

2469

}

2470

}

2470

}

2471

rcu_read_unlock();

2471

rcu_read_unlock();

2472

}

2472

}

2473

2474

if (wake_flags & WF_MIGRATED)

2474

if (wake_flags & WF_MIGRATED)

2475

schedstat_inc(p, se.statistics.nr_wakeups_migrate);

2475

schedstat_inc(p, se.statistics.nr_wakeups_migrate);

2476

2477

#endif /* CONFIG_SMP */

2477

#endif /* CONFIG_SMP */

2478

2479

schedstat_inc(rq, ttwu_count);

2479

schedstat_inc(rq, ttwu_count);

2480

schedstat_inc(p, se.statistics.nr_wakeups);

2480

schedstat_inc(p, se.statistics.nr_wakeups);

2481

2482

if (wake_flags & WF_SYNC)

2482

if (wake_flags & WF_SYNC)

2483

schedstat_inc(p, se.statistics.nr_wakeups_sync);

2483

schedstat_inc(p, se.statistics.nr_wakeups_sync);

2484

2485

#endif /* CONFIG_SCHEDSTATS */

2485

#endif /* CONFIG_SCHEDSTATS */

2486

}

2486

}

2487

2488

static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)

2488

static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)

2489

{

2489

{

2490

activate_task(rq, p, en_flags);

2490

activate_task(rq, p, en_flags);

2491

p->on_rq = 1;

2491

p->on_rq = 1;

2492

2493

/* if a worker is waking up, notify workqueue */

2493

/* if a worker is waking up, notify workqueue */

2494

if (p->flags & PF_WQ_WORKER)

2494

if (p->flags & PF_WQ_WORKER)

2495

wq_worker_waking_up(p, cpu_of(rq));

2495

wq_worker_waking_up(p, cpu_of(rq));

2496

}

2496

}

2497

2498

/*

2498

/*

2499

* Mark the task runnable and perform wakeup-preemption.

2499

* Mark the task runnable and perform wakeup-preemption.

2500

*/

2500

*/

2501

static void

2501

static void

2502

ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)

2502

ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)

2503

{

2503

{

2504

trace_sched_wakeup(p, true);

2504

trace_sched_wakeup(p, true);

2505

check_preempt_curr(rq, p, wake_flags);

2505

check_preempt_curr(rq, p, wake_flags);

2506

2507

p->state = TASK_RUNNING;

2507

p->state = TASK_RUNNING;

2508

#ifdef CONFIG_SMP

2508

#ifdef CONFIG_SMP

2509

if (p->sched_class->task_woken)

2509

if (p->sched_class->task_woken)

2510

p->sched_class->task_woken(rq, p);

2510

p->sched_class->task_woken(rq, p);

2511

2512

if (rq->idle_stamp) {

2512

if (rq->idle_stamp) {

2513

u64 delta = rq->clock - rq->idle_stamp;

2513

u64 delta = rq->clock - rq->idle_stamp;

2514

u64 max = 2*sysctl_sched_migration_cost;

2514

u64 max = 2*sysctl_sched_migration_cost;

2515

2516

if (delta > max)

2516

if (delta > max)

2517

rq->avg_idle = max;

2517

rq->avg_idle = max;

2518

else

2518

else

2519

update_avg(&rq->avg_idle, delta);

2519

update_avg(&rq->avg_idle, delta);

2520

rq->idle_stamp = 0;

2520

rq->idle_stamp = 0;

2521

}

2521

}

2522

#endif

2522

#endif

2523

}

2523

}

2524

2525

static void

2525

static void

2526

ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)

2526

ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)

2527

{

2527

{

2528

#ifdef CONFIG_SMP

2528

#ifdef CONFIG_SMP

2529

if (p->sched_contributes_to_load)

2529

if (p->sched_contributes_to_load)

2530

rq->nr_uninterruptible--;

2530

rq->nr_uninterruptible--;

2531

#endif

2531

#endif

2532

2533

ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);

2533

ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);

2534

ttwu_do_wakeup(rq, p, wake_flags);

2534

ttwu_do_wakeup(rq, p, wake_flags);

2535

}

2535

}

2536

2537

/*

2537

/*

2538

* Called in case the task @p isn't fully descheduled from its runqueue,

2538

* Called in case the task @p isn't fully descheduled from its runqueue,

2539

* in this case we must do a remote wakeup. Its a 'light' wakeup though,

2539

* in this case we must do a remote wakeup. Its a 'light' wakeup though,

2540

* since all we need to do is flip p->state to TASK_RUNNING, since

2540

* since all we need to do is flip p->state to TASK_RUNNING, since

2541

* the task is still ->on_rq.

2541

* the task is still ->on_rq.

2542

*/

2542

*/

2543

static int ttwu_remote(struct task_struct *p, int wake_flags)

2543

static int ttwu_remote(struct task_struct *p, int wake_flags)

2544

{

2544

{

2545

struct rq *rq;

2545

struct rq *rq;

2546

int ret = 0;

2546

int ret = 0;

2547

2548

rq = __task_rq_lock(p);

2548

rq = __task_rq_lock(p);

2549

if (p->on_rq) {

2549

if (p->on_rq) {

2550

ttwu_do_wakeup(rq, p, wake_flags);

2550

ttwu_do_wakeup(rq, p, wake_flags);

2551

ret = 1;

2551

ret = 1;

2552

}

2552

}

2553

__task_rq_unlock(rq);

2553

__task_rq_unlock(rq);

2554

2555

return ret;

2555

return ret;

2556

}

2556

}

2557

2558

#ifdef CONFIG_SMP

2558

#ifdef CONFIG_SMP

2559

static void sched_ttwu_do_pending(struct task_struct *list)

2559

static void sched_ttwu_do_pending(struct task_struct *list)

2560

{

2560

{

2561

struct rq *rq = this_rq();

2561

struct rq *rq = this_rq();

2562

2563

raw_spin_lock(&rq->lock);

2563

raw_spin_lock(&rq->lock);

2564

2565

while (list) {

2565

while (list) {

2566

struct task_struct *p = list;

2566

struct task_struct *p = list;

2567

list = list->wake_entry;

2567

list = list->wake_entry;

2568

ttwu_do_activate(rq, p, 0);

2568

ttwu_do_activate(rq, p, 0);

2569

}

2569

}

2570

2571

raw_spin_unlock(&rq->lock);

2571

raw_spin_unlock(&rq->lock);

2572

}

2572

}

2573

2574

#ifdef CONFIG_HOTPLUG_CPU

2574

#ifdef CONFIG_HOTPLUG_CPU

2575

2576

static void sched_ttwu_pending(void)

2576

static void sched_ttwu_pending(void)

2577

{

2577

{

2578

struct rq *rq = this_rq();

2578

struct rq *rq = this_rq();

2579

struct task_struct *list = xchg(&rq->wake_list, NULL);

2579

struct task_struct *list = xchg(&rq->wake_list, NULL);

2580

2581

if (!list)

2581

if (!list)

2582

return;

2582

return;

2583

2584

sched_ttwu_do_pending(list);

2584

sched_ttwu_do_pending(list);

2585

}

2585

}

2586

2587

#endif /* CONFIG_HOTPLUG_CPU */

2587

#endif /* CONFIG_HOTPLUG_CPU */

2588

2589

void scheduler_ipi(void)

2589

void scheduler_ipi(void)

2590

{

2590

{

2591

struct rq *rq = this_rq();

2591

struct rq *rq = this_rq();

2592

struct task_struct *list = xchg(&rq->wake_list, NULL);

2592

struct task_struct *list = xchg(&rq->wake_list, NULL);

2593

2594

if (!list)

2594

if (!list)

2595

return;

2595

return;

2596

2597

/*

2597

/*

2598

* Not all reschedule IPI handlers call irq_enter/irq_exit, since

2598

* Not all reschedule IPI handlers call irq_enter/irq_exit, since

2599

* traditionally all their work was done from the interrupt return

2599

* traditionally all their work was done from the interrupt return

2600

* path. Now that we actually do some work, we need to make sure

2600

* path. Now that we actually do some work, we need to make sure

2601

* we do call them.

2601

* we do call them.

2602

*

2602

*

2603

* Some archs already do call them, luckily irq_enter/exit nest

2603

* Some archs already do call them, luckily irq_enter/exit nest

2604

* properly.

2604

* properly.

2605

*

2605

*

2606

* Arguably we should visit all archs and update all handlers,

2606

* Arguably we should visit all archs and update all handlers,

2607

* however a fair share of IPIs are still resched only so this would

2607

* however a fair share of IPIs are still resched only so this would

2608

* somewhat pessimize the simple resched case.

2608

* somewhat pessimize the simple resched case.

2609

*/

2609

*/

2610

irq_enter();

2610

irq_enter();

2611

sched_ttwu_do_pending(list);

2611

sched_ttwu_do_pending(list);

2612

irq_exit();

2612

irq_exit();

2613

}

2613

}

2614

2615

static void ttwu_queue_remote(struct task_struct *p, int cpu)

2615

static void ttwu_queue_remote(struct task_struct *p, int cpu)

2616

{

2616

{

2617

struct rq *rq = cpu_rq(cpu);

2617

struct rq *rq = cpu_rq(cpu);

2618

struct task_struct *next = rq->wake_list;

2618

struct task_struct *next = rq->wake_list;

2619

2620

for (;;) {

2620

for (;;) {

2621

struct task_struct *old = next;

2621

struct task_struct *old = next;

2622

2623

p->wake_entry = next;

2623

p->wake_entry = next;

2624

next = cmpxchg(&rq->wake_list, old, p);

2624

next = cmpxchg(&rq->wake_list, old, p);

2625

if (next == old)

2625

if (next == old)

2626

break;

2626

break;

2627

}

2627

}

2628

2629

if (!next)

2629

if (!next)

2630

smp_send_reschedule(cpu);

2630

smp_send_reschedule(cpu);

2631

}

2631

}

2632

2633

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

2633

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

2634

static int ttwu_activate_remote(struct task_struct *p, int wake_flags)

2634

static int ttwu_activate_remote(struct task_struct *p, int wake_flags)

2635

{

2635

{

2636

struct rq *rq;

2636

struct rq *rq;

2637

int ret = 0;

2637

int ret = 0;

2638

2639

rq = __task_rq_lock(p);

2639

rq = __task_rq_lock(p);

2640

if (p->on_cpu) {

2640

if (p->on_cpu) {

2641

ttwu_activate(rq, p, ENQUEUE_WAKEUP);

2641

ttwu_activate(rq, p, ENQUEUE_WAKEUP);

2642

ttwu_do_wakeup(rq, p, wake_flags);

2642

ttwu_do_wakeup(rq, p, wake_flags);

2643

ret = 1;

2643

ret = 1;

2644

}

2644

}

2645

__task_rq_unlock(rq);

2645

__task_rq_unlock(rq);

2646

2647

return ret;

2647

return ret;

2648

2649

}

2649

}

2650

#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */

2650

#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */

2651

#endif /* CONFIG_SMP */

2651

#endif /* CONFIG_SMP */

2652

2653

static void ttwu_queue(struct task_struct *p, int cpu)

2653

static void ttwu_queue(struct task_struct *p, int cpu)

2654

{

2654

{

2655

struct rq *rq = cpu_rq(cpu);

2655

struct rq *rq = cpu_rq(cpu);

2656

2657

#if defined(CONFIG_SMP)

2657

#if defined(CONFIG_SMP)

2658

if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {

2658

if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {

2659

sched_clock_cpu(cpu); /* sync clocks x-cpu */

2659

sched_clock_cpu(cpu); /* sync clocks x-cpu */

2660

ttwu_queue_remote(p, cpu);

2660

ttwu_queue_remote(p, cpu);

2661

return;

2661

return;

2662

}

2662

}

2663

#endif

2663

#endif

2664

2665

raw_spin_lock(&rq->lock);

2665

raw_spin_lock(&rq->lock);

2666

ttwu_do_activate(rq, p, 0);

2666

ttwu_do_activate(rq, p, 0);

2667

raw_spin_unlock(&rq->lock);

2667

raw_spin_unlock(&rq->lock);

2668

}

2668

}

2669

2670

/**

2670

/**

2671

* try_to_wake_up - wake up a thread

2671

* try_to_wake_up - wake up a thread

2672

* @p: the thread to be awakened

2672

* @p: the thread to be awakened

2673

* @state: the mask of task states that can be woken

2673

* @state: the mask of task states that can be woken

2674

* @wake_flags: wake modifier flags (WF_*)

2674

* @wake_flags: wake modifier flags (WF_*)

2675

*

2675

*

2676

* Put it on the run-queue if it's not already there. The "current"

2676

* Put it on the run-queue if it's not already there. The "current"

2677

* thread is always on the run-queue (except when the actual

2677

* thread is always on the run-queue (except when the actual

2678

* re-schedule is in progress), and as such you're allowed to do

2678

* re-schedule is in progress), and as such you're allowed to do

2679

* the simpler "current->state = TASK_RUNNING" to mark yourself

2679

* the simpler "current->state = TASK_RUNNING" to mark yourself

2680

* runnable without the overhead of this.

2680

* runnable without the overhead of this.

2681

*

2681

*

2682

* Returns %true if @p was woken up, %false if it was already running

2682

* Returns %true if @p was woken up, %false if it was already running

2683

* or @state didn't match @p's state.

2683

* or @state didn't match @p's state.

2684

*/

2684

*/

2685

static int

2685

static int

2686

try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)

2686

try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)

2687

{

2687

{

2688

unsigned long flags;

2688

unsigned long flags;

2689

int cpu, success = 0;

2689

int cpu, success = 0;

2690

2691

smp_wmb();

2691

smp_wmb();

2692

raw_spin_lock_irqsave(&p->pi_lock, flags);

2692

raw_spin_lock_irqsave(&p->pi_lock, flags);

2693

if (!(p->state & state))

2693

if (!(p->state & state))

2694

goto out;

2694

goto out;

2695

2696

success = 1; /* we're going to change ->state */

2696

success = 1; /* we're going to change ->state */

2697

cpu = task_cpu(p);

2697

cpu = task_cpu(p);

2698

2699

if (p->on_rq && ttwu_remote(p, wake_flags))

2699

if (p->on_rq && ttwu_remote(p, wake_flags))

2700

goto stat;

2700

goto stat;

2701

2702

#ifdef CONFIG_SMP

2702

#ifdef CONFIG_SMP

2703

/*

2703

/*

2704

* If the owning (remote) cpu is still in the middle of schedule() with

2704

* If the owning (remote) cpu is still in the middle of schedule() with

2705

* this task as prev, wait until its done referencing the task.

2705

* this task as prev, wait until its done referencing the task.

2706

*/

2706

*/

2707

while (p->on_cpu) {

2707

while (p->on_cpu) {

2708

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

2708

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

2709

/*

2709

/*

2710

* In case the architecture enables interrupts in

2710

* In case the architecture enables interrupts in

2711

* context_switch(), we cannot busy wait, since that

2711

* context_switch(), we cannot busy wait, since that

2712

* would lead to deadlocks when an interrupt hits and

2712

* would lead to deadlocks when an interrupt hits and

2713

* tries to wake up @prev. So bail and do a complete

2713

* tries to wake up @prev. So bail and do a complete

2714

* remote wakeup.

2714

* remote wakeup.

2715

*/

2715

*/

2716

if (ttwu_activate_remote(p, wake_flags))

2716

if (ttwu_activate_remote(p, wake_flags))

2717

goto stat;

2717

goto stat;

2718

#else

2718

#else

2719

cpu_relax();

2719

cpu_relax();

2720

#endif

2720

#endif

2721

}

2721

}

2722

/*

2722

/*

2723

* Pairs with the smp_wmb() in finish_lock_switch().

2723

* Pairs with the smp_wmb() in finish_lock_switch().

2724

*/

2724

*/

2725

smp_rmb();

2725

smp_rmb();

2726

2727

p->sched_contributes_to_load = !!task_contributes_to_load(p);

2727

p->sched_contributes_to_load = !!task_contributes_to_load(p);

2728

p->state = TASK_WAKING;

2728

p->state = TASK_WAKING;

2729

2730

if (p->sched_class->task_waking)

2730

if (p->sched_class->task_waking)

2731

p->sched_class->task_waking(p);

2731

p->sched_class->task_waking(p);

2732

2733

cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);

2733

cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);

2734

if (task_cpu(p) != cpu) {

2734

if (task_cpu(p) != cpu) {

2735

wake_flags |= WF_MIGRATED;

2735

wake_flags |= WF_MIGRATED;

2736

set_task_cpu(p, cpu);

2736

set_task_cpu(p, cpu);

2737

}

2737

}

2738

#endif /* CONFIG_SMP */

2738

#endif /* CONFIG_SMP */

2739

2740

ttwu_queue(p, cpu);

2740

ttwu_queue(p, cpu);

2741

stat:

2741

stat:

2742

ttwu_stat(p, cpu, wake_flags);

2742

ttwu_stat(p, cpu, wake_flags);

2743

out:

2743

out:

2744

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

2744

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

2745

2746

return success;

2746

return success;

2747

}

2747

}

2748

2749

/**

2749

/**

2750

* try_to_wake_up_local - try to wake up a local task with rq lock held

2750

* try_to_wake_up_local - try to wake up a local task with rq lock held

2751

* @p: the thread to be awakened

2751

* @p: the thread to be awakened

2752

*

2752

*

2753

* Put @p on the run-queue if it's not already there. The caller must

2753

* Put @p on the run-queue if it's not already there. The caller must

2754

* ensure that this_rq() is locked, @p is bound to this_rq() and not

2754

* ensure that this_rq() is locked, @p is bound to this_rq() and not

2755

* the current task.

2755

* the current task.

2756

*/

2756

*/

2757

static void try_to_wake_up_local(struct task_struct *p)

2757

static void try_to_wake_up_local(struct task_struct *p)

2758

{

2758

{

2759

struct rq *rq = task_rq(p);

2759

struct rq *rq = task_rq(p);

2760

2761

BUG_ON(rq != this_rq());

2761

BUG_ON(rq != this_rq());

2762

BUG_ON(p == current);

2762

BUG_ON(p == current);

2763

lockdep_assert_held(&rq->lock);

2763

lockdep_assert_held(&rq->lock);

2764

2765

if (!raw_spin_trylock(&p->pi_lock)) {

2765

if (!raw_spin_trylock(&p->pi_lock)) {

2766

raw_spin_unlock(&rq->lock);

2766

raw_spin_unlock(&rq->lock);

2767

raw_spin_lock(&p->pi_lock);

2767

raw_spin_lock(&p->pi_lock);

2768

raw_spin_lock(&rq->lock);

2768

raw_spin_lock(&rq->lock);

2769

}

2769

}

2770

2771

if (!(p->state & TASK_NORMAL))

2771

if (!(p->state & TASK_NORMAL))

2772

goto out;

2772

goto out;

2773

2774

if (!p->on_rq)

2774

if (!p->on_rq)

2775

ttwu_activate(rq, p, ENQUEUE_WAKEUP);

2775

ttwu_activate(rq, p, ENQUEUE_WAKEUP);

2776

2777

ttwu_do_wakeup(rq, p, 0);

2777

ttwu_do_wakeup(rq, p, 0);

2778

ttwu_stat(p, smp_processor_id(), 0);

2778

ttwu_stat(p, smp_processor_id(), 0);

2779

out:

2779

out:

2780

raw_spin_unlock(&p->pi_lock);

2780

raw_spin_unlock(&p->pi_lock);

2781

}

2781

}

2782

2783

/**

2783

/**

2784

* wake_up_process - Wake up a specific process

2784

* wake_up_process - Wake up a specific process

2785

* @p: The process to be woken up.

2785

* @p: The process to be woken up.

2786

*

2786

*

2787

* Attempt to wake up the nominated process and move it to the set of runnable

2787

* Attempt to wake up the nominated process and move it to the set of runnable

2788

* processes. Returns 1 if the process was woken up, 0 if it was already

2788

* processes. Returns 1 if the process was woken up, 0 if it was already

2789

* running.

2789

* running.

2790

*

2790

*

2791

* It may be assumed that this function implies a write memory barrier before

2791

* It may be assumed that this function implies a write memory barrier before

2792

* changing the task state if and only if any tasks are woken up.

2792

* changing the task state if and only if any tasks are woken up.

2793

*/

2793

*/

2794

int wake_up_process(struct task_struct *p)

2794

int wake_up_process(struct task_struct *p)

2795

{

2795

{

2796

return try_to_wake_up(p, TASK_ALL, 0);

2796

return try_to_wake_up(p, TASK_ALL, 0);

2797

}

2797

}

2798

EXPORT_SYMBOL(wake_up_process);

2798

EXPORT_SYMBOL(wake_up_process);

2799

2800

int wake_up_state(struct task_struct *p, unsigned int state)

2800

int wake_up_state(struct task_struct *p, unsigned int state)

2801

{

2801

{

2802

return try_to_wake_up(p, state, 0);

2802

return try_to_wake_up(p, state, 0);

2803

}

2803

}

2804

2805

/*

2805

/*

2806

* Perform scheduler related setup for a newly forked process p.

2806

* Perform scheduler related setup for a newly forked process p.

2807

* p is forked by current.

2807

* p is forked by current.

2808

*

2808

*

2809

* __sched_fork() is basic setup used by init_idle() too:

2809

* __sched_fork() is basic setup used by init_idle() too:

2810

*/

2810

*/

2811

static void __sched_fork(struct task_struct *p)

2811

static void __sched_fork(struct task_struct *p)

2812

{

2812

{

2813

p->on_rq = 0;

2813

p->on_rq = 0;

2814

2815

p->se.on_rq = 0;

2815

p->se.on_rq = 0;

2816

p->se.exec_start = 0;

2816

p->se.exec_start = 0;

2817

p->se.sum_exec_runtime = 0;

2817

p->se.sum_exec_runtime = 0;

2818

p->se.prev_sum_exec_runtime = 0;

2818

p->se.prev_sum_exec_runtime = 0;

2819

p->se.nr_migrations = 0;

2819

p->se.nr_migrations = 0;

2820

p->se.vruntime = 0;

2820

p->se.vruntime = 0;

2821

INIT_LIST_HEAD(&p->se.group_node);

2821

INIT_LIST_HEAD(&p->se.group_node);

2822

2823

#ifdef CONFIG_SCHEDSTATS

2823

#ifdef CONFIG_SCHEDSTATS

2824

memset(&p->se.statistics, 0, sizeof(p->se.statistics));

2824

memset(&p->se.statistics, 0, sizeof(p->se.statistics));

2825

#endif

2825

#endif

2826

2827

INIT_LIST_HEAD(&p->rt.run_list);

2827

INIT_LIST_HEAD(&p->rt.run_list);

2828

2829

#ifdef CONFIG_PREEMPT_NOTIFIERS

2829

#ifdef CONFIG_PREEMPT_NOTIFIERS

2830

INIT_HLIST_HEAD(&p->preempt_notifiers);

2830

INIT_HLIST_HEAD(&p->preempt_notifiers);

2831

#endif

2831

#endif

2832

}

2832

}

2833

2834

/*

2834

/*

2835

* fork()/clone()-time setup:

2835

* fork()/clone()-time setup:

2836

*/

2836

*/

2837

void sched_fork(struct task_struct *p)

2837

void sched_fork(struct task_struct *p)

2838

{

2838

{

2839

unsigned long flags;

2839

unsigned long flags;

2840

int cpu = get_cpu();

2840

int cpu = get_cpu();

2841

2842

__sched_fork(p);

2842

__sched_fork(p);

2843

/*

2843

/*

2844

* We mark the process as running here. This guarantees that

2844

* We mark the process as running here. This guarantees that

2845

* nobody will actually run it, and a signal or other external

2845

* nobody will actually run it, and a signal or other external

2846

* event cannot wake it up and insert it on the runqueue either.

2846

* event cannot wake it up and insert it on the runqueue either.

2847

*/

2847

*/

2848

p->state = TASK_RUNNING;

2848

p->state = TASK_RUNNING;

2849

2850

/*

2850

/*

2851

* Revert to default priority/policy on fork if requested.

2851

* Revert to default priority/policy on fork if requested.

2852

*/

2852

*/

2853

if (unlikely(p->sched_reset_on_fork)) {

2853

if (unlikely(p->sched_reset_on_fork)) {

2854

if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {

2854

if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {

2855

p->policy = SCHED_NORMAL;

2855

p->policy = SCHED_NORMAL;

2856

p->normal_prio = p->static_prio;

2856

p->normal_prio = p->static_prio;

2857

}

2857

}

2858

2859

if (PRIO_TO_NICE(p->static_prio) < 0) {

2859

if (PRIO_TO_NICE(p->static_prio) < 0) {

2860

p->static_prio = NICE_TO_PRIO(0);

2860

p->static_prio = NICE_TO_PRIO(0);

2861

p->normal_prio = p->static_prio;

2861

p->normal_prio = p->static_prio;

2862

set_load_weight(p);

2862

set_load_weight(p);

2863

}

2863

}

2864

2865

/*

2865

/*

2866

* We don't need the reset flag anymore after the fork. It has

2866

* We don't need the reset flag anymore after the fork. It has

2867

* fulfilled its duty:

2867

* fulfilled its duty:

2868

*/

2868

*/

2869

p->sched_reset_on_fork = 0;

2869

p->sched_reset_on_fork = 0;

2870

}

2870

}

2871

2872

/*

2872

/*

2873

* Make sure we do not leak PI boosting priority to the child.

2873

* Make sure we do not leak PI boosting priority to the child.

2874

*/

2874

*/

2875

p->prio = current->normal_prio;

2875

p->prio = current->normal_prio;

2876

2877

if (!rt_prio(p->prio))

2877

if (!rt_prio(p->prio))

2878

p->sched_class = &fair_sched_class;

2878

p->sched_class = &fair_sched_class;

2879

2880

if (p->sched_class->task_fork)

2880

if (p->sched_class->task_fork)

2881

p->sched_class->task_fork(p);

2881

p->sched_class->task_fork(p);

2882

2883

/*

2883

/*

2884

* The child is not yet in the pid-hash so no cgroup attach races,

2884

* The child is not yet in the pid-hash so no cgroup attach races,

2885

* and the cgroup is pinned to this child due to cgroup_fork()

2885

* and the cgroup is pinned to this child due to cgroup_fork()

2886

* is ran before sched_fork().

2886

* is ran before sched_fork().

2887

*

2887

*

2888

* Silence PROVE_RCU.

2888

* Silence PROVE_RCU.

2889

*/

2889

*/

2890

raw_spin_lock_irqsave(&p->pi_lock, flags);

2890

raw_spin_lock_irqsave(&p->pi_lock, flags);

2891

set_task_cpu(p, cpu);

2891

set_task_cpu(p, cpu);

2892

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

2892

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

2893

2894

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

2894

#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)

2895

if (likely(sched_info_on()))

2895

if (likely(sched_info_on()))

2896

memset(&p->sched_info, 0, sizeof(p->sched_info));

2896

memset(&p->sched_info, 0, sizeof(p->sched_info));

2897

#endif

2897

#endif

2898

#if defined(CONFIG_SMP)

2898

#if defined(CONFIG_SMP)

2899

p->on_cpu = 0;

2899

p->on_cpu = 0;

2900

#endif

2900

#endif

2901

#ifdef CONFIG_PREEMPT_COUNT

2901

#ifdef CONFIG_PREEMPT_COUNT

2902

/* Want to start with kernel preemption disabled. */

2902

/* Want to start with kernel preemption disabled. */

2903

task_thread_info(p)->preempt_count = 1;

2903

task_thread_info(p)->preempt_count = 1;

2904

#endif

2904

#endif

2905

#ifdef CONFIG_SMP

2905

#ifdef CONFIG_SMP

2906

plist_node_init(&p->pushable_tasks, MAX_PRIO);

2906

plist_node_init(&p->pushable_tasks, MAX_PRIO);

2907

#endif

2907

#endif

2908

2909

put_cpu();

2909

put_cpu();

2910

}

2910

}

2911

2912

/*

2912

/*

2913

* wake_up_new_task - wake up a newly created task for the first time.

2913

* wake_up_new_task - wake up a newly created task for the first time.

2914

*

2914

*

2915

* This function will do some initial scheduler statistics housekeeping

2915

* This function will do some initial scheduler statistics housekeeping

2916

* that must be done for every newly created context, then puts the task

2916

* that must be done for every newly created context, then puts the task

2917

* on the runqueue and wakes it.

2917

* on the runqueue and wakes it.

2918

*/

2918

*/

2919

void wake_up_new_task(struct task_struct *p)

2919

void wake_up_new_task(struct task_struct *p)

2920

{

2920

{

2921

unsigned long flags;

2921

unsigned long flags;

2922

struct rq *rq;

2922

struct rq *rq;

2923

2924

raw_spin_lock_irqsave(&p->pi_lock, flags);

2924

raw_spin_lock_irqsave(&p->pi_lock, flags);

2925

#ifdef CONFIG_SMP

2925

#ifdef CONFIG_SMP

2926

/*

2926

/*

2927

* Fork balancing, do it here and not earlier because:

2927

* Fork balancing, do it here and not earlier because:

2928

* - cpus_allowed can change in the fork path

2928

* - cpus_allowed can change in the fork path

2929

* - any previously selected cpu might disappear through hotplug

2929

* - any previously selected cpu might disappear through hotplug

2930

*/

2930

*/

2931

set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));

2931

set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));

2932

#endif

2932

#endif

2933

2934

rq = __task_rq_lock(p);

2934

rq = __task_rq_lock(p);

2935

activate_task(rq, p, 0);

2935

activate_task(rq, p, 0);

2936

p->on_rq = 1;

2936

p->on_rq = 1;

2937

trace_sched_wakeup_new(p, true);

2937

trace_sched_wakeup_new(p, true);

2938

check_preempt_curr(rq, p, WF_FORK);

2938

check_preempt_curr(rq, p, WF_FORK);

2939

#ifdef CONFIG_SMP

2939

#ifdef CONFIG_SMP

2940

if (p->sched_class->task_woken)

2940

if (p->sched_class->task_woken)

2941

p->sched_class->task_woken(rq, p);

2941

p->sched_class->task_woken(rq, p);

2942

#endif

2942

#endif

2943

task_rq_unlock(rq, p, &flags);

2943

task_rq_unlock(rq, p, &flags);

2944

}

2944

}

2945

2946

#ifdef CONFIG_PREEMPT_NOTIFIERS

2946

#ifdef CONFIG_PREEMPT_NOTIFIERS

2947

2948

/**

2948

/**

2949

* preempt_notifier_register - tell me when current is being preempted & rescheduled

2949

* preempt_notifier_register - tell me when current is being preempted & rescheduled

2950

* @notifier: notifier struct to register

2950

* @notifier: notifier struct to register

2951

*/

2951

*/

2952

void preempt_notifier_register(struct preempt_notifier *notifier)

2952

void preempt_notifier_register(struct preempt_notifier *notifier)

2953

{

2953

{

2954

hlist_add_head(&notifier->link, &current->preempt_notifiers);

2954

hlist_add_head(&notifier->link, &current->preempt_notifiers);

2955

}

2955

}

2956

EXPORT_SYMBOL_GPL(preempt_notifier_register);

2956

EXPORT_SYMBOL_GPL(preempt_notifier_register);

2957

2958

/**

2958

/**

2959

* preempt_notifier_unregister - no longer interested in preemption notifications

2959

* preempt_notifier_unregister - no longer interested in preemption notifications

2960

* @notifier: notifier struct to unregister

2960

* @notifier: notifier struct to unregister

2961

*

2961

*

2962

* This is safe to call from within a preemption notifier.

2962

* This is safe to call from within a preemption notifier.

2963

*/

2963

*/

2964

void preempt_notifier_unregister(struct preempt_notifier *notifier)

2964

void preempt_notifier_unregister(struct preempt_notifier *notifier)

2965

{

2965

{

2966

hlist_del(&notifier->link);

2966

hlist_del(&notifier->link);

2967

}

2967

}

2968

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

2968

EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

2969

2970

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2970

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2971

{

2971

{

2972

struct preempt_notifier *notifier;

2972

struct preempt_notifier *notifier;

2973

struct hlist_node *node;

2973

struct hlist_node *node;

2974

2975

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2975

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2976

notifier->ops->sched_in(notifier, raw_smp_processor_id());

2976

notifier->ops->sched_in(notifier, raw_smp_processor_id());

2977

}

2977

}

2978

2979

static void

2979

static void

2980

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2980

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2981

struct task_struct *next)

2981

struct task_struct *next)

2982

{

2982

{

2983

struct preempt_notifier *notifier;

2983

struct preempt_notifier *notifier;

2984

struct hlist_node *node;

2984

struct hlist_node *node;

2985

2986

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2986

hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)

2987

notifier->ops->sched_out(notifier, next);

2987

notifier->ops->sched_out(notifier, next);

2988

}

2988

}

2989

2990

#else /* !CONFIG_PREEMPT_NOTIFIERS */

2990

#else /* !CONFIG_PREEMPT_NOTIFIERS */

2991

2992

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2992

static void fire_sched_in_preempt_notifiers(struct task_struct *curr)

2993

{

2993

{

2994

}

2994

}

2995

2996

static void

2996

static void

2997

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2997

fire_sched_out_preempt_notifiers(struct task_struct *curr,

2998

struct task_struct *next)

2998

struct task_struct *next)

2999

{

2999

{

3000

}

3000

}

3001

3002

#endif /* CONFIG_PREEMPT_NOTIFIERS */

3002

#endif /* CONFIG_PREEMPT_NOTIFIERS */

3003

3004

/**

3004

/**

3005

* prepare_task_switch - prepare to switch tasks

3005

* prepare_task_switch - prepare to switch tasks

3006

* @rq: the runqueue preparing to switch

3006

* @rq: the runqueue preparing to switch

3007

* @prev: the current task that is being switched out

3007

* @prev: the current task that is being switched out

3008

* @next: the task we are going to switch to.

3008

* @next: the task we are going to switch to.

3009

*

3009

*

3010

* This is called with the rq lock held and interrupts off. It must

3010

* This is called with the rq lock held and interrupts off. It must

3011

* be paired with a subsequent finish_task_switch after the context

3011

* be paired with a subsequent finish_task_switch after the context

3012

* switch.

3012

* switch.

3013

*

3013

*

3014

* prepare_task_switch sets up locking and calls architecture specific

3014

* prepare_task_switch sets up locking and calls architecture specific

3015

* hooks.

3015

* hooks.

3016

*/

3016

*/

3017

static inline void

3017

static inline void

3018

prepare_task_switch(struct rq *rq, struct task_struct *prev,

3018

prepare_task_switch(struct rq *rq, struct task_struct *prev,

3019

struct task_struct *next)

3019

struct task_struct *next)

3020

{

3020

{

3021

sched_info_switch(prev, next);

3021

sched_info_switch(prev, next);

3022

perf_event_task_sched_out(prev, next);

3022

perf_event_task_sched_out(prev, next);

3023

fire_sched_out_preempt_notifiers(prev, next);

3023

fire_sched_out_preempt_notifiers(prev, next);

3024

prepare_lock_switch(rq, next);

3024

prepare_lock_switch(rq, next);

3025

prepare_arch_switch(next);

3025

prepare_arch_switch(next);

3026

trace_sched_switch(prev, next);

3026

trace_sched_switch(prev, next);

3027

}

3027

}

3028

3029

/**

3029

/**

3030

* finish_task_switch - clean up after a task-switch

3030

* finish_task_switch - clean up after a task-switch

3031

* @rq: runqueue associated with task-switch

3031

* @rq: runqueue associated with task-switch

3032

* @prev: the thread we just switched away from.

3032

* @prev: the thread we just switched away from.

3033

*

3033

*

3034

* finish_task_switch must be called after the context switch, paired

3034

* finish_task_switch must be called after the context switch, paired

3035

* with a prepare_task_switch call before the context switch.

3035

* with a prepare_task_switch call before the context switch.

3036

* finish_task_switch will reconcile locking set up by prepare_task_switch,

3036

* finish_task_switch will reconcile locking set up by prepare_task_switch,

3037

* and do any other architecture-specific cleanup actions.

3037

* and do any other architecture-specific cleanup actions.

3038

*

3038

*

3039

* Note that we may have delayed dropping an mm in context_switch(). If

3039

* Note that we may have delayed dropping an mm in context_switch(). If

3040

* so, we finish that here outside of the runqueue lock. (Doing it

3040

* so, we finish that here outside of the runqueue lock. (Doing it

3041

* with the lock held can cause deadlocks; see schedule() for

3041

* with the lock held can cause deadlocks; see schedule() for

3042

* details.)

3042

* details.)

3043

*/

3043

*/

3044

static void finish_task_switch(struct rq *rq, struct task_struct *prev)

3044

static void finish_task_switch(struct rq *rq, struct task_struct *prev)

3045

__releases(rq->lock)

3045

__releases(rq->lock)

3046

{

3046

{

3047

struct mm_struct *mm = rq->prev_mm;

3047

struct mm_struct *mm = rq->prev_mm;

3048

long prev_state;

3048

long prev_state;

3049

3050

rq->prev_mm = NULL;

3050

rq->prev_mm = NULL;

3051

3052

/*

3052

/*

3053

* A task struct has one reference for the use as "current".

3053

* A task struct has one reference for the use as "current".

3054

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

3054

* If a task dies, then it sets TASK_DEAD in tsk->state and calls

3055

* schedule one last time. The schedule call will never return, and

3055

* schedule one last time. The schedule call will never return, and

3056

* the scheduled task must drop that reference.

3056

* the scheduled task must drop that reference.

3057

* The test for TASK_DEAD must occur while the runqueue locks are

3057

* The test for TASK_DEAD must occur while the runqueue locks are

3058

* still held, otherwise prev could be scheduled on another cpu, die

3058

* still held, otherwise prev could be scheduled on another cpu, die

3059

* there before we look at prev->state, and then the reference would

3059

* there before we look at prev->state, and then the reference would

3060

* be dropped twice.

3060

* be dropped twice.

3061

* Manfred Spraul <manfred@colorfullife.com>

3061

* Manfred Spraul <manfred@colorfullife.com>

3062

*/

3062

*/

3063

prev_state = prev->state;

3063

prev_state = prev->state;

3064

finish_arch_switch(prev);

3064

finish_arch_switch(prev);

3065

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

3065

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

3066

local_irq_disable();

3066

local_irq_disable();

3067

#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */

3067

#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */

3068

perf_event_task_sched_in(prev, current);

3068

perf_event_task_sched_in(prev, current);

3069

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

3069

#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW

3070

local_irq_enable();

3070

local_irq_enable();

3071

#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */

3071

#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */

3072

finish_lock_switch(rq, prev);

3072

finish_lock_switch(rq, prev);

3073

3074

fire_sched_in_preempt_notifiers(current);

3074

fire_sched_in_preempt_notifiers(current);

3075

if (mm)

3075

if (mm)

3076

mmdrop(mm);

3076

mmdrop(mm);

3077

if (unlikely(prev_state == TASK_DEAD)) {

3077

if (unlikely(prev_state == TASK_DEAD)) {

3078

/*

3078

/*

3079

* Remove function-return probe instances associated with this

3079

* Remove function-return probe instances associated with this

3080

* task and put them back on the free list.

3080

* task and put them back on the free list.

3081

*/

3081

*/

3082

kprobe_flush_task(prev);

3082

kprobe_flush_task(prev);

3083

put_task_struct(prev);

3083

put_task_struct(prev);

3084

}

3084

}

3085

}

3085

}

3086

3087

#ifdef CONFIG_SMP

3087

#ifdef CONFIG_SMP

3088

3089

/* assumes rq->lock is held */

3089

/* assumes rq->lock is held */

3090

static inline void pre_schedule(struct rq *rq, struct task_struct *prev)

3090

static inline void pre_schedule(struct rq *rq, struct task_struct *prev)

3091

{

3091

{

3092

if (prev->sched_class->pre_schedule)

3092

if (prev->sched_class->pre_schedule)

3093

prev->sched_class->pre_schedule(rq, prev);

3093

prev->sched_class->pre_schedule(rq, prev);

3094

}

3094

}

3095

3096

/* rq->lock is NOT held, but preemption is disabled */

3096

/* rq->lock is NOT held, but preemption is disabled */

3097

static inline void post_schedule(struct rq *rq)

3097

static inline void post_schedule(struct rq *rq)

3098

{

3098

{

3099

if (rq->post_schedule) {

3099

if (rq->post_schedule) {

3100

unsigned long flags;

3100

unsigned long flags;

3101

3102

raw_spin_lock_irqsave(&rq->lock, flags);

3102

raw_spin_lock_irqsave(&rq->lock, flags);

3103

if (rq->curr->sched_class->post_schedule)

3103

if (rq->curr->sched_class->post_schedule)

3104

rq->curr->sched_class->post_schedule(rq);

3104

rq->curr->sched_class->post_schedule(rq);

3105

raw_spin_unlock_irqrestore(&rq->lock, flags);

3105

raw_spin_unlock_irqrestore(&rq->lock, flags);

3106

3107

rq->post_schedule = 0;

3107

rq->post_schedule = 0;

3108

}

3108

}

3109

}

3109

}

3110

3111

#else

3111

#else

3112

3113

static inline void pre_schedule(struct rq *rq, struct task_struct *p)

3113

static inline void pre_schedule(struct rq *rq, struct task_struct *p)

3114

{

3114

{

3115

}

3115

}

3116

3117

static inline void post_schedule(struct rq *rq)

3117

static inline void post_schedule(struct rq *rq)

3118

{

3118

{

3119

}

3119

}

3120

3121

#endif

3121

#endif

3122

3123

/**

3123

/**

3124

* schedule_tail - first thing a freshly forked thread must call.

3124

* schedule_tail - first thing a freshly forked thread must call.

3125

* @prev: the thread we just switched away from.

3125

* @prev: the thread we just switched away from.

3126

*/

3126

*/

3127

asmlinkage void schedule_tail(struct task_struct *prev)

3127

asmlinkage void schedule_tail(struct task_struct *prev)

3128

__releases(rq->lock)

3128

__releases(rq->lock)

3129

{

3129

{

3130

struct rq *rq = this_rq();

3130

struct rq *rq = this_rq();

3131

3132

finish_task_switch(rq, prev);

3132

finish_task_switch(rq, prev);

3133

3134

/*

3134

/*

3135

* FIXME: do we need to worry about rq being invalidated by the

3135

* FIXME: do we need to worry about rq being invalidated by the

3136

* task_switch?

3136

* task_switch?

3137

*/

3137

*/

3138

post_schedule(rq);

3138

post_schedule(rq);

3139

3140

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

3140

#ifdef __ARCH_WANT_UNLOCKED_CTXSW

3141

/* In this case, finish_task_switch does not reenable preemption */

3141

/* In this case, finish_task_switch does not reenable preemption */

3142

preempt_enable();

3142

preempt_enable();

3143

#endif

3143

#endif

3144

if (current->set_child_tid)

3144

if (current->set_child_tid)

3145

put_user(task_pid_vnr(current), current->set_child_tid);

3145

put_user(task_pid_vnr(current), current->set_child_tid);

3146

}

3146

}

3147

3148

/*

3148

/*

3149

* context_switch - switch to the new MM and the new

3149

* context_switch - switch to the new MM and the new

3150

* thread's register state.

3150

* thread's register state.

3151

*/

3151

*/

3152

static inline void

3152

static inline void

3153

context_switch(struct rq *rq, struct task_struct *prev,

3153

context_switch(struct rq *rq, struct task_struct *prev,

3154

struct task_struct *next)

3154

struct task_struct *next)

3155

{

3155

{

3156

struct mm_struct *mm, *oldmm;

3156

struct mm_struct *mm, *oldmm;

3157

3158

prepare_task_switch(rq, prev, next);

3158

prepare_task_switch(rq, prev, next);

3159

3160

mm = next->mm;

3160

mm = next->mm;

3161

oldmm = prev->active_mm;

3161

oldmm = prev->active_mm;

3162

/*

3162

/*

3163

* For paravirt, this is coupled with an exit in switch_to to

3163

* For paravirt, this is coupled with an exit in switch_to to

3164

* combine the page table reload and the switch backend into

3164

* combine the page table reload and the switch backend into

3165

* one hypercall.

3165

* one hypercall.

3166

*/

3166

*/

3167

arch_start_context_switch(prev);

3167

arch_start_context_switch(prev);

3168

3169

if (!mm) {

3169

if (!mm) {

3170

next->active_mm = oldmm;

3170

next->active_mm = oldmm;

3171

atomic_inc(&oldmm->mm_count);

3171

atomic_inc(&oldmm->mm_count);

3172

enter_lazy_tlb(oldmm, next);

3172

enter_lazy_tlb(oldmm, next);

3173

} else

3173

} else

3174

switch_mm(oldmm, mm, next);

3174

switch_mm(oldmm, mm, next);

3175

3176

if (!prev->mm) {

3176

if (!prev->mm) {

3177

prev->active_mm = NULL;

3177

prev->active_mm = NULL;

3178

rq->prev_mm = oldmm;

3178

rq->prev_mm = oldmm;

3179

}

3179

}

3180

/*

3180

/*

3181

* Since the runqueue lock will be released by the next

3181

* Since the runqueue lock will be released by the next

3182

* task (which is an invalid locking op but in the case

3182

* task (which is an invalid locking op but in the case

3183

* of the scheduler it's an obvious special-case), so we

3183

* of the scheduler it's an obvious special-case), so we

3184

* do an early lockdep release here:

3184

* do an early lockdep release here:

3185

*/

3185

*/

3186

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

3186

#ifndef __ARCH_WANT_UNLOCKED_CTXSW

3187

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

3187

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

3188

#endif

3188

#endif

3189

3190

/* Here we just switch the register state and the stack. */

3190

/* Here we just switch the register state and the stack. */

3191

switch_to(prev, next, prev);

3191

switch_to(prev, next, prev);

3192

3193

barrier();

3193

barrier();

3194

/*

3194

/*

3195

* this_rq must be evaluated again because prev may have moved

3195

* this_rq must be evaluated again because prev may have moved

3196

* CPUs since it called schedule(), thus the 'rq' on its stack

3196

* CPUs since it called schedule(), thus the 'rq' on its stack

3197

* frame will be invalid.

3197

* frame will be invalid.

3198

*/

3198

*/

3199

finish_task_switch(this_rq(), prev);

3199

finish_task_switch(this_rq(), prev);

3200

}

3200

}

3201

3202

/*

3202

/*

3203

* nr_running, nr_uninterruptible and nr_context_switches:

3203

* nr_running, nr_uninterruptible and nr_context_switches:

3204

*

3204

*

3205

* externally visible scheduler statistics: current number of runnable

3205

* externally visible scheduler statistics: current number of runnable

3206

* threads, current number of uninterruptible-sleeping threads, total

3206

* threads, current number of uninterruptible-sleeping threads, total

3207

* number of context switches performed since bootup.

3207

* number of context switches performed since bootup.

3208

*/

3208

*/

3209

unsigned long nr_running(void)

3209

unsigned long nr_running(void)

3210

{

3210

{

3211

unsigned long i, sum = 0;

3211

unsigned long i, sum = 0;

3212

3213

for_each_online_cpu(i)

3213

for_each_online_cpu(i)

3214

sum += cpu_rq(i)->nr_running;

3214

sum += cpu_rq(i)->nr_running;

3215

3216

return sum;

3216

return sum;

3217

}

3217

}

3218

3219

unsigned long nr_uninterruptible(void)

3219

unsigned long nr_uninterruptible(void)

3220

{

3220

{

3221

unsigned long i, sum = 0;

3221

unsigned long i, sum = 0;

3222

3223

for_each_possible_cpu(i)

3223

for_each_possible_cpu(i)

3224

sum += cpu_rq(i)->nr_uninterruptible;

3224

sum += cpu_rq(i)->nr_uninterruptible;

3225

3226

/*

3226

/*

3227

* Since we read the counters lockless, it might be slightly

3227

* Since we read the counters lockless, it might be slightly

3228

* inaccurate. Do not allow it to go below zero though:

3228

* inaccurate. Do not allow it to go below zero though:

3229

*/

3229

*/

3230

if (unlikely((long)sum < 0))

3230

if (unlikely((long)sum < 0))

3231

sum = 0;

3231

sum = 0;

3232

3233

return sum;

3233

return sum;

3234

}

3234

}

3235

3236

unsigned long long nr_context_switches(void)

3236

unsigned long long nr_context_switches(void)

3237

{

3237

{

3238

int i;

3238

int i;

3239

unsigned long long sum = 0;

3239

unsigned long long sum = 0;

3240

3241

for_each_possible_cpu(i)

3241

for_each_possible_cpu(i)

3242

sum += cpu_rq(i)->nr_switches;

3242

sum += cpu_rq(i)->nr_switches;

3243

3244

return sum;

3244

return sum;

3245

}

3245

}

3246

3247

unsigned long nr_iowait(void)

3247

unsigned long nr_iowait(void)

3248

{

3248

{

3249

unsigned long i, sum = 0;

3249

unsigned long i, sum = 0;

3250

3251

for_each_possible_cpu(i)

3251

for_each_possible_cpu(i)

3252

sum += atomic_read(&cpu_rq(i)->nr_iowait);

3252

sum += atomic_read(&cpu_rq(i)->nr_iowait);

3253

3254

return sum;

3254

return sum;

3255

}

3255

}

3256

3257

unsigned long nr_iowait_cpu(int cpu)

3257

unsigned long nr_iowait_cpu(int cpu)

3258

{

3258

{

3259

struct rq *this = cpu_rq(cpu);

3259

struct rq *this = cpu_rq(cpu);

3260

return atomic_read(&this->nr_iowait);

3260

return atomic_read(&this->nr_iowait);

3261

}

3261

}

3262

3263

unsigned long this_cpu_load(void)

3263

unsigned long this_cpu_load(void)

3264

{

3264

{

3265

struct rq *this = this_rq();

3265

struct rq *this = this_rq();

3266

return this->cpu_load[0];

3266

return this->cpu_load[0];

3267

}

3267

}

3268

3269

3270

/* Variables and functions for calc_load */

3270

/* Variables and functions for calc_load */

3271

static atomic_long_t calc_load_tasks;

3271

static atomic_long_t calc_load_tasks;

3272

static unsigned long calc_load_update;

3272

static unsigned long calc_load_update;

3273

unsigned long avenrun[3];

3273

unsigned long avenrun[3];

3274

EXPORT_SYMBOL(avenrun);

3274

EXPORT_SYMBOL(avenrun);

3275

3276

static long calc_load_fold_active(struct rq *this_rq)

3276

static long calc_load_fold_active(struct rq *this_rq)

3277

{

3277

{

3278

long nr_active, delta = 0;

3278

long nr_active, delta = 0;

3279

3280

nr_active = this_rq->nr_running;

3280

nr_active = this_rq->nr_running;

3281

nr_active += (long) this_rq->nr_uninterruptible;

3281

nr_active += (long) this_rq->nr_uninterruptible;

3282

3283

if (nr_active != this_rq->calc_load_active) {

3283

if (nr_active != this_rq->calc_load_active) {

3284

delta = nr_active - this_rq->calc_load_active;

3284

delta = nr_active - this_rq->calc_load_active;

3285

this_rq->calc_load_active = nr_active;

3285

this_rq->calc_load_active = nr_active;

3286

}

3286

}

3287

3288

return delta;

3288

return delta;

3289

}

3289

}

3290

3291

static unsigned long

3291

static unsigned long

3292

calc_load(unsigned long load, unsigned long exp, unsigned long active)

3292

calc_load(unsigned long load, unsigned long exp, unsigned long active)

3293

{

3293

{

3294

load *= exp;

3294

load *= exp;

3295

load += active * (FIXED_1 - exp);

3295

load += active * (FIXED_1 - exp);

3296

load += 1UL << (FSHIFT - 1);

3296

load += 1UL << (FSHIFT - 1);

3297

return load >> FSHIFT;

3297

return load >> FSHIFT;

3298

}

3298

}

3299

3300

#ifdef CONFIG_NO_HZ

3300

#ifdef CONFIG_NO_HZ

3301

/*

3301

/*

3302

* For NO_HZ we delay the active fold to the next LOAD_FREQ update.

3302

* For NO_HZ we delay the active fold to the next LOAD_FREQ update.

3303

*

3303

*

3304

* When making the ILB scale, we should try to pull this in as well.

3304

* When making the ILB scale, we should try to pull this in as well.

3305

*/

3305

*/

3306

static atomic_long_t calc_load_tasks_idle;

3306

static atomic_long_t calc_load_tasks_idle;

3307

3308

static void calc_load_account_idle(struct rq *this_rq)

3308

static void calc_load_account_idle(struct rq *this_rq)

3309

{

3309

{

3310

long delta;

3310

long delta;

3311

3312

delta = calc_load_fold_active(this_rq);

3312

delta = calc_load_fold_active(this_rq);

3313

if (delta)

3313

if (delta)

3314

atomic_long_add(delta, &calc_load_tasks_idle);

3314

atomic_long_add(delta, &calc_load_tasks_idle);

3315

}

3315

}

3316

3317

static long calc_load_fold_idle(void)

3317

static long calc_load_fold_idle(void)

3318

{

3318

{

3319

long delta = 0;

3319

long delta = 0;

3320

3321

/*

3321

/*

3322

* Its got a race, we don't care...

3322

* Its got a race, we don't care...

3323

*/

3323

*/

3324

if (atomic_long_read(&calc_load_tasks_idle))

3324

if (atomic_long_read(&calc_load_tasks_idle))

3325

delta = atomic_long_xchg(&calc_load_tasks_idle, 0);

3325

delta = atomic_long_xchg(&calc_load_tasks_idle, 0);

3326

3327

return delta;

3327

return delta;

3328

}

3328

}

3329

3330

/**

3330

/**

3331

* fixed_power_int - compute: x^n, in O(log n) time

3331

* fixed_power_int - compute: x^n, in O(log n) time

3332

*

3332

*

3333

* @x: base of the power

3333

* @x: base of the power

3334

* @frac_bits: fractional bits of @x

3334

* @frac_bits: fractional bits of @x

3335

* @n: power to raise @x to.

3335

* @n: power to raise @x to.

3336

*

3336

*

3337

* By exploiting the relation between the definition of the natural power

3337

* By exploiting the relation between the definition of the natural power

3338

* function: x^n := x*x*...*x (x multiplied by itself for n times), and

3338

* function: x^n := x*x*...*x (x multiplied by itself for n times), and

3339

* the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,

3339

* the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,

3340

* (where: n_i \elem {0, 1}, the binary vector representing n),

3340

* (where: n_i \elem {0, 1}, the binary vector representing n),

3341

* we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is

3341

* we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is

3342

* of course trivially computable in O(log_2 n), the length of our binary

3342

* of course trivially computable in O(log_2 n), the length of our binary

3343

* vector.

3343

* vector.

3344

*/

3344

*/

3345

static unsigned long

3345

static unsigned long

3346

fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)

3346

fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)

3347

{

3347

{

3348

unsigned long result = 1UL << frac_bits;

3348

unsigned long result = 1UL << frac_bits;

3349

3350

if (n) for (;;) {

3350

if (n) for (;;) {

3351

if (n & 1) {

3351

if (n & 1) {

3352

result *= x;

3352

result *= x;

3353

result += 1UL << (frac_bits - 1);

3353

result += 1UL << (frac_bits - 1);

3354

result >>= frac_bits;

3354

result >>= frac_bits;

3355

}

3355

}

3356

n >>= 1;

3356

n >>= 1;

3357

if (!n)

3357

if (!n)

3358

break;

3358

break;

3359

x *= x;

3359

x *= x;

3360

x += 1UL << (frac_bits - 1);

3360

x += 1UL << (frac_bits - 1);

3361

x >>= frac_bits;

3361

x >>= frac_bits;

3362

}

3362

}

3363

3364

return result;

3364

return result;

3365

}

3365

}

3366

3367

/*

3367

/*

3368

* a1 = a0 * e + a * (1 - e)

3368

* a1 = a0 * e + a * (1 - e)

3369

*

3369

*

3370

* a2 = a1 * e + a * (1 - e)

3370

* a2 = a1 * e + a * (1 - e)

3371

* = (a0 * e + a * (1 - e)) * e + a * (1 - e)

3371

* = (a0 * e + a * (1 - e)) * e + a * (1 - e)

3372

* = a0 * e^2 + a * (1 - e) * (1 + e)

3372

* = a0 * e^2 + a * (1 - e) * (1 + e)

3373

*

3373

*

3374

* a3 = a2 * e + a * (1 - e)

3374

* a3 = a2 * e + a * (1 - e)

3375

* = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)

3375

* = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)

3376

* = a0 * e^3 + a * (1 - e) * (1 + e + e^2)

3376

* = a0 * e^3 + a * (1 - e) * (1 + e + e^2)

3377

*

3377

*

3378

* ...

3378

* ...

3379

*

3379

*

3380

* an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]

3380

* an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]

3381

* = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)

3381

* = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)

3382

* = a0 * e^n + a * (1 - e^n)

3382

* = a0 * e^n + a * (1 - e^n)

3383

*

3383

*

3384

* [1] application of the geometric series:

3384

* [1] application of the geometric series:

3385

*

3385

*

3386

* n 1 - x^(n+1)

3386

* n 1 - x^(n+1)

3387

* S_n := \Sum x^i = -------------

3387

* S_n := \Sum x^i = -------------

3388

* i=0 1 - x

3388

* i=0 1 - x

3389

*/

3389

*/

3390

static unsigned long

3390

static unsigned long

3391

calc_load_n(unsigned long load, unsigned long exp,

3391

calc_load_n(unsigned long load, unsigned long exp,

3392

unsigned long active, unsigned int n)

3392

unsigned long active, unsigned int n)

3393

{

3393

{

3394

3395

return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);

3395

return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);

3396

}

3396

}

3397

3398

/*

3398

/*

3399

* NO_HZ can leave us missing all per-cpu ticks calling

3399

* NO_HZ can leave us missing all per-cpu ticks calling

3400

* calc_load_account_active(), but since an idle CPU folds its delta into

3400

* calc_load_account_active(), but since an idle CPU folds its delta into

3401

* calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold

3401

* calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold

3402

* in the pending idle delta if our idle period crossed a load cycle boundary.

3402

* in the pending idle delta if our idle period crossed a load cycle boundary.

3403

*

3403

*

3404

* Once we've updated the global active value, we need to apply the exponential

3404

* Once we've updated the global active value, we need to apply the exponential

3405

* weights adjusted to the number of cycles missed.

3405

* weights adjusted to the number of cycles missed.

3406

*/

3406

*/

3407

static void calc_global_nohz(unsigned long ticks)

3407

static void calc_global_nohz(unsigned long ticks)

3408

{

3408

{

3409

long delta, active, n;

3409

long delta, active, n;

3410

3411

if (time_before(jiffies, calc_load_update))

3411

if (time_before(jiffies, calc_load_update))

3412

return;

3412

return;

3413

3414

/*

3414

/*

3415

* If we crossed a calc_load_update boundary, make sure to fold

3415

* If we crossed a calc_load_update boundary, make sure to fold

3416

* any pending idle changes, the respective CPUs might have

3416

* any pending idle changes, the respective CPUs might have

3417

* missed the tick driven calc_load_account_active() update

3417

* missed the tick driven calc_load_account_active() update

3418

* due to NO_HZ.

3418

* due to NO_HZ.

3419

*/

3419

*/

3420

delta = calc_load_fold_idle();

3420

delta = calc_load_fold_idle();

3421

if (delta)

3421

if (delta)

3422

atomic_long_add(delta, &calc_load_tasks);

3422

atomic_long_add(delta, &calc_load_tasks);

3423

3424

/*

3424

/*

3425

* If we were idle for multiple load cycles, apply them.

3425

* If we were idle for multiple load cycles, apply them.

3426

*/

3426

*/

3427

if (ticks >= LOAD_FREQ) {

3427

if (ticks >= LOAD_FREQ) {

3428

n = ticks / LOAD_FREQ;

3428

n = ticks / LOAD_FREQ;

3429

3430

active = atomic_long_read(&calc_load_tasks);

3430

active = atomic_long_read(&calc_load_tasks);

3431

active = active > 0 ? active * FIXED_1 : 0;

3431

active = active > 0 ? active * FIXED_1 : 0;

3432

3433

avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);

3433

avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);

3434

avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);

3434

avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);

3435

avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);

3435

avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);

3436

3437

calc_load_update += n * LOAD_FREQ;

3437

calc_load_update += n * LOAD_FREQ;

3438

}

3438

}

3439

3440

/*

3440

/*

3441

* Its possible the remainder of the above division also crosses

3441

* Its possible the remainder of the above division also crosses

3442

* a LOAD_FREQ period, the regular check in calc_global_load()

3442

* a LOAD_FREQ period, the regular check in calc_global_load()

3443

* which comes after this will take care of that.

3443

* which comes after this will take care of that.

3444

*

3444

*

3445

* Consider us being 11 ticks before a cycle completion, and us

3445

* Consider us being 11 ticks before a cycle completion, and us

3446

* sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will

3446

* sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will

3447

* age us 4 cycles, and the test in calc_global_load() will

3447

* age us 4 cycles, and the test in calc_global_load() will

3448

* pick up the final one.

3448

* pick up the final one.

3449

*/

3449

*/

3450

}

3450

}

3451

#else

3451

#else

3452

static void calc_load_account_idle(struct rq *this_rq)

3452

static void calc_load_account_idle(struct rq *this_rq)

3453

{

3453

{

3454

}

3454

}

3455

3456

static inline long calc_load_fold_idle(void)

3456

static inline long calc_load_fold_idle(void)

3457

{

3457

{

3458

return 0;

3458

return 0;

3459

}

3459

}

3460

3461

static void calc_global_nohz(unsigned long ticks)

3461

static void calc_global_nohz(unsigned long ticks)

3462

{

3462

{

3463

}

3463

}

3464

#endif

3464

#endif

3465

3466

/**

3466

/**

3467

* get_avenrun - get the load average array

3467

* get_avenrun - get the load average array

3468

* @loads: pointer to dest load array

3468

* @loads: pointer to dest load array

3469

* @offset: offset to add

3469

* @offset: offset to add

3470

* @shift: shift count to shift the result left

3470

* @shift: shift count to shift the result left

3471

*

3471

*

3472

* These values are estimates at best, so no need for locking.

3472

* These values are estimates at best, so no need for locking.

3473

*/

3473

*/

3474

void get_avenrun(unsigned long *loads, unsigned long offset, int shift)

3474

void get_avenrun(unsigned long *loads, unsigned long offset, int shift)

3475

{

3475

{

3476

loads[0] = (avenrun[0] + offset) << shift;

3476

loads[0] = (avenrun[0] + offset) << shift;

3477

loads[1] = (avenrun[1] + offset) << shift;

3477

loads[1] = (avenrun[1] + offset) << shift;

3478

loads[2] = (avenrun[2] + offset) << shift;

3478

loads[2] = (avenrun[2] + offset) << shift;

3479

}

3479

}

3480

3481

/*

3481

/*

3482

* calc_load - update the avenrun load estimates 10 ticks after the

3482

* calc_load - update the avenrun load estimates 10 ticks after the

3483

* CPUs have updated calc_load_tasks.

3483

* CPUs have updated calc_load_tasks.

3484

*/

3484

*/

3485

void calc_global_load(unsigned long ticks)

3485

void calc_global_load(unsigned long ticks)

3486

{

3486

{

3487

long active;

3487

long active;

3488

3489

calc_global_nohz(ticks);

3489

calc_global_nohz(ticks);

3490

3491

if (time_before(jiffies, calc_load_update + 10))

3491

if (time_before(jiffies, calc_load_update + 10))

3492

return;

3492

return;

3493

3494

active = atomic_long_read(&calc_load_tasks);

3494

active = atomic_long_read(&calc_load_tasks);

3495

active = active > 0 ? active * FIXED_1 : 0;

3495

active = active > 0 ? active * FIXED_1 : 0;

3496

3497

avenrun[0] = calc_load(avenrun[0], EXP_1, active);

3497

avenrun[0] = calc_load(avenrun[0], EXP_1, active);

3498

avenrun[1] = calc_load(avenrun[1], EXP_5, active);

3498

avenrun[1] = calc_load(avenrun[1], EXP_5, active);

3499

avenrun[2] = calc_load(avenrun[2], EXP_15, active);

3499

avenrun[2] = calc_load(avenrun[2], EXP_15, active);

3500

3501

calc_load_update += LOAD_FREQ;

3501

calc_load_update += LOAD_FREQ;

3502

}

3502

}

3503

3504

/*

3504

/*

3505

* Called from update_cpu_load() to periodically update this CPU's

3505

* Called from update_cpu_load() to periodically update this CPU's

3506

* active count.

3506

* active count.

3507

*/

3507

*/

3508

static void calc_load_account_active(struct rq *this_rq)

3508

static void calc_load_account_active(struct rq *this_rq)

3509

{

3509

{

3510

long delta;

3510

long delta;

3511

3512

if (time_before(jiffies, this_rq->calc_load_update))

3512

if (time_before(jiffies, this_rq->calc_load_update))

3513

return;

3513

return;

3514

3515

delta = calc_load_fold_active(this_rq);

3515

delta = calc_load_fold_active(this_rq);

3516

delta += calc_load_fold_idle();

3516

delta += calc_load_fold_idle();

3517

if (delta)

3517

if (delta)

3518

atomic_long_add(delta, &calc_load_tasks);

3518

atomic_long_add(delta, &calc_load_tasks);

3519

3520

this_rq->calc_load_update += LOAD_FREQ;

3520

this_rq->calc_load_update += LOAD_FREQ;

3521

}

3521

}

3522

3523

/*

3523

/*

3524

* The exact cpuload at various idx values, calculated at every tick would be

3524

* The exact cpuload at various idx values, calculated at every tick would be

3525

* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load

3525

* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load

3526

*

3526

*

3527

* If a cpu misses updates for n-1 ticks (as it was idle) and update gets called

3527

* If a cpu misses updates for n-1 ticks (as it was idle) and update gets called

3528

* on nth tick when cpu may be busy, then we have:

3528

* on nth tick when cpu may be busy, then we have:

3529

* load = ((2^idx - 1) / 2^idx)^(n-1) * load

3529

* load = ((2^idx - 1) / 2^idx)^(n-1) * load

3530

* load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load

3530

* load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load

3531

*

3531

*

3532

* decay_load_missed() below does efficient calculation of

3532

* decay_load_missed() below does efficient calculation of

3533

* load = ((2^idx - 1) / 2^idx)^(n-1) * load

3533

* load = ((2^idx - 1) / 2^idx)^(n-1) * load

3534

* avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load

3534

* avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load

3535

*

3535

*

3536

* The calculation is approximated on a 128 point scale.

3536

* The calculation is approximated on a 128 point scale.

3537

* degrade_zero_ticks is the number of ticks after which load at any

3537

* degrade_zero_ticks is the number of ticks after which load at any

3538

* particular idx is approximated to be zero.

3538

* particular idx is approximated to be zero.

3539

* degrade_factor is a precomputed table, a row for each load idx.

3539

* degrade_factor is a precomputed table, a row for each load idx.

3540

* Each column corresponds to degradation factor for a power of two ticks,

3540

* Each column corresponds to degradation factor for a power of two ticks,

3541

* based on 128 point scale.

3541

* based on 128 point scale.

3542

* Example:

3542

* Example:

3543

* row 2, col 3 (=12) says that the degradation at load idx 2 after

3543

* row 2, col 3 (=12) says that the degradation at load idx 2 after

3544

* 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).

3544

* 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).

3545

*

3545

*

3546

* With this power of 2 load factors, we can degrade the load n times

3546

* With this power of 2 load factors, we can degrade the load n times

3547

* by looking at 1 bits in n and doing as many mult/shift instead of

3547

* by looking at 1 bits in n and doing as many mult/shift instead of

3548

* n mult/shifts needed by the exact degradation.

3548

* n mult/shifts needed by the exact degradation.

3549

*/

3549

*/

3550

#define DEGRADE_SHIFT 7

3550

#define DEGRADE_SHIFT 7

3551

static const unsigned char

3551

static const unsigned char

3552

degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};

3552

degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};

3553

static const unsigned char

3553

static const unsigned char

3554

degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {

3554

degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {

3555

{0, 0, 0, 0, 0, 0, 0, 0},

3555

{0, 0, 0, 0, 0, 0, 0, 0},

3556

{64, 32, 8, 0, 0, 0, 0, 0},

3556

{64, 32, 8, 0, 0, 0, 0, 0},

3557

{96, 72, 40, 12, 1, 0, 0},

3557

{96, 72, 40, 12, 1, 0, 0},

3558

{112, 98, 75, 43, 15, 1, 0},

3558

{112, 98, 75, 43, 15, 1, 0},

3559

{120, 112, 98, 76, 45, 16, 2} };

3559

{120, 112, 98, 76, 45, 16, 2} };

3560

3561

/*

3561

/*

3562

* Update cpu_load for any missed ticks, due to tickless idle. The backlog

3562

* Update cpu_load for any missed ticks, due to tickless idle. The backlog

3563

* would be when CPU is idle and so we just decay the old load without

3563

* would be when CPU is idle and so we just decay the old load without

3564

* adding any new load.

3564

* adding any new load.

3565

*/

3565

*/

3566

static unsigned long

3566

static unsigned long

3567

decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)

3567

decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)

3568

{

3568

{

3569

int j = 0;

3569

int j = 0;

3570

3571

if (!missed_updates)

3571

if (!missed_updates)

3572

return load;

3572

return load;

3573

3574

if (missed_updates >= degrade_zero_ticks[idx])

3574

if (missed_updates >= degrade_zero_ticks[idx])

3575

return 0;

3575

return 0;

3576

3577

if (idx == 1)

3577

if (idx == 1)

3578

return load >> missed_updates;

3578

return load >> missed_updates;

3579

3580

while (missed_updates) {

3580

while (missed_updates) {

3581

if (missed_updates % 2)

3581

if (missed_updates % 2)

3582

load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;

3582

load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;

3583

3584

missed_updates >>= 1;

3584

missed_updates >>= 1;

3585

j++;

3585

j++;

3586

}

3586

}

3587

return load;

3587

return load;

3588

}

3588

}

3589

3590

/*

3590

/*

3591

* Update rq->cpu_load[] statistics. This function is usually called every

3591

* Update rq->cpu_load[] statistics. This function is usually called every

3592

* scheduler tick (TICK_NSEC). With tickless idle this will not be called

3592

* scheduler tick (TICK_NSEC). With tickless idle this will not be called

3593

* every tick. We fix it up based on jiffies.

3593

* every tick. We fix it up based on jiffies.

3594

*/

3594

*/

3595

static void update_cpu_load(struct rq *this_rq)

3595

static void update_cpu_load(struct rq *this_rq)

3596

{

3596

{

3597

unsigned long this_load = this_rq->load.weight;

3597

unsigned long this_load = this_rq->load.weight;

3598

unsigned long curr_jiffies = jiffies;

3598

unsigned long curr_jiffies = jiffies;

3599

unsigned long pending_updates;

3599

unsigned long pending_updates;

3600

int i, scale;

3600

int i, scale;

3601

3602

this_rq->nr_load_updates++;

3602

this_rq->nr_load_updates++;

3603

3604

/* Avoid repeated calls on same jiffy, when moving in and out of idle */

3604

/* Avoid repeated calls on same jiffy, when moving in and out of idle */

3605

if (curr_jiffies == this_rq->last_load_update_tick)

3605

if (curr_jiffies == this_rq->last_load_update_tick)

3606

return;

3606

return;

3607

3608

pending_updates = curr_jiffies - this_rq->last_load_update_tick;

3608

pending_updates = curr_jiffies - this_rq->last_load_update_tick;

3609

this_rq->last_load_update_tick = curr_jiffies;

3609

this_rq->last_load_update_tick = curr_jiffies;

3610

3611

/* Update our load: */

3611

/* Update our load: */

3612

this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */

3612

this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */

3613

for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {

3613

for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {

3614

unsigned long old_load, new_load;

3614

unsigned long old_load, new_load;

3615

3616

/* scale is effectively 1 << i now, and >> i divides by scale */

3616

/* scale is effectively 1 << i now, and >> i divides by scale */

3617

3618

old_load = this_rq->cpu_load[i];

3618

old_load = this_rq->cpu_load[i];

3619

old_load = decay_load_missed(old_load, pending_updates - 1, i);

3619

old_load = decay_load_missed(old_load, pending_updates - 1, i);

3620

new_load = this_load;

3620

new_load = this_load;

3621

/*

3621

/*

3622

* Round up the averaging division if load is increasing. This

3622

* Round up the averaging division if load is increasing. This

3623

* prevents us from getting stuck on 9 if the load is 10, for

3623

* prevents us from getting stuck on 9 if the load is 10, for

3624

* example.

3624

* example.

3625

*/

3625

*/

3626

if (new_load > old_load)

3626

if (new_load > old_load)

3627

new_load += scale - 1;

3627

new_load += scale - 1;

3628

3629

this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;

3629

this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;

3630

}

3630

}

3631

3632

sched_avg_update(this_rq);

3632

sched_avg_update(this_rq);

3633

}

3633

}

3634

3635

static void update_cpu_load_active(struct rq *this_rq)

3635

static void update_cpu_load_active(struct rq *this_rq)

3636

{

3636

{

3637

update_cpu_load(this_rq);

3637

update_cpu_load(this_rq);

3638

3639

calc_load_account_active(this_rq);

3639

calc_load_account_active(this_rq);

3640

}

3640

}

3641

3642

#ifdef CONFIG_SMP

3642

#ifdef CONFIG_SMP

3643

3644

/*

3644

/*

3645

* sched_exec - execve() is a valuable balancing opportunity, because at

3645

* sched_exec - execve() is a valuable balancing opportunity, because at

3646

* this point the task has the smallest effective memory and cache footprint.

3646

* this point the task has the smallest effective memory and cache footprint.

3647

*/

3647

*/

3648

void sched_exec(void)

3648

void sched_exec(void)

3649

{

3649

{

3650

struct task_struct *p = current;

3650

struct task_struct *p = current;

3651

unsigned long flags;

3651

unsigned long flags;

3652

int dest_cpu;

3652

int dest_cpu;

3653

3654

raw_spin_lock_irqsave(&p->pi_lock, flags);

3654

raw_spin_lock_irqsave(&p->pi_lock, flags);

3655

dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);

3655

dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);

3656

if (dest_cpu == smp_processor_id())

3656

if (dest_cpu == smp_processor_id())

3657

goto unlock;

3657

goto unlock;

3658

3659

if (likely(cpu_active(dest_cpu))) {

3659

if (likely(cpu_active(dest_cpu))) {

3660

struct migration_arg arg = { p, dest_cpu };

3660

struct migration_arg arg = { p, dest_cpu };

3661

3662

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

3662

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

3663

stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);

3663

stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);

3664

return;

3664

return;

3665

}

3665

}

3666

unlock:

3666

unlock:

3667

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

3667

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

3668

}

3668

}

3669

3670

#endif

3670

#endif

3671

3672

DEFINE_PER_CPU(struct kernel_stat, kstat);

3672

DEFINE_PER_CPU(struct kernel_stat, kstat);

3673

3674

EXPORT_PER_CPU_SYMBOL(kstat);

3674

EXPORT_PER_CPU_SYMBOL(kstat);

3675

3676

/*

3676

/*

3677

* Return any ns on the sched_clock that have not yet been accounted in

3677

* Return any ns on the sched_clock that have not yet been accounted in

3678

* @p in case that task is currently running.

3678

* @p in case that task is currently running.

3679

*

3679

*

3680

* Called with task_rq_lock() held on @rq.

3680

* Called with task_rq_lock() held on @rq.

3681

*/

3681

*/

3682

static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)

3682

static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)

3683

{

3683

{

3684

u64 ns = 0;

3684

u64 ns = 0;

3685

3686

if (task_current(rq, p)) {

3686

if (task_current(rq, p)) {

3687

update_rq_clock(rq);

3687

update_rq_clock(rq);

3688

ns = rq->clock_task - p->se.exec_start;

3688

ns = rq->clock_task - p->se.exec_start;

3689

if ((s64)ns < 0)

3689

if ((s64)ns < 0)

3690

ns = 0;

3690

ns = 0;

3691

}

3691

}

3692

3693

return ns;

3693

return ns;

3694

}

3694

}

3695

3696

unsigned long long task_delta_exec(struct task_struct *p)

3696

unsigned long long task_delta_exec(struct task_struct *p)

3697

{

3697

{

3698

unsigned long flags;

3698

unsigned long flags;

3699

struct rq *rq;

3699

struct rq *rq;

3700

u64 ns = 0;

3700

u64 ns = 0;

3701

3702

rq = task_rq_lock(p, &flags);

3702

rq = task_rq_lock(p, &flags);

3703

ns = do_task_delta_exec(p, rq);

3703

ns = do_task_delta_exec(p, rq);

3704

task_rq_unlock(rq, p, &flags);

3704

task_rq_unlock(rq, p, &flags);

3705

3706

return ns;

3706

return ns;

3707

}

3707

}

3708

3709

/*

3709

/*

3710

* Return accounted runtime for the task.

3710

* Return accounted runtime for the task.

3711

* In case the task is currently running, return the runtime plus current's

3711

* In case the task is currently running, return the runtime plus current's

3712

* pending runtime that have not been accounted yet.

3712

* pending runtime that have not been accounted yet.

3713

*/

3713

*/

3714

unsigned long long task_sched_runtime(struct task_struct *p)

3714

unsigned long long task_sched_runtime(struct task_struct *p)

3715

{

3715

{

3716

unsigned long flags;

3716

unsigned long flags;

3717

struct rq *rq;

3717

struct rq *rq;

3718

u64 ns = 0;

3718

u64 ns = 0;

3719

3720

rq = task_rq_lock(p, &flags);

3720

rq = task_rq_lock(p, &flags);

3721

ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);

3721

ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);

3722

task_rq_unlock(rq, p, &flags);

3722

task_rq_unlock(rq, p, &flags);

3723

3724

return ns;

3724

return ns;

3725

}

3725

}

3726

3727

/*

3727

/*

3728

* Return sum_exec_runtime for the thread group.

3728

* Return sum_exec_runtime for the thread group.

3729

* In case the task is currently running, return the sum plus current's

3729

* In case the task is currently running, return the sum plus current's

3730

* pending runtime that have not been accounted yet.

3730

* pending runtime that have not been accounted yet.

3731

*

3731

*

3732

* Note that the thread group might have other running tasks as well,

3732

* Note that the thread group might have other running tasks as well,

3733

* so the return value not includes other pending runtime that other

3733

* so the return value not includes other pending runtime that other

3734

* running tasks might have.

3734

* running tasks might have.

3735

*/

3735

*/

3736

unsigned long long thread_group_sched_runtime(struct task_struct *p)

3736

unsigned long long thread_group_sched_runtime(struct task_struct *p)

3737

{

3737

{

3738

struct task_cputime totals;

3738

struct task_cputime totals;

3739

unsigned long flags;

3739

unsigned long flags;

3740

struct rq *rq;

3740

struct rq *rq;

3741

u64 ns;

3741

u64 ns;

3742

3743

rq = task_rq_lock(p, &flags);

3743

rq = task_rq_lock(p, &flags);

3744

thread_group_cputime(p, &totals);

3744

thread_group_cputime(p, &totals);

3745

ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);

3745

ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);

3746

task_rq_unlock(rq, p, &flags);

3746

task_rq_unlock(rq, p, &flags);

3747

3748

return ns;

3748

return ns;

3749

}

3749

}

3750

3751

/*

3751

/*

3752

* Account user cpu time to a process.

3752

* Account user cpu time to a process.

3753

* @p: the process that the cpu time gets accounted to

3753

* @p: the process that the cpu time gets accounted to

3754

* @cputime: the cpu time spent in user space since the last update

3754

* @cputime: the cpu time spent in user space since the last update

3755

* @cputime_scaled: cputime scaled by cpu frequency

3755

* @cputime_scaled: cputime scaled by cpu frequency

3756

*/

3756

*/

3757

void account_user_time(struct task_struct *p, cputime_t cputime,

3757

void account_user_time(struct task_struct *p, cputime_t cputime,

3758

cputime_t cputime_scaled)

3758

cputime_t cputime_scaled)

3759

{

3759

{

3760

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3760

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3761

cputime64_t tmp;

3761

cputime64_t tmp;

3762

3763

/* Add user time to process. */

3763

/* Add user time to process. */

3764

p->utime = cputime_add(p->utime, cputime);

3764

p->utime = cputime_add(p->utime, cputime);

3765

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

3765

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

3766

account_group_user_time(p, cputime);

3766

account_group_user_time(p, cputime);

3767

3768

/* Add user time to cpustat. */

3768

/* Add user time to cpustat. */

3769

tmp = cputime_to_cputime64(cputime);

3769

tmp = cputime_to_cputime64(cputime);

3770

if (TASK_NICE(p) > 0)

3770

if (TASK_NICE(p) > 0)

3771

cpustat->nice = cputime64_add(cpustat->nice, tmp);

3771

cpustat->nice = cputime64_add(cpustat->nice, tmp);

3772

else

3772

else

3773

cpustat->user = cputime64_add(cpustat->user, tmp);

3773

cpustat->user = cputime64_add(cpustat->user, tmp);

3774

3775

cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);

3775

cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);

3776

/* Account for user time used */

3776

/* Account for user time used */

3777

acct_update_integrals(p);

3777

acct_update_integrals(p);

3778

}

3778

}

3779

3780

/*

3780

/*

3781

* Account guest cpu time to a process.

3781

* Account guest cpu time to a process.

3782

* @p: the process that the cpu time gets accounted to

3782

* @p: the process that the cpu time gets accounted to

3783

* @cputime: the cpu time spent in virtual machine since the last update

3783

* @cputime: the cpu time spent in virtual machine since the last update

3784

* @cputime_scaled: cputime scaled by cpu frequency

3784

* @cputime_scaled: cputime scaled by cpu frequency

3785

*/

3785

*/

3786

static void account_guest_time(struct task_struct *p, cputime_t cputime,

3786

static void account_guest_time(struct task_struct *p, cputime_t cputime,

3787

cputime_t cputime_scaled)

3787

cputime_t cputime_scaled)

3788

{

3788

{

3789

cputime64_t tmp;

3789

cputime64_t tmp;

3790

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3790

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3791

3792

tmp = cputime_to_cputime64(cputime);

3792

tmp = cputime_to_cputime64(cputime);

3793

3794

/* Add guest time to process. */

3794

/* Add guest time to process. */

3795

p->utime = cputime_add(p->utime, cputime);

3795

p->utime = cputime_add(p->utime, cputime);

3796

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

3796

p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);

3797

account_group_user_time(p, cputime);

3797

account_group_user_time(p, cputime);

3798

p->gtime = cputime_add(p->gtime, cputime);

3798

p->gtime = cputime_add(p->gtime, cputime);

3799

3800

/* Add guest time to cpustat. */

3800

/* Add guest time to cpustat. */

3801

if (TASK_NICE(p) > 0) {

3801

if (TASK_NICE(p) > 0) {

3802

cpustat->nice = cputime64_add(cpustat->nice, tmp);

3802

cpustat->nice = cputime64_add(cpustat->nice, tmp);

3803

cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);

3803

cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);

3804

} else {

3804

} else {

3805

cpustat->user = cputime64_add(cpustat->user, tmp);

3805

cpustat->user = cputime64_add(cpustat->user, tmp);

3806

cpustat->guest = cputime64_add(cpustat->guest, tmp);

3806

cpustat->guest = cputime64_add(cpustat->guest, tmp);

3807

}

3807

}

3808

}

3808

}

3809

3810

/*

3810

/*

3811

* Account system cpu time to a process and desired cpustat field

3811

* Account system cpu time to a process and desired cpustat field

3812

* @p: the process that the cpu time gets accounted to

3812

* @p: the process that the cpu time gets accounted to

3813

* @cputime: the cpu time spent in kernel space since the last update

3813

* @cputime: the cpu time spent in kernel space since the last update

3814

* @cputime_scaled: cputime scaled by cpu frequency

3814

* @cputime_scaled: cputime scaled by cpu frequency

3815

* @target_cputime64: pointer to cpustat field that has to be updated

3815

* @target_cputime64: pointer to cpustat field that has to be updated

3816

*/

3816

*/

3817

static inline

3817

static inline

3818

void __account_system_time(struct task_struct *p, cputime_t cputime,

3818

void __account_system_time(struct task_struct *p, cputime_t cputime,

3819

cputime_t cputime_scaled, cputime64_t *target_cputime64)

3819

cputime_t cputime_scaled, cputime64_t *target_cputime64)

3820

{

3820

{

3821

cputime64_t tmp = cputime_to_cputime64(cputime);

3821

cputime64_t tmp = cputime_to_cputime64(cputime);

3822

3823

/* Add system time to process. */

3823

/* Add system time to process. */

3824

p->stime = cputime_add(p->stime, cputime);

3824

p->stime = cputime_add(p->stime, cputime);

3825

p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);

3825

p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);

3826

account_group_system_time(p, cputime);

3826

account_group_system_time(p, cputime);

3827

3828

/* Add system time to cpustat. */

3828

/* Add system time to cpustat. */

3829

*target_cputime64 = cputime64_add(*target_cputime64, tmp);

3829

*target_cputime64 = cputime64_add(*target_cputime64, tmp);

3830

cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);

3830

cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);

3831

3832

/* Account for system time used */

3832

/* Account for system time used */

3833

acct_update_integrals(p);

3833

acct_update_integrals(p);

3834

}

3834

}

3835

3836

/*

3836

/*

3837

* Account system cpu time to a process.

3837

* Account system cpu time to a process.

3838

* @p: the process that the cpu time gets accounted to

3838

* @p: the process that the cpu time gets accounted to

3839

* @hardirq_offset: the offset to subtract from hardirq_count()

3839

* @hardirq_offset: the offset to subtract from hardirq_count()

3840

* @cputime: the cpu time spent in kernel space since the last update

3840

* @cputime: the cpu time spent in kernel space since the last update

3841

* @cputime_scaled: cputime scaled by cpu frequency

3841

* @cputime_scaled: cputime scaled by cpu frequency

3842

*/

3842

*/

3843

void account_system_time(struct task_struct *p, int hardirq_offset,

3843

void account_system_time(struct task_struct *p, int hardirq_offset,

3844

cputime_t cputime, cputime_t cputime_scaled)

3844

cputime_t cputime, cputime_t cputime_scaled)

3845

{

3845

{

3846

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3846

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3847

cputime64_t *target_cputime64;

3847

cputime64_t *target_cputime64;

3848

3849

if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {

3849

if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {

3850

account_guest_time(p, cputime, cputime_scaled);

3850

account_guest_time(p, cputime, cputime_scaled);

3851

return;

3851

return;

3852

}

3852

}

3853

3854

if (hardirq_count() - hardirq_offset)

3854

if (hardirq_count() - hardirq_offset)

3855

target_cputime64 = &cpustat->irq;

3855

target_cputime64 = &cpustat->irq;

3856

else if (in_serving_softirq())

3856

else if (in_serving_softirq())

3857

target_cputime64 = &cpustat->softirq;

3857

target_cputime64 = &cpustat->softirq;

3858

else

3858

else

3859

target_cputime64 = &cpustat->system;

3859

target_cputime64 = &cpustat->system;

3860

3861

__account_system_time(p, cputime, cputime_scaled, target_cputime64);

3861

__account_system_time(p, cputime, cputime_scaled, target_cputime64);

3862

}

3862

}

3863

3864

/*

3864

/*

3865

* Account for involuntary wait time.

3865

* Account for involuntary wait time.

3866

* @cputime: the cpu time spent in involuntary wait

3866

* @cputime: the cpu time spent in involuntary wait

3867

*/

3867

*/

3868

void account_steal_time(cputime_t cputime)

3868

void account_steal_time(cputime_t cputime)

3869

{

3869

{

3870

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3870

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3871

cputime64_t cputime64 = cputime_to_cputime64(cputime);

3871

cputime64_t cputime64 = cputime_to_cputime64(cputime);

3872

3873

cpustat->steal = cputime64_add(cpustat->steal, cputime64);

3873

cpustat->steal = cputime64_add(cpustat->steal, cputime64);

3874

}

3874

}

3875

3876

/*

3876

/*

3877

* Account for idle time.

3877

* Account for idle time.

3878

* @cputime: the cpu time spent in idle wait

3878

* @cputime: the cpu time spent in idle wait

3879

*/

3879

*/

3880

void account_idle_time(cputime_t cputime)

3880

void account_idle_time(cputime_t cputime)

3881

{

3881

{

3882

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3882

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3883

cputime64_t cputime64 = cputime_to_cputime64(cputime);

3883

cputime64_t cputime64 = cputime_to_cputime64(cputime);

3884

struct rq *rq = this_rq();

3884

struct rq *rq = this_rq();

3885

3886

if (atomic_read(&rq->nr_iowait) > 0)

3886

if (atomic_read(&rq->nr_iowait) > 0)

3887

cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);

3887

cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);

3888

else

3888

else

3889

cpustat->idle = cputime64_add(cpustat->idle, cputime64);

3889

cpustat->idle = cputime64_add(cpustat->idle, cputime64);

3890

}

3890

}

3891

3892

static __always_inline bool steal_account_process_tick(void)

3892

static __always_inline bool steal_account_process_tick(void)

3893

{

3893

{

3894

#ifdef CONFIG_PARAVIRT

3894

#ifdef CONFIG_PARAVIRT

3895

if (static_branch(&paravirt_steal_enabled)) {

3895

if (static_branch(&paravirt_steal_enabled)) {

3896

u64 steal, st = 0;

3896

u64 steal, st = 0;

3897

3898

steal = paravirt_steal_clock(smp_processor_id());

3898

steal = paravirt_steal_clock(smp_processor_id());

3899

steal -= this_rq()->prev_steal_time;

3899

steal -= this_rq()->prev_steal_time;

3900

3901

st = steal_ticks(steal);

3901

st = steal_ticks(steal);

3902

this_rq()->prev_steal_time += st * TICK_NSEC;

3902

this_rq()->prev_steal_time += st * TICK_NSEC;

3903

3904

account_steal_time(st);

3904

account_steal_time(st);

3905

return st;

3905

return st;

3906

}

3906

}

3907

#endif

3907

#endif

3908

return false;

3908

return false;

3909

}

3909

}

3910

3911

#ifndef CONFIG_VIRT_CPU_ACCOUNTING

3911

#ifndef CONFIG_VIRT_CPU_ACCOUNTING

3912

3913

#ifdef CONFIG_IRQ_TIME_ACCOUNTING

3913

#ifdef CONFIG_IRQ_TIME_ACCOUNTING

3914

/*

3914

/*

3915

* Account a tick to a process and cpustat

3915

* Account a tick to a process and cpustat

3916

* @p: the process that the cpu time gets accounted to

3916

* @p: the process that the cpu time gets accounted to

3917

* @user_tick: is the tick from userspace

3917

* @user_tick: is the tick from userspace

3918

* @rq: the pointer to rq

3918

* @rq: the pointer to rq

3919

*

3919

*

3920

* Tick demultiplexing follows the order

3920

* Tick demultiplexing follows the order

3921

* - pending hardirq update

3921

* - pending hardirq update

3922

* - pending softirq update

3922

* - pending softirq update

3923

* - user_time

3923

* - user_time

3924

* - idle_time

3924

* - idle_time

3925

* - system time

3925

* - system time

3926

* - check for guest_time

3926

* - check for guest_time

3927

* - else account as system_time

3927

* - else account as system_time

3928

*

3928

*

3929

* Check for hardirq is done both for system and user time as there is

3929

* Check for hardirq is done both for system and user time as there is

3930

* no timer going off while we are on hardirq and hence we may never get an

3930

* no timer going off while we are on hardirq and hence we may never get an

3931

* opportunity to update it solely in system time.

3931

* opportunity to update it solely in system time.

3932

* p->stime and friends are only updated on system time and not on irq

3932

* p->stime and friends are only updated on system time and not on irq

3933

* softirq as those do not count in task exec_runtime any more.

3933

* softirq as those do not count in task exec_runtime any more.

3934

*/

3934

*/

3935

static void irqtime_account_process_tick(struct task_struct *p, int user_tick,

3935

static void irqtime_account_process_tick(struct task_struct *p, int user_tick,

3936

struct rq *rq)

3936

struct rq *rq)

3937

{

3937

{

3938

cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);

3938

cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);

3939

cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);

3939

cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);

3940

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3940

struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;

3941

3942

if (steal_account_process_tick())

3942

if (steal_account_process_tick())

3943

return;

3943

return;

3944

3945

if (irqtime_account_hi_update()) {

3945

if (irqtime_account_hi_update()) {

3946

cpustat->irq = cputime64_add(cpustat->irq, tmp);

3946

cpustat->irq = cputime64_add(cpustat->irq, tmp);

3947

} else if (irqtime_account_si_update()) {

3947

} else if (irqtime_account_si_update()) {

3948

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

3948

cpustat->softirq = cputime64_add(cpustat->softirq, tmp);

3949

} else if (this_cpu_ksoftirqd() == p) {

3949

} else if (this_cpu_ksoftirqd() == p) {

3950

/*

3950

/*

3951

* ksoftirqd time do not get accounted in cpu_softirq_time.

3951

* ksoftirqd time do not get accounted in cpu_softirq_time.

3952

* So, we have to handle it separately here.

3952

* So, we have to handle it separately here.

3953

* Also, p->stime needs to be updated for ksoftirqd.

3953

* Also, p->stime needs to be updated for ksoftirqd.

3954

*/

3954

*/

3955

__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,

3955

__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,

3956

&cpustat->softirq);

3956

&cpustat->softirq);

3957

} else if (user_tick) {

3957

} else if (user_tick) {

3958

account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);

3958

account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);

3959

} else if (p == rq->idle) {

3959

} else if (p == rq->idle) {

3960

account_idle_time(cputime_one_jiffy);

3960

account_idle_time(cputime_one_jiffy);

3961

} else if (p->flags & PF_VCPU) { /* System time or guest time */

3961

} else if (p->flags & PF_VCPU) { /* System time or guest time */

3962

account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);

3962

account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);

3963

} else {

3963

} else {

3964

__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,

3964

__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,

3965

&cpustat->system);

3965

&cpustat->system);

3966

}

3966

}

3967

}

3967

}

3968

3969

static void irqtime_account_idle_ticks(int ticks)

3969

static void irqtime_account_idle_ticks(int ticks)

3970

{

3970

{

3971

int i;

3971

int i;

3972

struct rq *rq = this_rq();

3972

struct rq *rq = this_rq();

3973

3974

for (i = 0; i < ticks; i++)

3974

for (i = 0; i < ticks; i++)

3975

irqtime_account_process_tick(current, 0, rq);

3975

irqtime_account_process_tick(current, 0, rq);

3976

}

3976

}

3977

#else /* CONFIG_IRQ_TIME_ACCOUNTING */

3977

#else /* CONFIG_IRQ_TIME_ACCOUNTING */

3978

static void irqtime_account_idle_ticks(int ticks) {}

3978

static void irqtime_account_idle_ticks(int ticks) {}

3979

static void irqtime_account_process_tick(struct task_struct *p, int user_tick,

3979

static void irqtime_account_process_tick(struct task_struct *p, int user_tick,

3980

struct rq *rq) {}

3980

struct rq *rq) {}

3981

#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

3981

#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

3982

3983

/*

3983

/*

3984

* Account a single tick of cpu time.

3984

* Account a single tick of cpu time.

3985

* @p: the process that the cpu time gets accounted to

3985

* @p: the process that the cpu time gets accounted to

3986

* @user_tick: indicates if the tick is a user or a system tick

3986

* @user_tick: indicates if the tick is a user or a system tick

3987

*/

3987

*/

3988

void account_process_tick(struct task_struct *p, int user_tick)

3988

void account_process_tick(struct task_struct *p, int user_tick)

3989

{

3989

{

3990

cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);

3990

cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);

3991

struct rq *rq = this_rq();

3991

struct rq *rq = this_rq();

3992

3993

if (sched_clock_irqtime) {

3993

if (sched_clock_irqtime) {

3994

irqtime_account_process_tick(p, user_tick, rq);

3994

irqtime_account_process_tick(p, user_tick, rq);

3995

return;

3995

return;

3996

}

3996

}

3997

3998

if (steal_account_process_tick())

3998

if (steal_account_process_tick())

3999

return;

3999

return;

4000

4001

if (user_tick)

4001

if (user_tick)

4002

account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);

4002

account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);

4003

else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))

4003

else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))

4004

account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,

4004

account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,

4005

one_jiffy_scaled);

4005

one_jiffy_scaled);

4006

else

4006

else

4007

account_idle_time(cputime_one_jiffy);

4007

account_idle_time(cputime_one_jiffy);

4008

}

4008

}

4009

4010

/*

4010

/*

4011

* Account multiple ticks of steal time.

4011

* Account multiple ticks of steal time.

4012

* @p: the process from which the cpu time has been stolen

4012

* @p: the process from which the cpu time has been stolen

4013

* @ticks: number of stolen ticks

4013

* @ticks: number of stolen ticks

4014

*/

4014

*/

4015

void account_steal_ticks(unsigned long ticks)

4015

void account_steal_ticks(unsigned long ticks)

4016

{

4016

{

4017

account_steal_time(jiffies_to_cputime(ticks));

4017

account_steal_time(jiffies_to_cputime(ticks));

4018

}

4018

}

4019

4020

/*

4020

/*

4021

* Account multiple ticks of idle time.

4021

* Account multiple ticks of idle time.

4022

* @ticks: number of stolen ticks

4022

* @ticks: number of stolen ticks

4023

*/

4023

*/

4024

void account_idle_ticks(unsigned long ticks)

4024

void account_idle_ticks(unsigned long ticks)

4025

{

4025

{

4026

4027

if (sched_clock_irqtime) {

4027

if (sched_clock_irqtime) {

4028

irqtime_account_idle_ticks(ticks);

4028

irqtime_account_idle_ticks(ticks);

4029

return;

4029

return;

4030

}

4030

}

4031

4032

account_idle_time(jiffies_to_cputime(ticks));

4032

account_idle_time(jiffies_to_cputime(ticks));

4033

}

4033

}

4034

4035

#endif

4035

#endif

4036

4037

/*

4037

/*

4038

* Use precise platform statistics if available:

4038

* Use precise platform statistics if available:

4039

*/

4039

*/

4040

#ifdef CONFIG_VIRT_CPU_ACCOUNTING

4040

#ifdef CONFIG_VIRT_CPU_ACCOUNTING

4041

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

4041

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

4042

{

4042

{

4043

*ut = p->utime;

4043

*ut = p->utime;

4044

*st = p->stime;

4044

*st = p->stime;

4045

}

4045

}

4046

4047

void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

4047

void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

4048

{

4048

{

4049

struct task_cputime cputime;

4049

struct task_cputime cputime;

4050

4051

thread_group_cputime(p, &cputime);

4051

thread_group_cputime(p, &cputime);

4052

4053

*ut = cputime.utime;

4053

*ut = cputime.utime;

4054

*st = cputime.stime;

4054

*st = cputime.stime;

4055

}

4055

}

4056

#else

4056

#else

4057

4058

#ifndef nsecs_to_cputime

4058

#ifndef nsecs_to_cputime

4059

# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)

4059

# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)

4060

#endif

4060

#endif

4061

4062

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

4062

void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

4063

{

4063

{

4064

cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);

4064

cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);

4065

4066

/*

4066

/*

4067

* Use CFS's precise accounting:

4067

* Use CFS's precise accounting:

4068

*/

4068

*/

4069

rtime = nsecs_to_cputime(p->se.sum_exec_runtime);

4069

rtime = nsecs_to_cputime(p->se.sum_exec_runtime);

4070

4071

if (total) {

4071

if (total) {

4072

u64 temp = rtime;

4072

u64 temp = rtime;

4073

4074

temp *= utime;

4074

temp *= utime;

4075

do_div(temp, total);

4075

do_div(temp, total);

4076

utime = (cputime_t)temp;

4076

utime = (cputime_t)temp;

4077

} else

4077

} else

4078

utime = rtime;

4078

utime = rtime;

4079

4080

/*

4080

/*

4081

* Compare with previous values, to keep monotonicity:

4081

* Compare with previous values, to keep monotonicity:

4082

*/

4082

*/

4083

p->prev_utime = max(p->prev_utime, utime);

4083

p->prev_utime = max(p->prev_utime, utime);

4084

p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));

4084

p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));

4085

4086

*ut = p->prev_utime;

4086

*ut = p->prev_utime;

4087

*st = p->prev_stime;

4087

*st = p->prev_stime;

4088

}

4088

}

4089

4090

/*

4090

/*

4091

* Must be called with siglock held.

4091

* Must be called with siglock held.

4092

*/

4092

*/

4093

void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

4093

void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)

4094

{

4094

{

4095

struct signal_struct *sig = p->signal;

4095

struct signal_struct *sig = p->signal;

4096

struct task_cputime cputime;

4096

struct task_cputime cputime;

4097

cputime_t rtime, utime, total;

4097

cputime_t rtime, utime, total;

4098

4099

thread_group_cputime(p, &cputime);

4099

thread_group_cputime(p, &cputime);

4100

4101

total = cputime_add(cputime.utime, cputime.stime);

4101

total = cputime_add(cputime.utime, cputime.stime);

4102

rtime = nsecs_to_cputime(cputime.sum_exec_runtime);

4102

rtime = nsecs_to_cputime(cputime.sum_exec_runtime);

4103

4104

if (total) {

4104

if (total) {

4105

u64 temp = rtime;

4105

u64 temp = rtime;

4106

4107

temp *= cputime.utime;

4107

temp *= cputime.utime;

4108

do_div(temp, total);

4108

do_div(temp, total);

4109

utime = (cputime_t)temp;

4109

utime = (cputime_t)temp;

4110

} else

4110

} else

4111

utime = rtime;

4111

utime = rtime;

4112

4113

sig->prev_utime = max(sig->prev_utime, utime);

4113

sig->prev_utime = max(sig->prev_utime, utime);

4114

sig->prev_stime = max(sig->prev_stime,

4114

sig->prev_stime = max(sig->prev_stime,

4115

cputime_sub(rtime, sig->prev_utime));

4115

cputime_sub(rtime, sig->prev_utime));

4116

4117

*ut = sig->prev_utime;

4117

*ut = sig->prev_utime;

4118

*st = sig->prev_stime;

4118

*st = sig->prev_stime;

4119

}

4119

}

4120

#endif

4120

#endif

4121

4122

/*

4122

/*

4123

* This function gets called by the timer code, with HZ frequency.

4123

* This function gets called by the timer code, with HZ frequency.

4124

* We call it with interrupts disabled.

4124

* We call it with interrupts disabled.

4125

*/

4125

*/

4126

void scheduler_tick(void)

4126

void scheduler_tick(void)

4127

{

4127

{

4128

int cpu = smp_processor_id();

4128

int cpu = smp_processor_id();

4129

struct rq *rq = cpu_rq(cpu);

4129

struct rq *rq = cpu_rq(cpu);

4130

struct task_struct *curr = rq->curr;

4130

struct task_struct *curr = rq->curr;

4131

4132

sched_clock_tick();

4132

sched_clock_tick();

4133

4134

raw_spin_lock(&rq->lock);

4134

raw_spin_lock(&rq->lock);

4135

update_rq_clock(rq);

4135

update_rq_clock(rq);

4136

update_cpu_load_active(rq);

4136

update_cpu_load_active(rq);

4137

curr->sched_class->task_tick(rq, curr, 0);

4137

curr->sched_class->task_tick(rq, curr, 0);

4138

raw_spin_unlock(&rq->lock);

4138

raw_spin_unlock(&rq->lock);

4139

4140

perf_event_task_tick();

4140

perf_event_task_tick();

4141

4142

#ifdef CONFIG_SMP

4142

#ifdef CONFIG_SMP

4143

rq->idle_at_tick = idle_cpu(cpu);

4143

rq->idle_at_tick = idle_cpu(cpu);

4144

trigger_load_balance(rq, cpu);

4144

trigger_load_balance(rq, cpu);

4145

#endif

4145

#endif

4146

}

4146

}

4147

4148

notrace unsigned long get_parent_ip(unsigned long addr)

4148

notrace unsigned long get_parent_ip(unsigned long addr)

4149

{

4149

{

4150

if (in_lock_functions(addr)) {

4150

if (in_lock_functions(addr)) {

4151

addr = CALLER_ADDR2;

4151

addr = CALLER_ADDR2;

4152

if (in_lock_functions(addr))

4152

if (in_lock_functions(addr))

4153

addr = CALLER_ADDR3;

4153

addr = CALLER_ADDR3;

4154

}

4154

}

4155

return addr;

4155

return addr;

4156

}

4156

}

4157

4158

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \

4158

#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \

4159

defined(CONFIG_PREEMPT_TRACER))

4159

defined(CONFIG_PREEMPT_TRACER))

4160

4161

void __kprobes add_preempt_count(int val)

4161

void __kprobes add_preempt_count(int val)

4162

{

4162

{

4163

#ifdef CONFIG_DEBUG_PREEMPT

4163

#ifdef CONFIG_DEBUG_PREEMPT

4164

/*

4164

/*

4165

* Underflow?

4165

* Underflow?

4166

*/

4166

*/

4167

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

4167

if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))

4168

return;

4168

return;

4169

#endif

4169

#endif

4170

preempt_count() += val;

4170

preempt_count() += val;

4171

#ifdef CONFIG_DEBUG_PREEMPT

4171

#ifdef CONFIG_DEBUG_PREEMPT

4172

/*

4172

/*

4173

* Spinlock count overflowing soon?

4173

* Spinlock count overflowing soon?

4174

*/

4174

*/

4175

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

4175

DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=

4176

PREEMPT_MASK - 10);

4176

PREEMPT_MASK - 10);

4177

#endif

4177

#endif

4178

if (preempt_count() == val)

4178

if (preempt_count() == val)

4179

trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

4179

trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

4180

}

4180

}

4181

EXPORT_SYMBOL(add_preempt_count);

4181

EXPORT_SYMBOL(add_preempt_count);

4182

4183

void __kprobes sub_preempt_count(int val)

4183

void __kprobes sub_preempt_count(int val)

4184

{

4184

{

4185

#ifdef CONFIG_DEBUG_PREEMPT

4185

#ifdef CONFIG_DEBUG_PREEMPT

4186

/*

4186

/*

4187

* Underflow?

4187

* Underflow?

4188

*/

4188

*/

4189

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

4189

if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))

4190

return;

4190

return;

4191

/*

4191

/*

4192

* Is the spinlock portion underflowing?

4192

* Is the spinlock portion underflowing?

4193

*/

4193

*/

4194

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

4194

if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&

4195

!(preempt_count() & PREEMPT_MASK)))

4195

!(preempt_count() & PREEMPT_MASK)))

4196

return;

4196

return;

4197

#endif

4197

#endif

4198

4199

if (preempt_count() == val)

4199

if (preempt_count() == val)

4200

trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

4200

trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));

4201

preempt_count() -= val;

4201

preempt_count() -= val;

4202

}

4202

}

4203

EXPORT_SYMBOL(sub_preempt_count);

4203

EXPORT_SYMBOL(sub_preempt_count);

4204

4205

#endif

4205

#endif

4206

4207

/*

4207

/*

4208

* Print scheduling while atomic bug:

4208

* Print scheduling while atomic bug:

4209

*/

4209

*/

4210

static noinline void __schedule_bug(struct task_struct *prev)

4210

static noinline void __schedule_bug(struct task_struct *prev)

4211

{

4211

{

4212

struct pt_regs *regs = get_irq_regs();

4212

struct pt_regs *regs = get_irq_regs();

4213

4214

printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

4214

printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",

4215

prev->comm, prev->pid, preempt_count());

4215

prev->comm, prev->pid, preempt_count());

4216

4217

debug_show_held_locks(prev);

4217

debug_show_held_locks(prev);

4218

print_modules();

4218

print_modules();

4219

if (irqs_disabled())

4219

if (irqs_disabled())

4220

print_irqtrace_events(prev);

4220

print_irqtrace_events(prev);

4221

4222

if (regs)

4222

if (regs)

4223

show_regs(regs);

4223

show_regs(regs);

4224

else

4224

else

4225

dump_stack();

4225

dump_stack();

4226

}

4226

}

4227

4228

/*

4228

/*

4229

* Various schedule()-time debugging checks and statistics:

4229

* Various schedule()-time debugging checks and statistics:

4230

*/

4230

*/

4231

static inline void schedule_debug(struct task_struct *prev)

4231

static inline void schedule_debug(struct task_struct *prev)

4232

{

4232

{

4233

/*

4233

/*

4234

* Test if we are atomic. Since do_exit() needs to call into

4234

* Test if we are atomic. Since do_exit() needs to call into

4235

* schedule() atomically, we ignore that path for now.

4235

* schedule() atomically, we ignore that path for now.

4236

* Otherwise, whine if we are scheduling when we should not be.

4236

* Otherwise, whine if we are scheduling when we should not be.

4237

*/

4237

*/

4238

if (unlikely(in_atomic_preempt_off() && !prev->exit_state))

4238

if (unlikely(in_atomic_preempt_off() && !prev->exit_state))

4239

__schedule_bug(prev);

4239

__schedule_bug(prev);

4240

4241

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

4241

profile_hit(SCHED_PROFILING, __builtin_return_address(0));

4242

4243

schedstat_inc(this_rq(), sched_count);

4243

schedstat_inc(this_rq(), sched_count);

4244

}

4244

}

4245

4246

static void put_prev_task(struct rq *rq, struct task_struct *prev)

4246

static void put_prev_task(struct rq *rq, struct task_struct *prev)

4247

{

4247

{

4248

if (prev->on_rq || rq->skip_clock_update < 0)

4248

if (prev->on_rq || rq->skip_clock_update < 0)

4249

update_rq_clock(rq);

4249

update_rq_clock(rq);

4250

prev->sched_class->put_prev_task(rq, prev);

4250

prev->sched_class->put_prev_task(rq, prev);

4251

}

4251

}

4252

4253

/*

4253

/*

4254

* Pick up the highest-prio task:

4254

* Pick up the highest-prio task:

4255

*/

4255

*/

4256

static inline struct task_struct *

4256

static inline struct task_struct *

4257

pick_next_task(struct rq *rq)

4257

pick_next_task(struct rq *rq)

4258

{

4258

{

4259

const struct sched_class *class;

4259

const struct sched_class *class;

4260

struct task_struct *p;

4260

struct task_struct *p;

4261

4262

/*

4262

/*

4263

* Optimization: we know that if all tasks are in

4263

* Optimization: we know that if all tasks are in

4264

* the fair class we can call that function directly:

4264

* the fair class we can call that function directly:

4265

*/

4265

*/

4266

if (likely(rq->nr_running == rq->cfs.nr_running)) {

4266

if (likely(rq->nr_running == rq->cfs.nr_running)) {

4267

p = fair_sched_class.pick_next_task(rq);

4267

p = fair_sched_class.pick_next_task(rq);

4268

if (likely(p))

4268

if (likely(p))

4269

return p;

4269

return p;

4270

}

4270

}

4271

4272

for_each_class(class) {

4272

for_each_class(class) {

4273

p = class->pick_next_task(rq);

4273

p = class->pick_next_task(rq);

4274

if (p)

4274

if (p)

4275

return p;

4275

return p;

4276

}

4276

}

4277

4278

BUG(); /* the idle class will always have a runnable task */

4278

BUG(); /* the idle class will always have a runnable task */

4279

}

4279

}

4280

4281

/*

4281

/*

4282

* __schedule() is the main scheduler function.

4282

* __schedule() is the main scheduler function.

4283

*/

4283

*/

4284

static void __sched __schedule(void)

4284

static void __sched __schedule(void)

4285

{

4285

{

4286

struct task_struct *prev, *next;

4286

struct task_struct *prev, *next;

4287

unsigned long *switch_count;

4287

unsigned long *switch_count;

4288

struct rq *rq;

4288

struct rq *rq;

4289

int cpu;

4289

int cpu;

4290

4291

need_resched:

4291

need_resched:

4292

preempt_disable();

4292

preempt_disable();

4293

cpu = smp_processor_id();

4293

cpu = smp_processor_id();

4294

rq = cpu_rq(cpu);

4294

rq = cpu_rq(cpu);

4295

rcu_note_context_switch(cpu);

4295

rcu_note_context_switch(cpu);

4296

prev = rq->curr;

4296

prev = rq->curr;

4297

4298

schedule_debug(prev);

4298

schedule_debug(prev);

4299

4300

if (sched_feat(HRTICK))

4300

if (sched_feat(HRTICK))

4301

hrtick_clear(rq);

4301

hrtick_clear(rq);

4302

4303

raw_spin_lock_irq(&rq->lock);

4303

raw_spin_lock_irq(&rq->lock);

4304

4305

switch_count = &prev->nivcsw;

4305

switch_count = &prev->nivcsw;

4306

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

4306

if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {

4307

if (unlikely(signal_pending_state(prev->state, prev))) {

4307

if (unlikely(signal_pending_state(prev->state, prev))) {

4308

prev->state = TASK_RUNNING;

4308

prev->state = TASK_RUNNING;

4309

} else {

4309

} else {

4310

deactivate_task(rq, prev, DEQUEUE_SLEEP);

4310

deactivate_task(rq, prev, DEQUEUE_SLEEP);

4311

prev->on_rq = 0;

4311

prev->on_rq = 0;

4312

4313

/*

4313

/*

4314

* If a worker went to sleep, notify and ask workqueue

4314

* If a worker went to sleep, notify and ask workqueue

4315

* whether it wants to wake up a task to maintain

4315

* whether it wants to wake up a task to maintain

4316

* concurrency.

4316

* concurrency.

4317

*/

4317

*/

4318

if (prev->flags & PF_WQ_WORKER) {

4318

if (prev->flags & PF_WQ_WORKER) {

4319

struct task_struct *to_wakeup;

4319

struct task_struct *to_wakeup;

4320

4321

to_wakeup = wq_worker_sleeping(prev, cpu);

4321

to_wakeup = wq_worker_sleeping(prev, cpu);

4322

if (to_wakeup)

4322

if (to_wakeup)

4323

try_to_wake_up_local(to_wakeup);

4323

try_to_wake_up_local(to_wakeup);

4324

}

4324

}

4325

}

4325

}

4326

switch_count = &prev->nvcsw;

4326

switch_count = &prev->nvcsw;

4327

}

4327

}

4328

4329

pre_schedule(rq, prev);

4329

pre_schedule(rq, prev);

4330

4331

if (unlikely(!rq->nr_running))

4331

if (unlikely(!rq->nr_running))

4332

idle_balance(cpu, rq);

4332

idle_balance(cpu, rq);

4333

4334

put_prev_task(rq, prev);

4334

put_prev_task(rq, prev);

4335

next = pick_next_task(rq);

4335

next = pick_next_task(rq);

4336

clear_tsk_need_resched(prev);

4336

clear_tsk_need_resched(prev);

4337

rq->skip_clock_update = 0;

4337

rq->skip_clock_update = 0;

4338

4339

if (likely(prev != next)) {

4339

if (likely(prev != next)) {

4340

rq->nr_switches++;

4340

rq->nr_switches++;

4341

rq->curr = next;

4341

rq->curr = next;

4342

++*switch_count;

4342

++*switch_count;

4343

4344

context_switch(rq, prev, next); /* unlocks the rq */

4344

context_switch(rq, prev, next); /* unlocks the rq */

4345

/*

4345

/*

4346

* The context switch have flipped the stack from under us

4346

* The context switch have flipped the stack from under us

4347

* and restored the local variables which were saved when

4347

* and restored the local variables which were saved when

4348

* this task called schedule() in the past. prev == current

4348

* this task called schedule() in the past. prev == current

4349

* is still correct, but it can be moved to another cpu/rq.

4349

* is still correct, but it can be moved to another cpu/rq.

4350

*/

4350

*/

4351

cpu = smp_processor_id();

4351

cpu = smp_processor_id();

4352

rq = cpu_rq(cpu);

4352

rq = cpu_rq(cpu);

4353

} else

4353

} else

4354

raw_spin_unlock_irq(&rq->lock);

4354

raw_spin_unlock_irq(&rq->lock);

4355

4356

post_schedule(rq);

4356

post_schedule(rq);

4357

4358

preempt_enable_no_resched();

4358

preempt_enable_no_resched();

4359

if (need_resched())

4359

if (need_resched())

4360

goto need_resched;

4360

goto need_resched;

4361

}

4361

}

4362

4363

static inline void sched_submit_work(struct task_struct *tsk)

4363

static inline void sched_submit_work(struct task_struct *tsk)

4364

{

4364

{

4365

if (!tsk->state)

4365

if (!tsk->state)

4366

return;

4366

return;

4367

/*

4367

/*

4368

* If we are going to sleep and we have plugged IO queued,

4368

* If we are going to sleep and we have plugged IO queued,

4369

* make sure to submit it to avoid deadlocks.

4369

* make sure to submit it to avoid deadlocks.

4370

*/

4370

*/

4371

if (blk_needs_flush_plug(tsk))

4371

if (blk_needs_flush_plug(tsk))

4372

blk_schedule_flush_plug(tsk);

4372

blk_schedule_flush_plug(tsk);

4373

}

4373

}

4374

4375

asmlinkage void schedule(void)

4375

asmlinkage void __sched schedule(void)

4376

{

4376

{

4377

struct task_struct *tsk = current;

4377

struct task_struct *tsk = current;

4378

4379

sched_submit_work(tsk);

4379

sched_submit_work(tsk);

4380

__schedule();

4380

__schedule();

4381

}

4381

}

4382

EXPORT_SYMBOL(schedule);

4382

EXPORT_SYMBOL(schedule);

4383

4384

#ifdef CONFIG_MUTEX_SPIN_ON_OWNER

4384

#ifdef CONFIG_MUTEX_SPIN_ON_OWNER

4385

4386

static inline bool owner_running(struct mutex *lock, struct task_struct *owner)

4386

static inline bool owner_running(struct mutex *lock, struct task_struct *owner)

4387

{

4387

{

4388

if (lock->owner != owner)

4388

if (lock->owner != owner)

4389

return false;

4389

return false;

4390

4391

/*

4391

/*

4392

* Ensure we emit the owner->on_cpu, dereference _after_ checking

4392

* Ensure we emit the owner->on_cpu, dereference _after_ checking

4393

* lock->owner still matches owner, if that fails, owner might

4393

* lock->owner still matches owner, if that fails, owner might

4394

* point to free()d memory, if it still matches, the rcu_read_lock()

4394

* point to free()d memory, if it still matches, the rcu_read_lock()

4395

* ensures the memory stays valid.

4395

* ensures the memory stays valid.

4396

*/

4396

*/

4397

barrier();

4397

barrier();

4398

4399

return owner->on_cpu;

4399

return owner->on_cpu;

4400

}

4400

}

4401

4402

/*

4402

/*

4403

* Look out! "owner" is an entirely speculative pointer

4403

* Look out! "owner" is an entirely speculative pointer

4404

* access and not reliable.

4404

* access and not reliable.

4405

*/

4405

*/

4406

int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)

4406

int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)

4407

{

4407

{

4408

if (!sched_feat(OWNER_SPIN))

4408

if (!sched_feat(OWNER_SPIN))

4409

return 0;

4409

return 0;

4410

4411

rcu_read_lock();

4411

rcu_read_lock();

4412

while (owner_running(lock, owner)) {

4412

while (owner_running(lock, owner)) {

4413

if (need_resched())

4413

if (need_resched())

4414

break;

4414

break;

4415

4416

arch_mutex_cpu_relax();

4416

arch_mutex_cpu_relax();

4417

}

4417

}

4418

rcu_read_unlock();

4418

rcu_read_unlock();

4419

4420

/*

4420

/*

4421

* We break out the loop above on need_resched() and when the

4421

* We break out the loop above on need_resched() and when the

4422

* owner changed, which is a sign for heavy contention. Return

4422

* owner changed, which is a sign for heavy contention. Return

4423

* success only when lock->owner is NULL.

4423

* success only when lock->owner is NULL.

4424

*/

4424

*/

4425

return lock->owner == NULL;

4425

return lock->owner == NULL;

4426

}

4426

}

4427

#endif

4427

#endif

4428

4429

#ifdef CONFIG_PREEMPT

4429

#ifdef CONFIG_PREEMPT

4430

/*

4430

/*

4431

* this is the entry point to schedule() from in-kernel preemption

4431

* this is the entry point to schedule() from in-kernel preemption

4432

* off of preempt_enable. Kernel preemptions off return from interrupt

4432

* off of preempt_enable. Kernel preemptions off return from interrupt

4433

* occur there and call schedule directly.

4433

* occur there and call schedule directly.

4434

*/

4434

*/

4435

asmlinkage void __sched notrace preempt_schedule(void)

4435

asmlinkage void __sched notrace preempt_schedule(void)

4436

{

4436

{

4437

struct thread_info *ti = current_thread_info();

4437

struct thread_info *ti = current_thread_info();

4438

4439

/*

4439

/*

4440

* If there is a non-zero preempt_count or interrupts are disabled,

4440

* If there is a non-zero preempt_count or interrupts are disabled,

4441

* we do not want to preempt the current task. Just return..

4441

* we do not want to preempt the current task. Just return..

4442

*/

4442

*/

4443

if (likely(ti->preempt_count || irqs_disabled()))

4443

if (likely(ti->preempt_count || irqs_disabled()))

4444

return;

4444

return;

4445

4446

do {

4446

do {

4447

add_preempt_count_notrace(PREEMPT_ACTIVE);

4447

add_preempt_count_notrace(PREEMPT_ACTIVE);

4448

__schedule();

4448

__schedule();

4449

sub_preempt_count_notrace(PREEMPT_ACTIVE);

4449

sub_preempt_count_notrace(PREEMPT_ACTIVE);

4450

4451

/*

4451

/*

4452

* Check again in case we missed a preemption opportunity

4452

* Check again in case we missed a preemption opportunity

4453

* between schedule and now.

4453

* between schedule and now.

4454

*/

4454

*/

4455

barrier();

4455

barrier();

4456

} while (need_resched());

4456

} while (need_resched());

4457

}

4457

}

4458

EXPORT_SYMBOL(preempt_schedule);

4458

EXPORT_SYMBOL(preempt_schedule);

4459

4460

/*

4460

/*

4461

* this is the entry point to schedule() from kernel preemption

4461

* this is the entry point to schedule() from kernel preemption

4462

* off of irq context.

4462

* off of irq context.

4463

* Note, that this is called and return with irqs disabled. This will

4463

* Note, that this is called and return with irqs disabled. This will

4464

* protect us against recursive calling from irq.

4464

* protect us against recursive calling from irq.

4465

*/

4465

*/

4466

asmlinkage void __sched preempt_schedule_irq(void)

4466

asmlinkage void __sched preempt_schedule_irq(void)

4467

{

4467

{

4468

struct thread_info *ti = current_thread_info();

4468

struct thread_info *ti = current_thread_info();

4469

4470

/* Catch callers which need to be fixed */

4470

/* Catch callers which need to be fixed */

4471

BUG_ON(ti->preempt_count || !irqs_disabled());

4471

BUG_ON(ti->preempt_count || !irqs_disabled());

4472

4473

do {

4473

do {

4474

add_preempt_count(PREEMPT_ACTIVE);

4474

add_preempt_count(PREEMPT_ACTIVE);

4475

local_irq_enable();

4475

local_irq_enable();

4476

__schedule();

4476

__schedule();

4477

local_irq_disable();

4477

local_irq_disable();

4478

sub_preempt_count(PREEMPT_ACTIVE);

4478

sub_preempt_count(PREEMPT_ACTIVE);

4479

4480

/*

4480

/*

4481

* Check again in case we missed a preemption opportunity

4481

* Check again in case we missed a preemption opportunity

4482

* between schedule and now.

4482

* between schedule and now.

4483

*/

4483

*/

4484

barrier();

4484

barrier();

4485

} while (need_resched());

4485

} while (need_resched());

4486

}

4486

}

4487

4488

#endif /* CONFIG_PREEMPT */

4488

#endif /* CONFIG_PREEMPT */

4489

4490

int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,

4490

int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,

4491

void *key)

4491

void *key)

4492

{

4492

{

4493

return try_to_wake_up(curr->private, mode, wake_flags);

4493

return try_to_wake_up(curr->private, mode, wake_flags);

4494

}

4494

}

4495

EXPORT_SYMBOL(default_wake_function);

4495

EXPORT_SYMBOL(default_wake_function);

4496

4497

/*

4497

/*

4498

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

4498

* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just

4499

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

4499

* wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve

4500

* number) then we wake all the non-exclusive tasks and one exclusive task.

4500

* number) then we wake all the non-exclusive tasks and one exclusive task.

4501

*

4501

*

4502

* There are circumstances in which we can try to wake a task which has already

4502

* There are circumstances in which we can try to wake a task which has already

4503

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

4503

* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns

4504

* zero in this (rare) case, and we handle it by continuing to scan the queue.

4504

* zero in this (rare) case, and we handle it by continuing to scan the queue.

4505

*/

4505

*/

4506

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

4506

static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,

4507

int nr_exclusive, int wake_flags, void *key)

4507

int nr_exclusive, int wake_flags, void *key)

4508

{

4508

{

4509

wait_queue_t *curr, *next;

4509

wait_queue_t *curr, *next;

4510

4511

list_for_each_entry_safe(curr, next, &q->task_list, task_list) {

4511

list_for_each_entry_safe(curr, next, &q->task_list, task_list) {

4512

unsigned flags = curr->flags;

4512

unsigned flags = curr->flags;

4513

4514

if (curr->func(curr, mode, wake_flags, key) &&

4514

if (curr->func(curr, mode, wake_flags, key) &&

4515

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

4515

(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)

4516

break;

4516

break;

4517

}

4517

}

4518

}

4518

}

4519

4520

/**

4520

/**

4521

* __wake_up - wake up threads blocked on a waitqueue.

4521

* __wake_up - wake up threads blocked on a waitqueue.

4522

* @q: the waitqueue

4522

* @q: the waitqueue

4523

* @mode: which threads

4523

* @mode: which threads

4524

* @nr_exclusive: how many wake-one or wake-many threads to wake up

4524

* @nr_exclusive: how many wake-one or wake-many threads to wake up

4525

* @key: is directly passed to the wakeup function

4525

* @key: is directly passed to the wakeup function

4526

*

4526

*

4527

* It may be assumed that this function implies a write memory barrier before

4527

* It may be assumed that this function implies a write memory barrier before

4528

* changing the task state if and only if any tasks are woken up.

4528

* changing the task state if and only if any tasks are woken up.

4529

*/

4529

*/

4530

void __wake_up(wait_queue_head_t *q, unsigned int mode,

4530

void __wake_up(wait_queue_head_t *q, unsigned int mode,

4531

int nr_exclusive, void *key)

4531

int nr_exclusive, void *key)

4532

{

4532

{

4533

unsigned long flags;

4533

unsigned long flags;

4534

4535

spin_lock_irqsave(&q->lock, flags);

4535

spin_lock_irqsave(&q->lock, flags);

4536

__wake_up_common(q, mode, nr_exclusive, 0, key);

4536

__wake_up_common(q, mode, nr_exclusive, 0, key);

4537

spin_unlock_irqrestore(&q->lock, flags);

4537

spin_unlock_irqrestore(&q->lock, flags);

4538

}

4538

}

4539

EXPORT_SYMBOL(__wake_up);

4539

EXPORT_SYMBOL(__wake_up);

4540

4541

/*

4541

/*

4542

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

4542

* Same as __wake_up but called with the spinlock in wait_queue_head_t held.

4543

*/

4543

*/

4544

void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

4544

void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)

4545

{

4545

{

4546

__wake_up_common(q, mode, 1, 0, NULL);

4546

__wake_up_common(q, mode, 1, 0, NULL);

4547

}

4547

}

4548

EXPORT_SYMBOL_GPL(__wake_up_locked);

4548

EXPORT_SYMBOL_GPL(__wake_up_locked);

4549

4550

void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)

4550

void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)

4551

{

4551

{

4552

__wake_up_common(q, mode, 1, 0, key);

4552

__wake_up_common(q, mode, 1, 0, key);

4553

}

4553

}

4554

EXPORT_SYMBOL_GPL(__wake_up_locked_key);

4554

EXPORT_SYMBOL_GPL(__wake_up_locked_key);

4555

4556

/**

4556

/**

4557

* __wake_up_sync_key - wake up threads blocked on a waitqueue.

4557

* __wake_up_sync_key - wake up threads blocked on a waitqueue.

4558

* @q: the waitqueue

4558

* @q: the waitqueue

4559

* @mode: which threads

4559

* @mode: which threads

4560

* @nr_exclusive: how many wake-one or wake-many threads to wake up

4560

* @nr_exclusive: how many wake-one or wake-many threads to wake up

4561

* @key: opaque value to be passed to wakeup targets

4561

* @key: opaque value to be passed to wakeup targets

4562

*

4562

*

4563

* The sync wakeup differs that the waker knows that it will schedule

4563

* The sync wakeup differs that the waker knows that it will schedule

4564

* away soon, so while the target thread will be woken up, it will not

4564

* away soon, so while the target thread will be woken up, it will not

4565

* be migrated to another CPU - ie. the two threads are 'synchronized'

4565

* be migrated to another CPU - ie. the two threads are 'synchronized'

4566

* with each other. This can prevent needless bouncing between CPUs.

4566

* with each other. This can prevent needless bouncing between CPUs.

4567

*

4567

*

4568

* On UP it can prevent extra preemption.

4568

* On UP it can prevent extra preemption.

4569

*

4569

*

4570

* It may be assumed that this function implies a write memory barrier before

4570

* It may be assumed that this function implies a write memory barrier before

4571

* changing the task state if and only if any tasks are woken up.

4571

* changing the task state if and only if any tasks are woken up.

4572

*/

4572

*/

4573

void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,

4573

void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,

4574

int nr_exclusive, void *key)

4574

int nr_exclusive, void *key)

4575

{

4575

{

4576

unsigned long flags;

4576

unsigned long flags;

4577

int wake_flags = WF_SYNC;

4577

int wake_flags = WF_SYNC;

4578

4579

if (unlikely(!q))

4579

if (unlikely(!q))

4580

return;

4580

return;

4581

4582

if (unlikely(!nr_exclusive))

4582

if (unlikely(!nr_exclusive))

4583

wake_flags = 0;

4583

wake_flags = 0;

4584

4585

spin_lock_irqsave(&q->lock, flags);

4585

spin_lock_irqsave(&q->lock, flags);

4586

__wake_up_common(q, mode, nr_exclusive, wake_flags, key);

4586

__wake_up_common(q, mode, nr_exclusive, wake_flags, key);

4587

spin_unlock_irqrestore(&q->lock, flags);

4587

spin_unlock_irqrestore(&q->lock, flags);

4588

}

4588

}

4589

EXPORT_SYMBOL_GPL(__wake_up_sync_key);

4589

EXPORT_SYMBOL_GPL(__wake_up_sync_key);

4590

4591

/*

4591

/*

4592

* __wake_up_sync - see __wake_up_sync_key()

4592

* __wake_up_sync - see __wake_up_sync_key()

4593

*/

4593

*/

4594

void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

4594

void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)

4595

{

4595

{

4596

__wake_up_sync_key(q, mode, nr_exclusive, NULL);

4596

__wake_up_sync_key(q, mode, nr_exclusive, NULL);

4597

}

4597

}

4598

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

4598

EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */

4599

4600

/**

4600

/**

4601

* complete: - signals a single thread waiting on this completion

4601

* complete: - signals a single thread waiting on this completion

4602

* @x: holds the state of this particular completion

4602

* @x: holds the state of this particular completion

4603

*

4603

*

4604

* This will wake up a single thread waiting on this completion. Threads will be

4604

* This will wake up a single thread waiting on this completion. Threads will be

4605

* awakened in the same order in which they were queued.

4605

* awakened in the same order in which they were queued.

4606

*

4606

*

4607

* See also complete_all(), wait_for_completion() and related routines.

4607

* See also complete_all(), wait_for_completion() and related routines.

4608

*

4608

*

4609

* It may be assumed that this function implies a write memory barrier before

4609

* It may be assumed that this function implies a write memory barrier before

4610

* changing the task state if and only if any tasks are woken up.

4610

* changing the task state if and only if any tasks are woken up.

4611

*/

4611

*/

4612

void complete(struct completion *x)

4612

void complete(struct completion *x)

4613

{

4613

{

4614

unsigned long flags;

4614

unsigned long flags;

4615

4616

spin_lock_irqsave(&x->wait.lock, flags);

4616

spin_lock_irqsave(&x->wait.lock, flags);

4617

x->done++;

4617

x->done++;

4618

__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);

4618

__wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);

4619

spin_unlock_irqrestore(&x->wait.lock, flags);

4619

spin_unlock_irqrestore(&x->wait.lock, flags);

4620

}

4620

}

4621

EXPORT_SYMBOL(complete);

4621

EXPORT_SYMBOL(complete);

4622

4623

/**

4623

/**

4624

* complete_all: - signals all threads waiting on this completion

4624

* complete_all: - signals all threads waiting on this completion

4625

* @x: holds the state of this particular completion

4625

* @x: holds the state of this particular completion

4626

*

4626

*

4627

* This will wake up all threads waiting on this particular completion event.

4627

* This will wake up all threads waiting on this particular completion event.

4628

*

4628

*

4629

* It may be assumed that this function implies a write memory barrier before

4629

* It may be assumed that this function implies a write memory barrier before

4630

* changing the task state if and only if any tasks are woken up.

4630

* changing the task state if and only if any tasks are woken up.

4631

*/

4631

*/

4632

void complete_all(struct completion *x)

4632

void complete_all(struct completion *x)

4633

{

4633

{

4634

unsigned long flags;

4634

unsigned long flags;

4635

4636

spin_lock_irqsave(&x->wait.lock, flags);

4636

spin_lock_irqsave(&x->wait.lock, flags);

4637

x->done += UINT_MAX/2;

4637

x->done += UINT_MAX/2;

4638

__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);

4638

__wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);

4639

spin_unlock_irqrestore(&x->wait.lock, flags);

4639

spin_unlock_irqrestore(&x->wait.lock, flags);

4640

}

4640

}

4641

EXPORT_SYMBOL(complete_all);

4641

EXPORT_SYMBOL(complete_all);

4642

4643

static inline long __sched

4643

static inline long __sched

4644

do_wait_for_common(struct completion *x, long timeout, int state)

4644

do_wait_for_common(struct completion *x, long timeout, int state)

4645

{

4645

{

4646

if (!x->done) {

4646

if (!x->done) {

4647

DECLARE_WAITQUEUE(wait, current);

4647

DECLARE_WAITQUEUE(wait, current);

4648

4649

__add_wait_queue_tail_exclusive(&x->wait, &wait);

4649

__add_wait_queue_tail_exclusive(&x->wait, &wait);

4650

do {

4650

do {

4651

if (signal_pending_state(state, current)) {

4651

if (signal_pending_state(state, current)) {

4652

timeout = -ERESTARTSYS;

4652

timeout = -ERESTARTSYS;

4653

break;

4653

break;

4654

}

4654

}

4655

__set_current_state(state);

4655

__set_current_state(state);

4656

spin_unlock_irq(&x->wait.lock);

4656

spin_unlock_irq(&x->wait.lock);

4657

timeout = schedule_timeout(timeout);

4657

timeout = schedule_timeout(timeout);

4658

spin_lock_irq(&x->wait.lock);

4658

spin_lock_irq(&x->wait.lock);

4659

} while (!x->done && timeout);

4659

} while (!x->done && timeout);

4660

__remove_wait_queue(&x->wait, &wait);

4660

__remove_wait_queue(&x->wait, &wait);

4661

if (!x->done)

4661

if (!x->done)

4662

return timeout;

4662

return timeout;

4663

}

4663

}

4664

x->done--;

4664

x->done--;

4665

return timeout ?: 1;

4665

return timeout ?: 1;

4666

}

4666

}

4667

4668

static long __sched

4668

static long __sched

4669

wait_for_common(struct completion *x, long timeout, int state)

4669

wait_for_common(struct completion *x, long timeout, int state)

4670

{

4670

{

4671

might_sleep();

4671

might_sleep();

4672

4673

spin_lock_irq(&x->wait.lock);

4673

spin_lock_irq(&x->wait.lock);

4674

timeout = do_wait_for_common(x, timeout, state);

4674

timeout = do_wait_for_common(x, timeout, state);

4675

spin_unlock_irq(&x->wait.lock);

4675

spin_unlock_irq(&x->wait.lock);

4676

return timeout;

4676

return timeout;

4677

}

4677

}

4678

4679

/**

4679

/**

4680

* wait_for_completion: - waits for completion of a task

4680

* wait_for_completion: - waits for completion of a task

4681

* @x: holds the state of this particular completion

4681

* @x: holds the state of this particular completion

4682

*

4682

*

4683

* This waits to be signaled for completion of a specific task. It is NOT

4683

* This waits to be signaled for completion of a specific task. It is NOT

4684

* interruptible and there is no timeout.

4684

* interruptible and there is no timeout.

4685

*

4685

*

4686

* See also similar routines (i.e. wait_for_completion_timeout()) with timeout

4686

* See also similar routines (i.e. wait_for_completion_timeout()) with timeout

4687

* and interrupt capability. Also see complete().

4687

* and interrupt capability. Also see complete().

4688

*/

4688

*/

4689

void __sched wait_for_completion(struct completion *x)

4689

void __sched wait_for_completion(struct completion *x)

4690

{

4690

{

4691

wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);

4691

wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);

4692

}

4692

}

4693

EXPORT_SYMBOL(wait_for_completion);

4693

EXPORT_SYMBOL(wait_for_completion);

4694

4695

/**

4695

/**

4696

* wait_for_completion_timeout: - waits for completion of a task (w/timeout)

4696

* wait_for_completion_timeout: - waits for completion of a task (w/timeout)

4697

* @x: holds the state of this particular completion

4697

* @x: holds the state of this particular completion

4698

* @timeout: timeout value in jiffies

4698

* @timeout: timeout value in jiffies

4699

*

4699

*

4700

* This waits for either a completion of a specific task to be signaled or for a

4700

* This waits for either a completion of a specific task to be signaled or for a

4701

* specified timeout to expire. The timeout is in jiffies. It is not

4701

* specified timeout to expire. The timeout is in jiffies. It is not

4702

* interruptible.

4702

* interruptible.

4703

*/

4703

*/

4704

unsigned long __sched

4704

unsigned long __sched

4705

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

4705

wait_for_completion_timeout(struct completion *x, unsigned long timeout)

4706

{

4706

{

4707

return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);

4707

return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);

4708

}

4708

}

4709

EXPORT_SYMBOL(wait_for_completion_timeout);

4709

EXPORT_SYMBOL(wait_for_completion_timeout);

4710

4711

/**

4711

/**

4712

* wait_for_completion_interruptible: - waits for completion of a task (w/intr)

4712

* wait_for_completion_interruptible: - waits for completion of a task (w/intr)

4713

* @x: holds the state of this particular completion

4713

* @x: holds the state of this particular completion

4714

*

4714

*

4715

* This waits for completion of a specific task to be signaled. It is

4715

* This waits for completion of a specific task to be signaled. It is

4716

* interruptible.

4716

* interruptible.

4717

*/

4717

*/

4718

int __sched wait_for_completion_interruptible(struct completion *x)

4718

int __sched wait_for_completion_interruptible(struct completion *x)

4719

{

4719

{

4720

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);

4720

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);

4721

if (t == -ERESTARTSYS)

4721

if (t == -ERESTARTSYS)

4722

return t;

4722

return t;

4723

return 0;

4723

return 0;

4724

}

4724

}

4725

EXPORT_SYMBOL(wait_for_completion_interruptible);

4725

EXPORT_SYMBOL(wait_for_completion_interruptible);

4726

4727

/**

4727

/**

4728

* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))

4728

* wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))

4729

* @x: holds the state of this particular completion

4729

* @x: holds the state of this particular completion

4730

* @timeout: timeout value in jiffies

4730

* @timeout: timeout value in jiffies

4731

*

4731

*

4732

* This waits for either a completion of a specific task to be signaled or for a

4732

* This waits for either a completion of a specific task to be signaled or for a

4733

* specified timeout to expire. It is interruptible. The timeout is in jiffies.

4733

* specified timeout to expire. It is interruptible. The timeout is in jiffies.

4734

*/

4734

*/

4735

long __sched

4735

long __sched

4736

wait_for_completion_interruptible_timeout(struct completion *x,

4736

wait_for_completion_interruptible_timeout(struct completion *x,

4737

unsigned long timeout)

4737

unsigned long timeout)

4738

{

4738

{

4739

return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);

4739

return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);

4740

}

4740

}

4741

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

4741

EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);

4742

4743

/**

4743

/**

4744

* wait_for_completion_killable: - waits for completion of a task (killable)

4744

* wait_for_completion_killable: - waits for completion of a task (killable)

4745

* @x: holds the state of this particular completion

4745

* @x: holds the state of this particular completion

4746

*

4746

*

4747

* This waits to be signaled for completion of a specific task. It can be

4747

* This waits to be signaled for completion of a specific task. It can be

4748

* interrupted by a kill signal.

4748

* interrupted by a kill signal.

4749

*/

4749

*/

4750

int __sched wait_for_completion_killable(struct completion *x)

4750

int __sched wait_for_completion_killable(struct completion *x)

4751

{

4751

{

4752

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);

4752

long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);

4753

if (t == -ERESTARTSYS)

4753

if (t == -ERESTARTSYS)

4754

return t;

4754

return t;

4755

return 0;

4755

return 0;

4756

}

4756

}

4757

EXPORT_SYMBOL(wait_for_completion_killable);

4757

EXPORT_SYMBOL(wait_for_completion_killable);

4758

4759

/**

4759

/**

4760

* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))

4760

* wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))

4761

* @x: holds the state of this particular completion

4761

* @x: holds the state of this particular completion

4762

* @timeout: timeout value in jiffies

4762

* @timeout: timeout value in jiffies

4763

*

4763

*

4764

* This waits for either a completion of a specific task to be

4764

* This waits for either a completion of a specific task to be

4765

* signaled or for a specified timeout to expire. It can be

4765

* signaled or for a specified timeout to expire. It can be

4766

* interrupted by a kill signal. The timeout is in jiffies.

4766

* interrupted by a kill signal. The timeout is in jiffies.

4767

*/

4767

*/

4768

long __sched

4768

long __sched

4769

wait_for_completion_killable_timeout(struct completion *x,

4769

wait_for_completion_killable_timeout(struct completion *x,

4770

unsigned long timeout)

4770

unsigned long timeout)

4771

{

4771

{

4772

return wait_for_common(x, timeout, TASK_KILLABLE);

4772

return wait_for_common(x, timeout, TASK_KILLABLE);

4773

}

4773

}

4774

EXPORT_SYMBOL(wait_for_completion_killable_timeout);

4774

EXPORT_SYMBOL(wait_for_completion_killable_timeout);

4775

4776

/**

4776

/**

4777

* try_wait_for_completion - try to decrement a completion without blocking

4777

* try_wait_for_completion - try to decrement a completion without blocking

4778

* @x: completion structure

4778

* @x: completion structure

4779

*

4779

*

4780

* Returns: 0 if a decrement cannot be done without blocking

4780

* Returns: 0 if a decrement cannot be done without blocking

4781

* 1 if a decrement succeeded.

4781

* 1 if a decrement succeeded.

4782

*

4782

*

4783

* If a completion is being used as a counting completion,

4783

* If a completion is being used as a counting completion,

4784

* attempt to decrement the counter without blocking. This

4784

* attempt to decrement the counter without blocking. This

4785

* enables us to avoid waiting if the resource the completion

4785

* enables us to avoid waiting if the resource the completion

4786

* is protecting is not available.

4786

* is protecting is not available.

4787

*/

4787

*/

4788

bool try_wait_for_completion(struct completion *x)

4788

bool try_wait_for_completion(struct completion *x)

4789

{

4789

{

4790

unsigned long flags;

4790

unsigned long flags;

4791

int ret = 1;

4791

int ret = 1;

4792

4793

spin_lock_irqsave(&x->wait.lock, flags);

4793

spin_lock_irqsave(&x->wait.lock, flags);

4794

if (!x->done)

4794

if (!x->done)

4795

ret = 0;

4795

ret = 0;

4796

else

4796

else

4797

x->done--;

4797

x->done--;

4798

spin_unlock_irqrestore(&x->wait.lock, flags);

4798

spin_unlock_irqrestore(&x->wait.lock, flags);

4799

return ret;

4799

return ret;

4800

}

4800

}

4801

EXPORT_SYMBOL(try_wait_for_completion);

4801

EXPORT_SYMBOL(try_wait_for_completion);

4802

4803

/**

4803

/**

4804

* completion_done - Test to see if a completion has any waiters

4804

* completion_done - Test to see if a completion has any waiters

4805

* @x: completion structure

4805

* @x: completion structure

4806

*

4806

*

4807

* Returns: 0 if there are waiters (wait_for_completion() in progress)

4807

* Returns: 0 if there are waiters (wait_for_completion() in progress)

4808

* 1 if there are no waiters.

4808

* 1 if there are no waiters.

4809

*

4809

*

4810

*/

4810

*/

4811

bool completion_done(struct completion *x)

4811

bool completion_done(struct completion *x)

4812

{

4812

{

4813

unsigned long flags;

4813

unsigned long flags;

4814

int ret = 1;

4814

int ret = 1;

4815

4816

spin_lock_irqsave(&x->wait.lock, flags);

4816

spin_lock_irqsave(&x->wait.lock, flags);

4817

if (!x->done)

4817

if (!x->done)

4818

ret = 0;

4818

ret = 0;

4819

spin_unlock_irqrestore(&x->wait.lock, flags);

4819

spin_unlock_irqrestore(&x->wait.lock, flags);

4820

return ret;

4820

return ret;

4821

}

4821

}

4822

EXPORT_SYMBOL(completion_done);

4822

EXPORT_SYMBOL(completion_done);

4823

4824

static long __sched

4824

static long __sched

4825

sleep_on_common(wait_queue_head_t *q, int state, long timeout)

4825

sleep_on_common(wait_queue_head_t *q, int state, long timeout)

4826

{

4826

{

4827

unsigned long flags;

4827

unsigned long flags;

4828

wait_queue_t wait;

4828

wait_queue_t wait;

4829

4830

init_waitqueue_entry(&wait, current);

4830

init_waitqueue_entry(&wait, current);

4831

4832

__set_current_state(state);

4832

__set_current_state(state);

4833

4834

spin_lock_irqsave(&q->lock, flags);

4834

spin_lock_irqsave(&q->lock, flags);

4835

__add_wait_queue(q, &wait);

4835

__add_wait_queue(q, &wait);

4836

spin_unlock(&q->lock);

4836

spin_unlock(&q->lock);

4837

timeout = schedule_timeout(timeout);

4837

timeout = schedule_timeout(timeout);

4838

spin_lock_irq(&q->lock);

4838

spin_lock_irq(&q->lock);

4839

__remove_wait_queue(q, &wait);

4839

__remove_wait_queue(q, &wait);

4840

spin_unlock_irqrestore(&q->lock, flags);

4840

spin_unlock_irqrestore(&q->lock, flags);

4841

4842

return timeout;

4842

return timeout;

4843

}

4843

}

4844

4845

void __sched interruptible_sleep_on(wait_queue_head_t *q)

4845

void __sched interruptible_sleep_on(wait_queue_head_t *q)

4846

{

4846

{

4847

sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

4847

sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

4848

}

4848

}

4849

EXPORT_SYMBOL(interruptible_sleep_on);

4849

EXPORT_SYMBOL(interruptible_sleep_on);

4850

4851

long __sched

4851

long __sched

4852

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

4852

interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)

4853

{

4853

{

4854

return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);

4854

return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);

4855

}

4855

}

4856

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

4856

EXPORT_SYMBOL(interruptible_sleep_on_timeout);

4857

4858

void __sched sleep_on(wait_queue_head_t *q)

4858

void __sched sleep_on(wait_queue_head_t *q)

4859

{

4859

{

4860

sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

4860

sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);

4861

}

4861

}

4862

EXPORT_SYMBOL(sleep_on);

4862

EXPORT_SYMBOL(sleep_on);

4863

4864

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

4864

long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)

4865

{

4865

{

4866

return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);

4866

return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);

4867

}

4867

}

4868

EXPORT_SYMBOL(sleep_on_timeout);

4868

EXPORT_SYMBOL(sleep_on_timeout);

4869

4870

#ifdef CONFIG_RT_MUTEXES

4870

#ifdef CONFIG_RT_MUTEXES

4871

4872

/*

4872

/*

4873

* rt_mutex_setprio - set the current priority of a task

4873

* rt_mutex_setprio - set the current priority of a task

4874

* @p: task

4874

* @p: task

4875

* @prio: prio value (kernel-internal form)

4875

* @prio: prio value (kernel-internal form)

4876

*

4876

*

4877

* This function changes the 'effective' priority of a task. It does

4877

* This function changes the 'effective' priority of a task. It does

4878

* not touch ->normal_prio like __setscheduler().

4878

* not touch ->normal_prio like __setscheduler().

4879

*

4879

*

4880

* Used by the rt_mutex code to implement priority inheritance logic.

4880

* Used by the rt_mutex code to implement priority inheritance logic.

4881

*/

4881

*/

4882

void rt_mutex_setprio(struct task_struct *p, int prio)

4882

void rt_mutex_setprio(struct task_struct *p, int prio)

4883

{

4883

{

4884

int oldprio, on_rq, running;

4884

int oldprio, on_rq, running;

4885

struct rq *rq;

4885

struct rq *rq;

4886

const struct sched_class *prev_class;

4886

const struct sched_class *prev_class;

4887

4888

BUG_ON(prio < 0 || prio > MAX_PRIO);

4888

BUG_ON(prio < 0 || prio > MAX_PRIO);

4889

4890

rq = __task_rq_lock(p);

4890

rq = __task_rq_lock(p);

4891

4892

trace_sched_pi_setprio(p, prio);

4892

trace_sched_pi_setprio(p, prio);

4893

oldprio = p->prio;

4893

oldprio = p->prio;

4894

prev_class = p->sched_class;

4894

prev_class = p->sched_class;

4895

on_rq = p->on_rq;

4895

on_rq = p->on_rq;

4896

running = task_current(rq, p);

4896

running = task_current(rq, p);

4897

if (on_rq)

4897

if (on_rq)

4898

dequeue_task(rq, p, 0);

4898

dequeue_task(rq, p, 0);

4899

if (running)

4899

if (running)

4900

p->sched_class->put_prev_task(rq, p);

4900

p->sched_class->put_prev_task(rq, p);

4901

4902

if (rt_prio(prio))

4902

if (rt_prio(prio))

4903

p->sched_class = &rt_sched_class;

4903

p->sched_class = &rt_sched_class;

4904

else

4904

else

4905

p->sched_class = &fair_sched_class;

4905

p->sched_class = &fair_sched_class;

4906

4907

p->prio = prio;

4907

p->prio = prio;

4908

4909

if (running)

4909

if (running)

4910

p->sched_class->set_curr_task(rq);

4910

p->sched_class->set_curr_task(rq);

4911

if (on_rq)

4911

if (on_rq)

4912

enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);

4912

enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);

4913

4914

check_class_changed(rq, p, prev_class, oldprio);

4914

check_class_changed(rq, p, prev_class, oldprio);

4915

__task_rq_unlock(rq);

4915

__task_rq_unlock(rq);

4916

}

4916

}

4917

4918

#endif

4918

#endif

4919

4920

void set_user_nice(struct task_struct *p, long nice)

4920

void set_user_nice(struct task_struct *p, long nice)

4921

{

4921

{

4922

int old_prio, delta, on_rq;

4922

int old_prio, delta, on_rq;

4923

unsigned long flags;

4923

unsigned long flags;

4924

struct rq *rq;

4924

struct rq *rq;

4925

4926

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

4926

if (TASK_NICE(p) == nice || nice < -20 || nice > 19)

4927

return;

4927

return;

4928

/*

4928

/*

4929

* We have to be careful, if called from sys_setpriority(),

4929

* We have to be careful, if called from sys_setpriority(),

4930

* the task might be in the middle of scheduling on another CPU.

4930

* the task might be in the middle of scheduling on another CPU.

4931

*/

4931

*/

4932

rq = task_rq_lock(p, &flags);

4932

rq = task_rq_lock(p, &flags);

4933

/*

4933

/*

4934

* The RT priorities are set via sched_setscheduler(), but we still

4934

* The RT priorities are set via sched_setscheduler(), but we still

4935

* allow the 'normal' nice value to be set - but as expected

4935

* allow the 'normal' nice value to be set - but as expected

4936

* it wont have any effect on scheduling until the task is

4936

* it wont have any effect on scheduling until the task is

4937

* SCHED_FIFO/SCHED_RR:

4937

* SCHED_FIFO/SCHED_RR:

4938

*/

4938

*/

4939

if (task_has_rt_policy(p)) {

4939

if (task_has_rt_policy(p)) {

4940

p->static_prio = NICE_TO_PRIO(nice);

4940

p->static_prio = NICE_TO_PRIO(nice);

4941

goto out_unlock;

4941

goto out_unlock;

4942

}

4942

}

4943

on_rq = p->on_rq;

4943

on_rq = p->on_rq;

4944

if (on_rq)

4944

if (on_rq)

4945

dequeue_task(rq, p, 0);

4945

dequeue_task(rq, p, 0);

4946

4947

p->static_prio = NICE_TO_PRIO(nice);

4947

p->static_prio = NICE_TO_PRIO(nice);

4948

set_load_weight(p);

4948

set_load_weight(p);

4949

old_prio = p->prio;

4949

old_prio = p->prio;

4950

p->prio = effective_prio(p);

4950

p->prio = effective_prio(p);

4951

delta = p->prio - old_prio;

4951

delta = p->prio - old_prio;

4952

4953

if (on_rq) {

4953

if (on_rq) {

4954

enqueue_task(rq, p, 0);

4954

enqueue_task(rq, p, 0);

4955

/*

4955

/*

4956

* If the task increased its priority or is running and

4956

* If the task increased its priority or is running and

4957

* lowered its priority, then reschedule its CPU:

4957

* lowered its priority, then reschedule its CPU:

4958

*/

4958

*/

4959

if (delta < 0 || (delta > 0 && task_running(rq, p)))

4959

if (delta < 0 || (delta > 0 && task_running(rq, p)))

4960

resched_task(rq->curr);

4960

resched_task(rq->curr);

4961

}

4961

}

4962

out_unlock:

4962

out_unlock:

4963

task_rq_unlock(rq, p, &flags);

4963

task_rq_unlock(rq, p, &flags);

4964

}

4964

}

4965

EXPORT_SYMBOL(set_user_nice);

4965

EXPORT_SYMBOL(set_user_nice);

4966

4967

/*

4967

/*

4968

* can_nice - check if a task can reduce its nice value

4968

* can_nice - check if a task can reduce its nice value

4969

* @p: task

4969

* @p: task

4970

* @nice: nice value

4970

* @nice: nice value

4971

*/

4971

*/

4972

int can_nice(const struct task_struct *p, const int nice)

4972

int can_nice(const struct task_struct *p, const int nice)

4973

{

4973

{

4974

/* convert nice value [19,-20] to rlimit style value [1,40] */

4974

/* convert nice value [19,-20] to rlimit style value [1,40] */

4975

int nice_rlim = 20 - nice;

4975

int nice_rlim = 20 - nice;

4976

4977

return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||

4977

return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||

4978

capable(CAP_SYS_NICE));

4978

capable(CAP_SYS_NICE));

4979

}

4979

}

4980

4981

#ifdef __ARCH_WANT_SYS_NICE

4981

#ifdef __ARCH_WANT_SYS_NICE

4982

4983

/*

4983

/*

4984

* sys_nice - change the priority of the current process.

4984

* sys_nice - change the priority of the current process.

4985

* @increment: priority increment

4985

* @increment: priority increment

4986

*

4986

*

4987

* sys_setpriority is a more generic, but much slower function that

4987

* sys_setpriority is a more generic, but much slower function that

4988

* does similar things.

4988

* does similar things.

4989

*/

4989

*/

4990

SYSCALL_DEFINE1(nice, int, increment)

4990

SYSCALL_DEFINE1(nice, int, increment)

4991

{

4991

{

4992

long nice, retval;

4992

long nice, retval;

4993

4994

/*

4994

/*

4995

* Setpriority might change our priority at the same moment.

4995

* Setpriority might change our priority at the same moment.

4996

* We don't have to worry. Conceptually one call occurs first

4996

* We don't have to worry. Conceptually one call occurs first

4997

* and we have a single winner.

4997

* and we have a single winner.

4998

*/

4998

*/

4999

if (increment < -40)

4999

if (increment < -40)

5000

increment = -40;

5000

increment = -40;

5001

if (increment > 40)

5001

if (increment > 40)

5002

increment = 40;

5002

increment = 40;

5003

5004

nice = TASK_NICE(current) + increment;

5004

nice = TASK_NICE(current) + increment;

5005

if (nice < -20)

5005

if (nice < -20)

5006

nice = -20;

5006

nice = -20;

5007

if (nice > 19)

5007

if (nice > 19)

5008

nice = 19;

5008

nice = 19;

5009

5010

if (increment < 0 && !can_nice(current, nice))

5010

if (increment < 0 && !can_nice(current, nice))

5011

return -EPERM;

5011

return -EPERM;

5012

5013

retval = security_task_setnice(current, nice);

5013

retval = security_task_setnice(current, nice);

5014

if (retval)

5014

if (retval)

5015

return retval;

5015

return retval;

5016

5017

set_user_nice(current, nice);

5017

set_user_nice(current, nice);

5018

return 0;

5018

return 0;

5019

}

5019

}

5020

5021

#endif

5021

#endif

5022

5023

/**

5023

/**

5024

* task_prio - return the priority value of a given task.

5024

* task_prio - return the priority value of a given task.

5025

* @p: the task in question.

5025

* @p: the task in question.

5026

*

5026

*

5027

* This is the priority value as seen by users in /proc.

5027

* This is the priority value as seen by users in /proc.

5028

* RT tasks are offset by -200. Normal tasks are centered

5028

* RT tasks are offset by -200. Normal tasks are centered

5029

* around 0, value goes from -16 to +15.

5029

* around 0, value goes from -16 to +15.

5030

*/

5030

*/

5031

int task_prio(const struct task_struct *p)

5031

int task_prio(const struct task_struct *p)

5032

{

5032

{

5033

return p->prio - MAX_RT_PRIO;

5033

return p->prio - MAX_RT_PRIO;

5034

}

5034

}

5035

5036

/**

5036

/**

5037

* task_nice - return the nice value of a given task.

5037

* task_nice - return the nice value of a given task.

5038

* @p: the task in question.

5038

* @p: the task in question.

5039

*/

5039

*/

5040

int task_nice(const struct task_struct *p)

5040

int task_nice(const struct task_struct *p)

5041

{

5041

{

5042

return TASK_NICE(p);

5042

return TASK_NICE(p);

5043

}

5043

}

5044

EXPORT_SYMBOL(task_nice);

5044

EXPORT_SYMBOL(task_nice);

5045

5046

/**

5046

/**

5047

* idle_cpu - is a given cpu idle currently?

5047

* idle_cpu - is a given cpu idle currently?

5048

* @cpu: the processor in question.

5048

* @cpu: the processor in question.

5049

*/

5049

*/

5050

int idle_cpu(int cpu)

5050

int idle_cpu(int cpu)

5051

{

5051

{

5052

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

5052

return cpu_curr(cpu) == cpu_rq(cpu)->idle;

5053

}

5053

}

5054

5055

/**

5055

/**

5056

* idle_task - return the idle task for a given cpu.

5056

* idle_task - return the idle task for a given cpu.

5057

* @cpu: the processor in question.

5057

* @cpu: the processor in question.

5058

*/

5058

*/

5059

struct task_struct *idle_task(int cpu)

5059

struct task_struct *idle_task(int cpu)

5060

{

5060

{

5061

return cpu_rq(cpu)->idle;

5061

return cpu_rq(cpu)->idle;

5062

}

5062

}

5063

5064

/**

5064

/**

5065

* find_process_by_pid - find a process with a matching PID value.

5065

* find_process_by_pid - find a process with a matching PID value.

5066

* @pid: the pid in question.

5066

* @pid: the pid in question.

5067

*/

5067

*/

5068

static struct task_struct *find_process_by_pid(pid_t pid)

5068

static struct task_struct *find_process_by_pid(pid_t pid)

5069

{

5069

{

5070

return pid ? find_task_by_vpid(pid) : current;

5070

return pid ? find_task_by_vpid(pid) : current;

5071

}

5071

}

5072

5073

/* Actually do priority change: must hold rq lock. */

5073

/* Actually do priority change: must hold rq lock. */

5074

static void

5074

static void

5075

__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)

5075

__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)

5076

{

5076

{

5077

p->policy = policy;

5077

p->policy = policy;

5078

p->rt_priority = prio;

5078

p->rt_priority = prio;

5079

p->normal_prio = normal_prio(p);

5079

p->normal_prio = normal_prio(p);

5080

/* we are holding p->pi_lock already */

5080

/* we are holding p->pi_lock already */

5081

p->prio = rt_mutex_getprio(p);

5081

p->prio = rt_mutex_getprio(p);

5082

if (rt_prio(p->prio))

5082

if (rt_prio(p->prio))

5083

p->sched_class = &rt_sched_class;

5083

p->sched_class = &rt_sched_class;

5084

else

5084

else

5085

p->sched_class = &fair_sched_class;

5085

p->sched_class = &fair_sched_class;

5086

set_load_weight(p);

5086

set_load_weight(p);

5087

}

5087

}

5088

5089

/*

5089

/*

5090

* check the target process has a UID that matches the current process's

5090

* check the target process has a UID that matches the current process's

5091

*/

5091

*/

5092

static bool check_same_owner(struct task_struct *p)

5092

static bool check_same_owner(struct task_struct *p)

5093

{

5093

{

5094

const struct cred *cred = current_cred(), *pcred;

5094

const struct cred *cred = current_cred(), *pcred;

5095

bool match;

5095

bool match;

5096

5097

rcu_read_lock();

5097

rcu_read_lock();

5098

pcred = __task_cred(p);

5098

pcred = __task_cred(p);

5099

if (cred->user->user_ns == pcred->user->user_ns)

5099

if (cred->user->user_ns == pcred->user->user_ns)

5100

match = (cred->euid == pcred->euid ||

5100

match = (cred->euid == pcred->euid ||

5101

cred->euid == pcred->uid);

5101

cred->euid == pcred->uid);

5102

else

5102

else

5103

match = false;

5103

match = false;

5104

rcu_read_unlock();

5104

rcu_read_unlock();

5105

return match;

5105

return match;

5106

}

5106

}

5107

5108

static int __sched_setscheduler(struct task_struct *p, int policy,

5108

static int __sched_setscheduler(struct task_struct *p, int policy,

5109

const struct sched_param *param, bool user)

5109

const struct sched_param *param, bool user)

5110

{

5110

{

5111

int retval, oldprio, oldpolicy = -1, on_rq, running;

5111

int retval, oldprio, oldpolicy = -1, on_rq, running;

5112

unsigned long flags;

5112

unsigned long flags;

5113

const struct sched_class *prev_class;

5113

const struct sched_class *prev_class;

5114

struct rq *rq;

5114

struct rq *rq;

5115

int reset_on_fork;

5115

int reset_on_fork;

5116

5117

/* may grab non-irq protected spin_locks */

5117

/* may grab non-irq protected spin_locks */

5118

BUG_ON(in_interrupt());

5118

BUG_ON(in_interrupt());

5119

recheck:

5119

recheck:

5120

/* double check policy once rq lock held */

5120

/* double check policy once rq lock held */

5121

if (policy < 0) {

5121

if (policy < 0) {

5122

reset_on_fork = p->sched_reset_on_fork;

5122

reset_on_fork = p->sched_reset_on_fork;

5123

policy = oldpolicy = p->policy;

5123

policy = oldpolicy = p->policy;

5124

} else {

5124

} else {

5125

reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);

5125

reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);

5126

policy &= ~SCHED_RESET_ON_FORK;

5126

policy &= ~SCHED_RESET_ON_FORK;

5127

5128

if (policy != SCHED_FIFO && policy != SCHED_RR &&

5128

if (policy != SCHED_FIFO && policy != SCHED_RR &&

5129

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

5129

policy != SCHED_NORMAL && policy != SCHED_BATCH &&

5130

policy != SCHED_IDLE)

5130

policy != SCHED_IDLE)

5131

return -EINVAL;

5131

return -EINVAL;

5132

}

5132

}

5133

5134

/*

5134

/*

5135

* Valid priorities for SCHED_FIFO and SCHED_RR are

5135

* Valid priorities for SCHED_FIFO and SCHED_RR are

5136

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

5136

* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,

5137

* SCHED_BATCH and SCHED_IDLE is 0.

5137

* SCHED_BATCH and SCHED_IDLE is 0.

5138

*/

5138

*/

5139

if (param->sched_priority < 0 ||

5139

if (param->sched_priority < 0 ||

5140

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

5140

(p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||

5141

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

5141

(!p->mm && param->sched_priority > MAX_RT_PRIO-1))

5142

return -EINVAL;

5142

return -EINVAL;

5143

if (rt_policy(policy) != (param->sched_priority != 0))

5143

if (rt_policy(policy) != (param->sched_priority != 0))

5144

return -EINVAL;

5144

return -EINVAL;

5145

5146

/*

5146

/*

5147

* Allow unprivileged RT tasks to decrease priority:

5147

* Allow unprivileged RT tasks to decrease priority:

5148

*/

5148

*/

5149

if (user && !capable(CAP_SYS_NICE)) {

5149

if (user && !capable(CAP_SYS_NICE)) {

5150

if (rt_policy(policy)) {

5150

if (rt_policy(policy)) {

5151

unsigned long rlim_rtprio =

5151

unsigned long rlim_rtprio =

5152

task_rlimit(p, RLIMIT_RTPRIO);

5152

task_rlimit(p, RLIMIT_RTPRIO);

5153

5154

/* can't set/change the rt policy */

5154

/* can't set/change the rt policy */

5155

if (policy != p->policy && !rlim_rtprio)

5155

if (policy != p->policy && !rlim_rtprio)

5156

return -EPERM;

5156

return -EPERM;

5157

5158

/* can't increase priority */

5158

/* can't increase priority */

5159

if (param->sched_priority > p->rt_priority &&

5159

if (param->sched_priority > p->rt_priority &&

5160

param->sched_priority > rlim_rtprio)

5160

param->sched_priority > rlim_rtprio)

5161

return -EPERM;

5161

return -EPERM;

5162

}

5162

}

5163

5164

/*

5164

/*

5165

* Treat SCHED_IDLE as nice 20. Only allow a switch to

5165

* Treat SCHED_IDLE as nice 20. Only allow a switch to

5166

* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.

5166

* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.

5167

*/

5167

*/

5168

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {

5168

if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {

5169

if (!can_nice(p, TASK_NICE(p)))

5169

if (!can_nice(p, TASK_NICE(p)))

5170

return -EPERM;

5170

return -EPERM;

5171

}

5171

}

5172

5173

/* can't change other user's priorities */

5173

/* can't change other user's priorities */

5174

if (!check_same_owner(p))

5174

if (!check_same_owner(p))

5175

return -EPERM;

5175

return -EPERM;

5176

5177

/* Normal users shall not reset the sched_reset_on_fork flag */

5177

/* Normal users shall not reset the sched_reset_on_fork flag */

5178

if (p->sched_reset_on_fork && !reset_on_fork)

5178

if (p->sched_reset_on_fork && !reset_on_fork)

5179

return -EPERM;

5179

return -EPERM;

5180

}

5180

}

5181

5182

if (user) {

5182

if (user) {

5183

retval = security_task_setscheduler(p);

5183

retval = security_task_setscheduler(p);

5184

if (retval)

5184

if (retval)

5185

return retval;

5185

return retval;

5186

}

5186

}

5187

5188

/*

5188

/*

5189

* make sure no PI-waiters arrive (or leave) while we are

5189

* make sure no PI-waiters arrive (or leave) while we are

5190

* changing the priority of the task:

5190

* changing the priority of the task:

5191

*

5191

*

5192

* To be able to change p->policy safely, the appropriate

5192

* To be able to change p->policy safely, the appropriate

5193

* runqueue lock must be held.

5193

* runqueue lock must be held.

5194

*/

5194

*/

5195

rq = task_rq_lock(p, &flags);

5195

rq = task_rq_lock(p, &flags);

5196

5197

/*

5197

/*

5198

* Changing the policy of the stop threads its a very bad idea

5198

* Changing the policy of the stop threads its a very bad idea

5199

*/

5199

*/

5200

if (p == rq->stop) {

5200

if (p == rq->stop) {

5201

task_rq_unlock(rq, p, &flags);

5201

task_rq_unlock(rq, p, &flags);

5202

return -EINVAL;

5202

return -EINVAL;

5203

}

5203

}

5204

5205

/*

5205

/*

5206

* If not changing anything there's no need to proceed further:

5206

* If not changing anything there's no need to proceed further:

5207

*/

5207

*/

5208

if (unlikely(policy == p->policy && (!rt_policy(policy) ||

5208

if (unlikely(policy == p->policy && (!rt_policy(policy) ||

5209

param->sched_priority == p->rt_priority))) {

5209

param->sched_priority == p->rt_priority))) {

5210

5211

__task_rq_unlock(rq);

5211

__task_rq_unlock(rq);

5212

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

5212

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

5213

return 0;

5213

return 0;

5214

}

5214

}

5215

5216

#ifdef CONFIG_RT_GROUP_SCHED

5216

#ifdef CONFIG_RT_GROUP_SCHED

5217

if (user) {

5217

if (user) {

5218

/*

5218

/*

5219

* Do not allow realtime tasks into groups that have no runtime

5219

* Do not allow realtime tasks into groups that have no runtime

5220

* assigned.

5220

* assigned.

5221

*/

5221

*/

5222

if (rt_bandwidth_enabled() && rt_policy(policy) &&

5222

if (rt_bandwidth_enabled() && rt_policy(policy) &&

5223

task_group(p)->rt_bandwidth.rt_runtime == 0 &&

5223

task_group(p)->rt_bandwidth.rt_runtime == 0 &&

5224

!task_group_is_autogroup(task_group(p))) {

5224

!task_group_is_autogroup(task_group(p))) {

5225

task_rq_unlock(rq, p, &flags);

5225

task_rq_unlock(rq, p, &flags);

5226

return -EPERM;

5226

return -EPERM;

5227

}

5227

}

5228

}

5228

}

5229

#endif

5229

#endif

5230

5231

/* recheck policy now with rq lock held */

5231

/* recheck policy now with rq lock held */

5232

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

5232

if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {

5233

policy = oldpolicy = -1;

5233

policy = oldpolicy = -1;

5234

task_rq_unlock(rq, p, &flags);

5234

task_rq_unlock(rq, p, &flags);

5235

goto recheck;

5235

goto recheck;

5236

}

5236

}

5237

on_rq = p->on_rq;

5237

on_rq = p->on_rq;

5238

running = task_current(rq, p);

5238

running = task_current(rq, p);

5239

if (on_rq)

5239

if (on_rq)

5240

deactivate_task(rq, p, 0);

5240

deactivate_task(rq, p, 0);

5241

if (running)

5241

if (running)

5242

p->sched_class->put_prev_task(rq, p);

5242

p->sched_class->put_prev_task(rq, p);

5243

5244

p->sched_reset_on_fork = reset_on_fork;

5244

p->sched_reset_on_fork = reset_on_fork;

5245

5246

oldprio = p->prio;

5246

oldprio = p->prio;

5247

prev_class = p->sched_class;

5247

prev_class = p->sched_class;

5248

__setscheduler(rq, p, policy, param->sched_priority);

5248

__setscheduler(rq, p, policy, param->sched_priority);

5249

5250

if (running)

5250

if (running)

5251

p->sched_class->set_curr_task(rq);

5251

p->sched_class->set_curr_task(rq);

5252

if (on_rq)

5252

if (on_rq)

5253

activate_task(rq, p, 0);

5253

activate_task(rq, p, 0);

5254

5255

check_class_changed(rq, p, prev_class, oldprio);

5255

check_class_changed(rq, p, prev_class, oldprio);

5256

task_rq_unlock(rq, p, &flags);

5256

task_rq_unlock(rq, p, &flags);

5257

5258

rt_mutex_adjust_pi(p);

5258

rt_mutex_adjust_pi(p);

5259

5260

return 0;

5260

return 0;

5261

}

5261

}

5262

5263

/**

5263

/**

5264

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

5264

* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.

5265

* @p: the task in question.

5265

* @p: the task in question.

5266

* @policy: new policy.

5266

* @policy: new policy.

5267

* @param: structure containing the new RT priority.

5267

* @param: structure containing the new RT priority.

5268

*

5268

*

5269

* NOTE that the task may be already dead.

5269

* NOTE that the task may be already dead.

5270

*/

5270

*/

5271

int sched_setscheduler(struct task_struct *p, int policy,

5271

int sched_setscheduler(struct task_struct *p, int policy,

5272

const struct sched_param *param)

5272

const struct sched_param *param)

5273

{

5273

{

5274

return __sched_setscheduler(p, policy, param, true);

5274

return __sched_setscheduler(p, policy, param, true);

5275

}

5275

}

5276

EXPORT_SYMBOL_GPL(sched_setscheduler);

5276

EXPORT_SYMBOL_GPL(sched_setscheduler);

5277

5278

/**

5278

/**

5279

* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

5279

* sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.

5280

* @p: the task in question.

5280

* @p: the task in question.

5281

* @policy: new policy.

5281

* @policy: new policy.

5282

* @param: structure containing the new RT priority.

5282

* @param: structure containing the new RT priority.

5283

*

5283

*

5284

* Just like sched_setscheduler, only don't bother checking if the

5284

* Just like sched_setscheduler, only don't bother checking if the

5285

* current context has permission. For example, this is needed in

5285

* current context has permission. For example, this is needed in

5286

* stop_machine(): we create temporary high priority worker threads,

5286

* stop_machine(): we create temporary high priority worker threads,

5287

* but our caller might not have that capability.

5287

* but our caller might not have that capability.

5288

*/

5288

*/

5289

int sched_setscheduler_nocheck(struct task_struct *p, int policy,

5289

int sched_setscheduler_nocheck(struct task_struct *p, int policy,

5290

const struct sched_param *param)

5290

const struct sched_param *param)

5291

{

5291

{

5292

return __sched_setscheduler(p, policy, param, false);

5292

return __sched_setscheduler(p, policy, param, false);

5293

}

5293

}

5294

5295

static int

5295

static int

5296

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

5296

do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)

5297

{

5297

{

5298

struct sched_param lparam;

5298

struct sched_param lparam;

5299

struct task_struct *p;

5299

struct task_struct *p;

5300

int retval;

5300

int retval;

5301

5302

if (!param || pid < 0)

5302

if (!param || pid < 0)

5303

return -EINVAL;

5303

return -EINVAL;

5304

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

5304

if (copy_from_user(&lparam, param, sizeof(struct sched_param)))

5305

return -EFAULT;

5305

return -EFAULT;

5306

5307

rcu_read_lock();

5307

rcu_read_lock();

5308

retval = -ESRCH;

5308

retval = -ESRCH;

5309

p = find_process_by_pid(pid);

5309

p = find_process_by_pid(pid);

5310

if (p != NULL)

5310

if (p != NULL)

5311

retval = sched_setscheduler(p, policy, &lparam);

5311

retval = sched_setscheduler(p, policy, &lparam);

5312

rcu_read_unlock();

5312

rcu_read_unlock();

5313

5314

return retval;

5314

return retval;

5315

}

5315

}

5316

5317

/**

5317

/**

5318

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

5318

* sys_sched_setscheduler - set/change the scheduler policy and RT priority

5319

* @pid: the pid in question.

5319

* @pid: the pid in question.

5320

* @policy: new policy.

5320

* @policy: new policy.

5321

* @param: structure containing the new RT priority.

5321

* @param: structure containing the new RT priority.

5322

*/

5322

*/

5323

SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,

5323

SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,

5324

struct sched_param __user *, param)

5324

struct sched_param __user *, param)

5325

{

5325

{

5326

/* negative values for policy are not valid */

5326

/* negative values for policy are not valid */

5327

if (policy < 0)

5327

if (policy < 0)

5328

return -EINVAL;

5328

return -EINVAL;

5329

5330

return do_sched_setscheduler(pid, policy, param);

5330

return do_sched_setscheduler(pid, policy, param);

5331

}

5331

}

5332

5333

/**

5333

/**

5334

* sys_sched_setparam - set/change the RT priority of a thread

5334

* sys_sched_setparam - set/change the RT priority of a thread

5335

* @pid: the pid in question.

5335

* @pid: the pid in question.

5336

* @param: structure containing the new RT priority.

5336

* @param: structure containing the new RT priority.

5337

*/

5337

*/

5338

SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

5338

SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)

5339

{

5339

{

5340

return do_sched_setscheduler(pid, -1, param);

5340

return do_sched_setscheduler(pid, -1, param);

5341

}

5341

}

5342

5343

/**

5343

/**

5344

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

5344

* sys_sched_getscheduler - get the policy (scheduling class) of a thread

5345

* @pid: the pid in question.

5345

* @pid: the pid in question.

5346

*/

5346

*/

5347

SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

5347

SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)

5348

{

5348

{

5349

struct task_struct *p;

5349

struct task_struct *p;

5350

int retval;

5350

int retval;

5351

5352

if (pid < 0)

5352

if (pid < 0)

5353

return -EINVAL;

5353

return -EINVAL;

5354

5355

retval = -ESRCH;

5355

retval = -ESRCH;

5356

rcu_read_lock();

5356

rcu_read_lock();

5357

p = find_process_by_pid(pid);

5357

p = find_process_by_pid(pid);

5358

if (p) {

5358

if (p) {

5359

retval = security_task_getscheduler(p);

5359

retval = security_task_getscheduler(p);

5360

if (!retval)

5360

if (!retval)

5361

retval = p->policy

5361

retval = p->policy

5362

| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);

5362

| (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);

5363

}

5363

}

5364

rcu_read_unlock();

5364

rcu_read_unlock();

5365

return retval;

5365

return retval;

5366

}

5366

}

5367

5368

/**

5368

/**

5369

* sys_sched_getparam - get the RT priority of a thread

5369

* sys_sched_getparam - get the RT priority of a thread

5370

* @pid: the pid in question.

5370

* @pid: the pid in question.

5371

* @param: structure containing the RT priority.

5371

* @param: structure containing the RT priority.

5372

*/

5372

*/

5373

SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

5373

SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)

5374

{

5374

{

5375

struct sched_param lp;

5375

struct sched_param lp;

5376

struct task_struct *p;

5376

struct task_struct *p;

5377

int retval;

5377

int retval;

5378

5379

if (!param || pid < 0)

5379

if (!param || pid < 0)

5380

return -EINVAL;

5380

return -EINVAL;

5381

5382

rcu_read_lock();

5382

rcu_read_lock();

5383

p = find_process_by_pid(pid);

5383

p = find_process_by_pid(pid);

5384

retval = -ESRCH;

5384

retval = -ESRCH;

5385

if (!p)

5385

if (!p)

5386

goto out_unlock;

5386

goto out_unlock;

5387

5388

retval = security_task_getscheduler(p);

5388

retval = security_task_getscheduler(p);

5389

if (retval)

5389

if (retval)

5390

goto out_unlock;

5390

goto out_unlock;

5391

5392

lp.sched_priority = p->rt_priority;

5392

lp.sched_priority = p->rt_priority;

5393

rcu_read_unlock();

5393

rcu_read_unlock();

5394

5395

/*

5395

/*

5396

* This one might sleep, we cannot do it with a spinlock held ...

5396

* This one might sleep, we cannot do it with a spinlock held ...

5397

*/

5397

*/

5398

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

5398

retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

5399

5400

return retval;

5400

return retval;

5401

5402

out_unlock:

5402

out_unlock:

5403

rcu_read_unlock();

5403

rcu_read_unlock();

5404

return retval;

5404

return retval;

5405

}

5405

}

5406

5407

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

5407

long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)

5408

{

5408

{

5409

cpumask_var_t cpus_allowed, new_mask;

5409

cpumask_var_t cpus_allowed, new_mask;

5410

struct task_struct *p;

5410

struct task_struct *p;

5411

int retval;

5411

int retval;

5412

5413

get_online_cpus();

5413

get_online_cpus();

5414

rcu_read_lock();

5414

rcu_read_lock();

5415

5416

p = find_process_by_pid(pid);

5416

p = find_process_by_pid(pid);

5417

if (!p) {

5417

if (!p) {

5418

rcu_read_unlock();

5418

rcu_read_unlock();

5419

put_online_cpus();

5419

put_online_cpus();

5420

return -ESRCH;

5420

return -ESRCH;

5421

}

5421

}

5422

5423

/* Prevent p going away */

5423

/* Prevent p going away */

5424

get_task_struct(p);

5424

get_task_struct(p);

5425

rcu_read_unlock();

5425

rcu_read_unlock();

5426

5427

if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {

5427

if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {

5428

retval = -ENOMEM;

5428

retval = -ENOMEM;

5429

goto out_put_task;

5429

goto out_put_task;

5430

}

5430

}

5431

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

5431

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {

5432

retval = -ENOMEM;

5432

retval = -ENOMEM;

5433

goto out_free_cpus_allowed;

5433

goto out_free_cpus_allowed;

5434

}

5434

}

5435

retval = -EPERM;

5435

retval = -EPERM;

5436

if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))

5436

if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))

5437

goto out_unlock;

5437

goto out_unlock;

5438

5439

retval = security_task_setscheduler(p);

5439

retval = security_task_setscheduler(p);

5440

if (retval)

5440

if (retval)

5441

goto out_unlock;

5441

goto out_unlock;

5442

5443

cpuset_cpus_allowed(p, cpus_allowed);

5443

cpuset_cpus_allowed(p, cpus_allowed);

5444

cpumask_and(new_mask, in_mask, cpus_allowed);

5444

cpumask_and(new_mask, in_mask, cpus_allowed);

5445

again:

5445

again:

5446

retval = set_cpus_allowed_ptr(p, new_mask);

5446

retval = set_cpus_allowed_ptr(p, new_mask);

5447

5448

if (!retval) {

5448

if (!retval) {

5449

cpuset_cpus_allowed(p, cpus_allowed);

5449

cpuset_cpus_allowed(p, cpus_allowed);

5450

if (!cpumask_subset(new_mask, cpus_allowed)) {

5450

if (!cpumask_subset(new_mask, cpus_allowed)) {

5451

/*

5451

/*

5452

* We must have raced with a concurrent cpuset

5452

* We must have raced with a concurrent cpuset

5453

* update. Just reset the cpus_allowed to the

5453

* update. Just reset the cpus_allowed to the

5454

* cpuset's cpus_allowed

5454

* cpuset's cpus_allowed

5455

*/

5455

*/

5456

cpumask_copy(new_mask, cpus_allowed);

5456

cpumask_copy(new_mask, cpus_allowed);

5457

goto again;

5457

goto again;

5458

}

5458

}

5459

}

5459

}

5460

out_unlock:

5460

out_unlock:

5461

free_cpumask_var(new_mask);

5461

free_cpumask_var(new_mask);

5462

out_free_cpus_allowed:

5462

out_free_cpus_allowed:

5463

free_cpumask_var(cpus_allowed);

5463

free_cpumask_var(cpus_allowed);

5464

out_put_task:

5464

out_put_task:

5465

put_task_struct(p);

5465

put_task_struct(p);

5466

put_online_cpus();

5466

put_online_cpus();

5467

return retval;

5467

return retval;

5468

}

5468

}

5469

5470

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

5470

static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,

5471

struct cpumask *new_mask)

5471

struct cpumask *new_mask)

5472

{

5472

{

5473

if (len < cpumask_size())

5473

if (len < cpumask_size())

5474

cpumask_clear(new_mask);

5474

cpumask_clear(new_mask);

5475

else if (len > cpumask_size())

5475

else if (len > cpumask_size())

5476

len = cpumask_size();

5476

len = cpumask_size();

5477

5478

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

5478

return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;

5479

}

5479

}

5480

5481

/**

5481

/**

5482

* sys_sched_setaffinity - set the cpu affinity of a process

5482

* sys_sched_setaffinity - set the cpu affinity of a process

5483

* @pid: pid of the process

5483

* @pid: pid of the process

5484

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

5484

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

5485

* @user_mask_ptr: user-space pointer to the new cpu mask

5485

* @user_mask_ptr: user-space pointer to the new cpu mask

5486

*/

5486

*/

5487

SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

5487

SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,

5488

unsigned long __user *, user_mask_ptr)

5488

unsigned long __user *, user_mask_ptr)

5489

{

5489

{

5490

cpumask_var_t new_mask;

5490

cpumask_var_t new_mask;

5491

int retval;

5491

int retval;

5492

5493

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

5493

if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))

5494

return -ENOMEM;

5494

return -ENOMEM;

5495

5496

retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

5496

retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);

5497

if (retval == 0)

5497

if (retval == 0)

5498

retval = sched_setaffinity(pid, new_mask);

5498

retval = sched_setaffinity(pid, new_mask);

5499

free_cpumask_var(new_mask);

5499

free_cpumask_var(new_mask);

5500

return retval;

5500

return retval;

5501

}

5501

}

5502

5503

long sched_getaffinity(pid_t pid, struct cpumask *mask)

5503

long sched_getaffinity(pid_t pid, struct cpumask *mask)

5504

{

5504

{

5505

struct task_struct *p;

5505

struct task_struct *p;

5506

unsigned long flags;

5506

unsigned long flags;

5507

int retval;

5507

int retval;

5508

5509

get_online_cpus();

5509

get_online_cpus();

5510

rcu_read_lock();

5510

rcu_read_lock();

5511

5512

retval = -ESRCH;

5512

retval = -ESRCH;

5513

p = find_process_by_pid(pid);

5513

p = find_process_by_pid(pid);

5514

if (!p)

5514

if (!p)

5515

goto out_unlock;

5515

goto out_unlock;

5516

5517

retval = security_task_getscheduler(p);

5517

retval = security_task_getscheduler(p);

5518

if (retval)

5518

if (retval)

5519

goto out_unlock;

5519

goto out_unlock;

5520

5521

raw_spin_lock_irqsave(&p->pi_lock, flags);

5521

raw_spin_lock_irqsave(&p->pi_lock, flags);

5522

cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);

5522

cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);

5523

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

5523

raw_spin_unlock_irqrestore(&p->pi_lock, flags);

5524

5525

out_unlock:

5525

out_unlock:

5526

rcu_read_unlock();

5526

rcu_read_unlock();

5527

put_online_cpus();

5527

put_online_cpus();

5528

5529

return retval;

5529

return retval;

5530

}

5530

}

5531

5532

/**

5532

/**

5533

* sys_sched_getaffinity - get the cpu affinity of a process

5533

* sys_sched_getaffinity - get the cpu affinity of a process

5534

* @pid: pid of the process

5534

* @pid: pid of the process

5535

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

5535

* @len: length in bytes of the bitmask pointed to by user_mask_ptr

5536

* @user_mask_ptr: user-space pointer to hold the current cpu mask

5536

* @user_mask_ptr: user-space pointer to hold the current cpu mask

5537

*/

5537

*/

5538

SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

5538

SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,

5539

unsigned long __user *, user_mask_ptr)

5539

unsigned long __user *, user_mask_ptr)

5540

{

5540

{

5541

int ret;

5541

int ret;

5542

cpumask_var_t mask;

5542

cpumask_var_t mask;

5543

5544

if ((len * BITS_PER_BYTE) < nr_cpu_ids)

5544

if ((len * BITS_PER_BYTE) < nr_cpu_ids)

5545

return -EINVAL;

5545

return -EINVAL;

5546

if (len & (sizeof(unsigned long)-1))

5546

if (len & (sizeof(unsigned long)-1))

5547

return -EINVAL;

5547

return -EINVAL;

5548

5549

if (!alloc_cpumask_var(&mask, GFP_KERNEL))

5549

if (!alloc_cpumask_var(&mask, GFP_KERNEL))

5550

return -ENOMEM;

5550

return -ENOMEM;

5551

5552

ret = sched_getaffinity(pid, mask);

5552

ret = sched_getaffinity(pid, mask);

5553

if (ret == 0) {

5553

if (ret == 0) {

5554

size_t retlen = min_t(size_t, len, cpumask_size());

5554

size_t retlen = min_t(size_t, len, cpumask_size());

5555

5556

if (copy_to_user(user_mask_ptr, mask, retlen))

5556

if (copy_to_user(user_mask_ptr, mask, retlen))

5557

ret = -EFAULT;

5557

ret = -EFAULT;

5558

else

5558

else

5559

ret = retlen;

5559

ret = retlen;

5560

}

5560

}

5561

free_cpumask_var(mask);

5561

free_cpumask_var(mask);

5562

5563

return ret;

5563

return ret;

5564

}

5564

}

5565

5566

/**

5566

/**

5567

* sys_sched_yield - yield the current processor to other threads.

5567

* sys_sched_yield - yield the current processor to other threads.

5568

*

5568

*

5569

* This function yields the current CPU to other tasks. If there are no

5569

* This function yields the current CPU to other tasks. If there are no

5570

* other threads running on this CPU then this function will return.

5570

* other threads running on this CPU then this function will return.

5571

*/

5571

*/

5572

SYSCALL_DEFINE0(sched_yield)

5572

SYSCALL_DEFINE0(sched_yield)

5573

{

5573

{

5574

struct rq *rq = this_rq_lock();

5574

struct rq *rq = this_rq_lock();

5575

5576

schedstat_inc(rq, yld_count);

5576

schedstat_inc(rq, yld_count);

5577

current->sched_class->yield_task(rq);

5577

current->sched_class->yield_task(rq);

5578

5579

/*

5579

/*

5580

* Since we are going to call schedule() anyway, there's

5580

* Since we are going to call schedule() anyway, there's

5581

* no need to preempt or enable interrupts:

5581

* no need to preempt or enable interrupts:

5582

*/

5582

*/

5583

__release(rq->lock);

5583

__release(rq->lock);

5584

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

5584

spin_release(&rq->lock.dep_map, 1, _THIS_IP_);

5585

do_raw_spin_unlock(&rq->lock);

5585

do_raw_spin_unlock(&rq->lock);

5586

preempt_enable_no_resched();

5586

preempt_enable_no_resched();

5587

5588

schedule();

5588

schedule();

5589

5590

return 0;

5590

return 0;

5591

}

5591

}

5592

5593

static inline int should_resched(void)

5593

static inline int should_resched(void)

5594

{

5594

{

5595

return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);

5595

return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);

5596

}

5596

}

5597

5598

static void __cond_resched(void)

5598

static void __cond_resched(void)

5599

{

5599

{

5600

add_preempt_count(PREEMPT_ACTIVE);

5600

add_preempt_count(PREEMPT_ACTIVE);

5601

__schedule();

5601

__schedule();

5602

sub_preempt_count(PREEMPT_ACTIVE);

5602

sub_preempt_count(PREEMPT_ACTIVE);

5603

}

5603

}

5604

5605

int __sched _cond_resched(void)

5605

int __sched _cond_resched(void)

5606

{

5606

{

5607

if (should_resched()) {

5607

if (should_resched()) {

5608

__cond_resched();

5608

__cond_resched();

5609

return 1;

5609

return 1;

5610

}

5610

}

5611

return 0;

5611

return 0;

5612

}

5612

}

5613

EXPORT_SYMBOL(_cond_resched);

5613

EXPORT_SYMBOL(_cond_resched);

5614

5615

/*

5615

/*

5616

* __cond_resched_lock() - if a reschedule is pending, drop the given lock,

5616

* __cond_resched_lock() - if a reschedule is pending, drop the given lock,

5617

* call schedule, and on return reacquire the lock.

5617

* call schedule, and on return reacquire the lock.

5618

*

5618

*

5619

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

5619

* This works OK both with and without CONFIG_PREEMPT. We do strange low-level

5620

* operations here to prevent schedule() from being called twice (once via

5620

* operations here to prevent schedule() from being called twice (once via

5621

* spin_unlock(), once by hand).

5621

* spin_unlock(), once by hand).

5622

*/

5622

*/

5623

int __cond_resched_lock(spinlock_t *lock)

5623

int __cond_resched_lock(spinlock_t *lock)

5624

{

5624

{

5625

int resched = should_resched();

5625

int resched = should_resched();

5626

int ret = 0;

5626

int ret = 0;

5627

5628

lockdep_assert_held(lock);

5628

lockdep_assert_held(lock);

5629

5630

if (spin_needbreak(lock) || resched) {

5630

if (spin_needbreak(lock) || resched) {

5631

spin_unlock(lock);

5631

spin_unlock(lock);

5632

if (resched)

5632

if (resched)

5633

__cond_resched();

5633

__cond_resched();

5634

else

5634

else

5635

cpu_relax();

5635

cpu_relax();

5636

ret = 1;

5636

ret = 1;

5637

spin_lock(lock);

5637

spin_lock(lock);

5638

}

5638

}

5639

return ret;

5639

return ret;

5640

}

5640

}

5641

EXPORT_SYMBOL(__cond_resched_lock);

5641

EXPORT_SYMBOL(__cond_resched_lock);

5642

5643

int __sched __cond_resched_softirq(void)

5643

int __sched __cond_resched_softirq(void)

5644

{

5644

{

5645

BUG_ON(!in_softirq());

5645

BUG_ON(!in_softirq());

5646

5647

if (should_resched()) {

5647

if (should_resched()) {

5648

local_bh_enable();

5648

local_bh_enable();

5649

__cond_resched();

5649

__cond_resched();

5650

local_bh_disable();

5650

local_bh_disable();

5651

return 1;

5651

return 1;

5652

}

5652

}

5653

return 0;

5653

return 0;

5654

}

5654

}

5655

EXPORT_SYMBOL(__cond_resched_softirq);

5655

EXPORT_SYMBOL(__cond_resched_softirq);

5656

5657

/**

5657

/**

5658

* yield - yield the current processor to other threads.

5658

* yield - yield the current processor to other threads.

5659

*

5659

*

5660

* This is a shortcut for kernel-space yielding - it marks the

5660

* This is a shortcut for kernel-space yielding - it marks the

5661

* thread runnable and calls sys_sched_yield().

5661

* thread runnable and calls sys_sched_yield().

5662

*/

5662

*/

5663

void __sched yield(void)

5663

void __sched yield(void)

5664

{

5664

{

5665

set_current_state(TASK_RUNNING);

5665

set_current_state(TASK_RUNNING);

5666

sys_sched_yield();

5666

sys_sched_yield();

5667

}

5667

}

5668

EXPORT_SYMBOL(yield);

5668

EXPORT_SYMBOL(yield);

5669

5670

/**

5670

/**

5671

* yield_to - yield the current processor to another thread in

5671

* yield_to - yield the current processor to another thread in

5672

* your thread group, or accelerate that thread toward the

5672

* your thread group, or accelerate that thread toward the

5673

* processor it's on.

5673

* processor it's on.

5674

* @p: target task

5674

* @p: target task

5675

* @preempt: whether task preemption is allowed or not

5675

* @preempt: whether task preemption is allowed or not

5676

*

5676

*

5677

* It's the caller's job to ensure that the target task struct

5677

* It's the caller's job to ensure that the target task struct

5678

* can't go away on us before we can do any checks.

5678

* can't go away on us before we can do any checks.

5679

*

5679

*

5680

* Returns true if we indeed boosted the target task.

5680

* Returns true if we indeed boosted the target task.

5681

*/

5681

*/

5682

bool __sched yield_to(struct task_struct *p, bool preempt)

5682

bool __sched yield_to(struct task_struct *p, bool preempt)

5683

{

5683

{

5684

struct task_struct *curr = current;

5684

struct task_struct *curr = current;

5685

struct rq *rq, *p_rq;

5685

struct rq *rq, *p_rq;

5686

unsigned long flags;

5686

unsigned long flags;

5687

bool yielded = 0;

5687

bool yielded = 0;

5688

5689

local_irq_save(flags);

5689

local_irq_save(flags);

5690

rq = this_rq();

5690

rq = this_rq();

5691

5692

again:

5692

again:

5693

p_rq = task_rq(p);

5693

p_rq = task_rq(p);

5694

double_rq_lock(rq, p_rq);

5694

double_rq_lock(rq, p_rq);

5695

while (task_rq(p) != p_rq) {

5695

while (task_rq(p) != p_rq) {

5696

double_rq_unlock(rq, p_rq);

5696

double_rq_unlock(rq, p_rq);

5697

goto again;

5697

goto again;

5698

}

5698

}

5699

5700

if (!curr->sched_class->yield_to_task)

5700

if (!curr->sched_class->yield_to_task)

5701

goto out;

5701

goto out;

5702

5703

if (curr->sched_class != p->sched_class)

5703

if (curr->sched_class != p->sched_class)

5704

goto out;

5704

goto out;

5705

5706

if (task_running(p_rq, p) || p->state)

5706

if (task_running(p_rq, p) || p->state)

5707

goto out;

5707

goto out;

5708

5709

yielded = curr->sched_class->yield_to_task(rq, p, preempt);

5709

yielded = curr->sched_class->yield_to_task(rq, p, preempt);

5710

if (yielded) {

5710

if (yielded) {

5711

schedstat_inc(rq, yld_count);

5711

schedstat_inc(rq, yld_count);

5712

/*

5712

/*

5713

* Make p's CPU reschedule; pick_next_entity takes care of

5713

* Make p's CPU reschedule; pick_next_entity takes care of

5714

* fairness.

5714

* fairness.

5715

*/

5715

*/

5716

if (preempt && rq != p_rq)

5716

if (preempt && rq != p_rq)

5717

resched_task(p_rq->curr);

5717

resched_task(p_rq->curr);

5718

}

5718

}

5719

5720

out:

5720

out:

5721

double_rq_unlock(rq, p_rq);

5721

double_rq_unlock(rq, p_rq);

5722

local_irq_restore(flags);

5722

local_irq_restore(flags);

5723

5724

if (yielded)

5724

if (yielded)

5725

schedule();

5725

schedule();

5726

5727

return yielded;

5727

return yielded;

5728

}

5728

}

5729

EXPORT_SYMBOL_GPL(yield_to);

5729

EXPORT_SYMBOL_GPL(yield_to);

5730

5731

/*

5731

/*

5732

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

5732

* This task is about to go to sleep on IO. Increment rq->nr_iowait so

5733

* that process accounting knows that this is a task in IO wait state.

5733

* that process accounting knows that this is a task in IO wait state.

5734

*/

5734

*/

5735

void __sched io_schedule(void)

5735

void __sched io_schedule(void)

5736

{

5736

{

5737

struct rq *rq = raw_rq();

5737

struct rq *rq = raw_rq();

5738

5739

delayacct_blkio_start();

5739

delayacct_blkio_start();

5740

atomic_inc(&rq->nr_iowait);

5740

atomic_inc(&rq->nr_iowait);

5741

blk_flush_plug(current);

5741

blk_flush_plug(current);

5742

current->in_iowait = 1;

5742

current->in_iowait = 1;

5743

schedule();

5743

schedule();

5744

current->in_iowait = 0;

5744

current->in_iowait = 0;

5745

atomic_dec(&rq->nr_iowait);

5745

atomic_dec(&rq->nr_iowait);

5746

delayacct_blkio_end();

5746

delayacct_blkio_end();

5747

}

5747

}

5748

EXPORT_SYMBOL(io_schedule);

5748

EXPORT_SYMBOL(io_schedule);

5749

5750

long __sched io_schedule_timeout(long timeout)

5750

long __sched io_schedule_timeout(long timeout)

5751

{

5751

{

5752

struct rq *rq = raw_rq();

5752

struct rq *rq = raw_rq();

5753

long ret;

5753

long ret;

5754

5755

delayacct_blkio_start();

5755

delayacct_blkio_start();

5756

atomic_inc(&rq->nr_iowait);

5756

atomic_inc(&rq->nr_iowait);

5757

blk_flush_plug(current);

5757

blk_flush_plug(current);

5758

current->in_iowait = 1;

5758

current->in_iowait = 1;

5759

ret = schedule_timeout(timeout);

5759

ret = schedule_timeout(timeout);

5760

current->in_iowait = 0;

5760

current->in_iowait = 0;

5761

atomic_dec(&rq->nr_iowait);

5761

atomic_dec(&rq->nr_iowait);

5762

delayacct_blkio_end();

5762

delayacct_blkio_end();

5763

return ret;

5763

return ret;

5764

}

5764

}

5765

5766

/**

5766

/**

5767

* sys_sched_get_priority_max - return maximum RT priority.

5767

* sys_sched_get_priority_max - return maximum RT priority.

5768

* @policy: scheduling class.

5768

* @policy: scheduling class.

5769

*

5769

*

5770

* this syscall returns the maximum rt_priority that can be used

5770

* this syscall returns the maximum rt_priority that can be used

5771

* by a given scheduling class.

5771

* by a given scheduling class.

5772

*/

5772

*/

5773

SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

5773

SYSCALL_DEFINE1(sched_get_priority_max, int, policy)

5774

{

5774

{

5775

int ret = -EINVAL;

5775

int ret = -EINVAL;

5776

5777

switch (policy) {

5777

switch (policy) {

5778

case SCHED_FIFO:

5778

case SCHED_FIFO:

5779

case SCHED_RR:

5779

case SCHED_RR:

5780

ret = MAX_USER_RT_PRIO-1;

5780

ret = MAX_USER_RT_PRIO-1;

5781

break;

5781

break;

5782

case SCHED_NORMAL:

5782

case SCHED_NORMAL:

5783

case SCHED_BATCH:

5783

case SCHED_BATCH:

5784

case SCHED_IDLE:

5784

case SCHED_IDLE:

5785

ret = 0;

5785

ret = 0;

5786

break;

5786

break;

5787

}

5787

}

5788

return ret;

5788

return ret;

5789

}

5789

}

5790

5791

/**

5791

/**

5792

* sys_sched_get_priority_min - return minimum RT priority.

5792

* sys_sched_get_priority_min - return minimum RT priority.

5793

* @policy: scheduling class.

5793

* @policy: scheduling class.

5794

*

5794

*

5795

* this syscall returns the minimum rt_priority that can be used

5795

* this syscall returns the minimum rt_priority that can be used

5796

* by a given scheduling class.

5796

* by a given scheduling class.

5797

*/

5797

*/

5798

SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

5798

SYSCALL_DEFINE1(sched_get_priority_min, int, policy)

5799

{

5799

{

5800

int ret = -EINVAL;

5800

int ret = -EINVAL;

5801

5802

switch (policy) {

5802

switch (policy) {

5803

case SCHED_FIFO:

5803

case SCHED_FIFO:

5804

case SCHED_RR:

5804

case SCHED_RR:

5805

ret = 1;

5805

ret = 1;

5806

break;

5806

break;

5807

case SCHED_NORMAL:

5807

case SCHED_NORMAL:

5808

case SCHED_BATCH:

5808

case SCHED_BATCH:

5809

case SCHED_IDLE:

5809

case SCHED_IDLE:

5810

ret = 0;

5810

ret = 0;

5811

}

5811

}

5812

return ret;

5812

return ret;

5813

}

5813

}

5814

5815

/**

5815

/**

5816

* sys_sched_rr_get_interval - return the default timeslice of a process.

5816

* sys_sched_rr_get_interval - return the default timeslice of a process.

5817

* @pid: pid of the process.

5817

* @pid: pid of the process.

5818

* @interval: userspace pointer to the timeslice value.

5818

* @interval: userspace pointer to the timeslice value.

5819

*

5819

*

5820

* this syscall writes the default timeslice value of a given process

5820

* this syscall writes the default timeslice value of a given process

5821

* into the user-space timespec buffer. A value of '0' means infinity.

5821

* into the user-space timespec buffer. A value of '0' means infinity.

5822

*/

5822

*/

5823

SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

5823

SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,

5824

struct timespec __user *, interval)

5824

struct timespec __user *, interval)

5825

{

5825

{

5826

struct task_struct *p;

5826

struct task_struct *p;

5827

unsigned int time_slice;

5827

unsigned int time_slice;

5828

unsigned long flags;

5828

unsigned long flags;

5829

struct rq *rq;

5829

struct rq *rq;

5830

int retval;

5830

int retval;

5831

struct timespec t;

5831

struct timespec t;

5832

5833

if (pid < 0)

5833

if (pid < 0)

5834

return -EINVAL;

5834

return -EINVAL;

5835

5836

retval = -ESRCH;

5836

retval = -ESRCH;

5837

rcu_read_lock();

5837

rcu_read_lock();

5838

p = find_process_by_pid(pid);

5838

p = find_process_by_pid(pid);

5839

if (!p)

5839

if (!p)

5840

goto out_unlock;

5840

goto out_unlock;

5841

5842

retval = security_task_getscheduler(p);

5842

retval = security_task_getscheduler(p);

5843

if (retval)

5843

if (retval)

5844

goto out_unlock;

5844

goto out_unlock;

5845

5846

rq = task_rq_lock(p, &flags);

5846

rq = task_rq_lock(p, &flags);

5847

time_slice = p->sched_class->get_rr_interval(rq, p);

5847

time_slice = p->sched_class->get_rr_interval(rq, p);

5848

task_rq_unlock(rq, p, &flags);

5848

task_rq_unlock(rq, p, &flags);

5849

5850

rcu_read_unlock();

5850

rcu_read_unlock();

5851

jiffies_to_timespec(time_slice, &t);

5851

jiffies_to_timespec(time_slice, &t);

5852

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

5852

retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;

5853

return retval;

5853

return retval;

5854

5855

out_unlock:

5855

out_unlock:

5856

rcu_read_unlock();

5856

rcu_read_unlock();

5857

return retval;

5857

return retval;

5858

}

5858

}

5859

5860

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

5860

static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;

5861

5862

void sched_show_task(struct task_struct *p)

5862

void sched_show_task(struct task_struct *p)

5863

{

5863

{

5864

unsigned long free = 0;

5864

unsigned long free = 0;

5865

unsigned state;

5865

unsigned state;

5866

5867

state = p->state ? __ffs(p->state) + 1 : 0;

5867

state = p->state ? __ffs(p->state) + 1 : 0;

5868

printk(KERN_INFO "%-15.15s %c", p->comm,

5868

printk(KERN_INFO "%-15.15s %c", p->comm,

5869

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

5869

state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');

5870

#if BITS_PER_LONG == 32

5870

#if BITS_PER_LONG == 32

5871

if (state == TASK_RUNNING)

5871

if (state == TASK_RUNNING)

5872

printk(KERN_CONT " running ");

5872

printk(KERN_CONT " running ");

5873

else

5873

else

5874

printk(KERN_CONT " %08lx ", thread_saved_pc(p));

5874

printk(KERN_CONT " %08lx ", thread_saved_pc(p));

5875

#else

5875

#else

5876

if (state == TASK_RUNNING)

5876

if (state == TASK_RUNNING)

5877

printk(KERN_CONT " running task ");

5877

printk(KERN_CONT " running task ");

5878

else

5878

else

5879

printk(KERN_CONT " %016lx ", thread_saved_pc(p));

5879

printk(KERN_CONT " %016lx ", thread_saved_pc(p));

5880

#endif

5880

#endif

5881

#ifdef CONFIG_DEBUG_STACK_USAGE

5881

#ifdef CONFIG_DEBUG_STACK_USAGE

5882

free = stack_not_used(p);

5882

free = stack_not_used(p);

5883

#endif

5883

#endif

5884

printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,

5884

printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,

5885

task_pid_nr(p), task_pid_nr(p->real_parent),

5885

task_pid_nr(p), task_pid_nr(p->real_parent),

5886

(unsigned long)task_thread_info(p)->flags);

5886

(unsigned long)task_thread_info(p)->flags);

5887

5888

show_stack(p, NULL);

5888

show_stack(p, NULL);

5889

}

5889

}

5890

5891

void show_state_filter(unsigned long state_filter)

5891

void show_state_filter(unsigned long state_filter)

5892

{

5892

{

5893

struct task_struct *g, *p;

5893

struct task_struct *g, *p;

5894

5895

#if BITS_PER_LONG == 32

5895

#if BITS_PER_LONG == 32

5896

printk(KERN_INFO

5896

printk(KERN_INFO

5897

" task PC stack pid father\n");

5897

" task PC stack pid father\n");

5898

#else

5898

#else

5899

printk(KERN_INFO

5899

printk(KERN_INFO

5900

" task PC stack pid father\n");

5900

" task PC stack pid father\n");

5901

#endif

5901

#endif

5902

read_lock(&tasklist_lock);

5902

read_lock(&tasklist_lock);

5903

do_each_thread(g, p) {

5903

do_each_thread(g, p) {

5904

/*

5904

/*

5905

* reset the NMI-timeout, listing all files on a slow

5905

* reset the NMI-timeout, listing all files on a slow

5906

* console might take a lot of time:

5906

* console might take a lot of time:

5907

*/

5907

*/

5908

touch_nmi_watchdog();

5908

touch_nmi_watchdog();

5909

if (!state_filter || (p->state & state_filter))

5909

if (!state_filter || (p->state & state_filter))

5910

sched_show_task(p);

5910

sched_show_task(p);

5911

} while_each_thread(g, p);

5911

} while_each_thread(g, p);

5912

5913

touch_all_softlockup_watchdogs();

5913

touch_all_softlockup_watchdogs();

5914

5915

#ifdef CONFIG_SCHED_DEBUG

5915

#ifdef CONFIG_SCHED_DEBUG

5916

sysrq_sched_debug_show();

5916

sysrq_sched_debug_show();

5917

#endif

5917

#endif

5918

read_unlock(&tasklist_lock);

5918

read_unlock(&tasklist_lock);

5919

/*

5919

/*

5920

* Only show locks if all tasks are dumped:

5920

* Only show locks if all tasks are dumped:

5921

*/

5921

*/

5922

if (!state_filter)

5922

if (!state_filter)

5923

debug_show_all_locks();

5923

debug_show_all_locks();

5924

}

5924

}

5925

5926

void __cpuinit init_idle_bootup_task(struct task_struct *idle)

5926

void __cpuinit init_idle_bootup_task(struct task_struct *idle)

5927

{

5927

{

5928

idle->sched_class = &idle_sched_class;

5928

idle->sched_class = &idle_sched_class;

5929

}

5929

}

5930

5931

/**

5931

/**

5932

* init_idle - set up an idle thread for a given CPU

5932

* init_idle - set up an idle thread for a given CPU

5933

* @idle: task in question

5933

* @idle: task in question

5934

* @cpu: cpu the idle task belongs to

5934

* @cpu: cpu the idle task belongs to

5935

*

5935

*

5936

* NOTE: this function does not set the idle thread's NEED_RESCHED

5936

* NOTE: this function does not set the idle thread's NEED_RESCHED

5937

* flag, to make booting more robust.

5937

* flag, to make booting more robust.

5938

*/

5938

*/

5939

void __cpuinit init_idle(struct task_struct *idle, int cpu)

5939

void __cpuinit init_idle(struct task_struct *idle, int cpu)

5940

{

5940

{

5941

struct rq *rq = cpu_rq(cpu);

5941

struct rq *rq = cpu_rq(cpu);

5942

unsigned long flags;

5942

unsigned long flags;

5943

5944

raw_spin_lock_irqsave(&rq->lock, flags);

5944

raw_spin_lock_irqsave(&rq->lock, flags);

5945

5946

__sched_fork(idle);

5946

__sched_fork(idle);

5947

idle->state = TASK_RUNNING;

5947

idle->state = TASK_RUNNING;

5948

idle->se.exec_start = sched_clock();

5948

idle->se.exec_start = sched_clock();

5949

5950

do_set_cpus_allowed(idle, cpumask_of(cpu));

5950

do_set_cpus_allowed(idle, cpumask_of(cpu));

5951

/*

5951

/*

5952

* We're having a chicken and egg problem, even though we are

5952

* We're having a chicken and egg problem, even though we are

5953

* holding rq->lock, the cpu isn't yet set to this cpu so the

5953

* holding rq->lock, the cpu isn't yet set to this cpu so the

5954

* lockdep check in task_group() will fail.

5954

* lockdep check in task_group() will fail.

5955

*

5955

*

5956

* Similar case to sched_fork(). / Alternatively we could

5956

* Similar case to sched_fork(). / Alternatively we could

5957

* use task_rq_lock() here and obtain the other rq->lock.

5957

* use task_rq_lock() here and obtain the other rq->lock.

5958

*

5958

*

5959

* Silence PROVE_RCU

5959

* Silence PROVE_RCU

5960

*/

5960

*/

5961

rcu_read_lock();

5961

rcu_read_lock();

5962

__set_task_cpu(idle, cpu);

5962

__set_task_cpu(idle, cpu);

5963

rcu_read_unlock();

5963

rcu_read_unlock();

5964

5965

rq->curr = rq->idle = idle;

5965

rq->curr = rq->idle = idle;

5966

#if defined(CONFIG_SMP)

5966

#if defined(CONFIG_SMP)

5967

idle->on_cpu = 1;

5967

idle->on_cpu = 1;

5968

#endif

5968

#endif

5969

raw_spin_unlock_irqrestore(&rq->lock, flags);

5969

raw_spin_unlock_irqrestore(&rq->lock, flags);

5970

5971

/* Set the preempt count _outside_ the spinlocks! */

5971

/* Set the preempt count _outside_ the spinlocks! */

5972

task_thread_info(idle)->preempt_count = 0;

5972

task_thread_info(idle)->preempt_count = 0;

5973

5974

/*

5974

/*

5975

* The idle tasks have their own, simple scheduling class:

5975

* The idle tasks have their own, simple scheduling class:

5976

*/

5976

*/

5977

idle->sched_class = &idle_sched_class;

5977

idle->sched_class = &idle_sched_class;

5978

ftrace_graph_init_idle_task(idle, cpu);

5978

ftrace_graph_init_idle_task(idle, cpu);

5979

}

5979

}

5980

5981

/*

5981

/*

5982

* In a system that switches off the HZ timer nohz_cpu_mask

5982

* In a system that switches off the HZ timer nohz_cpu_mask

5983

* indicates which cpus entered this state. This is used

5983

* indicates which cpus entered this state. This is used

5984

* in the rcu update to wait only for active cpus. For system

5984

* in the rcu update to wait only for active cpus. For system

5985

* which do not switch off the HZ timer nohz_cpu_mask should

5985

* which do not switch off the HZ timer nohz_cpu_mask should

5986

* always be CPU_BITS_NONE.

5986

* always be CPU_BITS_NONE.

5987

*/

5987

*/

5988

cpumask_var_t nohz_cpu_mask;

5988

cpumask_var_t nohz_cpu_mask;

5989

5990

/*

5990

/*

5991

* Increase the granularity value when there are more CPUs,

5991

* Increase the granularity value when there are more CPUs,

5992

* because with more CPUs the 'effective latency' as visible

5992

* because with more CPUs the 'effective latency' as visible

5993

* to users decreases. But the relationship is not linear,

5993

* to users decreases. But the relationship is not linear,

5994

* so pick a second-best guess by going with the log2 of the

5994

* so pick a second-best guess by going with the log2 of the

5995

* number of CPUs.

5995

* number of CPUs.

5996

*

5996

*

5997

* This idea comes from the SD scheduler of Con Kolivas:

5997

* This idea comes from the SD scheduler of Con Kolivas:

5998

*/

5998

*/

5999

static int get_update_sysctl_factor(void)

5999

static int get_update_sysctl_factor(void)

6000

{

6000

{

6001

unsigned int cpus = min_t(int, num_online_cpus(), 8);

6001

unsigned int cpus = min_t(int, num_online_cpus(), 8);

6002

unsigned int factor;

6002

unsigned int factor;

6003

6004

switch (sysctl_sched_tunable_scaling) {

6004

switch (sysctl_sched_tunable_scaling) {

6005

case SCHED_TUNABLESCALING_NONE:

6005

case SCHED_TUNABLESCALING_NONE:

6006

factor = 1;

6006

factor = 1;

6007

break;

6007

break;

6008

case SCHED_TUNABLESCALING_LINEAR:

6008

case SCHED_TUNABLESCALING_LINEAR:

6009

factor = cpus;

6009

factor = cpus;

6010

break;

6010

break;

6011

case SCHED_TUNABLESCALING_LOG:

6011

case SCHED_TUNABLESCALING_LOG:

6012

default:

6012

default:

6013

factor = 1 + ilog2(cpus);

6013

factor = 1 + ilog2(cpus);

6014

break;

6014

break;

6015

}

6015

}

6016

6017

return factor;

6017

return factor;

6018

}

6018

}

6019

6020

static void update_sysctl(void)

6020

static void update_sysctl(void)

6021

{

6021

{

6022

unsigned int factor = get_update_sysctl_factor();

6022

unsigned int factor = get_update_sysctl_factor();

6023

6024

#define SET_SYSCTL(name) \

6024

#define SET_SYSCTL(name) \

6025

(sysctl_##name = (factor) * normalized_sysctl_##name)

6025

(sysctl_##name = (factor) * normalized_sysctl_##name)

6026

SET_SYSCTL(sched_min_granularity);

6026

SET_SYSCTL(sched_min_granularity);

6027

SET_SYSCTL(sched_latency);

6027

SET_SYSCTL(sched_latency);

6028

SET_SYSCTL(sched_wakeup_granularity);

6028

SET_SYSCTL(sched_wakeup_granularity);

6029

#undef SET_SYSCTL

6029

#undef SET_SYSCTL

6030

}

6030

}

6031

6032

static inline void sched_init_granularity(void)

6032

static inline void sched_init_granularity(void)

6033

{

6033

{

6034

update_sysctl();

6034

update_sysctl();

6035

}

6035

}

6036

6037

#ifdef CONFIG_SMP

6037

#ifdef CONFIG_SMP

6038

void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)

6038

void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)

6039

{

6039

{

6040

if (p->sched_class && p->sched_class->set_cpus_allowed)

6040

if (p->sched_class && p->sched_class->set_cpus_allowed)

6041

p->sched_class->set_cpus_allowed(p, new_mask);

6041

p->sched_class->set_cpus_allowed(p, new_mask);

6042

else {

6042

else {

6043

cpumask_copy(&p->cpus_allowed, new_mask);

6043

cpumask_copy(&p->cpus_allowed, new_mask);

6044

p->rt.nr_cpus_allowed = cpumask_weight(new_mask);

6044

p->rt.nr_cpus_allowed = cpumask_weight(new_mask);

6045

}

6045

}

6046

}

6046

}

6047

6048

/*

6048

/*

6049

* This is how migration works:

6049

* This is how migration works:

6050

*

6050

*

6051

* 1) we invoke migration_cpu_stop() on the target CPU using

6051

* 1) we invoke migration_cpu_stop() on the target CPU using

6052

* stop_one_cpu().

6052

* stop_one_cpu().

6053

* 2) stopper starts to run (implicitly forcing the migrated thread

6053

* 2) stopper starts to run (implicitly forcing the migrated thread

6054

* off the CPU)

6054

* off the CPU)

6055

* 3) it checks whether the migrated task is still in the wrong runqueue.

6055

* 3) it checks whether the migrated task is still in the wrong runqueue.

6056

* 4) if it's in the wrong runqueue then the migration thread removes

6056

* 4) if it's in the wrong runqueue then the migration thread removes

6057

* it and puts it into the right queue.

6057

* it and puts it into the right queue.

6058

* 5) stopper completes and stop_one_cpu() returns and the migration

6058

* 5) stopper completes and stop_one_cpu() returns and the migration

6059

* is done.

6059

* is done.

6060

*/

6060

*/

6061

6062

/*

6062

/*

6063

* Change a given task's CPU affinity. Migrate the thread to a

6063

* Change a given task's CPU affinity. Migrate the thread to a

6064

* proper CPU and schedule it away if the CPU it's executing on

6064

* proper CPU and schedule it away if the CPU it's executing on

6065

* is removed from the allowed bitmask.

6065

* is removed from the allowed bitmask.

6066

*

6066

*

6067

* NOTE: the caller must have a valid reference to the task, the

6067

* NOTE: the caller must have a valid reference to the task, the

6068

* task must not exit() & deallocate itself prematurely. The

6068

* task must not exit() & deallocate itself prematurely. The

6069

* call is not atomic; no spinlocks may be held.

6069

* call is not atomic; no spinlocks may be held.

6070

*/

6070

*/

6071

int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

6071

int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)

6072

{

6072

{

6073

unsigned long flags;

6073

unsigned long flags;

6074

struct rq *rq;

6074

struct rq *rq;

6075

unsigned int dest_cpu;

6075

unsigned int dest_cpu;

6076

int ret = 0;

6076

int ret = 0;

6077

6078

rq = task_rq_lock(p, &flags);

6078

rq = task_rq_lock(p, &flags);

6079

6080

if (cpumask_equal(&p->cpus_allowed, new_mask))

6080

if (cpumask_equal(&p->cpus_allowed, new_mask))

6081

goto out;

6081

goto out;

6082

6083

if (!cpumask_intersects(new_mask, cpu_active_mask)) {

6083

if (!cpumask_intersects(new_mask, cpu_active_mask)) {

6084

ret = -EINVAL;

6084

ret = -EINVAL;

6085

goto out;

6085

goto out;

6086

}

6086

}

6087

6088

if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {

6088

if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {

6089

ret = -EINVAL;

6089

ret = -EINVAL;

6090

goto out;

6090

goto out;

6091

}

6091

}

6092

6093

do_set_cpus_allowed(p, new_mask);

6093

do_set_cpus_allowed(p, new_mask);

6094

6095

/* Can the task run on the task's current CPU? If so, we're done */

6095

/* Can the task run on the task's current CPU? If so, we're done */

6096

if (cpumask_test_cpu(task_cpu(p), new_mask))

6096

if (cpumask_test_cpu(task_cpu(p), new_mask))

6097

goto out;

6097

goto out;

6098

6099

dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);

6099

dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);

6100

if (p->on_rq) {

6100

if (p->on_rq) {

6101

struct migration_arg arg = { p, dest_cpu };

6101

struct migration_arg arg = { p, dest_cpu };

6102

/* Need help from migration thread: drop lock and wait. */

6102

/* Need help from migration thread: drop lock and wait. */

6103

task_rq_unlock(rq, p, &flags);

6103

task_rq_unlock(rq, p, &flags);

6104

stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);

6104

stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);

6105

tlb_migrate_finish(p->mm);

6105

tlb_migrate_finish(p->mm);

6106

return 0;

6106

return 0;

6107

}

6107

}

6108

out:

6108

out:

6109

task_rq_unlock(rq, p, &flags);

6109

task_rq_unlock(rq, p, &flags);

6110

6111

return ret;

6111

return ret;

6112

}

6112

}

6113

EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

6113

EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

6114

6115

/*

6115

/*

6116

* Move (not current) task off this cpu, onto dest cpu. We're doing

6116

* Move (not current) task off this cpu, onto dest cpu. We're doing

6117

* this because either it can't run here any more (set_cpus_allowed()

6117

* this because either it can't run here any more (set_cpus_allowed()

6118

* away from this CPU, or CPU going down), or because we're

6118

* away from this CPU, or CPU going down), or because we're

6119

* attempting to rebalance this task on exec (sched_exec).

6119

* attempting to rebalance this task on exec (sched_exec).

6120

*

6120

*

6121

* So we race with normal scheduler movements, but that's OK, as long

6121

* So we race with normal scheduler movements, but that's OK, as long

6122

* as the task is no longer on this CPU.

6122

* as the task is no longer on this CPU.

6123

*

6123

*

6124

* Returns non-zero if task was successfully migrated.

6124

* Returns non-zero if task was successfully migrated.

6125

*/

6125

*/

6126

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

6126

static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)

6127

{

6127

{

6128

struct rq *rq_dest, *rq_src;

6128

struct rq *rq_dest, *rq_src;

6129

int ret = 0;

6129

int ret = 0;

6130

6131

if (unlikely(!cpu_active(dest_cpu)))

6131

if (unlikely(!cpu_active(dest_cpu)))

6132

return ret;

6132

return ret;

6133

6134

rq_src = cpu_rq(src_cpu);

6134

rq_src = cpu_rq(src_cpu);

6135

rq_dest = cpu_rq(dest_cpu);

6135

rq_dest = cpu_rq(dest_cpu);

6136

6137

raw_spin_lock(&p->pi_lock);

6137

raw_spin_lock(&p->pi_lock);

6138

double_rq_lock(rq_src, rq_dest);

6138

double_rq_lock(rq_src, rq_dest);

6139

/* Already moved. */

6139

/* Already moved. */

6140

if (task_cpu(p) != src_cpu)

6140

if (task_cpu(p) != src_cpu)

6141

goto done;

6141

goto done;

6142

/* Affinity changed (again). */

6142

/* Affinity changed (again). */

6143

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

6143

if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))

6144

goto fail;

6144

goto fail;

6145

6146

/*

6146

/*

6147

* If we're not on a rq, the next wake-up will ensure we're

6147

* If we're not on a rq, the next wake-up will ensure we're

6148

* placed properly.

6148

* placed properly.

6149

*/

6149

*/

6150

if (p->on_rq) {

6150

if (p->on_rq) {

6151

deactivate_task(rq_src, p, 0);

6151

deactivate_task(rq_src, p, 0);

6152

set_task_cpu(p, dest_cpu);

6152

set_task_cpu(p, dest_cpu);

6153

activate_task(rq_dest, p, 0);

6153

activate_task(rq_dest, p, 0);

6154

check_preempt_curr(rq_dest, p, 0);

6154

check_preempt_curr(rq_dest, p, 0);

6155

}

6155

}

6156

done:

6156

done:

6157

ret = 1;

6157

ret = 1;

6158

fail:

6158

fail:

6159

double_rq_unlock(rq_src, rq_dest);

6159

double_rq_unlock(rq_src, rq_dest);

6160

raw_spin_unlock(&p->pi_lock);

6160

raw_spin_unlock(&p->pi_lock);

6161

return ret;

6161

return ret;

6162

}

6162

}

6163

6164

/*

6164

/*

6165

* migration_cpu_stop - this will be executed by a highprio stopper thread

6165

* migration_cpu_stop - this will be executed by a highprio stopper thread

6166

* and performs thread migration by bumping thread off CPU then

6166

* and performs thread migration by bumping thread off CPU then

6167

* 'pushing' onto another runqueue.

6167

* 'pushing' onto another runqueue.

6168

*/

6168

*/

6169

static int migration_cpu_stop(void *data)

6169

static int migration_cpu_stop(void *data)

6170

{

6170

{

6171

struct migration_arg *arg = data;

6171

struct migration_arg *arg = data;

6172

6173

/*

6173

/*

6174

* The original target cpu might have gone down and we might

6174

* The original target cpu might have gone down and we might

6175

* be on another cpu but it doesn't matter.

6175

* be on another cpu but it doesn't matter.

6176

*/

6176

*/

6177

local_irq_disable();

6177

local_irq_disable();

6178

__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);

6178

__migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);

6179

local_irq_enable();

6179

local_irq_enable();

6180

return 0;

6180

return 0;

6181

}

6181

}

6182

6183

#ifdef CONFIG_HOTPLUG_CPU

6183

#ifdef CONFIG_HOTPLUG_CPU

6184

6185

/*

6185

/*

6186

* Ensures that the idle task is using init_mm right before its cpu goes

6186

* Ensures that the idle task is using init_mm right before its cpu goes

6187

* offline.

6187

* offline.

6188

*/

6188

*/

6189

void idle_task_exit(void)

6189

void idle_task_exit(void)

6190

{

6190

{

6191

struct mm_struct *mm = current->active_mm;

6191

struct mm_struct *mm = current->active_mm;

6192

6193

BUG_ON(cpu_online(smp_processor_id()));

6193

BUG_ON(cpu_online(smp_processor_id()));

6194

6195

if (mm != &init_mm)

6195

if (mm != &init_mm)

6196

switch_mm(mm, &init_mm, current);

6196

switch_mm(mm, &init_mm, current);

6197

mmdrop(mm);

6197

mmdrop(mm);

6198

}

6198

}

6199

6200

/*

6200

/*

6201

* While a dead CPU has no uninterruptible tasks queued at this point,

6201

* While a dead CPU has no uninterruptible tasks queued at this point,

6202

* it might still have a nonzero ->nr_uninterruptible counter, because

6202

* it might still have a nonzero ->nr_uninterruptible counter, because

6203

* for performance reasons the counter is not stricly tracking tasks to

6203

* for performance reasons the counter is not stricly tracking tasks to

6204

* their home CPUs. So we just add the counter to another CPU's counter,

6204

* their home CPUs. So we just add the counter to another CPU's counter,

6205

* to keep the global sum constant after CPU-down:

6205

* to keep the global sum constant after CPU-down:

6206

*/

6206

*/

6207

static void migrate_nr_uninterruptible(struct rq *rq_src)

6207

static void migrate_nr_uninterruptible(struct rq *rq_src)

6208

{

6208

{

6209

struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));

6209

struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));

6210

6211

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

6211

rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;

6212

rq_src->nr_uninterruptible = 0;

6212

rq_src->nr_uninterruptible = 0;

6213

}

6213

}

6214

6215

/*

6215

/*

6216

* remove the tasks which were accounted by rq from calc_load_tasks.

6216

* remove the tasks which were accounted by rq from calc_load_tasks.

6217

*/

6217

*/

6218

static void calc_global_load_remove(struct rq *rq)

6218

static void calc_global_load_remove(struct rq *rq)

6219

{

6219

{

6220

atomic_long_sub(rq->calc_load_active, &calc_load_tasks);

6220

atomic_long_sub(rq->calc_load_active, &calc_load_tasks);

6221

rq->calc_load_active = 0;

6221

rq->calc_load_active = 0;

6222

}

6222

}

6223

6224

/*

6224

/*

6225

* Migrate all tasks from the rq, sleeping tasks will be migrated by

6225

* Migrate all tasks from the rq, sleeping tasks will be migrated by

6226

* try_to_wake_up()->select_task_rq().

6226

* try_to_wake_up()->select_task_rq().

6227

*

6227

*

6228

* Called with rq->lock held even though we'er in stop_machine() and

6228

* Called with rq->lock held even though we'er in stop_machine() and

6229

* there's no concurrency possible, we hold the required locks anyway

6229

* there's no concurrency possible, we hold the required locks anyway

6230

* because of lock validation efforts.

6230

* because of lock validation efforts.

6231

*/

6231

*/

6232

static void migrate_tasks(unsigned int dead_cpu)

6232

static void migrate_tasks(unsigned int dead_cpu)

6233

{

6233

{

6234

struct rq *rq = cpu_rq(dead_cpu);

6234

struct rq *rq = cpu_rq(dead_cpu);

6235

struct task_struct *next, *stop = rq->stop;

6235

struct task_struct *next, *stop = rq->stop;

6236

int dest_cpu;

6236

int dest_cpu;

6237

6238

/*

6238

/*

6239

* Fudge the rq selection such that the below task selection loop

6239

* Fudge the rq selection such that the below task selection loop

6240

* doesn't get stuck on the currently eligible stop task.

6240

* doesn't get stuck on the currently eligible stop task.

6241

*

6241

*

6242

* We're currently inside stop_machine() and the rq is either stuck

6242

* We're currently inside stop_machine() and the rq is either stuck

6243

* in the stop_machine_cpu_stop() loop, or we're executing this code,

6243

* in the stop_machine_cpu_stop() loop, or we're executing this code,

6244

* either way we should never end up calling schedule() until we're

6244

* either way we should never end up calling schedule() until we're

6245

* done here.

6245

* done here.

6246

*/

6246

*/

6247

rq->stop = NULL;

6247

rq->stop = NULL;

6248

6249

for ( ; ; ) {

6249

for ( ; ; ) {

6250

/*

6250

/*

6251

* There's this thread running, bail when that's the only

6251

* There's this thread running, bail when that's the only

6252

* remaining thread.

6252

* remaining thread.

6253

*/

6253

*/

6254

if (rq->nr_running == 1)

6254

if (rq->nr_running == 1)

6255

break;

6255

break;

6256

6257

next = pick_next_task(rq);

6257

next = pick_next_task(rq);

6258

BUG_ON(!next);

6258

BUG_ON(!next);

6259

next->sched_class->put_prev_task(rq, next);

6259

next->sched_class->put_prev_task(rq, next);

6260

6261

/* Find suitable destination for @next, with force if needed. */

6261

/* Find suitable destination for @next, with force if needed. */

6262

dest_cpu = select_fallback_rq(dead_cpu, next);

6262

dest_cpu = select_fallback_rq(dead_cpu, next);

6263

raw_spin_unlock(&rq->lock);

6263

raw_spin_unlock(&rq->lock);

6264

6265

__migrate_task(next, dead_cpu, dest_cpu);

6265

__migrate_task(next, dead_cpu, dest_cpu);

6266

6267

raw_spin_lock(&rq->lock);

6267

raw_spin_lock(&rq->lock);

6268

}

6268

}

6269

6270

rq->stop = stop;

6270

rq->stop = stop;

6271

}

6271

}

6272

6273

#endif /* CONFIG_HOTPLUG_CPU */

6273

#endif /* CONFIG_HOTPLUG_CPU */

6274

6275

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

6275

#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)

6276

6277

static struct ctl_table sd_ctl_dir[] = {

6277

static struct ctl_table sd_ctl_dir[] = {

6278

{

6278

{

6279

.procname = "sched_domain",

6279

.procname = "sched_domain",

6280

.mode = 0555,

6280

.mode = 0555,

6281

},

6281

},

6282

{}

6282

{}

6283

};

6283

};

6284

6285

static struct ctl_table sd_ctl_root[] = {

6285

static struct ctl_table sd_ctl_root[] = {

6286

{

6286

{

6287

.procname = "kernel",

6287

.procname = "kernel",

6288

.mode = 0555,

6288

.mode = 0555,

6289

.child = sd_ctl_dir,

6289

.child = sd_ctl_dir,

6290

},

6290

},

6291

{}

6291

{}

6292

};

6292

};

6293

6294

static struct ctl_table *sd_alloc_ctl_entry(int n)

6294

static struct ctl_table *sd_alloc_ctl_entry(int n)

6295

{

6295

{

6296

struct ctl_table *entry =

6296

struct ctl_table *entry =

6297

kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);

6297

kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);

6298

6299

return entry;

6299

return entry;

6300

}

6300

}

6301

6302

static void sd_free_ctl_entry(struct ctl_table **tablep)

6302

static void sd_free_ctl_entry(struct ctl_table **tablep)

6303

{

6303

{

6304

struct ctl_table *entry;

6304

struct ctl_table *entry;

6305

6306

/*

6306

/*

6307

* In the intermediate directories, both the child directory and

6307

* In the intermediate directories, both the child directory and

6308

* procname are dynamically allocated and could fail but the mode

6308

* procname are dynamically allocated and could fail but the mode

6309

* will always be set. In the lowest directory the names are

6309

* will always be set. In the lowest directory the names are

6310

* static strings and all have proc handlers.

6310

* static strings and all have proc handlers.

6311

*/

6311

*/

6312

for (entry = *tablep; entry->mode; entry++) {

6312

for (entry = *tablep; entry->mode; entry++) {

6313

if (entry->child)

6313

if (entry->child)

6314

sd_free_ctl_entry(&entry->child);

6314

sd_free_ctl_entry(&entry->child);

6315

if (entry->proc_handler == NULL)

6315

if (entry->proc_handler == NULL)

6316

kfree(entry->procname);

6316

kfree(entry->procname);

6317

}

6317

}

6318

6319

kfree(*tablep);

6319

kfree(*tablep);

6320

*tablep = NULL;

6320

*tablep = NULL;

6321

}

6321

}

6322

6323

static void

6323

static void

6324

set_table_entry(struct ctl_table *entry,

6324

set_table_entry(struct ctl_table *entry,

6325

const char *procname, void *data, int maxlen,

6325

const char *procname, void *data, int maxlen,

6326

mode_t mode, proc_handler *proc_handler)

6326

mode_t mode, proc_handler *proc_handler)

6327

{

6327

{

6328

entry->procname = procname;

6328

entry->procname = procname;

6329

entry->data = data;

6329

entry->data = data;

6330

entry->maxlen = maxlen;

6330

entry->maxlen = maxlen;

6331

entry->mode = mode;

6331

entry->mode = mode;

6332

entry->proc_handler = proc_handler;

6332

entry->proc_handler = proc_handler;

6333

}

6333

}

6334

6335

static struct ctl_table *

6335

static struct ctl_table *

6336

sd_alloc_ctl_domain_table(struct sched_domain *sd)

6336

sd_alloc_ctl_domain_table(struct sched_domain *sd)

6337

{

6337

{

6338

struct ctl_table *table = sd_alloc_ctl_entry(13);

6338

struct ctl_table *table = sd_alloc_ctl_entry(13);

6339

6340

if (table == NULL)

6340

if (table == NULL)

6341

return NULL;

6341

return NULL;

6342

6343

set_table_entry(&table[0], "min_interval", &sd->min_interval,

6343

set_table_entry(&table[0], "min_interval", &sd->min_interval,

6344

sizeof(long), 0644, proc_doulongvec_minmax);

6344

sizeof(long), 0644, proc_doulongvec_minmax);

6345

set_table_entry(&table[1], "max_interval", &sd->max_interval,

6345

set_table_entry(&table[1], "max_interval", &sd->max_interval,

6346

sizeof(long), 0644, proc_doulongvec_minmax);

6346

sizeof(long), 0644, proc_doulongvec_minmax);

6347

set_table_entry(&table[2], "busy_idx", &sd->busy_idx,

6347

set_table_entry(&table[2], "busy_idx", &sd->busy_idx,

6348

sizeof(int), 0644, proc_dointvec_minmax);

6348

sizeof(int), 0644, proc_dointvec_minmax);

6349

set_table_entry(&table[3], "idle_idx", &sd->idle_idx,

6349

set_table_entry(&table[3], "idle_idx", &sd->idle_idx,

6350

sizeof(int), 0644, proc_dointvec_minmax);

6350

sizeof(int), 0644, proc_dointvec_minmax);

6351

set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,

6351

set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,

6352

sizeof(int), 0644, proc_dointvec_minmax);

6352

sizeof(int), 0644, proc_dointvec_minmax);

6353

set_table_entry(&table[5], "wake_idx", &sd->wake_idx,

6353

set_table_entry(&table[5], "wake_idx", &sd->wake_idx,

6354

sizeof(int), 0644, proc_dointvec_minmax);

6354

sizeof(int), 0644, proc_dointvec_minmax);

6355

set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,

6355

set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,

6356

sizeof(int), 0644, proc_dointvec_minmax);

6356

sizeof(int), 0644, proc_dointvec_minmax);

6357

set_table_entry(&table[7], "busy_factor", &sd->busy_factor,

6357

set_table_entry(&table[7], "busy_factor", &sd->busy_factor,

6358

sizeof(int), 0644, proc_dointvec_minmax);

6358

sizeof(int), 0644, proc_dointvec_minmax);

6359

set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,

6359

set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,

6360

sizeof(int), 0644, proc_dointvec_minmax);

6360

sizeof(int), 0644, proc_dointvec_minmax);

6361

set_table_entry(&table[9], "cache_nice_tries",

6361

set_table_entry(&table[9], "cache_nice_tries",

6362

&sd->cache_nice_tries,

6362

&sd->cache_nice_tries,

6363

sizeof(int), 0644, proc_dointvec_minmax);

6363

sizeof(int), 0644, proc_dointvec_minmax);

6364

set_table_entry(&table[10], "flags", &sd->flags,

6364

set_table_entry(&table[10], "flags", &sd->flags,

6365

sizeof(int), 0644, proc_dointvec_minmax);

6365

sizeof(int), 0644, proc_dointvec_minmax);

6366

set_table_entry(&table[11], "name", sd->name,

6366

set_table_entry(&table[11], "name", sd->name,

6367

CORENAME_MAX_SIZE, 0444, proc_dostring);

6367

CORENAME_MAX_SIZE, 0444, proc_dostring);

6368

/* &table[12] is terminator */

6368

/* &table[12] is terminator */

6369

6370

return table;

6370

return table;

6371

}

6371

}

6372

6373

static ctl_table *sd_alloc_ctl_cpu_table(int cpu)

6373

static ctl_table *sd_alloc_ctl_cpu_table(int cpu)

6374

{

6374

{

6375

struct ctl_table *entry, *table;

6375

struct ctl_table *entry, *table;

6376

struct sched_domain *sd;

6376

struct sched_domain *sd;

6377

int domain_num = 0, i;

6377

int domain_num = 0, i;

6378

char buf[32];

6378

char buf[32];

6379

6380

for_each_domain(cpu, sd)

6380

for_each_domain(cpu, sd)

6381

domain_num++;

6381

domain_num++;

6382

entry = table = sd_alloc_ctl_entry(domain_num + 1);

6382

entry = table = sd_alloc_ctl_entry(domain_num + 1);

6383

if (table == NULL)

6383

if (table == NULL)

6384

return NULL;

6384

return NULL;

6385

6386

i = 0;

6386

i = 0;

6387

for_each_domain(cpu, sd) {

6387

for_each_domain(cpu, sd) {

6388

snprintf(buf, 32, "domain%d", i);

6388

snprintf(buf, 32, "domain%d", i);

6389

entry->procname = kstrdup(buf, GFP_KERNEL);

6389

entry->procname = kstrdup(buf, GFP_KERNEL);

6390

entry->mode = 0555;

6390

entry->mode = 0555;

6391

entry->child = sd_alloc_ctl_domain_table(sd);

6391

entry->child = sd_alloc_ctl_domain_table(sd);

6392

entry++;

6392

entry++;

6393

i++;

6393

i++;

6394

}

6394

}

6395

return table;

6395

return table;

6396

}

6396

}

6397

6398

static struct ctl_table_header *sd_sysctl_header;

6398

static struct ctl_table_header *sd_sysctl_header;

6399

static void register_sched_domain_sysctl(void)

6399

static void register_sched_domain_sysctl(void)

6400

{

6400

{

6401

int i, cpu_num = num_possible_cpus();

6401

int i, cpu_num = num_possible_cpus();

6402

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

6402

struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);

6403

char buf[32];

6403

char buf[32];

6404

6405

WARN_ON(sd_ctl_dir[0].child);

6405

WARN_ON(sd_ctl_dir[0].child);

6406

sd_ctl_dir[0].child = entry;

6406

sd_ctl_dir[0].child = entry;

6407

6408

if (entry == NULL)

6408

if (entry == NULL)

6409

return;

6409

return;

6410

6411

for_each_possible_cpu(i) {

6411

for_each_possible_cpu(i) {

6412

snprintf(buf, 32, "cpu%d", i);

6412

snprintf(buf, 32, "cpu%d", i);

6413

entry->procname = kstrdup(buf, GFP_KERNEL);

6413

entry->procname = kstrdup(buf, GFP_KERNEL);

6414

entry->mode = 0555;

6414

entry->mode = 0555;

6415

entry->child = sd_alloc_ctl_cpu_table(i);

6415

entry->child = sd_alloc_ctl_cpu_table(i);

6416

entry++;

6416

entry++;

6417

}

6417

}

6418

6419

WARN_ON(sd_sysctl_header);

6419

WARN_ON(sd_sysctl_header);

6420

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

6420

sd_sysctl_header = register_sysctl_table(sd_ctl_root);

6421

}

6421

}

6422

6423

/* may be called multiple times per register */

6423

/* may be called multiple times per register */

6424

static void unregister_sched_domain_sysctl(void)

6424

static void unregister_sched_domain_sysctl(void)

6425

{

6425

{

6426

if (sd_sysctl_header)

6426

if (sd_sysctl_header)

6427

unregister_sysctl_table(sd_sysctl_header);

6427

unregister_sysctl_table(sd_sysctl_header);

6428

sd_sysctl_header = NULL;

6428

sd_sysctl_header = NULL;

6429

if (sd_ctl_dir[0].child)

6429

if (sd_ctl_dir[0].child)

6430

sd_free_ctl_entry(&sd_ctl_dir[0].child);

6430

sd_free_ctl_entry(&sd_ctl_dir[0].child);

6431

}

6431

}

6432

#else

6432

#else

6433

static void register_sched_domain_sysctl(void)

6433

static void register_sched_domain_sysctl(void)

6434

{

6434

{

6435

}

6435

}

6436

static void unregister_sched_domain_sysctl(void)

6436

static void unregister_sched_domain_sysctl(void)

6437

{

6437

{

6438

}

6438

}

6439

#endif

6439

#endif

6440

6441

static void set_rq_online(struct rq *rq)

6441

static void set_rq_online(struct rq *rq)

6442

{

6442

{

6443

if (!rq->online) {

6443

if (!rq->online) {

6444

const struct sched_class *class;

6444

const struct sched_class *class;

6445

6446

cpumask_set_cpu(rq->cpu, rq->rd->online);

6446

cpumask_set_cpu(rq->cpu, rq->rd->online);

6447

rq->online = 1;

6447

rq->online = 1;

6448

6449

for_each_class(class) {

6449

for_each_class(class) {

6450

if (class->rq_online)

6450

if (class->rq_online)

6451

class->rq_online(rq);

6451

class->rq_online(rq);

6452

}

6452

}

6453

}

6453

}

6454

}

6454

}

6455

6456

static void set_rq_offline(struct rq *rq)

6456

static void set_rq_offline(struct rq *rq)

6457

{

6457

{

6458

if (rq->online) {

6458

if (rq->online) {

6459

const struct sched_class *class;

6459

const struct sched_class *class;

6460

6461

for_each_class(class) {

6461

for_each_class(class) {

6462

if (class->rq_offline)

6462

if (class->rq_offline)

6463

class->rq_offline(rq);

6463

class->rq_offline(rq);

6464

}

6464

}

6465

6466

cpumask_clear_cpu(rq->cpu, rq->rd->online);

6466

cpumask_clear_cpu(rq->cpu, rq->rd->online);

6467

rq->online = 0;

6467

rq->online = 0;

6468

}

6468

}

6469

}

6469

}

6470

6471

/*

6471

/*

6472

* migration_call - callback that gets triggered when a CPU is added.

6472

* migration_call - callback that gets triggered when a CPU is added.

6473

* Here we can start up the necessary migration thread for the new CPU.

6473

* Here we can start up the necessary migration thread for the new CPU.

6474

*/

6474

*/

6475

static int __cpuinit

6475

static int __cpuinit

6476

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

6476

migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)

6477

{

6477

{

6478

int cpu = (long)hcpu;

6478

int cpu = (long)hcpu;

6479

unsigned long flags;

6479

unsigned long flags;

6480

struct rq *rq = cpu_rq(cpu);

6480

struct rq *rq = cpu_rq(cpu);

6481

6482

switch (action & ~CPU_TASKS_FROZEN) {

6482

switch (action & ~CPU_TASKS_FROZEN) {

6483

6484

case CPU_UP_PREPARE:

6484

case CPU_UP_PREPARE:

6485

rq->calc_load_update = calc_load_update;

6485

rq->calc_load_update = calc_load_update;

6486

break;

6486

break;

6487

6488

case CPU_ONLINE:

6488

case CPU_ONLINE:

6489

/* Update our root-domain */

6489

/* Update our root-domain */

6490

raw_spin_lock_irqsave(&rq->lock, flags);

6490

raw_spin_lock_irqsave(&rq->lock, flags);

6491

if (rq->rd) {

6491

if (rq->rd) {

6492

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

6492

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

6493

6494

set_rq_online(rq);

6494

set_rq_online(rq);

6495

}

6495

}

6496

raw_spin_unlock_irqrestore(&rq->lock, flags);

6496

raw_spin_unlock_irqrestore(&rq->lock, flags);

6497

break;

6497

break;

6498

6499

#ifdef CONFIG_HOTPLUG_CPU

6499

#ifdef CONFIG_HOTPLUG_CPU

6500

case CPU_DYING:

6500

case CPU_DYING:

6501

sched_ttwu_pending();

6501

sched_ttwu_pending();

6502

/* Update our root-domain */

6502

/* Update our root-domain */

6503

raw_spin_lock_irqsave(&rq->lock, flags);

6503

raw_spin_lock_irqsave(&rq->lock, flags);

6504

if (rq->rd) {

6504

if (rq->rd) {

6505

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

6505

BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));

6506

set_rq_offline(rq);

6506

set_rq_offline(rq);

6507

}

6507

}

6508

migrate_tasks(cpu);

6508

migrate_tasks(cpu);

6509

BUG_ON(rq->nr_running != 1); /* the migration thread */

6509

BUG_ON(rq->nr_running != 1); /* the migration thread */

6510

raw_spin_unlock_irqrestore(&rq->lock, flags);

6510

raw_spin_unlock_irqrestore(&rq->lock, flags);

6511

6512

migrate_nr_uninterruptible(rq);

6512

migrate_nr_uninterruptible(rq);

6513

calc_global_load_remove(rq);

6513

calc_global_load_remove(rq);

6514

break;

6514

break;

6515

#endif

6515

#endif

6516

}

6516

}

6517

6518

update_max_interval();

6518

update_max_interval();

6519

6520

return NOTIFY_OK;

6520

return NOTIFY_OK;

6521

}

6521

}

6522

6523

/*

6523

/*

6524

* Register at high priority so that task migration (migrate_all_tasks)

6524

* Register at high priority so that task migration (migrate_all_tasks)

6525

* happens before everything else. This has to be lower priority than

6525

* happens before everything else. This has to be lower priority than

6526

* the notifier in the perf_event subsystem, though.

6526

* the notifier in the perf_event subsystem, though.

6527

*/

6527

*/

6528

static struct notifier_block __cpuinitdata migration_notifier = {

6528

static struct notifier_block __cpuinitdata migration_notifier = {

6529

.notifier_call = migration_call,

6529

.notifier_call = migration_call,

6530

.priority = CPU_PRI_MIGRATION,

6530

.priority = CPU_PRI_MIGRATION,

6531

};

6531

};

6532

6533

static int __cpuinit sched_cpu_active(struct notifier_block *nfb,

6533

static int __cpuinit sched_cpu_active(struct notifier_block *nfb,

6534

unsigned long action, void *hcpu)

6534

unsigned long action, void *hcpu)

6535

{

6535

{

6536

switch (action & ~CPU_TASKS_FROZEN) {

6536

switch (action & ~CPU_TASKS_FROZEN) {

6537

case CPU_ONLINE:

6537

case CPU_ONLINE:

6538

case CPU_DOWN_FAILED:

6538

case CPU_DOWN_FAILED:

6539

set_cpu_active((long)hcpu, true);

6539

set_cpu_active((long)hcpu, true);

6540

return NOTIFY_OK;

6540

return NOTIFY_OK;

6541

default:

6541

default:

6542

return NOTIFY_DONE;

6542

return NOTIFY_DONE;

6543

}

6543

}

6544

}

6544

}

6545

6546

static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,

6546

static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,

6547

unsigned long action, void *hcpu)

6547

unsigned long action, void *hcpu)

6548

{

6548

{

6549

switch (action & ~CPU_TASKS_FROZEN) {

6549

switch (action & ~CPU_TASKS_FROZEN) {

6550

case CPU_DOWN_PREPARE:

6550

case CPU_DOWN_PREPARE:

6551

set_cpu_active((long)hcpu, false);

6551

set_cpu_active((long)hcpu, false);

6552

return NOTIFY_OK;

6552

return NOTIFY_OK;

6553

default:

6553

default:

6554

return NOTIFY_DONE;

6554

return NOTIFY_DONE;

6555

}

6555

}

6556

}

6556

}

6557

6558

static int __init migration_init(void)

6558

static int __init migration_init(void)

6559

{

6559

{

6560

void *cpu = (void *)(long)smp_processor_id();

6560

void *cpu = (void *)(long)smp_processor_id();

6561

int err;

6561

int err;

6562

6563

/* Initialize migration for the boot CPU */

6563

/* Initialize migration for the boot CPU */

6564

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

6564

err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);

6565

BUG_ON(err == NOTIFY_BAD);

6565

BUG_ON(err == NOTIFY_BAD);

6566

migration_call(&migration_notifier, CPU_ONLINE, cpu);

6566

migration_call(&migration_notifier, CPU_ONLINE, cpu);

6567

register_cpu_notifier(&migration_notifier);

6567

register_cpu_notifier(&migration_notifier);

6568

6569

/* Register cpu active notifiers */

6569

/* Register cpu active notifiers */

6570

cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);

6570

cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);

6571

cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);

6571

cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);

6572

6573

return 0;

6573

return 0;

6574

}

6574

}

6575

early_initcall(migration_init);

6575

early_initcall(migration_init);

6576

#endif

6576

#endif

6577

6578

#ifdef CONFIG_SMP

6578

#ifdef CONFIG_SMP

6579

6580

static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */

6580

static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */

6581

6582

#ifdef CONFIG_SCHED_DEBUG

6582

#ifdef CONFIG_SCHED_DEBUG

6583

6584

static __read_mostly int sched_domain_debug_enabled;

6584

static __read_mostly int sched_domain_debug_enabled;

6585

6586

static int __init sched_domain_debug_setup(char *str)

6586

static int __init sched_domain_debug_setup(char *str)

6587

{

6587

{

6588

sched_domain_debug_enabled = 1;

6588

sched_domain_debug_enabled = 1;

6589

6590

return 0;

6590

return 0;

6591

}

6591

}

6592

early_param("sched_debug", sched_domain_debug_setup);

6592

early_param("sched_debug", sched_domain_debug_setup);

6593

6594

static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

6594

static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,

6595

struct cpumask *groupmask)

6595

struct cpumask *groupmask)

6596

{

6596

{

6597

struct sched_group *group = sd->groups;

6597

struct sched_group *group = sd->groups;

6598

char str[256];

6598

char str[256];

6599

6600

cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));

6600

cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));

6601

cpumask_clear(groupmask);

6601

cpumask_clear(groupmask);

6602

6603

printk(KERN_DEBUG "%*s domain %d: ", level, "", level);

6603

printk(KERN_DEBUG "%*s domain %d: ", level, "", level);

6604

6605

if (!(sd->flags & SD_LOAD_BALANCE)) {

6605

if (!(sd->flags & SD_LOAD_BALANCE)) {

6606

printk("does not load-balance\n");

6606

printk("does not load-balance\n");

6607

if (sd->parent)

6607

if (sd->parent)

6608

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"

6608

printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"

6609

" has parent");

6609

" has parent");

6610

return -1;

6610

return -1;

6611

}

6611

}

6612

6613

printk(KERN_CONT "span %s level %s\n", str, sd->name);

6613

printk(KERN_CONT "span %s level %s\n", str, sd->name);

6614

6615

if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {

6615

if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {

6616

printk(KERN_ERR "ERROR: domain->span does not contain "

6616

printk(KERN_ERR "ERROR: domain->span does not contain "

6617

"CPU%d\n", cpu);

6617

"CPU%d\n", cpu);

6618

}

6618

}

6619

if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {

6619

if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {

6620

printk(KERN_ERR "ERROR: domain->groups does not contain"

6620

printk(KERN_ERR "ERROR: domain->groups does not contain"

6621

" CPU%d\n", cpu);

6621

" CPU%d\n", cpu);

6622

}

6622

}

6623

6624

printk(KERN_DEBUG "%*s groups:", level + 1, "");

6624

printk(KERN_DEBUG "%*s groups:", level + 1, "");

6625

do {

6625

do {

6626

if (!group) {

6626

if (!group) {

6627

printk("\n");

6627

printk("\n");

6628

printk(KERN_ERR "ERROR: group is NULL\n");

6628

printk(KERN_ERR "ERROR: group is NULL\n");

6629

break;

6629

break;

6630

}

6630

}

6631

6632

if (!group->sgp->power) {

6632

if (!group->sgp->power) {

6633

printk(KERN_CONT "\n");

6633

printk(KERN_CONT "\n");

6634

printk(KERN_ERR "ERROR: domain->cpu_power not "

6634

printk(KERN_ERR "ERROR: domain->cpu_power not "

6635

"set\n");

6635

"set\n");

6636

break;

6636

break;

6637

}

6637

}

6638

6639

if (!cpumask_weight(sched_group_cpus(group))) {

6639

if (!cpumask_weight(sched_group_cpus(group))) {

6640

printk(KERN_CONT "\n");

6640

printk(KERN_CONT "\n");

6641

printk(KERN_ERR "ERROR: empty group\n");

6641

printk(KERN_ERR "ERROR: empty group\n");

6642

break;

6642

break;

6643

}

6643

}

6644

6645

if (cpumask_intersects(groupmask, sched_group_cpus(group))) {

6645

if (cpumask_intersects(groupmask, sched_group_cpus(group))) {

6646

printk(KERN_CONT "\n");

6646

printk(KERN_CONT "\n");

6647

printk(KERN_ERR "ERROR: repeated CPUs\n");

6647

printk(KERN_ERR "ERROR: repeated CPUs\n");

6648

break;

6648

break;

6649

}

6649

}

6650

6651

cpumask_or(groupmask, groupmask, sched_group_cpus(group));

6651

cpumask_or(groupmask, groupmask, sched_group_cpus(group));

6652

6653

cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));

6653

cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));

6654

6655

printk(KERN_CONT " %s", str);

6655

printk(KERN_CONT " %s", str);

6656

if (group->sgp->power != SCHED_POWER_SCALE) {

6656

if (group->sgp->power != SCHED_POWER_SCALE) {

6657

printk(KERN_CONT " (cpu_power = %d)",

6657

printk(KERN_CONT " (cpu_power = %d)",

6658

group->sgp->power);

6658

group->sgp->power);

6659

}

6659

}

6660

6661

group = group->next;

6661

group = group->next;

6662

} while (group != sd->groups);

6662

} while (group != sd->groups);

6663

printk(KERN_CONT "\n");

6663

printk(KERN_CONT "\n");

6664

6665

if (!cpumask_equal(sched_domain_span(sd), groupmask))

6665

if (!cpumask_equal(sched_domain_span(sd), groupmask))

6666

printk(KERN_ERR "ERROR: groups don't span domain->span\n");

6666

printk(KERN_ERR "ERROR: groups don't span domain->span\n");

6667

6668

if (sd->parent &&

6668

if (sd->parent &&

6669

!cpumask_subset(groupmask, sched_domain_span(sd->parent)))

6669

!cpumask_subset(groupmask, sched_domain_span(sd->parent)))

6670

printk(KERN_ERR "ERROR: parent span is not a superset "

6670

printk(KERN_ERR "ERROR: parent span is not a superset "

6671

"of domain->span\n");

6671

"of domain->span\n");

6672

return 0;

6672

return 0;

6673

}

6673

}

6674

6675

static void sched_domain_debug(struct sched_domain *sd, int cpu)

6675

static void sched_domain_debug(struct sched_domain *sd, int cpu)

6676

{

6676

{

6677

int level = 0;

6677

int level = 0;

6678

6679

if (!sched_domain_debug_enabled)

6679

if (!sched_domain_debug_enabled)

6680

return;

6680

return;

6681

6682

if (!sd) {

6682

if (!sd) {

6683

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

6683

printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);

6684

return;

6684

return;

6685

}

6685

}

6686

6687

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

6687

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

6688

6689

for (;;) {

6689

for (;;) {

6690

if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))

6690

if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))

6691

break;

6691

break;

6692

level++;

6692

level++;

6693

sd = sd->parent;

6693

sd = sd->parent;

6694

if (!sd)

6694

if (!sd)

6695

break;

6695

break;

6696

}

6696

}

6697

}

6697

}

6698

#else /* !CONFIG_SCHED_DEBUG */

6698

#else /* !CONFIG_SCHED_DEBUG */

6699

# define sched_domain_debug(sd, cpu) do { } while (0)

6699

# define sched_domain_debug(sd, cpu) do { } while (0)

6700

#endif /* CONFIG_SCHED_DEBUG */

6700

#endif /* CONFIG_SCHED_DEBUG */

6701

6702

static int sd_degenerate(struct sched_domain *sd)

6702

static int sd_degenerate(struct sched_domain *sd)

6703

{

6703

{

6704

if (cpumask_weight(sched_domain_span(sd)) == 1)

6704

if (cpumask_weight(sched_domain_span(sd)) == 1)

6705

return 1;

6705

return 1;

6706

6707

/* Following flags need at least 2 groups */

6707

/* Following flags need at least 2 groups */

6708

if (sd->flags & (SD_LOAD_BALANCE |

6708

if (sd->flags & (SD_LOAD_BALANCE |

6709

SD_BALANCE_NEWIDLE |

6709

SD_BALANCE_NEWIDLE |

6710

SD_BALANCE_FORK |

6710

SD_BALANCE_FORK |

6711

SD_BALANCE_EXEC |

6711

SD_BALANCE_EXEC |

6712

SD_SHARE_CPUPOWER |

6712

SD_SHARE_CPUPOWER |

6713

SD_SHARE_PKG_RESOURCES)) {

6713

SD_SHARE_PKG_RESOURCES)) {

6714

if (sd->groups != sd->groups->next)

6714

if (sd->groups != sd->groups->next)

6715

return 0;

6715

return 0;

6716

}

6716

}

6717

6718

/* Following flags don't use groups */

6718

/* Following flags don't use groups */

6719

if (sd->flags & (SD_WAKE_AFFINE))

6719

if (sd->flags & (SD_WAKE_AFFINE))

6720

return 0;

6720

return 0;

6721

6722

return 1;

6722

return 1;

6723

}

6723

}

6724

6725

static int

6725

static int

6726

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

6726

sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)

6727

{

6727

{

6728

unsigned long cflags = sd->flags, pflags = parent->flags;

6728

unsigned long cflags = sd->flags, pflags = parent->flags;

6729

6730

if (sd_degenerate(parent))

6730

if (sd_degenerate(parent))

6731

return 1;

6731

return 1;

6732

6733

if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))

6733

if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))

6734

return 0;

6734

return 0;

6735

6736

/* Flags needing groups don't count if only 1 group in parent */

6736

/* Flags needing groups don't count if only 1 group in parent */

6737

if (parent->groups == parent->groups->next) {

6737

if (parent->groups == parent->groups->next) {

6738

pflags &= ~(SD_LOAD_BALANCE |

6738

pflags &= ~(SD_LOAD_BALANCE |

6739

SD_BALANCE_NEWIDLE |

6739

SD_BALANCE_NEWIDLE |

6740

SD_BALANCE_FORK |

6740

SD_BALANCE_FORK |

6741

SD_BALANCE_EXEC |

6741

SD_BALANCE_EXEC |

6742

SD_SHARE_CPUPOWER |

6742

SD_SHARE_CPUPOWER |

6743

SD_SHARE_PKG_RESOURCES);

6743

SD_SHARE_PKG_RESOURCES);

6744

if (nr_node_ids == 1)

6744

if (nr_node_ids == 1)

6745

pflags &= ~SD_SERIALIZE;

6745

pflags &= ~SD_SERIALIZE;

6746

}

6746

}

6747

if (~cflags & pflags)

6747

if (~cflags & pflags)

6748

return 0;

6748

return 0;

6749

6750

return 1;

6750

return 1;

6751

}

6751

}

6752

6753

static void free_rootdomain(struct rcu_head *rcu)

6753

static void free_rootdomain(struct rcu_head *rcu)

6754

{

6754

{

6755

struct root_domain *rd = container_of(rcu, struct root_domain, rcu);

6755

struct root_domain *rd = container_of(rcu, struct root_domain, rcu);

6756

6757

cpupri_cleanup(&rd->cpupri);

6757

cpupri_cleanup(&rd->cpupri);

6758

free_cpumask_var(rd->rto_mask);

6758

free_cpumask_var(rd->rto_mask);

6759

free_cpumask_var(rd->online);

6759

free_cpumask_var(rd->online);

6760

free_cpumask_var(rd->span);

6760

free_cpumask_var(rd->span);

6761

kfree(rd);

6761

kfree(rd);

6762

}

6762

}

6763

6764

static void rq_attach_root(struct rq *rq, struct root_domain *rd)

6764

static void rq_attach_root(struct rq *rq, struct root_domain *rd)

6765

{

6765

{

6766

struct root_domain *old_rd = NULL;

6766

struct root_domain *old_rd = NULL;

6767

unsigned long flags;

6767

unsigned long flags;

6768

6769

raw_spin_lock_irqsave(&rq->lock, flags);

6769

raw_spin_lock_irqsave(&rq->lock, flags);

6770

6771

if (rq->rd) {

6771

if (rq->rd) {

6772

old_rd = rq->rd;

6772

old_rd = rq->rd;

6773

6774

if (cpumask_test_cpu(rq->cpu, old_rd->online))

6774

if (cpumask_test_cpu(rq->cpu, old_rd->online))

6775

set_rq_offline(rq);

6775

set_rq_offline(rq);

6776

6777

cpumask_clear_cpu(rq->cpu, old_rd->span);

6777

cpumask_clear_cpu(rq->cpu, old_rd->span);

6778

6779

/*

6779

/*

6780

* If we dont want to free the old_rt yet then

6780

* If we dont want to free the old_rt yet then

6781

* set old_rd to NULL to skip the freeing later

6781

* set old_rd to NULL to skip the freeing later

6782

* in this function:

6782

* in this function:

6783

*/

6783

*/

6784

if (!atomic_dec_and_test(&old_rd->refcount))

6784

if (!atomic_dec_and_test(&old_rd->refcount))

6785

old_rd = NULL;

6785

old_rd = NULL;

6786

}

6786

}

6787

6788

atomic_inc(&rd->refcount);

6788

atomic_inc(&rd->refcount);

6789

rq->rd = rd;

6789

rq->rd = rd;

6790

6791

cpumask_set_cpu(rq->cpu, rd->span);

6791

cpumask_set_cpu(rq->cpu, rd->span);

6792

if (cpumask_test_cpu(rq->cpu, cpu_active_mask))

6792

if (cpumask_test_cpu(rq->cpu, cpu_active_mask))

6793

set_rq_online(rq);

6793

set_rq_online(rq);

6794

6795

raw_spin_unlock_irqrestore(&rq->lock, flags);

6795

raw_spin_unlock_irqrestore(&rq->lock, flags);

6796

6797

if (old_rd)

6797

if (old_rd)

6798

call_rcu_sched(&old_rd->rcu, free_rootdomain);

6798

call_rcu_sched(&old_rd->rcu, free_rootdomain);

6799

}

6799

}

6800

6801

static int init_rootdomain(struct root_domain *rd)

6801

static int init_rootdomain(struct root_domain *rd)

6802

{

6802

{

6803

memset(rd, 0, sizeof(*rd));

6803

memset(rd, 0, sizeof(*rd));

6804

6805

if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))

6805

if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))

6806

goto out;

6806

goto out;

6807

if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))

6807

if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))

6808

goto free_span;

6808

goto free_span;

6809

if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))

6809

if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))

6810

goto free_online;

6810

goto free_online;

6811

6812

if (cpupri_init(&rd->cpupri) != 0)

6812

if (cpupri_init(&rd->cpupri) != 0)

6813

goto free_rto_mask;

6813

goto free_rto_mask;

6814

return 0;

6814

return 0;

6815

6816

free_rto_mask:

6816

free_rto_mask:

6817

free_cpumask_var(rd->rto_mask);

6817

free_cpumask_var(rd->rto_mask);

6818

free_online:

6818

free_online:

6819

free_cpumask_var(rd->online);

6819

free_cpumask_var(rd->online);

6820

free_span:

6820

free_span:

6821

free_cpumask_var(rd->span);

6821

free_cpumask_var(rd->span);

6822

out:

6822

out:

6823

return -ENOMEM;

6823

return -ENOMEM;

6824

}

6824

}

6825

6826

static void init_defrootdomain(void)

6826

static void init_defrootdomain(void)

6827

{

6827

{

6828

init_rootdomain(&def_root_domain);

6828

init_rootdomain(&def_root_domain);

6829

6830

atomic_set(&def_root_domain.refcount, 1);

6830

atomic_set(&def_root_domain.refcount, 1);

6831

}

6831

}

6832

6833

static struct root_domain *alloc_rootdomain(void)

6833

static struct root_domain *alloc_rootdomain(void)

6834

{

6834

{

6835

struct root_domain *rd;

6835

struct root_domain *rd;

6836

6837

rd = kmalloc(sizeof(*rd), GFP_KERNEL);

6837

rd = kmalloc(sizeof(*rd), GFP_KERNEL);

6838

if (!rd)

6838

if (!rd)

6839

return NULL;

6839

return NULL;

6840

6841

if (init_rootdomain(rd) != 0) {

6841

if (init_rootdomain(rd) != 0) {

6842

kfree(rd);

6842

kfree(rd);

6843

return NULL;

6843

return NULL;

6844

}

6844

}

6845

6846

return rd;

6846

return rd;

6847

}

6847

}

6848

6849

static void free_sched_groups(struct sched_group *sg, int free_sgp)

6849

static void free_sched_groups(struct sched_group *sg, int free_sgp)

6850

{

6850

{

6851

struct sched_group *tmp, *first;

6851

struct sched_group *tmp, *first;

6852

6853

if (!sg)

6853

if (!sg)

6854

return;

6854

return;

6855

6856

first = sg;

6856

first = sg;

6857

do {

6857

do {

6858

tmp = sg->next;

6858

tmp = sg->next;

6859

6860

if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))

6860

if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))

6861

kfree(sg->sgp);

6861

kfree(sg->sgp);

6862

6863

kfree(sg);

6863

kfree(sg);

6864

sg = tmp;

6864

sg = tmp;

6865

} while (sg != first);

6865

} while (sg != first);

6866

}

6866

}

6867

6868

static void free_sched_domain(struct rcu_head *rcu)

6868

static void free_sched_domain(struct rcu_head *rcu)

6869

{

6869

{

6870

struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);

6870

struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);

6871

6872

/*

6872

/*

6873

* If its an overlapping domain it has private groups, iterate and

6873

* If its an overlapping domain it has private groups, iterate and

6874

* nuke them all.

6874

* nuke them all.

6875

*/

6875

*/

6876

if (sd->flags & SD_OVERLAP) {

6876

if (sd->flags & SD_OVERLAP) {

6877

free_sched_groups(sd->groups, 1);

6877

free_sched_groups(sd->groups, 1);

6878

} else if (atomic_dec_and_test(&sd->groups->ref)) {

6878

} else if (atomic_dec_and_test(&sd->groups->ref)) {

6879

kfree(sd->groups->sgp);

6879

kfree(sd->groups->sgp);

6880

kfree(sd->groups);

6880

kfree(sd->groups);

6881

}

6881

}

6882

kfree(sd);

6882

kfree(sd);

6883

}

6883

}

6884

6885

static void destroy_sched_domain(struct sched_domain *sd, int cpu)

6885

static void destroy_sched_domain(struct sched_domain *sd, int cpu)

6886

{

6886

{

6887

call_rcu(&sd->rcu, free_sched_domain);

6887

call_rcu(&sd->rcu, free_sched_domain);

6888

}

6888

}

6889

6890

static void destroy_sched_domains(struct sched_domain *sd, int cpu)

6890

static void destroy_sched_domains(struct sched_domain *sd, int cpu)

6891

{

6891

{

6892

for (; sd; sd = sd->parent)

6892

for (; sd; sd = sd->parent)

6893

destroy_sched_domain(sd, cpu);

6893

destroy_sched_domain(sd, cpu);

6894

}

6894

}

6895

6896

/*

6896

/*

6897

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

6897

* Attach the domain 'sd' to 'cpu' as its base domain. Callers must

6898

* hold the hotplug lock.

6898

* hold the hotplug lock.

6899

*/

6899

*/

6900

static void

6900

static void

6901

cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

6901

cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)

6902

{

6902

{

6903

struct rq *rq = cpu_rq(cpu);

6903

struct rq *rq = cpu_rq(cpu);

6904

struct sched_domain *tmp;

6904

struct sched_domain *tmp;

6905

6906

/* Remove the sched domains which do not contribute to scheduling. */

6906

/* Remove the sched domains which do not contribute to scheduling. */

6907

for (tmp = sd; tmp; ) {

6907

for (tmp = sd; tmp; ) {

6908

struct sched_domain *parent = tmp->parent;

6908

struct sched_domain *parent = tmp->parent;

6909

if (!parent)

6909

if (!parent)

6910

break;

6910

break;

6911

6912

if (sd_parent_degenerate(tmp, parent)) {

6912

if (sd_parent_degenerate(tmp, parent)) {

6913

tmp->parent = parent->parent;

6913

tmp->parent = parent->parent;

6914

if (parent->parent)

6914

if (parent->parent)

6915

parent->parent->child = tmp;

6915

parent->parent->child = tmp;

6916

destroy_sched_domain(parent, cpu);

6916

destroy_sched_domain(parent, cpu);

6917

} else

6917

} else

6918

tmp = tmp->parent;

6918

tmp = tmp->parent;

6919

}

6919

}

6920

6921

if (sd && sd_degenerate(sd)) {

6921

if (sd && sd_degenerate(sd)) {

6922

tmp = sd;

6922

tmp = sd;

6923

sd = sd->parent;

6923

sd = sd->parent;

6924

destroy_sched_domain(tmp, cpu);

6924

destroy_sched_domain(tmp, cpu);

6925

if (sd)

6925

if (sd)

6926

sd->child = NULL;

6926

sd->child = NULL;

6927

}

6927

}

6928

6929

sched_domain_debug(sd, cpu);

6929

sched_domain_debug(sd, cpu);

6930

6931

rq_attach_root(rq, rd);

6931

rq_attach_root(rq, rd);

6932

tmp = rq->sd;

6932

tmp = rq->sd;

6933

rcu_assign_pointer(rq->sd, sd);

6933

rcu_assign_pointer(rq->sd, sd);

6934

destroy_sched_domains(tmp, cpu);

6934

destroy_sched_domains(tmp, cpu);

6935

}

6935

}

6936

6937

/* cpus with isolated domains */

6937

/* cpus with isolated domains */

6938

static cpumask_var_t cpu_isolated_map;

6938

static cpumask_var_t cpu_isolated_map;

6939

6940

/* Setup the mask of cpus configured for isolated domains */

6940

/* Setup the mask of cpus configured for isolated domains */

6941

static int __init isolated_cpu_setup(char *str)

6941

static int __init isolated_cpu_setup(char *str)

6942

{

6942

{

6943

alloc_bootmem_cpumask_var(&cpu_isolated_map);

6943

alloc_bootmem_cpumask_var(&cpu_isolated_map);

6944

cpulist_parse(str, cpu_isolated_map);

6944

cpulist_parse(str, cpu_isolated_map);

6945

return 1;

6945

return 1;

6946

}

6946

}

6947

6948

__setup("isolcpus=", isolated_cpu_setup);

6948

__setup("isolcpus=", isolated_cpu_setup);

6949

6950

#define SD_NODES_PER_DOMAIN 16

6950

#define SD_NODES_PER_DOMAIN 16

6951

6952

#ifdef CONFIG_NUMA

6952

#ifdef CONFIG_NUMA

6953

6954

/**

6954

/**

6955

* find_next_best_node - find the next node to include in a sched_domain

6955

* find_next_best_node - find the next node to include in a sched_domain

6956

* @node: node whose sched_domain we're building

6956

* @node: node whose sched_domain we're building

6957

* @used_nodes: nodes already in the sched_domain

6957

* @used_nodes: nodes already in the sched_domain

6958

*

6958

*

6959

* Find the next node to include in a given scheduling domain. Simply

6959

* Find the next node to include in a given scheduling domain. Simply

6960

* finds the closest node not already in the @used_nodes map.

6960

* finds the closest node not already in the @used_nodes map.

6961

*

6961

*

6962

* Should use nodemask_t.

6962

* Should use nodemask_t.

6963

*/

6963

*/

6964

static int find_next_best_node(int node, nodemask_t *used_nodes)

6964

static int find_next_best_node(int node, nodemask_t *used_nodes)

6965

{

6965

{

6966

int i, n, val, min_val, best_node = -1;

6966

int i, n, val, min_val, best_node = -1;

6967

6968

min_val = INT_MAX;

6968

min_val = INT_MAX;

6969

6970

for (i = 0; i < nr_node_ids; i++) {

6970

for (i = 0; i < nr_node_ids; i++) {

6971

/* Start at @node */

6971

/* Start at @node */

6972

n = (node + i) % nr_node_ids;

6972

n = (node + i) % nr_node_ids;

6973

6974

if (!nr_cpus_node(n))

6974

if (!nr_cpus_node(n))

6975

continue;

6975

continue;

6976

6977

/* Skip already used nodes */

6977

/* Skip already used nodes */

6978

if (node_isset(n, *used_nodes))

6978

if (node_isset(n, *used_nodes))

6979

continue;

6979

continue;

6980

6981

/* Simple min distance search */

6981

/* Simple min distance search */

6982

val = node_distance(node, n);

6982

val = node_distance(node, n);

6983

6984

if (val < min_val) {

6984

if (val < min_val) {

6985

min_val = val;

6985

min_val = val;

6986

best_node = n;

6986

best_node = n;

6987

}

6987

}

6988

}

6988

}

6989

6990

if (best_node != -1)

6990

if (best_node != -1)

6991

node_set(best_node, *used_nodes);

6991

node_set(best_node, *used_nodes);

6992

return best_node;

6992

return best_node;

6993

}

6993

}

6994

6995

/**

6995

/**

6996

* sched_domain_node_span - get a cpumask for a node's sched_domain

6996

* sched_domain_node_span - get a cpumask for a node's sched_domain

6997

* @node: node whose cpumask we're constructing

6997

* @node: node whose cpumask we're constructing

6998

* @span: resulting cpumask

6998

* @span: resulting cpumask

6999

*

6999

*

7000

* Given a node, construct a good cpumask for its sched_domain to span. It

7000

* Given a node, construct a good cpumask for its sched_domain to span. It

7001

* should be one that prevents unnecessary balancing, but also spreads tasks

7001

* should be one that prevents unnecessary balancing, but also spreads tasks

7002

* out optimally.

7002

* out optimally.

7003

*/

7003

*/

7004

static void sched_domain_node_span(int node, struct cpumask *span)

7004

static void sched_domain_node_span(int node, struct cpumask *span)

7005

{

7005

{

7006

nodemask_t used_nodes;

7006

nodemask_t used_nodes;

7007

int i;

7007

int i;

7008

7009

cpumask_clear(span);

7009

cpumask_clear(span);

7010

nodes_clear(used_nodes);

7010

nodes_clear(used_nodes);

7011

7012

cpumask_or(span, span, cpumask_of_node(node));

7012

cpumask_or(span, span, cpumask_of_node(node));

7013

node_set(node, used_nodes);

7013

node_set(node, used_nodes);

7014

7015

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

7015

for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {

7016

int next_node = find_next_best_node(node, &used_nodes);

7016

int next_node = find_next_best_node(node, &used_nodes);

7017

if (next_node < 0)

7017

if (next_node < 0)

7018

break;

7018

break;

7019

cpumask_or(span, span, cpumask_of_node(next_node));

7019

cpumask_or(span, span, cpumask_of_node(next_node));

7020

}

7020

}

7021

}

7021

}

7022

7023

static const struct cpumask *cpu_node_mask(int cpu)

7023

static const struct cpumask *cpu_node_mask(int cpu)

7024

{

7024

{

7025

lockdep_assert_held(&sched_domains_mutex);

7025

lockdep_assert_held(&sched_domains_mutex);

7026

7027

sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);

7027

sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);

7028

7029

return sched_domains_tmpmask;

7029

return sched_domains_tmpmask;

7030

}

7030

}

7031

7032

static const struct cpumask *cpu_allnodes_mask(int cpu)

7032

static const struct cpumask *cpu_allnodes_mask(int cpu)

7033

{

7033

{

7034

return cpu_possible_mask;

7034

return cpu_possible_mask;

7035

}

7035

}

7036

#endif /* CONFIG_NUMA */

7036

#endif /* CONFIG_NUMA */

7037

7038

static const struct cpumask *cpu_cpu_mask(int cpu)

7038

static const struct cpumask *cpu_cpu_mask(int cpu)

7039

{

7039

{

7040

return cpumask_of_node(cpu_to_node(cpu));

7040

return cpumask_of_node(cpu_to_node(cpu));

7041

}

7041

}

7042

7043

int sched_smt_power_savings = 0, sched_mc_power_savings = 0;

7043

int sched_smt_power_savings = 0, sched_mc_power_savings = 0;

7044

7045

struct sd_data {

7045

struct sd_data {

7046

struct sched_domain **__percpu sd;

7046

struct sched_domain **__percpu sd;

7047

struct sched_group **__percpu sg;

7047

struct sched_group **__percpu sg;

7048

struct sched_group_power **__percpu sgp;

7048

struct sched_group_power **__percpu sgp;

7049

};

7049

};

7050

7051

struct s_data {

7051

struct s_data {

7052

struct sched_domain ** __percpu sd;

7052

struct sched_domain ** __percpu sd;

7053

struct root_domain *rd;

7053

struct root_domain *rd;

7054

};

7054

};

7055

7056

enum s_alloc {

7056

enum s_alloc {

7057

sa_rootdomain,

7057

sa_rootdomain,

7058

sa_sd,

7058

sa_sd,

7059

sa_sd_storage,

7059

sa_sd_storage,

7060

sa_none,

7060

sa_none,

7061

};

7061

};

7062

7063

struct sched_domain_topology_level;

7063

struct sched_domain_topology_level;

7064

7065

typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);

7065

typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);

7066

typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);

7066

typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);

7067

7068

#define SDTL_OVERLAP 0x01

7068

#define SDTL_OVERLAP 0x01

7069

7070

struct sched_domain_topology_level {

7070

struct sched_domain_topology_level {

7071

sched_domain_init_f init;

7071

sched_domain_init_f init;

7072

sched_domain_mask_f mask;

7072

sched_domain_mask_f mask;

7073

int flags;

7073

int flags;

7074

struct sd_data data;

7074

struct sd_data data;

7075

};

7075

};

7076

7077

static int

7077

static int

7078

build_overlap_sched_groups(struct sched_domain *sd, int cpu)

7078

build_overlap_sched_groups(struct sched_domain *sd, int cpu)

7079

{

7079

{

7080

struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;

7080

struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;

7081

const struct cpumask *span = sched_domain_span(sd);

7081

const struct cpumask *span = sched_domain_span(sd);

7082

struct cpumask *covered = sched_domains_tmpmask;

7082

struct cpumask *covered = sched_domains_tmpmask;

7083

struct sd_data *sdd = sd->private;

7083

struct sd_data *sdd = sd->private;

7084

struct sched_domain *child;

7084

struct sched_domain *child;

7085

int i;

7085

int i;

7086

7087

cpumask_clear(covered);

7087

cpumask_clear(covered);

7088

7089

for_each_cpu(i, span) {

7089

for_each_cpu(i, span) {

7090

struct cpumask *sg_span;

7090

struct cpumask *sg_span;

7091

7092

if (cpumask_test_cpu(i, covered))

7092

if (cpumask_test_cpu(i, covered))

7093

continue;

7093

continue;

7094

7095

sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),

7095

sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),

7096

GFP_KERNEL, cpu_to_node(i));

7096

GFP_KERNEL, cpu_to_node(i));

7097

7098

if (!sg)

7098

if (!sg)

7099

goto fail;

7099

goto fail;

7100

7101

sg_span = sched_group_cpus(sg);

7101

sg_span = sched_group_cpus(sg);

7102

7103

child = *per_cpu_ptr(sdd->sd, i);

7103

child = *per_cpu_ptr(sdd->sd, i);

7104

if (child->child) {

7104

if (child->child) {

7105

child = child->child;

7105

child = child->child;

7106

cpumask_copy(sg_span, sched_domain_span(child));

7106

cpumask_copy(sg_span, sched_domain_span(child));

7107

} else

7107

} else

7108

cpumask_set_cpu(i, sg_span);

7108

cpumask_set_cpu(i, sg_span);

7109

7110

cpumask_or(covered, covered, sg_span);

7110

cpumask_or(covered, covered, sg_span);

7111

7112

sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));

7112

sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));

7113

atomic_inc(&sg->sgp->ref);

7113

atomic_inc(&sg->sgp->ref);

7114

7115

if (cpumask_test_cpu(cpu, sg_span))

7115

if (cpumask_test_cpu(cpu, sg_span))

7116

groups = sg;

7116

groups = sg;

7117

7118

if (!first)

7118

if (!first)

7119

first = sg;

7119

first = sg;

7120

if (last)

7120

if (last)

7121

last->next = sg;

7121

last->next = sg;

7122

last = sg;

7122

last = sg;

7123

last->next = first;

7123

last->next = first;

7124

}

7124

}

7125

sd->groups = groups;

7125

sd->groups = groups;

7126

7127

return 0;

7127

return 0;

7128

7129

fail:

7129

fail:

7130

free_sched_groups(first, 0);

7130

free_sched_groups(first, 0);

7131

7132

return -ENOMEM;

7132

return -ENOMEM;

7133

}

7133

}

7134

7135

static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)

7135

static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)

7136

{

7136

{

7137

struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);

7137

struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);

7138

struct sched_domain *child = sd->child;

7138

struct sched_domain *child = sd->child;

7139

7140

if (child)

7140

if (child)

7141

cpu = cpumask_first(sched_domain_span(child));

7141

cpu = cpumask_first(sched_domain_span(child));

7142

7143

if (sg) {

7143

if (sg) {

7144

*sg = *per_cpu_ptr(sdd->sg, cpu);

7144

*sg = *per_cpu_ptr(sdd->sg, cpu);

7145

(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);

7145

(*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);

7146

atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */

7146

atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */

7147

}

7147

}

7148

7149

return cpu;

7149

return cpu;

7150

}

7150

}

7151

7152

/*

7152

/*

7153

* build_sched_groups will build a circular linked list of the groups

7153

* build_sched_groups will build a circular linked list of the groups

7154

* covered by the given span, and will set each group's ->cpumask correctly,

7154

* covered by the given span, and will set each group's ->cpumask correctly,

7155

* and ->cpu_power to 0.

7155

* and ->cpu_power to 0.

7156

*

7156

*

7157

* Assumes the sched_domain tree is fully constructed

7157

* Assumes the sched_domain tree is fully constructed

7158

*/

7158

*/

7159

static int

7159

static int

7160

build_sched_groups(struct sched_domain *sd, int cpu)

7160

build_sched_groups(struct sched_domain *sd, int cpu)

7161

{

7161

{

7162

struct sched_group *first = NULL, *last = NULL;

7162

struct sched_group *first = NULL, *last = NULL;

7163

struct sd_data *sdd = sd->private;

7163

struct sd_data *sdd = sd->private;

7164

const struct cpumask *span = sched_domain_span(sd);

7164

const struct cpumask *span = sched_domain_span(sd);

7165

struct cpumask *covered;

7165

struct cpumask *covered;

7166

int i;

7166

int i;

7167

7168

get_group(cpu, sdd, &sd->groups);

7168

get_group(cpu, sdd, &sd->groups);

7169

atomic_inc(&sd->groups->ref);

7169

atomic_inc(&sd->groups->ref);

7170

7171

if (cpu != cpumask_first(sched_domain_span(sd)))

7171

if (cpu != cpumask_first(sched_domain_span(sd)))

7172

return 0;

7172

return 0;

7173

7174

lockdep_assert_held(&sched_domains_mutex);

7174

lockdep_assert_held(&sched_domains_mutex);

7175

covered = sched_domains_tmpmask;

7175

covered = sched_domains_tmpmask;

7176

7177

cpumask_clear(covered);

7177

cpumask_clear(covered);

7178

7179

for_each_cpu(i, span) {

7179

for_each_cpu(i, span) {

7180

struct sched_group *sg;

7180

struct sched_group *sg;

7181

int group = get_group(i, sdd, &sg);

7181

int group = get_group(i, sdd, &sg);

7182

int j;

7182

int j;

7183

7184

if (cpumask_test_cpu(i, covered))

7184

if (cpumask_test_cpu(i, covered))

7185

continue;

7185

continue;

7186

7187

cpumask_clear(sched_group_cpus(sg));

7187

cpumask_clear(sched_group_cpus(sg));

7188

sg->sgp->power = 0;

7188

sg->sgp->power = 0;

7189

7190

for_each_cpu(j, span) {

7190

for_each_cpu(j, span) {

7191

if (get_group(j, sdd, NULL) != group)

7191

if (get_group(j, sdd, NULL) != group)

7192

continue;

7192

continue;

7193

7194

cpumask_set_cpu(j, covered);

7194

cpumask_set_cpu(j, covered);

7195

cpumask_set_cpu(j, sched_group_cpus(sg));

7195

cpumask_set_cpu(j, sched_group_cpus(sg));

7196

}

7196

}

7197

7198

if (!first)

7198

if (!first)

7199

first = sg;

7199

first = sg;

7200

if (last)

7200

if (last)

7201

last->next = sg;

7201

last->next = sg;

7202

last = sg;

7202

last = sg;

7203

}

7203

}

7204

last->next = first;

7204

last->next = first;

7205

7206

return 0;

7206

return 0;

7207

}

7207

}

7208

7209

/*

7209

/*

7210

* Initialize sched groups cpu_power.

7210

* Initialize sched groups cpu_power.

7211

*

7211

*

7212

* cpu_power indicates the capacity of sched group, which is used while

7212

* cpu_power indicates the capacity of sched group, which is used while

7213

* distributing the load between different sched groups in a sched domain.

7213

* distributing the load between different sched groups in a sched domain.

7214

* Typically cpu_power for all the groups in a sched domain will be same unless

7214

* Typically cpu_power for all the groups in a sched domain will be same unless

7215

* there are asymmetries in the topology. If there are asymmetries, group

7215

* there are asymmetries in the topology. If there are asymmetries, group

7216

* having more cpu_power will pickup more load compared to the group having

7216

* having more cpu_power will pickup more load compared to the group having

7217

* less cpu_power.

7217

* less cpu_power.

7218

*/

7218

*/

7219

static void init_sched_groups_power(int cpu, struct sched_domain *sd)

7219

static void init_sched_groups_power(int cpu, struct sched_domain *sd)

7220

{

7220

{

7221

struct sched_group *sg = sd->groups;

7221

struct sched_group *sg = sd->groups;

7222

7223

WARN_ON(!sd || !sg);

7223

WARN_ON(!sd || !sg);

7224

7225

do {

7225

do {

7226

sg->group_weight = cpumask_weight(sched_group_cpus(sg));

7226

sg->group_weight = cpumask_weight(sched_group_cpus(sg));

7227

sg = sg->next;

7227

sg = sg->next;

7228

} while (sg != sd->groups);

7228

} while (sg != sd->groups);

7229

7230

if (cpu != group_first_cpu(sg))

7230

if (cpu != group_first_cpu(sg))

7231

return;

7231

return;

7232

7233

update_group_power(sd, cpu);

7233

update_group_power(sd, cpu);

7234

}

7234

}

7235

7236

/*

7236

/*

7237

* Initializers for schedule domains

7237

* Initializers for schedule domains

7238

* Non-inlined to reduce accumulated stack pressure in build_sched_domains()

7238

* Non-inlined to reduce accumulated stack pressure in build_sched_domains()

7239

*/

7239

*/

7240

7241

#ifdef CONFIG_SCHED_DEBUG

7241

#ifdef CONFIG_SCHED_DEBUG

7242

# define SD_INIT_NAME(sd, type) sd->name = #type

7242

# define SD_INIT_NAME(sd, type) sd->name = #type

7243

#else

7243

#else

7244

# define SD_INIT_NAME(sd, type) do { } while (0)

7244

# define SD_INIT_NAME(sd, type) do { } while (0)

7245

#endif

7245

#endif

7246

7247

#define SD_INIT_FUNC(type) \

7247

#define SD_INIT_FUNC(type) \

7248

static noinline struct sched_domain * \

7248

static noinline struct sched_domain * \

7249

sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \

7249

sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \

7250

{ \

7250

{ \

7251

struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \

7251

struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \

7252

*sd = SD_##type##_INIT; \

7252

*sd = SD_##type##_INIT; \

7253

SD_INIT_NAME(sd, type); \

7253

SD_INIT_NAME(sd, type); \

7254

sd->private = &tl->data; \

7254

sd->private = &tl->data; \

7255

return sd; \

7255

return sd; \

7256

}

7256

}

7257

7258

SD_INIT_FUNC(CPU)

7258

SD_INIT_FUNC(CPU)

7259

#ifdef CONFIG_NUMA

7259

#ifdef CONFIG_NUMA

7260

SD_INIT_FUNC(ALLNODES)

7260

SD_INIT_FUNC(ALLNODES)

7261

SD_INIT_FUNC(NODE)

7261

SD_INIT_FUNC(NODE)

7262

#endif

7262

#endif

7263

#ifdef CONFIG_SCHED_SMT

7263

#ifdef CONFIG_SCHED_SMT

7264

SD_INIT_FUNC(SIBLING)

7264

SD_INIT_FUNC(SIBLING)

7265

#endif

7265

#endif

7266

#ifdef CONFIG_SCHED_MC

7266

#ifdef CONFIG_SCHED_MC

7267

SD_INIT_FUNC(MC)

7267

SD_INIT_FUNC(MC)

7268

#endif

7268

#endif

7269

#ifdef CONFIG_SCHED_BOOK

7269

#ifdef CONFIG_SCHED_BOOK

7270

SD_INIT_FUNC(BOOK)

7270

SD_INIT_FUNC(BOOK)

7271

#endif

7271

#endif

7272

7273

static int default_relax_domain_level = -1;

7273

static int default_relax_domain_level = -1;

7274

int sched_domain_level_max;

7274

int sched_domain_level_max;

7275

7276

static int __init setup_relax_domain_level(char *str)

7276

static int __init setup_relax_domain_level(char *str)

7277

{

7277

{

7278

unsigned long val;

7278

unsigned long val;

7279

7280

val = simple_strtoul(str, NULL, 0);

7280

val = simple_strtoul(str, NULL, 0);

7281

if (val < sched_domain_level_max)

7281

if (val < sched_domain_level_max)

7282

default_relax_domain_level = val;

7282

default_relax_domain_level = val;

7283

7284

return 1;

7284

return 1;

7285

}

7285

}

7286

__setup("relax_domain_level=", setup_relax_domain_level);

7286

__setup("relax_domain_level=", setup_relax_domain_level);

7287

7288

static void set_domain_attribute(struct sched_domain *sd,

7288

static void set_domain_attribute(struct sched_domain *sd,

7289

struct sched_domain_attr *attr)

7289

struct sched_domain_attr *attr)

7290

{

7290

{

7291

int request;

7291

int request;

7292

7293

if (!attr || attr->relax_domain_level < 0) {

7293

if (!attr || attr->relax_domain_level < 0) {

7294

if (default_relax_domain_level < 0)

7294

if (default_relax_domain_level < 0)

7295

return;

7295

return;

7296

else

7296

else

7297

request = default_relax_domain_level;

7297

request = default_relax_domain_level;

7298

} else

7298

} else

7299

request = attr->relax_domain_level;

7299

request = attr->relax_domain_level;

7300

if (request < sd->level) {

7300

if (request < sd->level) {

7301

/* turn off idle balance on this domain */

7301

/* turn off idle balance on this domain */

7302

sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

7302

sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

7303

} else {

7303

} else {

7304

/* turn on idle balance on this domain */

7304

/* turn on idle balance on this domain */

7305

sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

7305

sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);

7306

}

7306

}

7307

}

7307

}

7308

7309

static void __sdt_free(const struct cpumask *cpu_map);

7309

static void __sdt_free(const struct cpumask *cpu_map);

7310

static int __sdt_alloc(const struct cpumask *cpu_map);

7310

static int __sdt_alloc(const struct cpumask *cpu_map);

7311

7312

static void __free_domain_allocs(struct s_data *d, enum s_alloc what,

7312

static void __free_domain_allocs(struct s_data *d, enum s_alloc what,

7313

const struct cpumask *cpu_map)

7313

const struct cpumask *cpu_map)

7314

{

7314

{

7315

switch (what) {

7315

switch (what) {

7316

case sa_rootdomain:

7316

case sa_rootdomain:

7317

if (!atomic_read(&d->rd->refcount))

7317

if (!atomic_read(&d->rd->refcount))

7318

free_rootdomain(&d->rd->rcu); /* fall through */

7318

free_rootdomain(&d->rd->rcu); /* fall through */

7319

case sa_sd:

7319

case sa_sd:

7320

free_percpu(d->sd); /* fall through */

7320

free_percpu(d->sd); /* fall through */

7321

case sa_sd_storage:

7321

case sa_sd_storage:

7322

__sdt_free(cpu_map); /* fall through */

7322

__sdt_free(cpu_map); /* fall through */

7323

case sa_none:

7323

case sa_none:

7324

break;

7324

break;

7325

}

7325

}

7326

}

7326

}

7327

7328

static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,

7328

static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,

7329

const struct cpumask *cpu_map)

7329

const struct cpumask *cpu_map)

7330

{

7330

{

7331

memset(d, 0, sizeof(*d));

7331

memset(d, 0, sizeof(*d));

7332

7333

if (__sdt_alloc(cpu_map))

7333

if (__sdt_alloc(cpu_map))

7334

return sa_sd_storage;

7334

return sa_sd_storage;

7335

d->sd = alloc_percpu(struct sched_domain *);

7335

d->sd = alloc_percpu(struct sched_domain *);

7336

if (!d->sd)

7336

if (!d->sd)

7337

return sa_sd_storage;

7337

return sa_sd_storage;

7338

d->rd = alloc_rootdomain();

7338

d->rd = alloc_rootdomain();

7339

if (!d->rd)

7339

if (!d->rd)

7340

return sa_sd;

7340

return sa_sd;

7341

return sa_rootdomain;

7341

return sa_rootdomain;

7342

}

7342

}

7343

7344

/*

7344

/*

7345

* NULL the sd_data elements we've used to build the sched_domain and

7345

* NULL the sd_data elements we've used to build the sched_domain and

7346

* sched_group structure so that the subsequent __free_domain_allocs()

7346

* sched_group structure so that the subsequent __free_domain_allocs()

7347

* will not free the data we're using.

7347

* will not free the data we're using.

7348

*/

7348

*/

7349

static void claim_allocations(int cpu, struct sched_domain *sd)

7349

static void claim_allocations(int cpu, struct sched_domain *sd)

7350

{

7350

{

7351

struct sd_data *sdd = sd->private;

7351

struct sd_data *sdd = sd->private;

7352

7353

WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);

7353

WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);

7354

*per_cpu_ptr(sdd->sd, cpu) = NULL;

7354

*per_cpu_ptr(sdd->sd, cpu) = NULL;

7355

7356

if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))

7356

if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))

7357

*per_cpu_ptr(sdd->sg, cpu) = NULL;

7357

*per_cpu_ptr(sdd->sg, cpu) = NULL;

7358

7359

if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))

7359

if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))

7360

*per_cpu_ptr(sdd->sgp, cpu) = NULL;

7360

*per_cpu_ptr(sdd->sgp, cpu) = NULL;

7361

}

7361

}

7362

7363

#ifdef CONFIG_SCHED_SMT

7363

#ifdef CONFIG_SCHED_SMT

7364

static const struct cpumask *cpu_smt_mask(int cpu)

7364

static const struct cpumask *cpu_smt_mask(int cpu)

7365

{

7365

{

7366

return topology_thread_cpumask(cpu);

7366

return topology_thread_cpumask(cpu);

7367

}

7367

}

7368

#endif

7368

#endif

7369

7370

/*

7370

/*

7371

* Topology list, bottom-up.

7371

* Topology list, bottom-up.

7372

*/

7372

*/

7373

static struct sched_domain_topology_level default_topology[] = {

7373

static struct sched_domain_topology_level default_topology[] = {

7374

#ifdef CONFIG_SCHED_SMT

7374

#ifdef CONFIG_SCHED_SMT

7375

{ sd_init_SIBLING, cpu_smt_mask, },

7375

{ sd_init_SIBLING, cpu_smt_mask, },

7376

#endif

7376

#endif

7377

#ifdef CONFIG_SCHED_MC

7377

#ifdef CONFIG_SCHED_MC

7378

{ sd_init_MC, cpu_coregroup_mask, },

7378

{ sd_init_MC, cpu_coregroup_mask, },

7379

#endif

7379

#endif

7380

#ifdef CONFIG_SCHED_BOOK

7380

#ifdef CONFIG_SCHED_BOOK

7381

{ sd_init_BOOK, cpu_book_mask, },

7381

{ sd_init_BOOK, cpu_book_mask, },

7382

#endif

7382

#endif

7383

{ sd_init_CPU, cpu_cpu_mask, },

7383

{ sd_init_CPU, cpu_cpu_mask, },

7384

#ifdef CONFIG_NUMA

7384

#ifdef CONFIG_NUMA

7385

{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },

7385

{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },

7386

{ sd_init_ALLNODES, cpu_allnodes_mask, },

7386

{ sd_init_ALLNODES, cpu_allnodes_mask, },

7387

#endif

7387

#endif

7388

{ NULL, },

7388

{ NULL, },

7389

};

7389

};

7390

7391

static struct sched_domain_topology_level *sched_domain_topology = default_topology;

7391

static struct sched_domain_topology_level *sched_domain_topology = default_topology;

7392

7393

static int __sdt_alloc(const struct cpumask *cpu_map)

7393

static int __sdt_alloc(const struct cpumask *cpu_map)

7394

{

7394

{

7395

struct sched_domain_topology_level *tl;

7395

struct sched_domain_topology_level *tl;

7396

int j;

7396

int j;

7397

7398

for (tl = sched_domain_topology; tl->init; tl++) {

7398

for (tl = sched_domain_topology; tl->init; tl++) {

7399

struct sd_data *sdd = &tl->data;

7399

struct sd_data *sdd = &tl->data;

7400

7401

sdd->sd = alloc_percpu(struct sched_domain *);

7401

sdd->sd = alloc_percpu(struct sched_domain *);

7402

if (!sdd->sd)

7402

if (!sdd->sd)

7403

return -ENOMEM;

7403

return -ENOMEM;

7404

7405

sdd->sg = alloc_percpu(struct sched_group *);

7405

sdd->sg = alloc_percpu(struct sched_group *);

7406

if (!sdd->sg)

7406

if (!sdd->sg)

7407

return -ENOMEM;

7407

return -ENOMEM;

7408

7409

sdd->sgp = alloc_percpu(struct sched_group_power *);

7409

sdd->sgp = alloc_percpu(struct sched_group_power *);

7410

if (!sdd->sgp)

7410

if (!sdd->sgp)

7411

return -ENOMEM;

7411

return -ENOMEM;

7412

7413

for_each_cpu(j, cpu_map) {

7413

for_each_cpu(j, cpu_map) {

7414

struct sched_domain *sd;

7414

struct sched_domain *sd;

7415

struct sched_group *sg;

7415

struct sched_group *sg;

7416

struct sched_group_power *sgp;

7416

struct sched_group_power *sgp;

7417

7418

sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),

7418

sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),

7419

GFP_KERNEL, cpu_to_node(j));

7419

GFP_KERNEL, cpu_to_node(j));

7420

if (!sd)

7420

if (!sd)

7421

return -ENOMEM;

7421

return -ENOMEM;

7422

7423

*per_cpu_ptr(sdd->sd, j) = sd;

7423

*per_cpu_ptr(sdd->sd, j) = sd;

7424

7425

sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),

7425

sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),

7426

GFP_KERNEL, cpu_to_node(j));

7426

GFP_KERNEL, cpu_to_node(j));

7427

if (!sg)

7427

if (!sg)

7428

return -ENOMEM;

7428

return -ENOMEM;

7429

7430

*per_cpu_ptr(sdd->sg, j) = sg;

7430

*per_cpu_ptr(sdd->sg, j) = sg;

7431

7432

sgp = kzalloc_node(sizeof(struct sched_group_power),

7432

sgp = kzalloc_node(sizeof(struct sched_group_power),

7433

GFP_KERNEL, cpu_to_node(j));

7433

GFP_KERNEL, cpu_to_node(j));

7434

if (!sgp)

7434

if (!sgp)

7435

return -ENOMEM;

7435

return -ENOMEM;

7436

7437

*per_cpu_ptr(sdd->sgp, j) = sgp;

7437

*per_cpu_ptr(sdd->sgp, j) = sgp;

7438

}

7438

}

7439

}

7439

}

7440

7441

return 0;

7441

return 0;

7442

}

7442

}

7443

7444

static void __sdt_free(const struct cpumask *cpu_map)

7444

static void __sdt_free(const struct cpumask *cpu_map)

7445

{

7445

{

7446

struct sched_domain_topology_level *tl;

7446

struct sched_domain_topology_level *tl;

7447

int j;

7447

int j;

7448

7449

for (tl = sched_domain_topology; tl->init; tl++) {

7449

for (tl = sched_domain_topology; tl->init; tl++) {

7450

struct sd_data *sdd = &tl->data;

7450

struct sd_data *sdd = &tl->data;

7451

7452

for_each_cpu(j, cpu_map) {

7452

for_each_cpu(j, cpu_map) {

7453

struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);

7453

struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);

7454

if (sd && (sd->flags & SD_OVERLAP))

7454

if (sd && (sd->flags & SD_OVERLAP))

7455

free_sched_groups(sd->groups, 0);

7455

free_sched_groups(sd->groups, 0);

7456

kfree(*per_cpu_ptr(sdd->sd, j));

7456

kfree(*per_cpu_ptr(sdd->sd, j));

7457

kfree(*per_cpu_ptr(sdd->sg, j));

7457

kfree(*per_cpu_ptr(sdd->sg, j));

7458

kfree(*per_cpu_ptr(sdd->sgp, j));

7458

kfree(*per_cpu_ptr(sdd->sgp, j));

7459

}

7459

}

7460

free_percpu(sdd->sd);

7460

free_percpu(sdd->sd);

7461

free_percpu(sdd->sg);

7461

free_percpu(sdd->sg);

7462

free_percpu(sdd->sgp);

7462

free_percpu(sdd->sgp);

7463

}

7463

}

7464

}

7464

}

7465

7466

struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,

7466

struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,

7467

struct s_data *d, const struct cpumask *cpu_map,

7467

struct s_data *d, const struct cpumask *cpu_map,

7468

struct sched_domain_attr *attr, struct sched_domain *child,

7468

struct sched_domain_attr *attr, struct sched_domain *child,

7469

int cpu)

7469

int cpu)

7470

{

7470

{

7471

struct sched_domain *sd = tl->init(tl, cpu);

7471

struct sched_domain *sd = tl->init(tl, cpu);

7472

if (!sd)

7472

if (!sd)

7473

return child;

7473

return child;

7474

7475

set_domain_attribute(sd, attr);

7475

set_domain_attribute(sd, attr);

7476

cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));

7476

cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));

7477

if (child) {

7477

if (child) {

7478

sd->level = child->level + 1;

7478

sd->level = child->level + 1;

7479

sched_domain_level_max = max(sched_domain_level_max, sd->level);

7479

sched_domain_level_max = max(sched_domain_level_max, sd->level);

7480

child->parent = sd;

7480

child->parent = sd;

7481

}

7481

}

7482

sd->child = child;

7482

sd->child = child;

7483

7484

return sd;

7484

return sd;

7485

}

7485

}

7486

7487

/*

7487

/*

7488

* Build sched domains for a given set of cpus and attach the sched domains

7488

* Build sched domains for a given set of cpus and attach the sched domains

7489

* to the individual cpus

7489

* to the individual cpus

7490

*/

7490

*/

7491

static int build_sched_domains(const struct cpumask *cpu_map,

7491

static int build_sched_domains(const struct cpumask *cpu_map,

7492

struct sched_domain_attr *attr)

7492

struct sched_domain_attr *attr)

7493

{

7493

{

7494

enum s_alloc alloc_state = sa_none;

7494

enum s_alloc alloc_state = sa_none;

7495

struct sched_domain *sd;

7495

struct sched_domain *sd;

7496

struct s_data d;

7496

struct s_data d;

7497

int i, ret = -ENOMEM;

7497

int i, ret = -ENOMEM;

7498

7499

alloc_state = __visit_domain_allocation_hell(&d, cpu_map);

7499

alloc_state = __visit_domain_allocation_hell(&d, cpu_map);

7500

if (alloc_state != sa_rootdomain)

7500

if (alloc_state != sa_rootdomain)

7501

goto error;

7501

goto error;

7502

7503

/* Set up domains for cpus specified by the cpu_map. */

7503

/* Set up domains for cpus specified by the cpu_map. */

7504

for_each_cpu(i, cpu_map) {

7504

for_each_cpu(i, cpu_map) {

7505

struct sched_domain_topology_level *tl;

7505

struct sched_domain_topology_level *tl;

7506

7507

sd = NULL;

7507

sd = NULL;

7508

for (tl = sched_domain_topology; tl->init; tl++) {

7508

for (tl = sched_domain_topology; tl->init; tl++) {

7509

sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);

7509

sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);

7510

if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))

7510

if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))

7511

sd->flags |= SD_OVERLAP;

7511

sd->flags |= SD_OVERLAP;

7512

if (cpumask_equal(cpu_map, sched_domain_span(sd)))

7512

if (cpumask_equal(cpu_map, sched_domain_span(sd)))

7513

break;

7513

break;

7514

}

7514

}

7515

7516

while (sd->child)

7516

while (sd->child)

7517

sd = sd->child;

7517

sd = sd->child;

7518

7519

*per_cpu_ptr(d.sd, i) = sd;

7519

*per_cpu_ptr(d.sd, i) = sd;

7520

}

7520

}

7521

7522

/* Build the groups for the domains */

7522

/* Build the groups for the domains */

7523

for_each_cpu(i, cpu_map) {

7523

for_each_cpu(i, cpu_map) {

7524

for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {

7524

for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {

7525

sd->span_weight = cpumask_weight(sched_domain_span(sd));

7525

sd->span_weight = cpumask_weight(sched_domain_span(sd));

7526

if (sd->flags & SD_OVERLAP) {

7526

if (sd->flags & SD_OVERLAP) {

7527

if (build_overlap_sched_groups(sd, i))

7527

if (build_overlap_sched_groups(sd, i))

7528

goto error;

7528

goto error;

7529

} else {

7529

} else {

7530

if (build_sched_groups(sd, i))

7530

if (build_sched_groups(sd, i))

7531

goto error;

7531

goto error;

7532

}

7532

}

7533

}

7533

}

7534

}

7534

}

7535

7536

/* Calculate CPU power for physical packages and nodes */

7536

/* Calculate CPU power for physical packages and nodes */

7537

for (i = nr_cpumask_bits-1; i >= 0; i--) {

7537

for (i = nr_cpumask_bits-1; i >= 0; i--) {

7538

if (!cpumask_test_cpu(i, cpu_map))

7538

if (!cpumask_test_cpu(i, cpu_map))

7539

continue;

7539

continue;

7540

7541

for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {

7541

for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {

7542

claim_allocations(i, sd);

7542

claim_allocations(i, sd);

7543

init_sched_groups_power(i, sd);

7543

init_sched_groups_power(i, sd);

7544

}

7544

}

7545

}

7545

}

7546

7547

/* Attach the domains */

7547

/* Attach the domains */

7548

rcu_read_lock();

7548

rcu_read_lock();

7549

for_each_cpu(i, cpu_map) {

7549

for_each_cpu(i, cpu_map) {

7550

sd = *per_cpu_ptr(d.sd, i);

7550

sd = *per_cpu_ptr(d.sd, i);

7551

cpu_attach_domain(sd, d.rd, i);

7551

cpu_attach_domain(sd, d.rd, i);

7552

}

7552

}

7553

rcu_read_unlock();

7553

rcu_read_unlock();

7554

7555

ret = 0;

7555

ret = 0;

7556

error:

7556

error:

7557

__free_domain_allocs(&d, alloc_state, cpu_map);

7557

__free_domain_allocs(&d, alloc_state, cpu_map);

7558

return ret;

7558

return ret;

7559

}

7559

}

7560

7561

static cpumask_var_t *doms_cur; /* current sched domains */

7561

static cpumask_var_t *doms_cur; /* current sched domains */

7562

static int ndoms_cur; /* number of sched domains in 'doms_cur' */

7562

static int ndoms_cur; /* number of sched domains in 'doms_cur' */

7563

static struct sched_domain_attr *dattr_cur;

7563

static struct sched_domain_attr *dattr_cur;

7564

/* attribues of custom domains in 'doms_cur' */

7564

/* attribues of custom domains in 'doms_cur' */

7565

7566

/*

7566

/*

7567

* Special case: If a kmalloc of a doms_cur partition (array of

7567

* Special case: If a kmalloc of a doms_cur partition (array of

7568

* cpumask) fails, then fallback to a single sched domain,

7568

* cpumask) fails, then fallback to a single sched domain,

7569

* as determined by the single cpumask fallback_doms.

7569

* as determined by the single cpumask fallback_doms.

7570

*/

7570

*/

7571

static cpumask_var_t fallback_doms;

7571

static cpumask_var_t fallback_doms;

7572

7573

/*

7573

/*

7574

* arch_update_cpu_topology lets virtualized architectures update the

7574

* arch_update_cpu_topology lets virtualized architectures update the

7575

* cpu core maps. It is supposed to return 1 if the topology changed

7575

* cpu core maps. It is supposed to return 1 if the topology changed

7576

* or 0 if it stayed the same.

7576

* or 0 if it stayed the same.

7577

*/

7577

*/

7578

int __attribute__((weak)) arch_update_cpu_topology(void)

7578

int __attribute__((weak)) arch_update_cpu_topology(void)

7579

{

7579

{

7580

return 0;

7580

return 0;

7581

}

7581

}

7582

7583

cpumask_var_t *alloc_sched_domains(unsigned int ndoms)

7583

cpumask_var_t *alloc_sched_domains(unsigned int ndoms)

7584

{

7584

{

7585

int i;

7585

int i;

7586

cpumask_var_t *doms;

7586

cpumask_var_t *doms;

7587

7588

doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);

7588

doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);

7589

if (!doms)

7589

if (!doms)

7590

return NULL;

7590

return NULL;

7591

for (i = 0; i < ndoms; i++) {

7591

for (i = 0; i < ndoms; i++) {

7592

if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {

7592

if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {

7593

free_sched_domains(doms, i);

7593

free_sched_domains(doms, i);

7594

return NULL;

7594

return NULL;

7595

}

7595

}

7596

}

7596

}

7597

return doms;

7597

return doms;

7598

}

7598

}

7599

7600

void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)

7600

void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)

7601

{

7601

{

7602

unsigned int i;

7602

unsigned int i;

7603

for (i = 0; i < ndoms; i++)

7603

for (i = 0; i < ndoms; i++)

7604

free_cpumask_var(doms[i]);

7604

free_cpumask_var(doms[i]);

7605

kfree(doms);

7605

kfree(doms);

7606

}

7606

}

7607

7608

/*

7608

/*

7609

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

7609

* Set up scheduler domains and groups. Callers must hold the hotplug lock.

7610

* For now this just excludes isolated cpus, but could be used to

7610

* For now this just excludes isolated cpus, but could be used to

7611

* exclude other special cases in the future.

7611

* exclude other special cases in the future.

7612

*/

7612

*/

7613

static int init_sched_domains(const struct cpumask *cpu_map)

7613

static int init_sched_domains(const struct cpumask *cpu_map)

7614

{

7614

{

7615

int err;

7615

int err;

7616

7617

arch_update_cpu_topology();

7617

arch_update_cpu_topology();

7618

ndoms_cur = 1;

7618

ndoms_cur = 1;

7619

doms_cur = alloc_sched_domains(ndoms_cur);

7619

doms_cur = alloc_sched_domains(ndoms_cur);

7620

if (!doms_cur)

7620

if (!doms_cur)

7621

doms_cur = &fallback_doms;

7621

doms_cur = &fallback_doms;

7622

cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);

7622

cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);

7623

dattr_cur = NULL;

7623

dattr_cur = NULL;

7624

err = build_sched_domains(doms_cur[0], NULL);

7624

err = build_sched_domains(doms_cur[0], NULL);

7625

register_sched_domain_sysctl();

7625

register_sched_domain_sysctl();

7626

7627

return err;

7627

return err;

7628

}

7628

}

7629

7630

/*

7630

/*

7631

* Detach sched domains from a group of cpus specified in cpu_map

7631

* Detach sched domains from a group of cpus specified in cpu_map

7632

* These cpus will now be attached to the NULL domain

7632

* These cpus will now be attached to the NULL domain

7633

*/

7633

*/

7634

static void detach_destroy_domains(const struct cpumask *cpu_map)

7634

static void detach_destroy_domains(const struct cpumask *cpu_map)

7635

{

7635

{

7636

int i;

7636

int i;

7637

7638

rcu_read_lock();

7638

rcu_read_lock();

7639

for_each_cpu(i, cpu_map)

7639

for_each_cpu(i, cpu_map)

7640

cpu_attach_domain(NULL, &def_root_domain, i);

7640

cpu_attach_domain(NULL, &def_root_domain, i);

7641

rcu_read_unlock();

7641

rcu_read_unlock();

7642

}

7642

}

7643

7644

/* handle null as "default" */

7644

/* handle null as "default" */

7645

static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,

7645

static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,

7646

struct sched_domain_attr *new, int idx_new)

7646

struct sched_domain_attr *new, int idx_new)

7647

{

7647

{

7648

struct sched_domain_attr tmp;

7648

struct sched_domain_attr tmp;

7649

7650

/* fast path */

7650

/* fast path */

7651

if (!new && !cur)

7651

if (!new && !cur)

7652

return 1;

7652

return 1;

7653

7654

tmp = SD_ATTR_INIT;

7654

tmp = SD_ATTR_INIT;

7655

return !memcmp(cur ? (cur + idx_cur) : &tmp,

7655

return !memcmp(cur ? (cur + idx_cur) : &tmp,

7656

new ? (new + idx_new) : &tmp,

7656

new ? (new + idx_new) : &tmp,

7657

sizeof(struct sched_domain_attr));

7657

sizeof(struct sched_domain_attr));

7658

}

7658

}

7659

7660

/*

7660

/*

7661

* Partition sched domains as specified by the 'ndoms_new'

7661

* Partition sched domains as specified by the 'ndoms_new'

7662

* cpumasks in the array doms_new[] of cpumasks. This compares

7662

* cpumasks in the array doms_new[] of cpumasks. This compares

7663

* doms_new[] to the current sched domain partitioning, doms_cur[].

7663

* doms_new[] to the current sched domain partitioning, doms_cur[].

7664

* It destroys each deleted domain and builds each new domain.

7664

* It destroys each deleted domain and builds each new domain.

7665

*

7665

*

7666

* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.

7666

* 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.

7667

* The masks don't intersect (don't overlap.) We should setup one

7667

* The masks don't intersect (don't overlap.) We should setup one

7668

* sched domain for each mask. CPUs not in any of the cpumasks will

7668

* sched domain for each mask. CPUs not in any of the cpumasks will

7669

* not be load balanced. If the same cpumask appears both in the

7669

* not be load balanced. If the same cpumask appears both in the

7670

* current 'doms_cur' domains and in the new 'doms_new', we can leave

7670

* current 'doms_cur' domains and in the new 'doms_new', we can leave

7671

* it as it is.

7671

* it as it is.

7672

*

7672

*

7673

* The passed in 'doms_new' should be allocated using

7673

* The passed in 'doms_new' should be allocated using

7674

* alloc_sched_domains. This routine takes ownership of it and will

7674

* alloc_sched_domains. This routine takes ownership of it and will

7675

* free_sched_domains it when done with it. If the caller failed the

7675

* free_sched_domains it when done with it. If the caller failed the

7676

* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,

7676

* alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,

7677

* and partition_sched_domains() will fallback to the single partition

7677

* and partition_sched_domains() will fallback to the single partition

7678

* 'fallback_doms', it also forces the domains to be rebuilt.

7678

* 'fallback_doms', it also forces the domains to be rebuilt.

7679

*

7679

*

7680

* If doms_new == NULL it will be replaced with cpu_online_mask.

7680

* If doms_new == NULL it will be replaced with cpu_online_mask.

7681

* ndoms_new == 0 is a special case for destroying existing domains,

7681

* ndoms_new == 0 is a special case for destroying existing domains,

7682

* and it will not create the default domain.

7682

* and it will not create the default domain.

7683

*

7683

*

7684

* Call with hotplug lock held

7684

* Call with hotplug lock held

7685

*/

7685

*/

7686

void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

7686

void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

7687

struct sched_domain_attr *dattr_new)

7687

struct sched_domain_attr *dattr_new)

7688

{

7688

{

7689

int i, j, n;

7689

int i, j, n;

7690

int new_topology;

7690

int new_topology;

7691

7692

mutex_lock(&sched_domains_mutex);

7692

mutex_lock(&sched_domains_mutex);

7693

7694

/* always unregister in case we don't destroy any domains */

7694

/* always unregister in case we don't destroy any domains */

7695

unregister_sched_domain_sysctl();

7695

unregister_sched_domain_sysctl();

7696

7697

/* Let architecture update cpu core mappings. */

7697

/* Let architecture update cpu core mappings. */

7698

new_topology = arch_update_cpu_topology();

7698

new_topology = arch_update_cpu_topology();

7699

7700

n = doms_new ? ndoms_new : 0;

7700

n = doms_new ? ndoms_new : 0;

7701

7702

/* Destroy deleted domains */

7702

/* Destroy deleted domains */

7703

for (i = 0; i < ndoms_cur; i++) {

7703

for (i = 0; i < ndoms_cur; i++) {

7704

for (j = 0; j < n && !new_topology; j++) {

7704

for (j = 0; j < n && !new_topology; j++) {

7705

if (cpumask_equal(doms_cur[i], doms_new[j])

7705

if (cpumask_equal(doms_cur[i], doms_new[j])

7706

&& dattrs_equal(dattr_cur, i, dattr_new, j))

7706

&& dattrs_equal(dattr_cur, i, dattr_new, j))

7707

goto match1;

7707

goto match1;

7708

}

7708

}

7709

/* no match - a current sched domain not in new doms_new[] */

7709

/* no match - a current sched domain not in new doms_new[] */

7710

detach_destroy_domains(doms_cur[i]);

7710

detach_destroy_domains(doms_cur[i]);

7711

match1:

7711

match1:

7712

;

7712

;

7713

}

7713

}

7714

7715

if (doms_new == NULL) {

7715

if (doms_new == NULL) {

7716

ndoms_cur = 0;

7716

ndoms_cur = 0;

7717

doms_new = &fallback_doms;

7717

doms_new = &fallback_doms;

7718

cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);

7718

cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);

7719

WARN_ON_ONCE(dattr_new);

7719

WARN_ON_ONCE(dattr_new);

7720

}

7720

}

7721

7722

/* Build new domains */

7722

/* Build new domains */

7723

for (i = 0; i < ndoms_new; i++) {

7723

for (i = 0; i < ndoms_new; i++) {

7724

for (j = 0; j < ndoms_cur && !new_topology; j++) {

7724

for (j = 0; j < ndoms_cur && !new_topology; j++) {

7725

if (cpumask_equal(doms_new[i], doms_cur[j])

7725

if (cpumask_equal(doms_new[i], doms_cur[j])

7726

&& dattrs_equal(dattr_new, i, dattr_cur, j))

7726

&& dattrs_equal(dattr_new, i, dattr_cur, j))

7727

goto match2;

7727

goto match2;

7728

}

7728

}

7729

/* no match - add a new doms_new */

7729

/* no match - add a new doms_new */

7730

build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);

7730

build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);

7731

match2:

7731

match2:

7732

;

7732

;

7733

}

7733

}

7734

7735

/* Remember the new sched domains */

7735

/* Remember the new sched domains */

7736

if (doms_cur != &fallback_doms)

7736

if (doms_cur != &fallback_doms)

7737

free_sched_domains(doms_cur, ndoms_cur);

7737

free_sched_domains(doms_cur, ndoms_cur);

7738

kfree(dattr_cur); /* kfree(NULL) is safe */

7738

kfree(dattr_cur); /* kfree(NULL) is safe */

7739

doms_cur = doms_new;

7739

doms_cur = doms_new;

7740

dattr_cur = dattr_new;

7740

dattr_cur = dattr_new;

7741

ndoms_cur = ndoms_new;

7741

ndoms_cur = ndoms_new;

7742

7743

register_sched_domain_sysctl();

7743

register_sched_domain_sysctl();

7744

7745

mutex_unlock(&sched_domains_mutex);

7745

mutex_unlock(&sched_domains_mutex);

7746

}

7746

}

7747

7748

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

7748

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)

7749

static void reinit_sched_domains(void)

7749

static void reinit_sched_domains(void)

7750

{

7750

{

7751

get_online_cpus();

7751

get_online_cpus();

7752

7753

/* Destroy domains first to force the rebuild */

7753

/* Destroy domains first to force the rebuild */

7754

partition_sched_domains(0, NULL, NULL);

7754

partition_sched_domains(0, NULL, NULL);

7755

7756

rebuild_sched_domains();

7756

rebuild_sched_domains();

7757

put_online_cpus();

7757

put_online_cpus();

7758

}

7758

}

7759

7760

static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)

7760

static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)

7761

{

7761

{

7762

unsigned int level = 0;

7762

unsigned int level = 0;

7763

7764

if (sscanf(buf, "%u", &level) != 1)

7764

if (sscanf(buf, "%u", &level) != 1)

7765

return -EINVAL;

7765

return -EINVAL;

7766

7767

/*

7767

/*

7768

* level is always be positive so don't check for

7768

* level is always be positive so don't check for

7769

* level < POWERSAVINGS_BALANCE_NONE which is 0

7769

* level < POWERSAVINGS_BALANCE_NONE which is 0

7770

* What happens on 0 or 1 byte write,

7770

* What happens on 0 or 1 byte write,

7771

* need to check for count as well?

7771

* need to check for count as well?

7772

*/

7772

*/

7773

7774

if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)

7774

if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)

7775

return -EINVAL;

7775

return -EINVAL;

7776

7777

if (smt)

7777

if (smt)

7778

sched_smt_power_savings = level;

7778

sched_smt_power_savings = level;

7779

else

7779

else

7780

sched_mc_power_savings = level;

7780

sched_mc_power_savings = level;

7781

7782

reinit_sched_domains();

7782

reinit_sched_domains();

7783

7784

return count;

7784

return count;

7785

}

7785

}

7786

7787

#ifdef CONFIG_SCHED_MC

7787

#ifdef CONFIG_SCHED_MC

7788

static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,

7788

static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,

7789

struct sysdev_class_attribute *attr,

7789

struct sysdev_class_attribute *attr,

7790

char *page)

7790

char *page)

7791

{

7791

{

7792

return sprintf(page, "%u\n", sched_mc_power_savings);

7792

return sprintf(page, "%u\n", sched_mc_power_savings);

7793

}

7793

}

7794

static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,

7794

static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,

7795

struct sysdev_class_attribute *attr,

7795

struct sysdev_class_attribute *attr,

7796

const char *buf, size_t count)

7796

const char *buf, size_t count)

7797

{

7797

{

7798

return sched_power_savings_store(buf, count, 0);

7798

return sched_power_savings_store(buf, count, 0);

7799

}

7799

}

7800

static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,

7800

static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,

7801

sched_mc_power_savings_show,

7801

sched_mc_power_savings_show,

7802

sched_mc_power_savings_store);

7802

sched_mc_power_savings_store);

7803

#endif

7803

#endif

7804

7805

#ifdef CONFIG_SCHED_SMT

7805

#ifdef CONFIG_SCHED_SMT

7806

static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,

7806

static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,

7807

struct sysdev_class_attribute *attr,

7807

struct sysdev_class_attribute *attr,

7808

char *page)

7808

char *page)

7809

{

7809

{

7810

return sprintf(page, "%u\n", sched_smt_power_savings);

7810

return sprintf(page, "%u\n", sched_smt_power_savings);

7811

}

7811

}

7812

static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,

7812

static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,

7813

struct sysdev_class_attribute *attr,

7813

struct sysdev_class_attribute *attr,

7814

const char *buf, size_t count)

7814

const char *buf, size_t count)

7815

{

7815

{

7816

return sched_power_savings_store(buf, count, 1);

7816

return sched_power_savings_store(buf, count, 1);

7817

}

7817

}

7818

static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,

7818

static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,

7819

sched_smt_power_savings_show,

7819

sched_smt_power_savings_show,

7820

sched_smt_power_savings_store);

7820

sched_smt_power_savings_store);

7821

#endif

7821

#endif

7822

7823

int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)

7823

int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)

7824

{

7824

{

7825

int err = 0;

7825

int err = 0;

7826

7827

#ifdef CONFIG_SCHED_SMT

7827

#ifdef CONFIG_SCHED_SMT

7828

if (smt_capable())

7828

if (smt_capable())

7829

err = sysfs_create_file(&cls->kset.kobj,

7829

err = sysfs_create_file(&cls->kset.kobj,

7830

&attr_sched_smt_power_savings.attr);

7830

&attr_sched_smt_power_savings.attr);

7831

#endif

7831

#endif

7832

#ifdef CONFIG_SCHED_MC

7832

#ifdef CONFIG_SCHED_MC

7833

if (!err && mc_capable())

7833

if (!err && mc_capable())

7834

err = sysfs_create_file(&cls->kset.kobj,

7834

err = sysfs_create_file(&cls->kset.kobj,

7835

&attr_sched_mc_power_savings.attr);

7835

&attr_sched_mc_power_savings.attr);

7836

#endif

7836

#endif

7837

return err;

7837

return err;

7838

}

7838

}

7839

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

7839

#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */

7840

7841

/*

7841

/*

7842

* Update cpusets according to cpu_active mask. If cpusets are

7842

* Update cpusets according to cpu_active mask. If cpusets are

7843

* disabled, cpuset_update_active_cpus() becomes a simple wrapper

7843

* disabled, cpuset_update_active_cpus() becomes a simple wrapper

7844

* around partition_sched_domains().

7844

* around partition_sched_domains().

7845

*/

7845

*/

7846

static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,

7846

static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,

7847

void *hcpu)

7847

void *hcpu)

7848

{

7848

{

7849

switch (action & ~CPU_TASKS_FROZEN) {

7849

switch (action & ~CPU_TASKS_FROZEN) {

7850

case CPU_ONLINE:

7850

case CPU_ONLINE:

7851

case CPU_DOWN_FAILED:

7851

case CPU_DOWN_FAILED:

7852

cpuset_update_active_cpus();

7852

cpuset_update_active_cpus();

7853

return NOTIFY_OK;

7853

return NOTIFY_OK;

7854

default:

7854

default:

7855

return NOTIFY_DONE;

7855

return NOTIFY_DONE;

7856

}

7856

}

7857

}

7857

}

7858

7859

static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,

7859

static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,

7860

void *hcpu)

7860

void *hcpu)

7861

{

7861

{

7862

switch (action & ~CPU_TASKS_FROZEN) {

7862

switch (action & ~CPU_TASKS_FROZEN) {

7863

case CPU_DOWN_PREPARE:

7863

case CPU_DOWN_PREPARE:

7864

cpuset_update_active_cpus();

7864

cpuset_update_active_cpus();

7865

return NOTIFY_OK;

7865

return NOTIFY_OK;

7866

default:

7866

default:

7867

return NOTIFY_DONE;

7867

return NOTIFY_DONE;

7868

}

7868

}

7869

}

7869

}

7870

7871

static int update_runtime(struct notifier_block *nfb,

7871

static int update_runtime(struct notifier_block *nfb,

7872

unsigned long action, void *hcpu)

7872

unsigned long action, void *hcpu)

7873

{

7873

{

7874

int cpu = (int)(long)hcpu;

7874

int cpu = (int)(long)hcpu;

7875

7876

switch (action) {

7876

switch (action) {

7877

case CPU_DOWN_PREPARE:

7877

case CPU_DOWN_PREPARE:

7878

case CPU_DOWN_PREPARE_FROZEN:

7878

case CPU_DOWN_PREPARE_FROZEN:

7879

disable_runtime(cpu_rq(cpu));

7879

disable_runtime(cpu_rq(cpu));

7880

return NOTIFY_OK;

7880

return NOTIFY_OK;

7881

7882

case CPU_DOWN_FAILED:

7882

case CPU_DOWN_FAILED:

7883

case CPU_DOWN_FAILED_FROZEN:

7883

case CPU_DOWN_FAILED_FROZEN:

7884

case CPU_ONLINE:

7884

case CPU_ONLINE:

7885

case CPU_ONLINE_FROZEN:

7885

case CPU_ONLINE_FROZEN:

7886

enable_runtime(cpu_rq(cpu));

7886

enable_runtime(cpu_rq(cpu));

7887

return NOTIFY_OK;

7887

return NOTIFY_OK;

7888

7889

default:

7889

default:

7890

return NOTIFY_DONE;

7890

return NOTIFY_DONE;

7891

}

7891

}

7892

}

7892

}

7893

7894

void __init sched_init_smp(void)

7894

void __init sched_init_smp(void)

7895

{

7895

{

7896

cpumask_var_t non_isolated_cpus;

7896

cpumask_var_t non_isolated_cpus;

7897

7898

alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);

7898

alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);

7899

alloc_cpumask_var(&fallback_doms, GFP_KERNEL);

7899

alloc_cpumask_var(&fallback_doms, GFP_KERNEL);

7900

7901

get_online_cpus();

7901

get_online_cpus();

7902

mutex_lock(&sched_domains_mutex);

7902

mutex_lock(&sched_domains_mutex);

7903

init_sched_domains(cpu_active_mask);

7903

init_sched_domains(cpu_active_mask);

7904

cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);

7904

cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);

7905

if (cpumask_empty(non_isolated_cpus))

7905

if (cpumask_empty(non_isolated_cpus))

7906

cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);

7906

cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);

7907

mutex_unlock(&sched_domains_mutex);

7907

mutex_unlock(&sched_domains_mutex);

7908

put_online_cpus();

7908

put_online_cpus();

7909

7910

hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);

7910

hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);

7911

hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);

7911

hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);

7912

7913

/* RT runtime code needs to handle some hotplug events */

7913

/* RT runtime code needs to handle some hotplug events */

7914

hotcpu_notifier(update_runtime, 0);

7914

hotcpu_notifier(update_runtime, 0);

7915

7916

init_hrtick();

7916

init_hrtick();

7917

7918

/* Move init over to a non-isolated CPU */

7918

/* Move init over to a non-isolated CPU */

7919

if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)

7919

if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)

7920

BUG();

7920

BUG();

7921

sched_init_granularity();

7921

sched_init_granularity();

7922

free_cpumask_var(non_isolated_cpus);

7922

free_cpumask_var(non_isolated_cpus);

7923

7924

init_sched_rt_class();

7924

init_sched_rt_class();

7925

}

7925

}

7926

#else

7926

#else

7927

void __init sched_init_smp(void)

7927

void __init sched_init_smp(void)

7928

{

7928

{

7929

sched_init_granularity();

7929

sched_init_granularity();

7930

}

7930

}

7931

#endif /* CONFIG_SMP */

7931

#endif /* CONFIG_SMP */

7932

7933

const_debug unsigned int sysctl_timer_migration = 1;

7933

const_debug unsigned int sysctl_timer_migration = 1;

7934

7935

int in_sched_functions(unsigned long addr)

7935

int in_sched_functions(unsigned long addr)

7936

{

7936

{

7937

return in_lock_functions(addr) ||

7937

return in_lock_functions(addr) ||

7938

(addr >= (unsigned long)__sched_text_start

7938

(addr >= (unsigned long)__sched_text_start

7939

&& addr < (unsigned long)__sched_text_end);

7939

&& addr < (unsigned long)__sched_text_end);

7940

}

7940

}

7941

7942

static void init_cfs_rq(struct cfs_rq *cfs_rq)

7942

static void init_cfs_rq(struct cfs_rq *cfs_rq)

7943

{

7943

{

7944

cfs_rq->tasks_timeline = RB_ROOT;

7944

cfs_rq->tasks_timeline = RB_ROOT;

7945

INIT_LIST_HEAD(&cfs_rq->tasks);

7945

INIT_LIST_HEAD(&cfs_rq->tasks);

7946

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

7946

cfs_rq->min_vruntime = (u64)(-(1LL << 20));

7947

#ifndef CONFIG_64BIT

7947

#ifndef CONFIG_64BIT

7948

cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;

7948

cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;

7949

#endif

7949

#endif

7950

}

7950

}

7951

7952

static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)

7952

static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)

7953

{

7953

{

7954

struct rt_prio_array *array;

7954

struct rt_prio_array *array;

7955

int i;

7955

int i;

7956

7957

array = &rt_rq->active;

7957

array = &rt_rq->active;

7958

for (i = 0; i < MAX_RT_PRIO; i++) {

7958

for (i = 0; i < MAX_RT_PRIO; i++) {

7959

INIT_LIST_HEAD(array->queue + i);

7959

INIT_LIST_HEAD(array->queue + i);

7960

__clear_bit(i, array->bitmap);

7960

__clear_bit(i, array->bitmap);

7961

}

7961

}

7962

/* delimiter for bitsearch: */

7962

/* delimiter for bitsearch: */

7963

__set_bit(MAX_RT_PRIO, array->bitmap);

7963

__set_bit(MAX_RT_PRIO, array->bitmap);

7964

7965

#if defined CONFIG_SMP

7965

#if defined CONFIG_SMP

7966

rt_rq->highest_prio.curr = MAX_RT_PRIO;

7966

rt_rq->highest_prio.curr = MAX_RT_PRIO;

7967

rt_rq->highest_prio.next = MAX_RT_PRIO;

7967

rt_rq->highest_prio.next = MAX_RT_PRIO;

7968

rt_rq->rt_nr_migratory = 0;

7968

rt_rq->rt_nr_migratory = 0;

7969

rt_rq->overloaded = 0;

7969

rt_rq->overloaded = 0;

7970

plist_head_init(&rt_rq->pushable_tasks);

7970

plist_head_init(&rt_rq->pushable_tasks);

7971

#endif

7971

#endif

7972

7973

rt_rq->rt_time = 0;

7973

rt_rq->rt_time = 0;

7974

rt_rq->rt_throttled = 0;

7974

rt_rq->rt_throttled = 0;

7975

rt_rq->rt_runtime = 0;

7975

rt_rq->rt_runtime = 0;

7976

raw_spin_lock_init(&rt_rq->rt_runtime_lock);

7976

raw_spin_lock_init(&rt_rq->rt_runtime_lock);

7977

}

7977

}

7978

7979

#ifdef CONFIG_FAIR_GROUP_SCHED

7979

#ifdef CONFIG_FAIR_GROUP_SCHED

7980

static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

7980

static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,

7981

struct sched_entity *se, int cpu,

7981

struct sched_entity *se, int cpu,

7982

struct sched_entity *parent)

7982

struct sched_entity *parent)

7983

{

7983

{

7984

struct rq *rq = cpu_rq(cpu);

7984

struct rq *rq = cpu_rq(cpu);

7985

7986

cfs_rq->tg = tg;

7986

cfs_rq->tg = tg;

7987

cfs_rq->rq = rq;

7987

cfs_rq->rq = rq;

7988

#ifdef CONFIG_SMP

7988

#ifdef CONFIG_SMP

7989

/* allow initial update_cfs_load() to truncate */

7989

/* allow initial update_cfs_load() to truncate */

7990

cfs_rq->load_stamp = 1;

7990

cfs_rq->load_stamp = 1;

7991

#endif

7991

#endif

7992

7993

tg->cfs_rq[cpu] = cfs_rq;

7993

tg->cfs_rq[cpu] = cfs_rq;

7994

tg->se[cpu] = se;

7994

tg->se[cpu] = se;

7995

7996

/* se could be NULL for root_task_group */

7996

/* se could be NULL for root_task_group */

7997

if (!se)

7997

if (!se)

7998

return;

7998

return;

7999

8000

if (!parent)

8000

if (!parent)

8001

se->cfs_rq = &rq->cfs;

8001

se->cfs_rq = &rq->cfs;

8002

else

8002

else

8003

se->cfs_rq = parent->my_q;

8003

se->cfs_rq = parent->my_q;

8004

8005

se->my_q = cfs_rq;

8005

se->my_q = cfs_rq;

8006

update_load_set(&se->load, 0);

8006

update_load_set(&se->load, 0);

8007

se->parent = parent;

8007

se->parent = parent;

8008

}

8008

}

8009

#endif

8009

#endif

8010

8011

#ifdef CONFIG_RT_GROUP_SCHED

8011

#ifdef CONFIG_RT_GROUP_SCHED

8012

static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,

8012

static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,

8013

struct sched_rt_entity *rt_se, int cpu,

8013

struct sched_rt_entity *rt_se, int cpu,

8014

struct sched_rt_entity *parent)

8014

struct sched_rt_entity *parent)

8015

{

8015

{

8016

struct rq *rq = cpu_rq(cpu);

8016

struct rq *rq = cpu_rq(cpu);

8017

8018

rt_rq->highest_prio.curr = MAX_RT_PRIO;

8018

rt_rq->highest_prio.curr = MAX_RT_PRIO;

8019

rt_rq->rt_nr_boosted = 0;

8019

rt_rq->rt_nr_boosted = 0;

8020

rt_rq->rq = rq;

8020

rt_rq->rq = rq;

8021

rt_rq->tg = tg;

8021

rt_rq->tg = tg;

8022

8023

tg->rt_rq[cpu] = rt_rq;

8023

tg->rt_rq[cpu] = rt_rq;

8024

tg->rt_se[cpu] = rt_se;

8024

tg->rt_se[cpu] = rt_se;

8025

8026

if (!rt_se)

8026

if (!rt_se)

8027

return;

8027

return;

8028

8029

if (!parent)

8029

if (!parent)

8030

rt_se->rt_rq = &rq->rt;

8030

rt_se->rt_rq = &rq->rt;

8031

else

8031

else

8032

rt_se->rt_rq = parent->my_q;

8032

rt_se->rt_rq = parent->my_q;

8033

8034

rt_se->my_q = rt_rq;

8034

rt_se->my_q = rt_rq;

8035

rt_se->parent = parent;

8035

rt_se->parent = parent;

8036

INIT_LIST_HEAD(&rt_se->run_list);

8036

INIT_LIST_HEAD(&rt_se->run_list);

8037

}

8037

}

8038

#endif

8038

#endif

8039

8040

void __init sched_init(void)

8040

void __init sched_init(void)

8041

{

8041

{

8042

int i, j;

8042

int i, j;

8043

unsigned long alloc_size = 0, ptr;

8043

unsigned long alloc_size = 0, ptr;

8044

8045

#ifdef CONFIG_FAIR_GROUP_SCHED

8045

#ifdef CONFIG_FAIR_GROUP_SCHED

8046

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

8046

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

8047

#endif

8047

#endif

8048

#ifdef CONFIG_RT_GROUP_SCHED

8048

#ifdef CONFIG_RT_GROUP_SCHED

8049

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

8049

alloc_size += 2 * nr_cpu_ids * sizeof(void **);

8050

#endif

8050

#endif

8051

#ifdef CONFIG_CPUMASK_OFFSTACK

8051

#ifdef CONFIG_CPUMASK_OFFSTACK

8052

alloc_size += num_possible_cpus() * cpumask_size();

8052

alloc_size += num_possible_cpus() * cpumask_size();

8053

#endif

8053

#endif

8054

if (alloc_size) {

8054

if (alloc_size) {

8055

ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

8055

ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);

8056

8057

#ifdef CONFIG_FAIR_GROUP_SCHED

8057

#ifdef CONFIG_FAIR_GROUP_SCHED

8058

root_task_group.se = (struct sched_entity **)ptr;

8058

root_task_group.se = (struct sched_entity **)ptr;

8059

ptr += nr_cpu_ids * sizeof(void **);

8059

ptr += nr_cpu_ids * sizeof(void **);

8060

8061

root_task_group.cfs_rq = (struct cfs_rq **)ptr;

8061

root_task_group.cfs_rq = (struct cfs_rq **)ptr;

8062

ptr += nr_cpu_ids * sizeof(void **);

8062

ptr += nr_cpu_ids * sizeof(void **);

8063

8064

#endif /* CONFIG_FAIR_GROUP_SCHED */

8064

#endif /* CONFIG_FAIR_GROUP_SCHED */

8065

#ifdef CONFIG_RT_GROUP_SCHED

8065

#ifdef CONFIG_RT_GROUP_SCHED

8066

root_task_group.rt_se = (struct sched_rt_entity **)ptr;

8066

root_task_group.rt_se = (struct sched_rt_entity **)ptr;

8067

ptr += nr_cpu_ids * sizeof(void **);

8067

ptr += nr_cpu_ids * sizeof(void **);

8068

8069

root_task_group.rt_rq = (struct rt_rq **)ptr;

8069

root_task_group.rt_rq = (struct rt_rq **)ptr;

8070

ptr += nr_cpu_ids * sizeof(void **);

8070

ptr += nr_cpu_ids * sizeof(void **);

8071

8072

#endif /* CONFIG_RT_GROUP_SCHED */

8072

#endif /* CONFIG_RT_GROUP_SCHED */

8073

#ifdef CONFIG_CPUMASK_OFFSTACK

8073

#ifdef CONFIG_CPUMASK_OFFSTACK

8074

for_each_possible_cpu(i) {

8074

for_each_possible_cpu(i) {

8075

per_cpu(load_balance_tmpmask, i) = (void *)ptr;

8075

per_cpu(load_balance_tmpmask, i) = (void *)ptr;

8076

ptr += cpumask_size();

8076

ptr += cpumask_size();

8077

}

8077

}

8078

#endif /* CONFIG_CPUMASK_OFFSTACK */

8078

#endif /* CONFIG_CPUMASK_OFFSTACK */

8079

}

8079

}

8080

8081

#ifdef CONFIG_SMP

8081

#ifdef CONFIG_SMP

8082

init_defrootdomain();

8082

init_defrootdomain();

8083

#endif

8083

#endif

8084

8085

init_rt_bandwidth(&def_rt_bandwidth,

8085

init_rt_bandwidth(&def_rt_bandwidth,

8086

global_rt_period(), global_rt_runtime());

8086

global_rt_period(), global_rt_runtime());

8087

8088

#ifdef CONFIG_RT_GROUP_SCHED

8088

#ifdef CONFIG_RT_GROUP_SCHED

8089

init_rt_bandwidth(&root_task_group.rt_bandwidth,

8089

init_rt_bandwidth(&root_task_group.rt_bandwidth,

8090

global_rt_period(), global_rt_runtime());

8090

global_rt_period(), global_rt_runtime());

8091

#endif /* CONFIG_RT_GROUP_SCHED */

8091

#endif /* CONFIG_RT_GROUP_SCHED */

8092

8093

#ifdef CONFIG_CGROUP_SCHED

8093

#ifdef CONFIG_CGROUP_SCHED

8094

list_add(&root_task_group.list, &task_groups);

8094

list_add(&root_task_group.list, &task_groups);

8095

INIT_LIST_HEAD(&root_task_group.children);

8095

INIT_LIST_HEAD(&root_task_group.children);

8096

autogroup_init(&init_task);

8096

autogroup_init(&init_task);

8097

#endif /* CONFIG_CGROUP_SCHED */

8097

#endif /* CONFIG_CGROUP_SCHED */

8098

8099

for_each_possible_cpu(i) {

8099

for_each_possible_cpu(i) {

8100

struct rq *rq;

8100

struct rq *rq;

8101

8102

rq = cpu_rq(i);

8102

rq = cpu_rq(i);

8103

raw_spin_lock_init(&rq->lock);

8103

raw_spin_lock_init(&rq->lock);

8104

rq->nr_running = 0;

8104

rq->nr_running = 0;

8105

rq->calc_load_active = 0;

8105

rq->calc_load_active = 0;

8106

rq->calc_load_update = jiffies + LOAD_FREQ;

8106

rq->calc_load_update = jiffies + LOAD_FREQ;

8107

init_cfs_rq(&rq->cfs);

8107

init_cfs_rq(&rq->cfs);

8108

init_rt_rq(&rq->rt, rq);

8108

init_rt_rq(&rq->rt, rq);

8109

#ifdef CONFIG_FAIR_GROUP_SCHED

8109

#ifdef CONFIG_FAIR_GROUP_SCHED

8110

root_task_group.shares = root_task_group_load;

8110

root_task_group.shares = root_task_group_load;

8111

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

8111

INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);

8112

/*

8112

/*

8113

* How much cpu bandwidth does root_task_group get?

8113

* How much cpu bandwidth does root_task_group get?

8114

*

8114

*

8115

* In case of task-groups formed thr' the cgroup filesystem, it

8115

* In case of task-groups formed thr' the cgroup filesystem, it

8116

* gets 100% of the cpu resources in the system. This overall

8116

* gets 100% of the cpu resources in the system. This overall

8117

* system cpu resource is divided among the tasks of

8117

* system cpu resource is divided among the tasks of

8118

* root_task_group and its child task-groups in a fair manner,

8118

* root_task_group and its child task-groups in a fair manner,

8119

* based on each entity's (task or task-group's) weight

8119

* based on each entity's (task or task-group's) weight

8120

* (se->load.weight).

8120

* (se->load.weight).

8121

*

8121

*

8122

* In other words, if root_task_group has 10 tasks of weight

8122

* In other words, if root_task_group has 10 tasks of weight

8123

* 1024) and two child groups A0 and A1 (of weight 1024 each),

8123

* 1024) and two child groups A0 and A1 (of weight 1024 each),

8124

* then A0's share of the cpu resource is:

8124

* then A0's share of the cpu resource is:

8125

*

8125

*

8126

* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%

8126

* A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%

8127

*

8127

*

8128

* We achieve this by letting root_task_group's tasks sit

8128

* We achieve this by letting root_task_group's tasks sit

8129

* directly in rq->cfs (i.e root_task_group->se[] = NULL).

8129

* directly in rq->cfs (i.e root_task_group->se[] = NULL).

8130

*/

8130

*/

8131

init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);

8131

init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);

8132

#endif /* CONFIG_FAIR_GROUP_SCHED */

8132

#endif /* CONFIG_FAIR_GROUP_SCHED */

8133

8134

rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

8134

rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;

8135

#ifdef CONFIG_RT_GROUP_SCHED

8135

#ifdef CONFIG_RT_GROUP_SCHED

8136

INIT_LIST_HEAD(&rq->leaf_rt_rq_list);

8136

INIT_LIST_HEAD(&rq->leaf_rt_rq_list);

8137

init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);

8137

init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);

8138

#endif

8138

#endif

8139

8140

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

8140

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)

8141

rq->cpu_load[j] = 0;

8141

rq->cpu_load[j] = 0;

8142

8143

rq->last_load_update_tick = jiffies;

8143

rq->last_load_update_tick = jiffies;

8144

8145

#ifdef CONFIG_SMP

8145

#ifdef CONFIG_SMP

8146

rq->sd = NULL;

8146

rq->sd = NULL;

8147

rq->rd = NULL;

8147

rq->rd = NULL;

8148

rq->cpu_power = SCHED_POWER_SCALE;

8148

rq->cpu_power = SCHED_POWER_SCALE;

8149

rq->post_schedule = 0;

8149

rq->post_schedule = 0;

8150

rq->active_balance = 0;

8150

rq->active_balance = 0;

8151

rq->next_balance = jiffies;

8151

rq->next_balance = jiffies;

8152

rq->push_cpu = 0;

8152

rq->push_cpu = 0;

8153

rq->cpu = i;

8153

rq->cpu = i;

8154

rq->online = 0;

8154

rq->online = 0;

8155

rq->idle_stamp = 0;

8155

rq->idle_stamp = 0;

8156

rq->avg_idle = 2*sysctl_sched_migration_cost;

8156

rq->avg_idle = 2*sysctl_sched_migration_cost;

8157

rq_attach_root(rq, &def_root_domain);

8157

rq_attach_root(rq, &def_root_domain);

8158

#ifdef CONFIG_NO_HZ

8158

#ifdef CONFIG_NO_HZ

8159

rq->nohz_balance_kick = 0;

8159

rq->nohz_balance_kick = 0;

8160

init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));

8160

init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));

8161

#endif

8161

#endif

8162

#endif

8162

#endif

8163

init_rq_hrtick(rq);

8163

init_rq_hrtick(rq);

8164

atomic_set(&rq->nr_iowait, 0);

8164

atomic_set(&rq->nr_iowait, 0);

8165

}

8165

}

8166

8167

set_load_weight(&init_task);

8167

set_load_weight(&init_task);

8168

8169

#ifdef CONFIG_PREEMPT_NOTIFIERS

8169

#ifdef CONFIG_PREEMPT_NOTIFIERS

8170

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

8170

INIT_HLIST_HEAD(&init_task.preempt_notifiers);

8171

#endif

8171

#endif

8172

8173

#ifdef CONFIG_SMP

8173

#ifdef CONFIG_SMP

8174

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

8174

open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

8175

#endif

8175

#endif

8176

8177

#ifdef CONFIG_RT_MUTEXES

8177

#ifdef CONFIG_RT_MUTEXES

8178

plist_head_init(&init_task.pi_waiters);

8178

plist_head_init(&init_task.pi_waiters);

8179

#endif

8179

#endif

8180

8181

/*

8181

/*

8182

* The boot idle thread does lazy MMU switching as well:

8182

* The boot idle thread does lazy MMU switching as well:

8183

*/

8183

*/

8184

atomic_inc(&init_mm.mm_count);

8184

atomic_inc(&init_mm.mm_count);

8185

enter_lazy_tlb(&init_mm, current);

8185

enter_lazy_tlb(&init_mm, current);

8186

8187

/*

8187

/*

8188

* Make us the idle thread. Technically, schedule() should not be

8188

* Make us the idle thread. Technically, schedule() should not be

8189

* called from this thread, however somewhere below it might be,

8189

* called from this thread, however somewhere below it might be,

8190

* but because we are the idle thread, we just pick up running again

8190

* but because we are the idle thread, we just pick up running again

8191

* when this runqueue becomes "idle".

8191

* when this runqueue becomes "idle".

8192

*/

8192

*/

8193

init_idle(current, smp_processor_id());

8193

init_idle(current, smp_processor_id());

8194

8195

calc_load_update = jiffies + LOAD_FREQ;

8195

calc_load_update = jiffies + LOAD_FREQ;

8196

8197

/*

8197

/*

8198

* During early bootup we pretend to be a normal task:

8198

* During early bootup we pretend to be a normal task:

8199

*/

8199

*/

8200

current->sched_class = &fair_sched_class;

8200

current->sched_class = &fair_sched_class;

8201

8202

/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */

8202

/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */

8203

zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);

8203

zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);

8204

#ifdef CONFIG_SMP

8204

#ifdef CONFIG_SMP

8205

zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);

8205

zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);

8206

#ifdef CONFIG_NO_HZ

8206

#ifdef CONFIG_NO_HZ

8207

zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);

8207

zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);

8208

alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);

8208

alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);

8209

atomic_set(&nohz.load_balancer, nr_cpu_ids);

8209

atomic_set(&nohz.load_balancer, nr_cpu_ids);

8210

atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);

8210

atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);

8211

atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);

8211

atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);

8212

#endif

8212

#endif

8213

/* May be allocated at isolcpus cmdline parse time */

8213

/* May be allocated at isolcpus cmdline parse time */

8214

if (cpu_isolated_map == NULL)

8214

if (cpu_isolated_map == NULL)

8215

zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);

8215

zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);

8216

#endif /* SMP */

8216

#endif /* SMP */

8217

8218

scheduler_running = 1;

8218

scheduler_running = 1;

8219

}

8219

}

8220

8221

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP

8221

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP

8222

static inline int preempt_count_equals(int preempt_offset)

8222

static inline int preempt_count_equals(int preempt_offset)

8223

{

8223

{

8224

int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();

8224

int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();

8225

8226

return (nested == preempt_offset);

8226

return (nested == preempt_offset);

8227

}

8227

}

8228

8229

void __might_sleep(const char *file, int line, int preempt_offset)

8229

void __might_sleep(const char *file, int line, int preempt_offset)

8230

{

8230

{

8231

static unsigned long prev_jiffy; /* ratelimiting */

8231

static unsigned long prev_jiffy; /* ratelimiting */

8232

8233

if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||

8233

if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||

8234

system_state != SYSTEM_RUNNING || oops_in_progress)

8234

system_state != SYSTEM_RUNNING || oops_in_progress)

8235

return;

8235

return;

8236

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

8236

if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)

8237

return;

8237

return;

8238

prev_jiffy = jiffies;

8238

prev_jiffy = jiffies;

8239

8240

printk(KERN_ERR

8240

printk(KERN_ERR

8241

"BUG: sleeping function called from invalid context at %s:%d\n",

8241

"BUG: sleeping function called from invalid context at %s:%d\n",

8242

file, line);

8242

file, line);

8243

printk(KERN_ERR

8243

printk(KERN_ERR

8244

"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

8244

"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",

8245

in_atomic(), irqs_disabled(),

8245

in_atomic(), irqs_disabled(),

8246

current->pid, current->comm);

8246

current->pid, current->comm);

8247

8248

debug_show_held_locks(current);

8248

debug_show_held_locks(current);

8249

if (irqs_disabled())

8249

if (irqs_disabled())

8250

print_irqtrace_events(current);

8250

print_irqtrace_events(current);

8251

dump_stack();

8251

dump_stack();

8252

}

8252

}

8253

EXPORT_SYMBOL(__might_sleep);

8253

EXPORT_SYMBOL(__might_sleep);

8254

#endif

8254

#endif

8255

8256

#ifdef CONFIG_MAGIC_SYSRQ

8256

#ifdef CONFIG_MAGIC_SYSRQ

8257

static void normalize_task(struct rq *rq, struct task_struct *p)

8257

static void normalize_task(struct rq *rq, struct task_struct *p)

8258

{

8258

{

8259

const struct sched_class *prev_class = p->sched_class;

8259

const struct sched_class *prev_class = p->sched_class;

8260

int old_prio = p->prio;

8260

int old_prio = p->prio;

8261

int on_rq;

8261

int on_rq;

8262

8263

on_rq = p->on_rq;

8263

on_rq = p->on_rq;

8264

if (on_rq)

8264

if (on_rq)

8265

deactivate_task(rq, p, 0);

8265

deactivate_task(rq, p, 0);

8266

__setscheduler(rq, p, SCHED_NORMAL, 0);

8266

__setscheduler(rq, p, SCHED_NORMAL, 0);

8267

if (on_rq) {

8267

if (on_rq) {

8268

activate_task(rq, p, 0);

8268

activate_task(rq, p, 0);

8269

resched_task(rq->curr);

8269

resched_task(rq->curr);

8270

}

8270

}

8271

8272

check_class_changed(rq, p, prev_class, old_prio);

8272

check_class_changed(rq, p, prev_class, old_prio);

8273

}

8273

}

8274

8275

void normalize_rt_tasks(void)

8275

void normalize_rt_tasks(void)

8276

{

8276

{

8277

struct task_struct *g, *p;

8277

struct task_struct *g, *p;

8278

unsigned long flags;

8278

unsigned long flags;

8279

struct rq *rq;

8279

struct rq *rq;

8280

8281

read_lock_irqsave(&tasklist_lock, flags);

8281

read_lock_irqsave(&tasklist_lock, flags);

8282

do_each_thread(g, p) {

8282

do_each_thread(g, p) {

8283

/*

8283

/*

8284

* Only normalize user tasks:

8284

* Only normalize user tasks:

8285

*/

8285

*/

8286

if (!p->mm)

8286

if (!p->mm)

8287

continue;

8287

continue;

8288

8289

p->se.exec_start = 0;

8289

p->se.exec_start = 0;

8290

#ifdef CONFIG_SCHEDSTATS

8290

#ifdef CONFIG_SCHEDSTATS

8291

p->se.statistics.wait_start = 0;

8291

p->se.statistics.wait_start = 0;

8292

p->se.statistics.sleep_start = 0;

8292

p->se.statistics.sleep_start = 0;

8293

p->se.statistics.block_start = 0;

8293

p->se.statistics.block_start = 0;

8294

#endif

8294

#endif

8295

8296

if (!rt_task(p)) {

8296

if (!rt_task(p)) {

8297

/*

8297

/*

8298

* Renice negative nice level userspace

8298

* Renice negative nice level userspace

8299

* tasks back to 0:

8299

* tasks back to 0:

8300

*/

8300

*/

8301

if (TASK_NICE(p) < 0 && p->mm)

8301

if (TASK_NICE(p) < 0 && p->mm)

8302

set_user_nice(p, 0);

8302

set_user_nice(p, 0);

8303

continue;

8303

continue;

8304

}

8304

}

8305

8306

raw_spin_lock(&p->pi_lock);

8306

raw_spin_lock(&p->pi_lock);

8307

rq = __task_rq_lock(p);

8307

rq = __task_rq_lock(p);

8308

8309

normalize_task(rq, p);

8309

normalize_task(rq, p);

8310

8311

__task_rq_unlock(rq);

8311

__task_rq_unlock(rq);

8312

raw_spin_unlock(&p->pi_lock);

8312

raw_spin_unlock(&p->pi_lock);

8313

} while_each_thread(g, p);

8313

} while_each_thread(g, p);

8314

8315

read_unlock_irqrestore(&tasklist_lock, flags);

8315

read_unlock_irqrestore(&tasklist_lock, flags);

8316

}

8316

}

8317

8318

#endif /* CONFIG_MAGIC_SYSRQ */

8318

#endif /* CONFIG_MAGIC_SYSRQ */

8319

8320

#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)

8320

#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)

8321

/*

8321

/*

8322

* These functions are only useful for the IA64 MCA handling, or kdb.

8322

* These functions are only useful for the IA64 MCA handling, or kdb.

8323

*

8323

*

8324

* They can only be called when the whole system has been

8324

* They can only be called when the whole system has been

8325

* stopped - every CPU needs to be quiescent, and no scheduling

8325

* stopped - every CPU needs to be quiescent, and no scheduling

8326

* activity can take place. Using them for anything else would

8326

* activity can take place. Using them for anything else would

8327

* be a serious bug, and as a result, they aren't even visible

8327

* be a serious bug, and as a result, they aren't even visible

8328

* under any other configuration.

8328

* under any other configuration.

8329

*/

8329

*/

8330

8331

/**

8331

/**

8332

* curr_task - return the current task for a given cpu.

8332

* curr_task - return the current task for a given cpu.

8333

* @cpu: the processor in question.

8333

* @cpu: the processor in question.

8334

*

8334

*

8335

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

8335

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

8336

*/

8336

*/

8337

struct task_struct *curr_task(int cpu)

8337

struct task_struct *curr_task(int cpu)

8338

{

8338

{

8339

return cpu_curr(cpu);

8339

return cpu_curr(cpu);

8340

}

8340

}

8341

8342

#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */

8342

#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */

8343

8344

#ifdef CONFIG_IA64

8344

#ifdef CONFIG_IA64

8345

/**

8345

/**

8346

* set_curr_task - set the current task for a given cpu.

8346

* set_curr_task - set the current task for a given cpu.

8347

* @cpu: the processor in question.

8347

* @cpu: the processor in question.

8348

* @p: the task pointer to set.

8348

* @p: the task pointer to set.

8349

*

8349

*

8350

* Description: This function must only be used when non-maskable interrupts

8350

* Description: This function must only be used when non-maskable interrupts

8351

* are serviced on a separate stack. It allows the architecture to switch the

8351

* are serviced on a separate stack. It allows the architecture to switch the

8352

* notion of the current task on a cpu in a non-blocking manner. This function

8352

* notion of the current task on a cpu in a non-blocking manner. This function

8353

* must be called with all CPU's synchronized, and interrupts disabled, the

8353

* must be called with all CPU's synchronized, and interrupts disabled, the

8354

* and caller must save the original value of the current task (see

8354

* and caller must save the original value of the current task (see

8355

* curr_task() above) and restore that value before reenabling interrupts and

8355

* curr_task() above) and restore that value before reenabling interrupts and

8356

* re-starting the system.

8356

* re-starting the system.

8357

*

8357

*

8358

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

8358

* ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!

8359

*/

8359

*/

8360

void set_curr_task(int cpu, struct task_struct *p)

8360

void set_curr_task(int cpu, struct task_struct *p)

8361

{

8361

{

8362

cpu_curr(cpu) = p;

8362

cpu_curr(cpu) = p;

8363

}

8363

}

8364

8365

#endif

8365

#endif

8366

8367

#ifdef CONFIG_FAIR_GROUP_SCHED

8367

#ifdef CONFIG_FAIR_GROUP_SCHED

8368

static void free_fair_sched_group(struct task_group *tg)

8368

static void free_fair_sched_group(struct task_group *tg)

8369

{

8369

{

8370

int i;

8370

int i;

8371

8372

for_each_possible_cpu(i) {

8372

for_each_possible_cpu(i) {

8373

if (tg->cfs_rq)

8373

if (tg->cfs_rq)

8374

kfree(tg->cfs_rq[i]);

8374

kfree(tg->cfs_rq[i]);

8375

if (tg->se)

8375

if (tg->se)

8376

kfree(tg->se[i]);

8376

kfree(tg->se[i]);

8377

}

8377

}

8378

8379

kfree(tg->cfs_rq);

8379

kfree(tg->cfs_rq);

8380

kfree(tg->se);

8380

kfree(tg->se);

8381

}

8381

}

8382

8383

static

8383

static

8384

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

8384

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

8385

{

8385

{

8386

struct cfs_rq *cfs_rq;

8386

struct cfs_rq *cfs_rq;

8387

struct sched_entity *se;

8387

struct sched_entity *se;

8388

int i;

8388

int i;

8389

8390

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

8390

tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);

8391

if (!tg->cfs_rq)

8391

if (!tg->cfs_rq)

8392

goto err;

8392

goto err;

8393

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

8393

tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);

8394

if (!tg->se)

8394

if (!tg->se)

8395

goto err;

8395

goto err;

8396

8397

tg->shares = NICE_0_LOAD;

8397

tg->shares = NICE_0_LOAD;

8398

8399

for_each_possible_cpu(i) {

8399

for_each_possible_cpu(i) {

8400

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

8400

cfs_rq = kzalloc_node(sizeof(struct cfs_rq),

8401

GFP_KERNEL, cpu_to_node(i));

8401

GFP_KERNEL, cpu_to_node(i));

8402

if (!cfs_rq)

8402

if (!cfs_rq)

8403

goto err;

8403

goto err;

8404

8405

se = kzalloc_node(sizeof(struct sched_entity),

8405

se = kzalloc_node(sizeof(struct sched_entity),

8406

GFP_KERNEL, cpu_to_node(i));

8406

GFP_KERNEL, cpu_to_node(i));

8407

if (!se)

8407

if (!se)

8408

goto err_free_rq;

8408

goto err_free_rq;

8409

8410

init_cfs_rq(cfs_rq);

8410

init_cfs_rq(cfs_rq);

8411

init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);

8411

init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);

8412

}

8412

}

8413

8414

return 1;

8414

return 1;

8415

8416

err_free_rq:

8416

err_free_rq:

8417

kfree(cfs_rq);

8417

kfree(cfs_rq);

8418

err:

8418

err:

8419

return 0;

8419

return 0;

8420

}

8420

}

8421

8422

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

8422

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

8423

{

8423

{

8424

struct rq *rq = cpu_rq(cpu);

8424

struct rq *rq = cpu_rq(cpu);

8425

unsigned long flags;

8425

unsigned long flags;

8426

8427

/*

8427

/*

8428

* Only empty task groups can be destroyed; so we can speculatively

8428

* Only empty task groups can be destroyed; so we can speculatively

8429

* check on_list without danger of it being re-added.

8429

* check on_list without danger of it being re-added.

8430

*/

8430

*/

8431

if (!tg->cfs_rq[cpu]->on_list)

8431

if (!tg->cfs_rq[cpu]->on_list)

8432

return;

8432

return;

8433

8434

raw_spin_lock_irqsave(&rq->lock, flags);

8434

raw_spin_lock_irqsave(&rq->lock, flags);

8435

list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);

8435

list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);

8436

raw_spin_unlock_irqrestore(&rq->lock, flags);

8436

raw_spin_unlock_irqrestore(&rq->lock, flags);

8437

}

8437

}

8438

#else /* !CONFIG_FAIR_GROUP_SCHED */

8438

#else /* !CONFIG_FAIR_GROUP_SCHED */

8439

static inline void free_fair_sched_group(struct task_group *tg)

8439

static inline void free_fair_sched_group(struct task_group *tg)

8440

{

8440

{

8441

}

8441

}

8442

8443

static inline

8443

static inline

8444

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

8444

int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)

8445

{

8445

{

8446

return 1;

8446

return 1;

8447

}

8447

}

8448

8449

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

8449

static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)

8450

{

8450

{

8451

}

8451

}

8452

#endif /* CONFIG_FAIR_GROUP_SCHED */

8452

#endif /* CONFIG_FAIR_GROUP_SCHED */

8453

8454

#ifdef CONFIG_RT_GROUP_SCHED

8454

#ifdef CONFIG_RT_GROUP_SCHED

8455

static void free_rt_sched_group(struct task_group *tg)

8455

static void free_rt_sched_group(struct task_group *tg)

8456

{

8456

{

8457

int i;

8457

int i;

8458

8459

if (tg->rt_se)

8459

if (tg->rt_se)

8460

destroy_rt_bandwidth(&tg->rt_bandwidth);

8460

destroy_rt_bandwidth(&tg->rt_bandwidth);

8461

8462

for_each_possible_cpu(i) {

8462

for_each_possible_cpu(i) {

8463

if (tg->rt_rq)

8463

if (tg->rt_rq)

8464

kfree(tg->rt_rq[i]);

8464

kfree(tg->rt_rq[i]);

8465

if (tg->rt_se)

8465

if (tg->rt_se)

8466

kfree(tg->rt_se[i]);

8466

kfree(tg->rt_se[i]);

8467

}

8467

}

8468

8469

kfree(tg->rt_rq);

8469

kfree(tg->rt_rq);

8470

kfree(tg->rt_se);

8470

kfree(tg->rt_se);

8471

}

8471

}

8472

8473

static

8473

static

8474

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

8474

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

8475

{

8475

{

8476

struct rt_rq *rt_rq;

8476

struct rt_rq *rt_rq;

8477

struct sched_rt_entity *rt_se;

8477

struct sched_rt_entity *rt_se;

8478

int i;

8478

int i;

8479

8480

tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);

8480

tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);

8481

if (!tg->rt_rq)

8481

if (!tg->rt_rq)

8482

goto err;

8482

goto err;

8483

tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);

8483

tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);

8484

if (!tg->rt_se)

8484

if (!tg->rt_se)

8485

goto err;

8485

goto err;

8486

8487

init_rt_bandwidth(&tg->rt_bandwidth,

8487

init_rt_bandwidth(&tg->rt_bandwidth,

8488

ktime_to_ns(def_rt_bandwidth.rt_period), 0);

8488

ktime_to_ns(def_rt_bandwidth.rt_period), 0);

8489

8490

for_each_possible_cpu(i) {

8490

for_each_possible_cpu(i) {

8491

rt_rq = kzalloc_node(sizeof(struct rt_rq),

8491

rt_rq = kzalloc_node(sizeof(struct rt_rq),

8492

GFP_KERNEL, cpu_to_node(i));

8492

GFP_KERNEL, cpu_to_node(i));

8493

if (!rt_rq)

8493

if (!rt_rq)

8494

goto err;

8494

goto err;

8495

8496

rt_se = kzalloc_node(sizeof(struct sched_rt_entity),

8496

rt_se = kzalloc_node(sizeof(struct sched_rt_entity),

8497

GFP_KERNEL, cpu_to_node(i));

8497

GFP_KERNEL, cpu_to_node(i));

8498

if (!rt_se)

8498

if (!rt_se)

8499

goto err_free_rq;

8499

goto err_free_rq;

8500

8501

init_rt_rq(rt_rq, cpu_rq(i));

8501

init_rt_rq(rt_rq, cpu_rq(i));

8502

rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;

8502

rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;

8503

init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);

8503

init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);

8504

}

8504

}

8505

8506

return 1;

8506

return 1;

8507

8508

err_free_rq:

8508

err_free_rq:

8509

kfree(rt_rq);

8509

kfree(rt_rq);

8510

err:

8510

err:

8511

return 0;

8511

return 0;

8512

}

8512

}

8513

#else /* !CONFIG_RT_GROUP_SCHED */

8513

#else /* !CONFIG_RT_GROUP_SCHED */

8514

static inline void free_rt_sched_group(struct task_group *tg)

8514

static inline void free_rt_sched_group(struct task_group *tg)

8515

{

8515

{

8516

}

8516

}

8517

8518

static inline

8518

static inline

8519

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

8519

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

8520

{

8520

{

8521

return 1;

8521

return 1;

8522

}

8522

}

8523

#endif /* CONFIG_RT_GROUP_SCHED */

8523

#endif /* CONFIG_RT_GROUP_SCHED */

8524

8525

#ifdef CONFIG_CGROUP_SCHED

8525

#ifdef CONFIG_CGROUP_SCHED

8526

static void free_sched_group(struct task_group *tg)

8526

static void free_sched_group(struct task_group *tg)

8527

{

8527

{

8528

free_fair_sched_group(tg);

8528

free_fair_sched_group(tg);

8529

free_rt_sched_group(tg);

8529

free_rt_sched_group(tg);

8530

autogroup_free(tg);

8530

autogroup_free(tg);

8531

kfree(tg);

8531

kfree(tg);

8532

}

8532

}

8533

8534

/* allocate runqueue etc for a new task group */

8534

/* allocate runqueue etc for a new task group */

8535

struct task_group *sched_create_group(struct task_group *parent)

8535

struct task_group *sched_create_group(struct task_group *parent)

8536

{

8536

{

8537

struct task_group *tg;

8537

struct task_group *tg;

8538

unsigned long flags;

8538

unsigned long flags;

8539

8540

tg = kzalloc(sizeof(*tg), GFP_KERNEL);

8540

tg = kzalloc(sizeof(*tg), GFP_KERNEL);

8541

if (!tg)

8541

if (!tg)

8542

return ERR_PTR(-ENOMEM);

8542

return ERR_PTR(-ENOMEM);

8543

8544

if (!alloc_fair_sched_group(tg, parent))

8544

if (!alloc_fair_sched_group(tg, parent))

8545

goto err;

8545

goto err;

8546

8547

if (!alloc_rt_sched_group(tg, parent))

8547

if (!alloc_rt_sched_group(tg, parent))

8548

goto err;

8548

goto err;

8549

8550

spin_lock_irqsave(&task_group_lock, flags);

8550

spin_lock_irqsave(&task_group_lock, flags);

8551

list_add_rcu(&tg->list, &task_groups);

8551

list_add_rcu(&tg->list, &task_groups);

8552

8553

WARN_ON(!parent); /* root should already exist */

8553

WARN_ON(!parent); /* root should already exist */

8554

8555

tg->parent = parent;

8555

tg->parent = parent;

8556

INIT_LIST_HEAD(&tg->children);

8556

INIT_LIST_HEAD(&tg->children);

8557

list_add_rcu(&tg->siblings, &parent->children);

8557

list_add_rcu(&tg->siblings, &parent->children);

8558

spin_unlock_irqrestore(&task_group_lock, flags);

8558

spin_unlock_irqrestore(&task_group_lock, flags);

8559

8560

return tg;

8560

return tg;

8561

8562

err:

8562

err:

8563

free_sched_group(tg);

8563

free_sched_group(tg);

8564

return ERR_PTR(-ENOMEM);

8564

return ERR_PTR(-ENOMEM);

8565

}

8565

}

8566

8567

/* rcu callback to free various structures associated with a task group */

8567

/* rcu callback to free various structures associated with a task group */

8568

static void free_sched_group_rcu(struct rcu_head *rhp)

8568

static void free_sched_group_rcu(struct rcu_head *rhp)

8569

{

8569

{

8570

/* now it should be safe to free those cfs_rqs */

8570

/* now it should be safe to free those cfs_rqs */

8571

free_sched_group(container_of(rhp, struct task_group, rcu));

8571

free_sched_group(container_of(rhp, struct task_group, rcu));

8572

}

8572

}

8573

8574

/* Destroy runqueue etc associated with a task group */

8574

/* Destroy runqueue etc associated with a task group */

8575

void sched_destroy_group(struct task_group *tg)

8575

void sched_destroy_group(struct task_group *tg)

8576

{

8576

{

8577

unsigned long flags;

8577

unsigned long flags;

8578

int i;

8578

int i;

8579

8580

/* end participation in shares distribution */

8580

/* end participation in shares distribution */

8581

for_each_possible_cpu(i)

8581

for_each_possible_cpu(i)

8582

unregister_fair_sched_group(tg, i);

8582

unregister_fair_sched_group(tg, i);

8583

8584

spin_lock_irqsave(&task_group_lock, flags);

8584

spin_lock_irqsave(&task_group_lock, flags);

8585

list_del_rcu(&tg->list);

8585

list_del_rcu(&tg->list);

8586

list_del_rcu(&tg->siblings);

8586

list_del_rcu(&tg->siblings);

8587

spin_unlock_irqrestore(&task_group_lock, flags);

8587

spin_unlock_irqrestore(&task_group_lock, flags);

8588

8589

/* wait for possible concurrent references to cfs_rqs complete */

8589

/* wait for possible concurrent references to cfs_rqs complete */

8590

call_rcu(&tg->rcu, free_sched_group_rcu);

8590

call_rcu(&tg->rcu, free_sched_group_rcu);

8591

}

8591

}

8592

8593

/* change task's runqueue when it moves between groups.

8593

/* change task's runqueue when it moves between groups.

8594

* The caller of this function should have put the task in its new group

8594

* The caller of this function should have put the task in its new group

8595

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

8595

* by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to

8596

* reflect its new group.

8596

* reflect its new group.

8597

*/

8597

*/

8598

void sched_move_task(struct task_struct *tsk)

8598

void sched_move_task(struct task_struct *tsk)

8599

{

8599

{

8600

int on_rq, running;

8600

int on_rq, running;

8601

unsigned long flags;

8601

unsigned long flags;

8602

struct rq *rq;

8602

struct rq *rq;

8603

8604

rq = task_rq_lock(tsk, &flags);

8604

rq = task_rq_lock(tsk, &flags);

8605

8606

running = task_current(rq, tsk);

8606

running = task_current(rq, tsk);

8607

on_rq = tsk->on_rq;

8607

on_rq = tsk->on_rq;

8608

8609

if (on_rq)

8609

if (on_rq)

8610

dequeue_task(rq, tsk, 0);

8610

dequeue_task(rq, tsk, 0);

8611

if (unlikely(running))

8611

if (unlikely(running))

8612

tsk->sched_class->put_prev_task(rq, tsk);

8612

tsk->sched_class->put_prev_task(rq, tsk);

8613

8614

#ifdef CONFIG_FAIR_GROUP_SCHED

8614

#ifdef CONFIG_FAIR_GROUP_SCHED

8615

if (tsk->sched_class->task_move_group)

8615

if (tsk->sched_class->task_move_group)

8616

tsk->sched_class->task_move_group(tsk, on_rq);

8616

tsk->sched_class->task_move_group(tsk, on_rq);

8617

else

8617

else

8618

#endif

8618

#endif

8619

set_task_rq(tsk, task_cpu(tsk));

8619

set_task_rq(tsk, task_cpu(tsk));

8620

8621

if (unlikely(running))

8621

if (unlikely(running))

8622

tsk->sched_class->set_curr_task(rq);

8622

tsk->sched_class->set_curr_task(rq);

8623

if (on_rq)

8623

if (on_rq)

8624

enqueue_task(rq, tsk, 0);

8624

enqueue_task(rq, tsk, 0);

8625

8626

task_rq_unlock(rq, tsk, &flags);

8626

task_rq_unlock(rq, tsk, &flags);

8627

}

8627

}

8628

#endif /* CONFIG_CGROUP_SCHED */

8628

#endif /* CONFIG_CGROUP_SCHED */

8629

8630

#ifdef CONFIG_FAIR_GROUP_SCHED

8630

#ifdef CONFIG_FAIR_GROUP_SCHED

8631

static DEFINE_MUTEX(shares_mutex);

8631

static DEFINE_MUTEX(shares_mutex);

8632

8633

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

8633

int sched_group_set_shares(struct task_group *tg, unsigned long shares)

8634

{

8634

{

8635

int i;

8635

int i;

8636

unsigned long flags;

8636

unsigned long flags;

8637

8638

/*

8638

/*

8639

* We can't change the weight of the root cgroup.

8639

* We can't change the weight of the root cgroup.

8640

*/

8640

*/

8641

if (!tg->se[0])

8641

if (!tg->se[0])

8642

return -EINVAL;

8642

return -EINVAL;

8643

8644

shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));

8644

shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));

8645

8646

mutex_lock(&shares_mutex);

8646

mutex_lock(&shares_mutex);

8647

if (tg->shares == shares)

8647

if (tg->shares == shares)

8648

goto done;

8648

goto done;

8649

8650

tg->shares = shares;

8650

tg->shares = shares;

8651

for_each_possible_cpu(i) {

8651

for_each_possible_cpu(i) {

8652

struct rq *rq = cpu_rq(i);

8652

struct rq *rq = cpu_rq(i);

8653

struct sched_entity *se;

8653

struct sched_entity *se;

8654

8655

se = tg->se[i];

8655

se = tg->se[i];

8656

/* Propagate contribution to hierarchy */

8656

/* Propagate contribution to hierarchy */

8657

raw_spin_lock_irqsave(&rq->lock, flags);

8657

raw_spin_lock_irqsave(&rq->lock, flags);

8658

for_each_sched_entity(se)

8658

for_each_sched_entity(se)

8659

update_cfs_shares(group_cfs_rq(se));

8659

update_cfs_shares(group_cfs_rq(se));

8660

raw_spin_unlock_irqrestore(&rq->lock, flags);

8660

raw_spin_unlock_irqrestore(&rq->lock, flags);

8661

}

8661

}

8662

8663

done:

8663

done:

8664

mutex_unlock(&shares_mutex);

8664

mutex_unlock(&shares_mutex);

8665

return 0;

8665

return 0;

8666

}

8666

}

8667

8668

unsigned long sched_group_shares(struct task_group *tg)

8668

unsigned long sched_group_shares(struct task_group *tg)

8669

{

8669

{

8670

return tg->shares;

8670

return tg->shares;

8671

}

8671

}

8672

#endif

8672

#endif

8673

8674

#ifdef CONFIG_RT_GROUP_SCHED

8674

#ifdef CONFIG_RT_GROUP_SCHED

8675

/*

8675

/*

8676

* Ensure that the real time constraints are schedulable.

8676

* Ensure that the real time constraints are schedulable.

8677

*/

8677

*/

8678

static DEFINE_MUTEX(rt_constraints_mutex);

8678

static DEFINE_MUTEX(rt_constraints_mutex);

8679

8680

static unsigned long to_ratio(u64 period, u64 runtime)

8680

static unsigned long to_ratio(u64 period, u64 runtime)

8681

{

8681

{

8682

if (runtime == RUNTIME_INF)

8682

if (runtime == RUNTIME_INF)

8683

return 1ULL << 20;

8683

return 1ULL << 20;

8684

8685

return div64_u64(runtime << 20, period);

8685

return div64_u64(runtime << 20, period);

8686

}

8686

}

8687

8688

/* Must be called with tasklist_lock held */

8688

/* Must be called with tasklist_lock held */

8689

static inline int tg_has_rt_tasks(struct task_group *tg)

8689

static inline int tg_has_rt_tasks(struct task_group *tg)

8690

{

8690

{

8691

struct task_struct *g, *p;

8691

struct task_struct *g, *p;

8692

8693

do_each_thread(g, p) {

8693

do_each_thread(g, p) {

8694

if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)

8694

if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)

8695

return 1;

8695

return 1;

8696

} while_each_thread(g, p);

8696

} while_each_thread(g, p);

8697

8698

return 0;

8698

return 0;

8699

}

8699

}

8700

8701

struct rt_schedulable_data {

8701

struct rt_schedulable_data {

8702

struct task_group *tg;

8702

struct task_group *tg;

8703

u64 rt_period;

8703

u64 rt_period;

8704

u64 rt_runtime;

8704

u64 rt_runtime;

8705

};

8705

};

8706

8707

static int tg_schedulable(struct task_group *tg, void *data)

8707

static int tg_schedulable(struct task_group *tg, void *data)

8708

{

8708

{

8709

struct rt_schedulable_data *d = data;

8709

struct rt_schedulable_data *d = data;

8710

struct task_group *child;

8710

struct task_group *child;

8711

unsigned long total, sum = 0;

8711

unsigned long total, sum = 0;

8712

u64 period, runtime;

8712

u64 period, runtime;

8713

8714

period = ktime_to_ns(tg->rt_bandwidth.rt_period);

8714

period = ktime_to_ns(tg->rt_bandwidth.rt_period);

8715

runtime = tg->rt_bandwidth.rt_runtime;

8715

runtime = tg->rt_bandwidth.rt_runtime;

8716

8717

if (tg == d->tg) {

8717

if (tg == d->tg) {

8718

period = d->rt_period;

8718

period = d->rt_period;

8719

runtime = d->rt_runtime;

8719

runtime = d->rt_runtime;

8720

}

8720

}

8721

8722

/*

8722

/*

8723

* Cannot have more runtime than the period.

8723

* Cannot have more runtime than the period.

8724

*/

8724

*/

8725

if (runtime > period && runtime != RUNTIME_INF)

8725

if (runtime > period && runtime != RUNTIME_INF)

8726

return -EINVAL;

8726

return -EINVAL;

8727

8728

/*

8728

/*

8729

* Ensure we don't starve existing RT tasks.

8729

* Ensure we don't starve existing RT tasks.

8730

*/

8730

*/

8731

if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))

8731

if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))

8732

return -EBUSY;

8732

return -EBUSY;

8733

8734

total = to_ratio(period, runtime);

8734

total = to_ratio(period, runtime);

8735

8736

/*

8736

/*

8737

* Nobody can have more than the global setting allows.

8737

* Nobody can have more than the global setting allows.

8738

*/

8738

*/

8739

if (total > to_ratio(global_rt_period(), global_rt_runtime()))

8739

if (total > to_ratio(global_rt_period(), global_rt_runtime()))

8740

return -EINVAL;

8740

return -EINVAL;

8741

8742

/*

8742

/*

8743

* The sum of our children's runtime should not exceed our own.

8743

* The sum of our children's runtime should not exceed our own.

8744

*/

8744

*/

8745

list_for_each_entry_rcu(child, &tg->children, siblings) {

8745

list_for_each_entry_rcu(child, &tg->children, siblings) {

8746

period = ktime_to_ns(child->rt_bandwidth.rt_period);

8746

period = ktime_to_ns(child->rt_bandwidth.rt_period);

8747

runtime = child->rt_bandwidth.rt_runtime;

8747

runtime = child->rt_bandwidth.rt_runtime;

8748

8749

if (child == d->tg) {

8749

if (child == d->tg) {

8750

period = d->rt_period;

8750

period = d->rt_period;

8751

runtime = d->rt_runtime;

8751

runtime = d->rt_runtime;

8752

}

8752

}

8753

8754

sum += to_ratio(period, runtime);

8754

sum += to_ratio(period, runtime);

8755

}

8755

}

8756

8757

if (sum > total)

8757

if (sum > total)

8758

return -EINVAL;

8758

return -EINVAL;

8759

8760

return 0;

8760

return 0;

8761

}

8761

}

8762

8763

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)

8763

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)

8764

{

8764

{

8765

struct rt_schedulable_data data = {

8765

struct rt_schedulable_data data = {

8766

.tg = tg,

8766

.tg = tg,

8767

.rt_period = period,

8767

.rt_period = period,

8768

.rt_runtime = runtime,

8768

.rt_runtime = runtime,

8769

};

8769

};

8770

8771

return walk_tg_tree(tg_schedulable, tg_nop, &data);

8771

return walk_tg_tree(tg_schedulable, tg_nop, &data);

8772

}

8772

}

8773

8774

static int tg_set_bandwidth(struct task_group *tg,

8774

static int tg_set_bandwidth(struct task_group *tg,

8775

u64 rt_period, u64 rt_runtime)

8775

u64 rt_period, u64 rt_runtime)

8776

{

8776

{

8777

int i, err = 0;

8777

int i, err = 0;

8778

8779

mutex_lock(&rt_constraints_mutex);

8779

mutex_lock(&rt_constraints_mutex);

8780

read_lock(&tasklist_lock);

8780

read_lock(&tasklist_lock);

8781

err = __rt_schedulable(tg, rt_period, rt_runtime);

8781

err = __rt_schedulable(tg, rt_period, rt_runtime);

8782

if (err)

8782

if (err)

8783

goto unlock;

8783

goto unlock;

8784

8785

raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);

8785

raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);

8786

tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);

8786

tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);

8787

tg->rt_bandwidth.rt_runtime = rt_runtime;

8787

tg->rt_bandwidth.rt_runtime = rt_runtime;

8788

8789

for_each_possible_cpu(i) {

8789

for_each_possible_cpu(i) {

8790

struct rt_rq *rt_rq = tg->rt_rq[i];

8790

struct rt_rq *rt_rq = tg->rt_rq[i];

8791

8792

raw_spin_lock(&rt_rq->rt_runtime_lock);

8792

raw_spin_lock(&rt_rq->rt_runtime_lock);

8793

rt_rq->rt_runtime = rt_runtime;

8793

rt_rq->rt_runtime = rt_runtime;

8794

raw_spin_unlock(&rt_rq->rt_runtime_lock);

8794

raw_spin_unlock(&rt_rq->rt_runtime_lock);

8795

}

8795

}

8796

raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);

8796

raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);

8797

unlock:

8797

unlock:

8798

read_unlock(&tasklist_lock);

8798

read_unlock(&tasklist_lock);

8799

mutex_unlock(&rt_constraints_mutex);

8799

mutex_unlock(&rt_constraints_mutex);

8800

8801

return err;

8801

return err;

8802

}

8802

}

8803

8804

int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)

8804

int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)

8805

{

8805

{

8806

u64 rt_runtime, rt_period;

8806

u64 rt_runtime, rt_period;

8807

8808

rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);

8808

rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);

8809

rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;

8809

rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;

8810

if (rt_runtime_us < 0)

8810

if (rt_runtime_us < 0)

8811

rt_runtime = RUNTIME_INF;

8811

rt_runtime = RUNTIME_INF;

8812

8813

return tg_set_bandwidth(tg, rt_period, rt_runtime);

8813

return tg_set_bandwidth(tg, rt_period, rt_runtime);

8814

}

8814

}

8815

8816

long sched_group_rt_runtime(struct task_group *tg)

8816

long sched_group_rt_runtime(struct task_group *tg)

8817

{

8817

{

8818

u64 rt_runtime_us;

8818

u64 rt_runtime_us;

8819

8820

if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)

8820

if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)

8821

return -1;

8821

return -1;

8822

8823

rt_runtime_us = tg->rt_bandwidth.rt_runtime;

8823

rt_runtime_us = tg->rt_bandwidth.rt_runtime;

8824

do_div(rt_runtime_us, NSEC_PER_USEC);

8824

do_div(rt_runtime_us, NSEC_PER_USEC);

8825

return rt_runtime_us;

8825

return rt_runtime_us;

8826

}

8826

}

8827

8828

int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)

8828

int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)

8829

{

8829

{

8830

u64 rt_runtime, rt_period;

8830

u64 rt_runtime, rt_period;

8831

8832

rt_period = (u64)rt_period_us * NSEC_PER_USEC;

8832

rt_period = (u64)rt_period_us * NSEC_PER_USEC;

8833

rt_runtime = tg->rt_bandwidth.rt_runtime;

8833

rt_runtime = tg->rt_bandwidth.rt_runtime;

8834

8835

if (rt_period == 0)

8835

if (rt_period == 0)

8836

return -EINVAL;

8836

return -EINVAL;

8837

8838

return tg_set_bandwidth(tg, rt_period, rt_runtime);

8838

return tg_set_bandwidth(tg, rt_period, rt_runtime);

8839

}

8839

}

8840

8841

long sched_group_rt_period(struct task_group *tg)

8841

long sched_group_rt_period(struct task_group *tg)

8842

{

8842

{

8843

u64 rt_period_us;

8843

u64 rt_period_us;

8844

8845

rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);

8845

rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);

8846

do_div(rt_period_us, NSEC_PER_USEC);

8846

do_div(rt_period_us, NSEC_PER_USEC);

8847

return rt_period_us;

8847

return rt_period_us;

8848

}

8848

}

8849

8850

static int sched_rt_global_constraints(void)

8850

static int sched_rt_global_constraints(void)

8851

{

8851

{

8852

u64 runtime, period;

8852

u64 runtime, period;

8853

int ret = 0;

8853

int ret = 0;

8854

8855

if (sysctl_sched_rt_period <= 0)

8855

if (sysctl_sched_rt_period <= 0)

8856

return -EINVAL;

8856

return -EINVAL;

8857

8858

runtime = global_rt_runtime();

8858

runtime = global_rt_runtime();

8859

period = global_rt_period();

8859

period = global_rt_period();

8860

8861

/*

8861

/*

8862

* Sanity check on the sysctl variables.

8862

* Sanity check on the sysctl variables.

8863

*/

8863

*/

8864

if (runtime > period && runtime != RUNTIME_INF)

8864

if (runtime > period && runtime != RUNTIME_INF)

8865

return -EINVAL;

8865

return -EINVAL;

8866

8867

mutex_lock(&rt_constraints_mutex);

8867

mutex_lock(&rt_constraints_mutex);

8868

read_lock(&tasklist_lock);

8868

read_lock(&tasklist_lock);

8869

ret = __rt_schedulable(NULL, 0, 0);

8869

ret = __rt_schedulable(NULL, 0, 0);

8870

read_unlock(&tasklist_lock);

8870

read_unlock(&tasklist_lock);

8871

mutex_unlock(&rt_constraints_mutex);

8871

mutex_unlock(&rt_constraints_mutex);

8872

8873

return ret;

8873

return ret;

8874

}

8874

}

8875

8876

int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)

8876

int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)

8877

{

8877

{

8878

/* Don't accept realtime tasks when there is no way for them to run */

8878

/* Don't accept realtime tasks when there is no way for them to run */

8879

if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)

8879

if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)

8880

return 0;

8880

return 0;

8881

8882

return 1;

8882

return 1;

8883

}

8883

}

8884

8885

#else /* !CONFIG_RT_GROUP_SCHED */

8885

#else /* !CONFIG_RT_GROUP_SCHED */

8886

static int sched_rt_global_constraints(void)

8886

static int sched_rt_global_constraints(void)

8887

{

8887

{

8888

unsigned long flags;

8888

unsigned long flags;

8889

int i;

8889

int i;

8890

8891

if (sysctl_sched_rt_period <= 0)

8891

if (sysctl_sched_rt_period <= 0)

8892

return -EINVAL;

8892

return -EINVAL;

8893

8894

/*

8894

/*

8895

* There's always some RT tasks in the root group

8895

* There's always some RT tasks in the root group

8896

* -- migration, kstopmachine etc..

8896

* -- migration, kstopmachine etc..

8897

*/

8897

*/

8898

if (sysctl_sched_rt_runtime == 0)

8898

if (sysctl_sched_rt_runtime == 0)

8899

return -EBUSY;

8899

return -EBUSY;

8900

8901

raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);

8901

raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);

8902

for_each_possible_cpu(i) {

8902

for_each_possible_cpu(i) {

8903

struct rt_rq *rt_rq = &cpu_rq(i)->rt;

8903

struct rt_rq *rt_rq = &cpu_rq(i)->rt;

8904

8905

raw_spin_lock(&rt_rq->rt_runtime_lock);

8905

raw_spin_lock(&rt_rq->rt_runtime_lock);

8906

rt_rq->rt_runtime = global_rt_runtime();

8906

rt_rq->rt_runtime = global_rt_runtime();

8907

raw_spin_unlock(&rt_rq->rt_runtime_lock);

8907

raw_spin_unlock(&rt_rq->rt_runtime_lock);

8908

}

8908

}

8909

raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

8909

raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

8910

8911

return 0;

8911

return 0;

8912

}

8912

}

8913

#endif /* CONFIG_RT_GROUP_SCHED */

8913

#endif /* CONFIG_RT_GROUP_SCHED */

8914

8915

int sched_rt_handler(struct ctl_table *table, int write,

8915

int sched_rt_handler(struct ctl_table *table, int write,

8916

void __user *buffer, size_t *lenp,

8916

void __user *buffer, size_t *lenp,

8917

loff_t *ppos)

8917

loff_t *ppos)

8918

{

8918

{

8919

int ret;

8919

int ret;

8920

int old_period, old_runtime;

8920

int old_period, old_runtime;

8921

static DEFINE_MUTEX(mutex);

8921

static DEFINE_MUTEX(mutex);

8922

8923

mutex_lock(&mutex);

8923

mutex_lock(&mutex);

8924

old_period = sysctl_sched_rt_period;

8924

old_period = sysctl_sched_rt_period;

8925

old_runtime = sysctl_sched_rt_runtime;

8925

old_runtime = sysctl_sched_rt_runtime;

8926

8927

ret = proc_dointvec(table, write, buffer, lenp, ppos);

8927

ret = proc_dointvec(table, write, buffer, lenp, ppos);

8928

8929

if (!ret && write) {

8929

if (!ret && write) {

8930

ret = sched_rt_global_constraints();

8930

ret = sched_rt_global_constraints();

8931

if (ret) {

8931

if (ret) {

8932

sysctl_sched_rt_period = old_period;

8932

sysctl_sched_rt_period = old_period;

8933

sysctl_sched_rt_runtime = old_runtime;

8933

sysctl_sched_rt_runtime = old_runtime;

8934

} else {

8934

} else {

8935

def_rt_bandwidth.rt_runtime = global_rt_runtime();

8935

def_rt_bandwidth.rt_runtime = global_rt_runtime();

8936

def_rt_bandwidth.rt_period =

8936

def_rt_bandwidth.rt_period =

8937

ns_to_ktime(global_rt_period());

8937

ns_to_ktime(global_rt_period());

8938

}

8938

}

8939

}

8939

}

8940

mutex_unlock(&mutex);

8940

mutex_unlock(&mutex);

8941

8942

return ret;

8942

return ret;

8943

}

8943

}

8944

8945

#ifdef CONFIG_CGROUP_SCHED

8945

#ifdef CONFIG_CGROUP_SCHED

8946

8947

/* return corresponding task_group object of a cgroup */

8947

/* return corresponding task_group object of a cgroup */

8948

static inline struct task_group *cgroup_tg(struct cgroup *cgrp)

8948

static inline struct task_group *cgroup_tg(struct cgroup *cgrp)

8949

{

8949

{

8950

return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),

8950

return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),

8951

struct task_group, css);

8951

struct task_group, css);

8952

}

8952

}

8953

8954

static struct cgroup_subsys_state *

8954

static struct cgroup_subsys_state *

8955

cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)

8955

cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)

8956

{

8956

{

8957

struct task_group *tg, *parent;

8957

struct task_group *tg, *parent;

8958

8959

if (!cgrp->parent) {

8959

if (!cgrp->parent) {

8960

/* This is early initialization for the top cgroup */

8960

/* This is early initialization for the top cgroup */

8961

return &root_task_group.css;

8961

return &root_task_group.css;

8962

}

8962

}

8963

8964

parent = cgroup_tg(cgrp->parent);

8964

parent = cgroup_tg(cgrp->parent);

8965

tg = sched_create_group(parent);

8965

tg = sched_create_group(parent);

8966

if (IS_ERR(tg))

8966

if (IS_ERR(tg))

8967

return ERR_PTR(-ENOMEM);

8967

return ERR_PTR(-ENOMEM);

8968

8969

return &tg->css;

8969

return &tg->css;

8970

}

8970

}

8971

8972

static void

8972

static void

8973

cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

8973

cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

8974

{

8974

{

8975

struct task_group *tg = cgroup_tg(cgrp);

8975

struct task_group *tg = cgroup_tg(cgrp);

8976

8977

sched_destroy_group(tg);

8977

sched_destroy_group(tg);

8978

}

8978

}

8979

8980

static int

8980

static int

8981

cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

8981

cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

8982

{

8982

{

8983

#ifdef CONFIG_RT_GROUP_SCHED

8983

#ifdef CONFIG_RT_GROUP_SCHED

8984

if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))

8984

if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))

8985

return -EINVAL;

8985

return -EINVAL;

8986

#else

8986

#else

8987

/* We don't support RT-tasks being in separate groups */

8987

/* We don't support RT-tasks being in separate groups */

8988

if (tsk->sched_class != &fair_sched_class)

8988

if (tsk->sched_class != &fair_sched_class)

8989

return -EINVAL;

8989

return -EINVAL;

8990

#endif

8990

#endif

8991

return 0;

8991

return 0;

8992

}

8992

}

8993

8994

static void

8994

static void

8995

cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

8995

cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

8996

{

8996

{

8997

sched_move_task(tsk);

8997

sched_move_task(tsk);

8998

}

8998

}

8999

9000

static void

9000

static void

9001

cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,

9001

cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,

9002

struct cgroup *old_cgrp, struct task_struct *task)

9002

struct cgroup *old_cgrp, struct task_struct *task)

9003

{

9003

{

9004

/*

9004

/*

9005

* cgroup_exit() is called in the copy_process() failure path.

9005

* cgroup_exit() is called in the copy_process() failure path.

9006

* Ignore this case since the task hasn't ran yet, this avoids

9006

* Ignore this case since the task hasn't ran yet, this avoids

9007

* trying to poke a half freed task state from generic code.

9007

* trying to poke a half freed task state from generic code.

9008

*/

9008

*/

9009

if (!(task->flags & PF_EXITING))

9009

if (!(task->flags & PF_EXITING))

9010

return;

9010

return;

9011

9012

sched_move_task(task);

9012

sched_move_task(task);

9013

}

9013

}

9014

9015

#ifdef CONFIG_FAIR_GROUP_SCHED

9015

#ifdef CONFIG_FAIR_GROUP_SCHED

9016

static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,

9016

static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,

9017

u64 shareval)

9017

u64 shareval)

9018

{

9018

{

9019

return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));

9019

return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));

9020

}

9020

}

9021

9022

static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)

9022

static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)

9023

{

9023

{

9024

struct task_group *tg = cgroup_tg(cgrp);

9024

struct task_group *tg = cgroup_tg(cgrp);

9025

9026

return (u64) scale_load_down(tg->shares);

9026

return (u64) scale_load_down(tg->shares);

9027

}

9027

}

9028

#endif /* CONFIG_FAIR_GROUP_SCHED */

9028

#endif /* CONFIG_FAIR_GROUP_SCHED */

9029

9030

#ifdef CONFIG_RT_GROUP_SCHED

9030

#ifdef CONFIG_RT_GROUP_SCHED

9031

static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,

9031

static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,

9032

s64 val)

9032

s64 val)

9033

{

9033

{

9034

return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);

9034

return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);

9035

}

9035

}

9036

9037

static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)

9037

static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)

9038

{

9038

{

9039

return sched_group_rt_runtime(cgroup_tg(cgrp));

9039

return sched_group_rt_runtime(cgroup_tg(cgrp));

9040

}

9040

}

9041

9042

static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,

9042

static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,

9043

u64 rt_period_us)

9043

u64 rt_period_us)

9044

{

9044

{

9045

return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);

9045

return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);

9046

}

9046

}

9047

9048

static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)

9048

static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)

9049

{

9049

{

9050

return sched_group_rt_period(cgroup_tg(cgrp));

9050

return sched_group_rt_period(cgroup_tg(cgrp));

9051

}

9051

}

9052

#endif /* CONFIG_RT_GROUP_SCHED */

9052

#endif /* CONFIG_RT_GROUP_SCHED */

9053

9054

static struct cftype cpu_files[] = {

9054

static struct cftype cpu_files[] = {

9055

#ifdef CONFIG_FAIR_GROUP_SCHED

9055

#ifdef CONFIG_FAIR_GROUP_SCHED

9056

{

9056

{

9057

.name = "shares",

9057

.name = "shares",

9058

.read_u64 = cpu_shares_read_u64,

9058

.read_u64 = cpu_shares_read_u64,

9059

.write_u64 = cpu_shares_write_u64,

9059

.write_u64 = cpu_shares_write_u64,

9060

},

9060

},

9061

#endif

9061

#endif

9062

#ifdef CONFIG_RT_GROUP_SCHED

9062

#ifdef CONFIG_RT_GROUP_SCHED

9063

{

9063

{

9064

.name = "rt_runtime_us",

9064

.name = "rt_runtime_us",

9065

.read_s64 = cpu_rt_runtime_read,

9065

.read_s64 = cpu_rt_runtime_read,

9066

.write_s64 = cpu_rt_runtime_write,

9066

.write_s64 = cpu_rt_runtime_write,

9067

},

9067

},

9068

{

9068

{

9069

.name = "rt_period_us",

9069

.name = "rt_period_us",

9070

.read_u64 = cpu_rt_period_read_uint,

9070

.read_u64 = cpu_rt_period_read_uint,

9071

.write_u64 = cpu_rt_period_write_uint,

9071

.write_u64 = cpu_rt_period_write_uint,

9072

},

9072

},

9073

#endif

9073

#endif

9074

};

9074

};

9075

9076

static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)

9076

static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)

9077

{

9077

{

9078

return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));

9078

return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));

9079

}

9079

}

9080

9081

struct cgroup_subsys cpu_cgroup_subsys = {

9081

struct cgroup_subsys cpu_cgroup_subsys = {

9082

.name = "cpu",

9082

.name = "cpu",

9083

.create = cpu_cgroup_create,

9083

.create = cpu_cgroup_create,

9084

.destroy = cpu_cgroup_destroy,

9084

.destroy = cpu_cgroup_destroy,

9085

.can_attach_task = cpu_cgroup_can_attach_task,

9085

.can_attach_task = cpu_cgroup_can_attach_task,

9086

.attach_task = cpu_cgroup_attach_task,

9086

.attach_task = cpu_cgroup_attach_task,

9087

.exit = cpu_cgroup_exit,

9087

.exit = cpu_cgroup_exit,

9088

.populate = cpu_cgroup_populate,

9088

.populate = cpu_cgroup_populate,

9089

.subsys_id = cpu_cgroup_subsys_id,

9089

.subsys_id = cpu_cgroup_subsys_id,

9090

.early_init = 1,

9090

.early_init = 1,

9091

};

9091

};

9092

9093

#endif /* CONFIG_CGROUP_SCHED */

9093

#endif /* CONFIG_CGROUP_SCHED */

9094

9095

#ifdef CONFIG_CGROUP_CPUACCT

9095

#ifdef CONFIG_CGROUP_CPUACCT

9096

9097

/*

9097

/*

9098

* CPU accounting code for task groups.

9098

* CPU accounting code for task groups.

9099

*

9099

*

9100

* Based on the work by Paul Menage (menage@google.com) and Balbir Singh

9100

* Based on the work by Paul Menage (menage@google.com) and Balbir Singh

9101

* (balbir@in.ibm.com).

9101

* (balbir@in.ibm.com).

9102

*/

9102

*/

9103

9104

/* track cpu usage of a group of tasks and its child groups */

9104

/* track cpu usage of a group of tasks and its child groups */

9105

struct cpuacct {

9105

struct cpuacct {

9106

struct cgroup_subsys_state css;

9106

struct cgroup_subsys_state css;

9107

/* cpuusage holds pointer to a u64-type object on every cpu */

9107

/* cpuusage holds pointer to a u64-type object on every cpu */

9108

u64 __percpu *cpuusage;

9108

u64 __percpu *cpuusage;

9109

struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];

9109

struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];

9110

struct cpuacct *parent;

9110

struct cpuacct *parent;

9111

};

9111

};

9112

9113

struct cgroup_subsys cpuacct_subsys;

9113

struct cgroup_subsys cpuacct_subsys;

9114

9115

/* return cpu accounting group corresponding to this container */

9115

/* return cpu accounting group corresponding to this container */

9116

static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)

9116

static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)

9117

{

9117

{

9118

return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),

9118

return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),

9119

struct cpuacct, css);

9119

struct cpuacct, css);

9120

}

9120

}

9121

9122

/* return cpu accounting group to which this task belongs */

9122

/* return cpu accounting group to which this task belongs */

9123

static inline struct cpuacct *task_ca(struct task_struct *tsk)

9123

static inline struct cpuacct *task_ca(struct task_struct *tsk)

9124

{

9124

{

9125

return container_of(task_subsys_state(tsk, cpuacct_subsys_id),

9125

return container_of(task_subsys_state(tsk, cpuacct_subsys_id),

9126

struct cpuacct, css);

9126

struct cpuacct, css);

9127

}

9127

}

9128

9129

/* create a new cpu accounting group */

9129

/* create a new cpu accounting group */

9130

static struct cgroup_subsys_state *cpuacct_create(

9130

static struct cgroup_subsys_state *cpuacct_create(

9131

struct cgroup_subsys *ss, struct cgroup *cgrp)

9131

struct cgroup_subsys *ss, struct cgroup *cgrp)

9132

{

9132

{

9133

struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);

9133

struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);

9134

int i;

9134

int i;

9135

9136

if (!ca)

9136

if (!ca)

9137

goto out;

9137

goto out;

9138

9139

ca->cpuusage = alloc_percpu(u64);

9139

ca->cpuusage = alloc_percpu(u64);

9140

if (!ca->cpuusage)

9140

if (!ca->cpuusage)

9141

goto out_free_ca;

9141

goto out_free_ca;

9142

9143

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

9143

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

9144

if (percpu_counter_init(&ca->cpustat[i], 0))

9144

if (percpu_counter_init(&ca->cpustat[i], 0))

9145

goto out_free_counters;

9145

goto out_free_counters;

9146

9147

if (cgrp->parent)

9147

if (cgrp->parent)

9148

ca->parent = cgroup_ca(cgrp->parent);

9148

ca->parent = cgroup_ca(cgrp->parent);

9149

9150

return &ca->css;

9150

return &ca->css;

9151

9152

out_free_counters:

9152

out_free_counters:

9153

while (--i >= 0)

9153

while (--i >= 0)

9154

percpu_counter_destroy(&ca->cpustat[i]);

9154

percpu_counter_destroy(&ca->cpustat[i]);

9155

free_percpu(ca->cpuusage);

9155

free_percpu(ca->cpuusage);

9156

out_free_ca:

9156

out_free_ca:

9157

kfree(ca);

9157

kfree(ca);

9158

out:

9158

out:

9159

return ERR_PTR(-ENOMEM);

9159

return ERR_PTR(-ENOMEM);

9160

}

9160

}

9161

9162

/* destroy an existing cpu accounting group */

9162

/* destroy an existing cpu accounting group */

9163

static void

9163

static void

9164

cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

9164

cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)

9165

{

9165

{

9166

struct cpuacct *ca = cgroup_ca(cgrp);

9166

struct cpuacct *ca = cgroup_ca(cgrp);

9167

int i;

9167

int i;

9168

9169

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

9169

for (i = 0; i < CPUACCT_STAT_NSTATS; i++)

9170

percpu_counter_destroy(&ca->cpustat[i]);

9170

percpu_counter_destroy(&ca->cpustat[i]);

9171

free_percpu(ca->cpuusage);

9171

free_percpu(ca->cpuusage);

9172

kfree(ca);

9172

kfree(ca);

9173

}

9173

}

9174

9175

static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)

9175

static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)

9176

{

9176

{

9177

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

9177

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

9178

u64 data;

9178

u64 data;

9179

9180

#ifndef CONFIG_64BIT

9180

#ifndef CONFIG_64BIT

9181

/*

9181

/*

9182

* Take rq->lock to make 64-bit read safe on 32-bit platforms.

9182

* Take rq->lock to make 64-bit read safe on 32-bit platforms.

9183

*/

9183

*/

9184

raw_spin_lock_irq(&cpu_rq(cpu)->lock);

9184

raw_spin_lock_irq(&cpu_rq(cpu)->lock);

9185

data = *cpuusage;

9185

data = *cpuusage;

9186

raw_spin_unlock_irq(&cpu_rq(cpu)->lock);

9186

raw_spin_unlock_irq(&cpu_rq(cpu)->lock);

9187

#else

9187

#else

9188

data = *cpuusage;

9188

data = *cpuusage;

9189

#endif

9189

#endif

9190

9191

return data;

9191

return data;

9192

}

9192

}

9193

9194

static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)

9194

static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)

9195

{

9195

{

9196

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

9196

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

9197

9198

#ifndef CONFIG_64BIT

9198

#ifndef CONFIG_64BIT

9199

/*

9199

/*

9200

* Take rq->lock to make 64-bit write safe on 32-bit platforms.

9200

* Take rq->lock to make 64-bit write safe on 32-bit platforms.

9201

*/

9201

*/

9202

raw_spin_lock_irq(&cpu_rq(cpu)->lock);

9202

raw_spin_lock_irq(&cpu_rq(cpu)->lock);

9203

*cpuusage = val;

9203

*cpuusage = val;

9204

raw_spin_unlock_irq(&cpu_rq(cpu)->lock);

9204

raw_spin_unlock_irq(&cpu_rq(cpu)->lock);

9205

#else

9205

#else

9206

*cpuusage = val;

9206

*cpuusage = val;

9207

#endif

9207

#endif

9208

}

9208

}

9209

9210

/* return total cpu usage (in nanoseconds) of a group */

9210

/* return total cpu usage (in nanoseconds) of a group */

9211

static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)

9211

static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)

9212

{

9212

{

9213

struct cpuacct *ca = cgroup_ca(cgrp);

9213

struct cpuacct *ca = cgroup_ca(cgrp);

9214

u64 totalcpuusage = 0;

9214

u64 totalcpuusage = 0;

9215

int i;

9215

int i;

9216

9217

for_each_present_cpu(i)

9217

for_each_present_cpu(i)

9218

totalcpuusage += cpuacct_cpuusage_read(ca, i);

9218

totalcpuusage += cpuacct_cpuusage_read(ca, i);

9219

9220

return totalcpuusage;

9220

return totalcpuusage;

9221

}

9221

}

9222

9223

static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,

9223

static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,

9224

u64 reset)

9224

u64 reset)

9225

{

9225

{

9226

struct cpuacct *ca = cgroup_ca(cgrp);

9226

struct cpuacct *ca = cgroup_ca(cgrp);

9227

int err = 0;

9227

int err = 0;

9228

int i;

9228

int i;

9229

9230

if (reset) {

9230

if (reset) {

9231

err = -EINVAL;

9231

err = -EINVAL;

9232

goto out;

9232

goto out;

9233

}

9233

}

9234

9235

for_each_present_cpu(i)

9235

for_each_present_cpu(i)

9236

cpuacct_cpuusage_write(ca, i, 0);

9236

cpuacct_cpuusage_write(ca, i, 0);

9237

9238

out:

9238

out:

9239

return err;

9239

return err;

9240

}

9240

}

9241

9242

static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,

9242

static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,

9243

struct seq_file *m)

9243

struct seq_file *m)

9244

{

9244

{

9245

struct cpuacct *ca = cgroup_ca(cgroup);

9245

struct cpuacct *ca = cgroup_ca(cgroup);

9246

u64 percpu;

9246

u64 percpu;

9247

int i;

9247

int i;

9248

9249

for_each_present_cpu(i) {

9249

for_each_present_cpu(i) {

9250

percpu = cpuacct_cpuusage_read(ca, i);

9250

percpu = cpuacct_cpuusage_read(ca, i);

9251

seq_printf(m, "%llu ", (unsigned long long) percpu);

9251

seq_printf(m, "%llu ", (unsigned long long) percpu);

9252

}

9252

}

9253

seq_printf(m, "\n");

9253

seq_printf(m, "\n");

9254

return 0;

9254

return 0;

9255

}

9255

}

9256

9257

static const char *cpuacct_stat_desc[] = {

9257

static const char *cpuacct_stat_desc[] = {

9258

[CPUACCT_STAT_USER] = "user",

9258

[CPUACCT_STAT_USER] = "user",

9259

[CPUACCT_STAT_SYSTEM] = "system",

9259

[CPUACCT_STAT_SYSTEM] = "system",

9260

};

9260

};

9261

9262

static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,

9262

static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,

9263

struct cgroup_map_cb *cb)

9263

struct cgroup_map_cb *cb)

9264

{

9264

{

9265

struct cpuacct *ca = cgroup_ca(cgrp);

9265

struct cpuacct *ca = cgroup_ca(cgrp);

9266

int i;

9266

int i;

9267

9268

for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {

9268

for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {

9269

s64 val = percpu_counter_read(&ca->cpustat[i]);

9269

s64 val = percpu_counter_read(&ca->cpustat[i]);

9270

val = cputime64_to_clock_t(val);

9270

val = cputime64_to_clock_t(val);

9271

cb->fill(cb, cpuacct_stat_desc[i], val);

9271

cb->fill(cb, cpuacct_stat_desc[i], val);

9272

}

9272

}

9273

return 0;

9273

return 0;

9274

}

9274

}

9275

9276

static struct cftype files[] = {

9276

static struct cftype files[] = {

9277

{

9277

{

9278

.name = "usage",

9278

.name = "usage",

9279

.read_u64 = cpuusage_read,

9279

.read_u64 = cpuusage_read,

9280

.write_u64 = cpuusage_write,

9280

.write_u64 = cpuusage_write,

9281

},

9281

},

9282

{

9282

{

9283

.name = "usage_percpu",

9283

.name = "usage_percpu",

9284

.read_seq_string = cpuacct_percpu_seq_read,

9284

.read_seq_string = cpuacct_percpu_seq_read,

9285

},

9285

},

9286

{

9286

{

9287

.name = "stat",

9287

.name = "stat",

9288

.read_map = cpuacct_stats_show,

9288

.read_map = cpuacct_stats_show,

9289

},

9289

},

9290

};

9290

};

9291

9292

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)

9292

static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)

9293

{

9293

{

9294

return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));

9294

return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));

9295

}

9295

}

9296

9297

/*

9297

/*

9298

* charge this task's execution time to its accounting group.

9298

* charge this task's execution time to its accounting group.

9299

*

9299

*

9300

* called with rq->lock held.

9300

* called with rq->lock held.

9301

*/

9301

*/

9302

static void cpuacct_charge(struct task_struct *tsk, u64 cputime)

9302

static void cpuacct_charge(struct task_struct *tsk, u64 cputime)

9303

{

9303

{

9304

struct cpuacct *ca;

9304

struct cpuacct *ca;

9305

int cpu;

9305

int cpu;

9306

9307

if (unlikely(!cpuacct_subsys.active))

9307

if (unlikely(!cpuacct_subsys.active))

9308

return;

9308

return;

9309

9310

cpu = task_cpu(tsk);

9310

cpu = task_cpu(tsk);

9311

9312

rcu_read_lock();

9312

rcu_read_lock();

9313

9314

ca = task_ca(tsk);

9314

ca = task_ca(tsk);

9315

9316

for (; ca; ca = ca->parent) {

9316

for (; ca; ca = ca->parent) {

9317

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

9317

u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);

9318

*cpuusage += cputime;

9318

*cpuusage += cputime;

9319

}

9319

}

9320

9321

rcu_read_unlock();

9321

rcu_read_unlock();

9322

}

9322

}

9323

9324

/*

9324

/*

9325

* When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large

9325

* When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large

9326

* in cputime_t units. As a result, cpuacct_update_stats calls

9326

* in cputime_t units. As a result, cpuacct_update_stats calls

9327

* percpu_counter_add with values large enough to always overflow the

9327

* percpu_counter_add with values large enough to always overflow the

9328

* per cpu batch limit causing bad SMP scalability.

9328

* per cpu batch limit causing bad SMP scalability.

9329

*

9329

*

9330

* To fix this we scale percpu_counter_batch by cputime_one_jiffy so we

9330

* To fix this we scale percpu_counter_batch by cputime_one_jiffy so we

9331

* batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled

9331

* batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled

9332

* and enabled. We cap it at INT_MAX which is the largest allowed batch value.

9332

* and enabled. We cap it at INT_MAX which is the largest allowed batch value.

9333

*/

9333

*/

9334

#ifdef CONFIG_SMP

9334

#ifdef CONFIG_SMP

9335

#define CPUACCT_BATCH \

9335

#define CPUACCT_BATCH \

9336

min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)

9336

min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)

9337

#else

9337

#else

9338

#define CPUACCT_BATCH 0

9338

#define CPUACCT_BATCH 0

9339

#endif

9339

#endif

9340

9341

/*

9341

/*

9342

* Charge the system/user time to the task's accounting group.

9342

* Charge the system/user time to the task's accounting group.

9343

*/

9343

*/

9344

static void cpuacct_update_stats(struct task_struct *tsk,

9344

static void cpuacct_update_stats(struct task_struct *tsk,

9345

enum cpuacct_stat_index idx, cputime_t val)

9345

enum cpuacct_stat_index idx, cputime_t val)

9346

{

9346

{

9347

struct cpuacct *ca;

9347

struct cpuacct *ca;

9348

int batch = CPUACCT_BATCH;

9348

int batch = CPUACCT_BATCH;

9349

9350

if (unlikely(!cpuacct_subsys.active))

9350

if (unlikely(!cpuacct_subsys.active))

9351

return;

9351

return;

9352

9353

rcu_read_lock();

9353

rcu_read_lock();

9354

ca = task_ca(tsk);

9354

ca = task_ca(tsk);

9355

9356

do {

9356

do {

9357

__percpu_counter_add(&ca->cpustat[idx], val, batch);

9357

__percpu_counter_add(&ca->cpustat[idx], val, batch);

9358

ca = ca->parent;

9358

ca = ca->parent;

9359

} while (ca);

9359

} while (ca);

9360

rcu_read_unlock();

9360

rcu_read_unlock();

9361

}

9361

}

9362

9363

struct cgroup_subsys cpuacct_subsys = {

9363

struct cgroup_subsys cpuacct_subsys = {

9364

.name = "cpuacct",

9364

.name = "cpuacct",

9365

.create = cpuacct_create,

9365

.create = cpuacct_create,

9366

.destroy = cpuacct_destroy,

9366

.destroy = cpuacct_destroy,

9367

.populate = cpuacct_populate,

9367

.populate = cpuacct_populate,

9368

.subsys_id = cpuacct_subsys_id,

9368

.subsys_id = cpuacct_subsys_id,

9369

};

9369

};

9370

#endif /* CONFIG_CGROUP_CPUACCT */

9370

#endif /* CONFIG_CGROUP_CPUACCT */

9371

9372

GITLAB

Eric Lee / linux-smarc-t335x-v3.2

sched: Fix up wchan borkage