Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* Generic process-grouping system.

2

* Generic process-grouping system.

3

*

3

*

4

* Based originally on the cpuset system, extracted by Paul Menage

4

* Based originally on the cpuset system, extracted by Paul Menage

5

6

*

6

*

7

* Notifications support

7

* Notifications support

8

9

* Author: Kirill A. Shutemov

9

* Author: Kirill A. Shutemov

10

*

10

*

11

* Copyright notices from the original cpuset code:

11

* Copyright notices from the original cpuset code:

12

* --------------------------------------------------

12

* --------------------------------------------------

13

14

15

*

15

*

16

* Portions derived from Patrick Mochel's sysfs code.

16

* Portions derived from Patrick Mochel's sysfs code.

17

18

*

18

*

19

* 2003-10-10 Written by Simon Derr.

19

* 2003-10-10 Written by Simon Derr.

20

* 2003-10-22 Updates by Stephen Hemminger.

20

* 2003-10-22 Updates by Stephen Hemminger.

21

* 2004 May-July Rework by Paul Jackson.

21

* 2004 May-July Rework by Paul Jackson.

22

* ---------------------------------------------------

22

* ---------------------------------------------------

23

*

23

*

24

* This file is subject to the terms and conditions of the GNU General Public

24

* This file is subject to the terms and conditions of the GNU General Public

25

* License. See the file COPYING in the main directory of the Linux

25

* License. See the file COPYING in the main directory of the Linux

26

* distribution for more details.

26

* distribution for more details.

27

*/

27

*/

28

29

#include <linux/cgroup.h>

29

#include <linux/cgroup.h>

30

#include <linux/cred.h>

30

#include <linux/cred.h>

31

#include <linux/ctype.h>

31

#include <linux/ctype.h>

32

#include <linux/errno.h>

32

#include <linux/errno.h>

33

#include <linux/init_task.h>

33

#include <linux/init_task.h>

34

#include <linux/kernel.h>

34

#include <linux/kernel.h>

35

#include <linux/list.h>

35

#include <linux/list.h>

36

#include <linux/mm.h>

36

#include <linux/mm.h>

37

#include <linux/mutex.h>

37

#include <linux/mutex.h>

38

#include <linux/mount.h>

38

#include <linux/mount.h>

39

#include <linux/pagemap.h>

39

#include <linux/pagemap.h>

40

#include <linux/proc_fs.h>

40

#include <linux/proc_fs.h>

41

#include <linux/rcupdate.h>

41

#include <linux/rcupdate.h>

42

#include <linux/sched.h>

42

#include <linux/sched.h>

43

#include <linux/slab.h>

43

#include <linux/slab.h>

44

#include <linux/spinlock.h>

44

#include <linux/spinlock.h>

45

#include <linux/rwsem.h>

45

#include <linux/rwsem.h>

46

#include <linux/string.h>

46

#include <linux/string.h>

47

#include <linux/sort.h>

47

#include <linux/sort.h>

48

#include <linux/kmod.h>

48

#include <linux/kmod.h>

49

#include <linux/delayacct.h>

49

#include <linux/delayacct.h>

50

#include <linux/cgroupstats.h>

50

#include <linux/cgroupstats.h>

51

#include <linux/hashtable.h>

51

#include <linux/hashtable.h>

52

#include <linux/pid_namespace.h>

52

#include <linux/pid_namespace.h>

53

#include <linux/idr.h>

53

#include <linux/idr.h>

54

#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */

54

#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */

55

#include <linux/kthread.h>

55

#include <linux/kthread.h>

56

#include <linux/delay.h>

56

#include <linux/delay.h>

57

58

#include <linux/atomic.h>

58

#include <linux/atomic.h>

59

60

/*

60

/*

61

* pidlists linger the following amount before being destroyed. The goal

61

* pidlists linger the following amount before being destroyed. The goal

62

* is avoiding frequent destruction in the middle of consecutive read calls

62

* is avoiding frequent destruction in the middle of consecutive read calls

63

* Expiring in the middle is a performance problem not a correctness one.

63

* Expiring in the middle is a performance problem not a correctness one.

64

* 1 sec should be enough.

64

* 1 sec should be enough.

65

*/

65

*/

66

#define CGROUP_PIDLIST_DESTROY_DELAY HZ

66

#define CGROUP_PIDLIST_DESTROY_DELAY HZ

67

68

#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \

68

#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \

69

MAX_CFTYPE_NAME + 2)

69

MAX_CFTYPE_NAME + 2)

70

71

/*

71

/*

72

* cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file

72

* cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file

73

* creation/removal and hierarchy changing operations including cgroup

73

* creation/removal and hierarchy changing operations including cgroup

74

* creation, removal, css association and controller rebinding. This outer

74

* creation, removal, css association and controller rebinding. This outer

75

* lock is needed mainly to resolve the circular dependency between kernfs

75

* lock is needed mainly to resolve the circular dependency between kernfs

76

* active ref and cgroup_mutex. cgroup_tree_mutex nests above both.

76

* active ref and cgroup_mutex. cgroup_tree_mutex nests above both.

77

*/

77

*/

78

static DEFINE_MUTEX(cgroup_tree_mutex);

78

static DEFINE_MUTEX(cgroup_tree_mutex);

79

80

/*

80

/*

81

* cgroup_mutex is the master lock. Any modification to cgroup or its

81

* cgroup_mutex is the master lock. Any modification to cgroup or its

82

* hierarchy must be performed while holding it.

82

* hierarchy must be performed while holding it.

83

*

83

*

84

* css_set_rwsem protects task->cgroups pointer, the list of css_set

84

* css_set_rwsem protects task->cgroups pointer, the list of css_set

85

* objects, and the chain of tasks off each css_set.

85

* objects, and the chain of tasks off each css_set.

86

*

86

*

87

* These locks are exported if CONFIG_PROVE_RCU so that accessors in

87

* These locks are exported if CONFIG_PROVE_RCU so that accessors in

88

* cgroup.h can use them for lockdep annotations.

88

* cgroup.h can use them for lockdep annotations.

89

*/

89

*/

90

#ifdef CONFIG_PROVE_RCU

90

#ifdef CONFIG_PROVE_RCU

91

DEFINE_MUTEX(cgroup_mutex);

91

DEFINE_MUTEX(cgroup_mutex);

92

DECLARE_RWSEM(css_set_rwsem);

92

DECLARE_RWSEM(css_set_rwsem);

93

EXPORT_SYMBOL_GPL(cgroup_mutex);

93

EXPORT_SYMBOL_GPL(cgroup_mutex);

94

EXPORT_SYMBOL_GPL(css_set_rwsem);

94

EXPORT_SYMBOL_GPL(css_set_rwsem);

95

#else

95

#else

96

static DEFINE_MUTEX(cgroup_mutex);

96

static DEFINE_MUTEX(cgroup_mutex);

97

static DECLARE_RWSEM(css_set_rwsem);

97

static DECLARE_RWSEM(css_set_rwsem);

98

#endif

98

#endif

99

100

/*

100

/*

101

* Protects cgroup_subsys->release_agent_path. Modifying it also requires

101

* Protects cgroup_subsys->release_agent_path. Modifying it also requires

102

* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.

102

* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.

103

*/

103

*/

104

static DEFINE_SPINLOCK(release_agent_path_lock);

104

static DEFINE_SPINLOCK(release_agent_path_lock);

105

106

#define cgroup_assert_mutexes_or_rcu_locked() \

106

#define cgroup_assert_mutexes_or_rcu_locked() \

107

rcu_lockdep_assert(rcu_read_lock_held() || \

107

rcu_lockdep_assert(rcu_read_lock_held() || \

108

lockdep_is_held(&cgroup_tree_mutex) || \

108

lockdep_is_held(&cgroup_tree_mutex) || \

109

lockdep_is_held(&cgroup_mutex), \

109

lockdep_is_held(&cgroup_mutex), \

110

"cgroup_[tree_]mutex or RCU read lock required");

110

"cgroup_[tree_]mutex or RCU read lock required");

111

112

/*

112

/*

113

* cgroup destruction makes heavy use of work items and there can be a lot

113

* cgroup destruction makes heavy use of work items and there can be a lot

114

* of concurrent destructions. Use a separate workqueue so that cgroup

114

* of concurrent destructions. Use a separate workqueue so that cgroup

115

* destruction work items don't end up filling up max_active of system_wq

115

* destruction work items don't end up filling up max_active of system_wq

116

* which may lead to deadlock.

116

* which may lead to deadlock.

117

*/

117

*/

118

static struct workqueue_struct *cgroup_destroy_wq;

118

static struct workqueue_struct *cgroup_destroy_wq;

119

120

/*

120

/*

121

* pidlist destructions need to be flushed on cgroup destruction. Use a

121

* pidlist destructions need to be flushed on cgroup destruction. Use a

122

* separate workqueue as flush domain.

122

* separate workqueue as flush domain.

123

*/

123

*/

124

static struct workqueue_struct *cgroup_pidlist_destroy_wq;

124

static struct workqueue_struct *cgroup_pidlist_destroy_wq;

125

126

/* generate an array of cgroup subsystem pointers */

126

/* generate an array of cgroup subsystem pointers */

127

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,

127

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,

128

static struct cgroup_subsys *cgroup_subsys[] = {

128

static struct cgroup_subsys *cgroup_subsys[] = {

129

#include <linux/cgroup_subsys.h>

129

#include <linux/cgroup_subsys.h>

130

};

130

};

131

#undef SUBSYS

131

#undef SUBSYS

132

133

/* array of cgroup subsystem names */

133

/* array of cgroup subsystem names */

134

#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,

134

#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,

135

static const char *cgroup_subsys_name[] = {

135

static const char *cgroup_subsys_name[] = {

136

#include <linux/cgroup_subsys.h>

136

#include <linux/cgroup_subsys.h>

137

};

137

};

138

#undef SUBSYS

138

#undef SUBSYS

139

140

/*

140

/*

141

* The default hierarchy, reserved for the subsystems that are otherwise

141

* The default hierarchy, reserved for the subsystems that are otherwise

142

* unattached - it never has more than a single cgroup, and all tasks are

142

* unattached - it never has more than a single cgroup, and all tasks are

143

* part of that cgroup.

143

* part of that cgroup.

144

*/

144

*/

145

struct cgroup_root cgrp_dfl_root;

145

struct cgroup_root cgrp_dfl_root;

146

147

/*

147

/*

148

* The default hierarchy always exists but is hidden until mounted for the

148

* The default hierarchy always exists but is hidden until mounted for the

149

* first time. This is for backward compatibility.

149

* first time. This is for backward compatibility.

150

*/

150

*/

151

static bool cgrp_dfl_root_visible;

151

static bool cgrp_dfl_root_visible;

152

153

/* The list of hierarchy roots */

153

/* The list of hierarchy roots */

154

155

static LIST_HEAD(cgroup_roots);

155

static LIST_HEAD(cgroup_roots);

156

static int cgroup_root_count;

156

static int cgroup_root_count;

157

158

/* hierarchy ID allocation and mapping, protected by cgroup_mutex */

158

/* hierarchy ID allocation and mapping, protected by cgroup_mutex */

159

static DEFINE_IDR(cgroup_hierarchy_idr);

159

static DEFINE_IDR(cgroup_hierarchy_idr);

160

161

/*

161

/*

162

* Assign a monotonically increasing serial number to cgroups. It

162

* Assign a monotonically increasing serial number to cgroups. It

163

* guarantees cgroups with bigger numbers are newer than those with smaller

163

* guarantees cgroups with bigger numbers are newer than those with smaller

164

* numbers. Also, as cgroups are always appended to the parent's

164

* numbers. Also, as cgroups are always appended to the parent's

165

* ->children list, it guarantees that sibling cgroups are always sorted in

165

* ->children list, it guarantees that sibling cgroups are always sorted in

166

* the ascending serial number order on the list. Protected by

166

* the ascending serial number order on the list. Protected by

167

* cgroup_mutex.

167

* cgroup_mutex.

168

*/

168

*/

169

static u64 cgroup_serial_nr_next = 1;

169

static u64 cgroup_serial_nr_next = 1;

170

171

/* This flag indicates whether tasks in the fork and exit paths should

171

/* This flag indicates whether tasks in the fork and exit paths should

172

* check for fork/exit handlers to call. This avoids us having to do

172

* check for fork/exit handlers to call. This avoids us having to do

173

* extra work in the fork/exit path if none of the subsystems need to

173

* extra work in the fork/exit path if none of the subsystems need to

174

* be called.

174

* be called.

175

*/

175

*/

176

static int need_forkexit_callback __read_mostly;

176

static int need_forkexit_callback __read_mostly;

177

178

static struct cftype cgroup_base_files[];

178

static struct cftype cgroup_base_files[];

179

180

static void cgroup_put(struct cgroup *cgrp);

180

static void cgroup_put(struct cgroup *cgrp);

181

static int rebind_subsystems(struct cgroup_root *dst_root,

181

static int rebind_subsystems(struct cgroup_root *dst_root,

182

unsigned long ss_mask);

182

unsigned long ss_mask);

183

static void cgroup_destroy_css_killed(struct cgroup *cgrp);

183

static void cgroup_destroy_css_killed(struct cgroup *cgrp);

184

static int cgroup_destroy_locked(struct cgroup *cgrp);

184

static int cgroup_destroy_locked(struct cgroup *cgrp);

185

static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],

185

static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],

186

bool is_add);

186

bool is_add);

187

static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);

187

static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);

188

189

/**

189

/**

190

* cgroup_css - obtain a cgroup's css for the specified subsystem

190

* cgroup_css - obtain a cgroup's css for the specified subsystem

191

* @cgrp: the cgroup of interest

191

* @cgrp: the cgroup of interest

192

* @ss: the subsystem of interest (%NULL returns the dummy_css)

192

* @ss: the subsystem of interest (%NULL returns the dummy_css)

193

*

193

*

194

* Return @cgrp's css (cgroup_subsys_state) associated with @ss. This

194

* Return @cgrp's css (cgroup_subsys_state) associated with @ss. This

195

* function must be called either under cgroup_mutex or rcu_read_lock() and

195

* function must be called either under cgroup_mutex or rcu_read_lock() and

196

* the caller is responsible for pinning the returned css if it wants to

196

* the caller is responsible for pinning the returned css if it wants to

197

* keep accessing it outside the said locks. This function may return

197

* keep accessing it outside the said locks. This function may return

198

* %NULL if @cgrp doesn't have @subsys_id enabled.

198

* %NULL if @cgrp doesn't have @subsys_id enabled.

199

*/

199

*/

200

static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,

200

static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,

201

struct cgroup_subsys *ss)

201

struct cgroup_subsys *ss)

202

{

202

{

203

if (ss)

203

if (ss)

204

return rcu_dereference_check(cgrp->subsys[ss->id],

204

return rcu_dereference_check(cgrp->subsys[ss->id],

205

lockdep_is_held(&cgroup_tree_mutex) ||

205

lockdep_is_held(&cgroup_tree_mutex) ||

206

lockdep_is_held(&cgroup_mutex));

206

lockdep_is_held(&cgroup_mutex));

207

else

207

else

208

return &cgrp->dummy_css;

208

return &cgrp->dummy_css;

209

}

209

}

210

211

/**

212

* cgroup_e_css - obtain a cgroup's effective css for the specified subsystem

213

* @cgrp: the cgroup of interest

214

* @ss: the subsystem of interest (%NULL returns the dummy_css)

215

*

216

* Similar to cgroup_css() but returns the effctive css, which is defined

217

* as the matching css of the nearest ancestor including self which has @ss

218

* enabled. If @ss is associated with the hierarchy @cgrp is on, this

219

* function is guaranteed to return non-NULL css.

220

*/

221

static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,

222

struct cgroup_subsys *ss)

223

{

224

lockdep_assert_held(&cgroup_mutex);

225

226

if (!ss)

227

return &cgrp->dummy_css;

228

229

if (!(cgrp->root->subsys_mask & (1 << ss->id)))

230

return NULL;

231

232

while (cgrp->parent &&

233

!(cgrp->parent->child_subsys_mask & (1 << ss->id)))

234

cgrp = cgrp->parent;

235

236

return cgroup_css(cgrp, ss);

237

}

238

211

/* convenient tests for these bits */

239

/* convenient tests for these bits */

212

static inline bool cgroup_is_dead(const struct cgroup *cgrp)

240

static inline bool cgroup_is_dead(const struct cgroup *cgrp)

213

{

241

{

214

return test_bit(CGRP_DEAD, &cgrp->flags);

242

return test_bit(CGRP_DEAD, &cgrp->flags);

215

}

243

}

216

244

217

struct cgroup_subsys_state *seq_css(struct seq_file *seq)

245

struct cgroup_subsys_state *seq_css(struct seq_file *seq)

218

{

246

{

219

struct kernfs_open_file *of = seq->private;

247

struct kernfs_open_file *of = seq->private;

220

struct cgroup *cgrp = of->kn->parent->priv;

248

struct cgroup *cgrp = of->kn->parent->priv;

221

struct cftype *cft = seq_cft(seq);

249

struct cftype *cft = seq_cft(seq);

222

250

223

/*

251

/*

224

* This is open and unprotected implementation of cgroup_css().

252

* This is open and unprotected implementation of cgroup_css().

225

* seq_css() is only called from a kernfs file operation which has

253

* seq_css() is only called from a kernfs file operation which has

226

* an active reference on the file. Because all the subsystem

254

* an active reference on the file. Because all the subsystem

227

* files are drained before a css is disassociated with a cgroup,

255

* files are drained before a css is disassociated with a cgroup,

228

* the matching css from the cgroup's subsys table is guaranteed to

256

* the matching css from the cgroup's subsys table is guaranteed to

229

* be and stay valid until the enclosing operation is complete.

257

* be and stay valid until the enclosing operation is complete.

230

*/

258

*/

231

if (cft->ss)

259

if (cft->ss)

232

return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);

260

return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);

233

else

261

else

234

return &cgrp->dummy_css;

262

return &cgrp->dummy_css;

235

}

263

}

236

EXPORT_SYMBOL_GPL(seq_css);

264

EXPORT_SYMBOL_GPL(seq_css);

237

265

238

/**

266

/**

239

* cgroup_is_descendant - test ancestry

267

* cgroup_is_descendant - test ancestry

240

* @cgrp: the cgroup to be tested

268

* @cgrp: the cgroup to be tested

241

* @ancestor: possible ancestor of @cgrp

269

* @ancestor: possible ancestor of @cgrp

242

*

270

*

243

* Test whether @cgrp is a descendant of @ancestor. It also returns %true

271

* Test whether @cgrp is a descendant of @ancestor. It also returns %true

244

* if @cgrp == @ancestor. This function is safe to call as long as @cgrp

272

* if @cgrp == @ancestor. This function is safe to call as long as @cgrp

245

* and @ancestor are accessible.

273

* and @ancestor are accessible.

246

*/

274

*/

247

bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)

275

bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)

248

{

276

{

249

while (cgrp) {

277

while (cgrp) {

250

if (cgrp == ancestor)

278

if (cgrp == ancestor)

251

return true;

279

return true;

252

cgrp = cgrp->parent;

280

cgrp = cgrp->parent;

253

}

281

}

254

return false;

282

return false;

255

}

283

}

256

284

257

static int cgroup_is_releasable(const struct cgroup *cgrp)

285

static int cgroup_is_releasable(const struct cgroup *cgrp)

258

{

286

{

259

const int bits =

287

const int bits =

260

(1 << CGRP_RELEASABLE) |

288

(1 << CGRP_RELEASABLE) |

261

(1 << CGRP_NOTIFY_ON_RELEASE);

289

(1 << CGRP_NOTIFY_ON_RELEASE);

262

return (cgrp->flags & bits) == bits;

290

return (cgrp->flags & bits) == bits;

263

}

291

}

264

292

265

static int notify_on_release(const struct cgroup *cgrp)

293

static int notify_on_release(const struct cgroup *cgrp)

266

{

294

{

267

return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

295

return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

268

}

296

}

269

297

270

/**

298

/**

271

* for_each_css - iterate all css's of a cgroup

299

* for_each_css - iterate all css's of a cgroup

272

* @css: the iteration cursor

300

* @css: the iteration cursor

273

* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end

301

* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end

274

* @cgrp: the target cgroup to iterate css's of

302

* @cgrp: the target cgroup to iterate css's of

275

*

303

*

276

* Should be called under cgroup_mutex.

304

* Should be called under cgroup_[tree_]mutex.

277

*/

305

*/

278

#define for_each_css(css, ssid, cgrp) \

306

#define for_each_css(css, ssid, cgrp) \

279

for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \

307

for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \

280

if (!((css) = rcu_dereference_check( \

308

if (!((css) = rcu_dereference_check( \

281

(cgrp)->subsys[(ssid)], \

309

(cgrp)->subsys[(ssid)], \

282

lockdep_is_held(&cgroup_tree_mutex) || \

310

lockdep_is_held(&cgroup_tree_mutex) || \

283

lockdep_is_held(&cgroup_mutex)))) { } \

311

lockdep_is_held(&cgroup_mutex)))) { } \

284

else

312

else

285

313

286

/**

314

/**

315

* for_each_e_css - iterate all effective css's of a cgroup

316

* @css: the iteration cursor

317

* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end

318

* @cgrp: the target cgroup to iterate css's of

319

*

320

* Should be called under cgroup_[tree_]mutex.

321

*/

322

#define for_each_e_css(css, ssid, cgrp) \

323

for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \

324

if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \

325

; \

326

else

327

328

/**

287

* for_each_subsys - iterate all enabled cgroup subsystems

329

* for_each_subsys - iterate all enabled cgroup subsystems

288

* @ss: the iteration cursor

330

* @ss: the iteration cursor

289

* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end

331

* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end

290

*/

332

*/

291

#define for_each_subsys(ss, ssid) \

333

#define for_each_subsys(ss, ssid) \

292

for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \

334

for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \

293

(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)

335

(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)

294

336

295

/* iterate across the hierarchies */

337

/* iterate across the hierarchies */

296

#define for_each_root(root) \

338

#define for_each_root(root) \

297

list_for_each_entry((root), &cgroup_roots, root_list)

339

list_for_each_entry((root), &cgroup_roots, root_list)

298

340

299

/**

341

/**

300

* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.

342

* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.

301

* @cgrp: the cgroup to be checked for liveness

343

* @cgrp: the cgroup to be checked for liveness

302

*

344

*

303

* On success, returns true; the mutex should be later unlocked. On

345

* On success, returns true; the mutex should be later unlocked. On

304

* failure returns false with no lock held.

346

* failure returns false with no lock held.

305

*/

347

*/

306

static bool cgroup_lock_live_group(struct cgroup *cgrp)

348

static bool cgroup_lock_live_group(struct cgroup *cgrp)

307

{

349

{

308

mutex_lock(&cgroup_mutex);

350

mutex_lock(&cgroup_mutex);

309

if (cgroup_is_dead(cgrp)) {

351

if (cgroup_is_dead(cgrp)) {

310

mutex_unlock(&cgroup_mutex);

352

mutex_unlock(&cgroup_mutex);

311

return false;

353

return false;

312

}

354

}

313

return true;

355

return true;

314

}

356

}

315

357

316

/* the list of cgroups eligible for automatic release. Protected by

358

/* the list of cgroups eligible for automatic release. Protected by

317

* release_list_lock */

359

* release_list_lock */

318

static LIST_HEAD(release_list);

360

static LIST_HEAD(release_list);

319

static DEFINE_RAW_SPINLOCK(release_list_lock);

361

static DEFINE_RAW_SPINLOCK(release_list_lock);

320

static void cgroup_release_agent(struct work_struct *work);

362

static void cgroup_release_agent(struct work_struct *work);

321

static DECLARE_WORK(release_agent_work, cgroup_release_agent);

363

static DECLARE_WORK(release_agent_work, cgroup_release_agent);

322

static void check_for_release(struct cgroup *cgrp);

364

static void check_for_release(struct cgroup *cgrp);

323

365

324

/*

366

/*

325

* A cgroup can be associated with multiple css_sets as different tasks may

367

* A cgroup can be associated with multiple css_sets as different tasks may

326

* belong to different cgroups on different hierarchies. In the other

368

* belong to different cgroups on different hierarchies. In the other

327

* direction, a css_set is naturally associated with multiple cgroups.

369

* direction, a css_set is naturally associated with multiple cgroups.

328

* This M:N relationship is represented by the following link structure

370

* This M:N relationship is represented by the following link structure

329

* which exists for each association and allows traversing the associations

371

* which exists for each association and allows traversing the associations

330

* from both sides.

372

* from both sides.

331

*/

373

*/

332

struct cgrp_cset_link {

374

struct cgrp_cset_link {

333

/* the cgroup and css_set this link associates */

375

/* the cgroup and css_set this link associates */

334

struct cgroup *cgrp;

376

struct cgroup *cgrp;

335

struct css_set *cset;

377

struct css_set *cset;

336

378

337

/* list of cgrp_cset_links anchored at cgrp->cset_links */

379

/* list of cgrp_cset_links anchored at cgrp->cset_links */

338

struct list_head cset_link;

380

struct list_head cset_link;

339

381

340

/* list of cgrp_cset_links anchored at css_set->cgrp_links */

382

/* list of cgrp_cset_links anchored at css_set->cgrp_links */

341

struct list_head cgrp_link;

383

struct list_head cgrp_link;

342

};

384

};

343

385

344

/*

386

/*

345

* The default css_set - used by init and its children prior to any

387

* The default css_set - used by init and its children prior to any

346

* hierarchies being mounted. It contains a pointer to the root state

388

* hierarchies being mounted. It contains a pointer to the root state

347

* for each subsystem. Also used to anchor the list of css_sets. Not

389

* for each subsystem. Also used to anchor the list of css_sets. Not

348

* reference-counted, to improve performance when child cgroups

390

* reference-counted, to improve performance when child cgroups

349

* haven't been created.

391

* haven't been created.

350

*/

392

*/

351

static struct css_set init_css_set = {

393

static struct css_set init_css_set = {

352

.refcount = ATOMIC_INIT(1),

394

.refcount = ATOMIC_INIT(1),

353

.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),

395

.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),

354

.tasks = LIST_HEAD_INIT(init_css_set.tasks),

396

.tasks = LIST_HEAD_INIT(init_css_set.tasks),

355

.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),

397

.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),

356

.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),

398

.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),

357

.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),

399

.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),

358

};

400

};

359

401

360

static int css_set_count = 1; /* 1 for init_css_set */

402

static int css_set_count = 1; /* 1 for init_css_set */

361

403

362

/*

404

/*

363

* hash table for cgroup groups. This improves the performance to find

405

* hash table for cgroup groups. This improves the performance to find

364

* an existing css_set. This hash doesn't (currently) take into

406

* an existing css_set. This hash doesn't (currently) take into

365

* account cgroups in empty hierarchies.

407

* account cgroups in empty hierarchies.

366

*/

408

*/

367

#define CSS_SET_HASH_BITS 7

409

#define CSS_SET_HASH_BITS 7

368

static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);

410

static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);

369

411

370

static unsigned long css_set_hash(struct cgroup_subsys_state *css[])

412

static unsigned long css_set_hash(struct cgroup_subsys_state *css[])

371

{

413

{

372

unsigned long key = 0UL;

414

unsigned long key = 0UL;

373

struct cgroup_subsys *ss;

415

struct cgroup_subsys *ss;

374

int i;

416

int i;

375

417

376

for_each_subsys(ss, i)

418

for_each_subsys(ss, i)

377

key += (unsigned long)css[i];

419

key += (unsigned long)css[i];

378

key = (key >> 16) ^ key;

420

key = (key >> 16) ^ key;

379

421

380

return key;

422

return key;

381

}

423

}

382

424

383

static void put_css_set_locked(struct css_set *cset, bool taskexit)

425

static void put_css_set_locked(struct css_set *cset, bool taskexit)

384

{

426

{

385

struct cgrp_cset_link *link, *tmp_link;

427

struct cgrp_cset_link *link, *tmp_link;

386

428

387

lockdep_assert_held(&css_set_rwsem);

429

lockdep_assert_held(&css_set_rwsem);

388

430

389

if (!atomic_dec_and_test(&cset->refcount))

431

if (!atomic_dec_and_test(&cset->refcount))

390

return;

432

return;

391

433

392

/* This css_set is dead. unlink it and release cgroup refcounts */

434

/* This css_set is dead. unlink it and release cgroup refcounts */

393

hash_del(&cset->hlist);

435

hash_del(&cset->hlist);

394

css_set_count--;

436

css_set_count--;

395

437

396

list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {

438

list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {

397

struct cgroup *cgrp = link->cgrp;

439

struct cgroup *cgrp = link->cgrp;

398

440

399

list_del(&link->cset_link);

441

list_del(&link->cset_link);

400

list_del(&link->cgrp_link);

442

list_del(&link->cgrp_link);

401

443

402

/* @cgrp can't go away while we're holding css_set_rwsem */

444

/* @cgrp can't go away while we're holding css_set_rwsem */

403

if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {

445

if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {

404

if (taskexit)

446

if (taskexit)

405

set_bit(CGRP_RELEASABLE, &cgrp->flags);

447

set_bit(CGRP_RELEASABLE, &cgrp->flags);

406

check_for_release(cgrp);

448

check_for_release(cgrp);

407

}

449

}

408

450

409

kfree(link);

451

kfree(link);

410

}

452

}

411

453

412

kfree_rcu(cset, rcu_head);

454

kfree_rcu(cset, rcu_head);

413

}

455

}

414

456

415

static void put_css_set(struct css_set *cset, bool taskexit)

457

static void put_css_set(struct css_set *cset, bool taskexit)

416

{

458

{

417

/*

459

/*

418

* Ensure that the refcount doesn't hit zero while any readers

460

* Ensure that the refcount doesn't hit zero while any readers

419

* can see it. Similar to atomic_dec_and_lock(), but for an

461

* can see it. Similar to atomic_dec_and_lock(), but for an

420

* rwlock

462

* rwlock

421

*/

463

*/

422

if (atomic_add_unless(&cset->refcount, -1, 1))

464

if (atomic_add_unless(&cset->refcount, -1, 1))

423

return;

465

return;

424

466

425

down_write(&css_set_rwsem);

467

down_write(&css_set_rwsem);

426

put_css_set_locked(cset, taskexit);

468

put_css_set_locked(cset, taskexit);

427

up_write(&css_set_rwsem);

469

up_write(&css_set_rwsem);

428

}

470

}

429

471

430

/*

472

/*

431

* refcounted get/put for css_set objects

473

* refcounted get/put for css_set objects

432

*/

474

*/

433

static inline void get_css_set(struct css_set *cset)

475

static inline void get_css_set(struct css_set *cset)

434

{

476

{

435

atomic_inc(&cset->refcount);

477

atomic_inc(&cset->refcount);

436

}

478

}

437

479

438

/**

480

/**

439

* compare_css_sets - helper function for find_existing_css_set().

481

* compare_css_sets - helper function for find_existing_css_set().

440

* @cset: candidate css_set being tested

482

* @cset: candidate css_set being tested

441

* @old_cset: existing css_set for a task

483

* @old_cset: existing css_set for a task

442

* @new_cgrp: cgroup that's being entered by the task

484

* @new_cgrp: cgroup that's being entered by the task

443

* @template: desired set of css pointers in css_set (pre-calculated)

485

* @template: desired set of css pointers in css_set (pre-calculated)

444

*

486

*

445

* Returns true if "cset" matches "old_cset" except for the hierarchy

487

* Returns true if "cset" matches "old_cset" except for the hierarchy

446

* which "new_cgrp" belongs to, for which it should match "new_cgrp".

488

* which "new_cgrp" belongs to, for which it should match "new_cgrp".

447

*/

489

*/

448

static bool compare_css_sets(struct css_set *cset,

490

static bool compare_css_sets(struct css_set *cset,

449

struct css_set *old_cset,

491

struct css_set *old_cset,

450

struct cgroup *new_cgrp,

492

struct cgroup *new_cgrp,

451

struct cgroup_subsys_state *template[])

493

struct cgroup_subsys_state *template[])

452

{

494

{

453

struct list_head *l1, *l2;

495

struct list_head *l1, *l2;

454

496

455

if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {

497

/*

456

/* Not all subsystems matched */

498

* On the default hierarchy, there can be csets which are

499

* associated with the same set of cgroups but different csses.

500

* Let's first ensure that csses match.

501

*/

502

if (memcmp(template, cset->subsys, sizeof(cset->subsys)))

457

return false;

503

return false;

458

}

459

504

460

/*

505

/*

461

* Compare cgroup pointers in order to distinguish between

506

* Compare cgroup pointers in order to distinguish between

462

* different cgroups in heirarchies with no subsystems. We

507

* different cgroups in hierarchies. As different cgroups may

463

* could get by with just this check alone (and skip the

508

* share the same effective css, this comparison is always

464

* memcmp above) but on most setups the memcmp check will

509

* necessary.

465

* avoid the need for this more expensive check on almost all

466

* candidates.

467

*/

510

*/

468

469

l1 = &cset->cgrp_links;

511

l1 = &cset->cgrp_links;

470

l2 = &old_cset->cgrp_links;

512

l2 = &old_cset->cgrp_links;

471

while (1) {

513

while (1) {

472

struct cgrp_cset_link *link1, *link2;

514

struct cgrp_cset_link *link1, *link2;

473

struct cgroup *cgrp1, *cgrp2;

515

struct cgroup *cgrp1, *cgrp2;

474

516

475

l1 = l1->next;

517

l1 = l1->next;

476

l2 = l2->next;

518

l2 = l2->next;

477

/* See if we reached the end - both lists are equal length. */

519

/* See if we reached the end - both lists are equal length. */

478

if (l1 == &cset->cgrp_links) {

520

if (l1 == &cset->cgrp_links) {

479

BUG_ON(l2 != &old_cset->cgrp_links);

521

BUG_ON(l2 != &old_cset->cgrp_links);

480

break;

522

break;

481

} else {

523

} else {

482

BUG_ON(l2 == &old_cset->cgrp_links);

524

BUG_ON(l2 == &old_cset->cgrp_links);

483

}

525

}

484

/* Locate the cgroups associated with these links. */

526

/* Locate the cgroups associated with these links. */

485

link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);

527

link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);

486

link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);

528

link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);

487

cgrp1 = link1->cgrp;

529

cgrp1 = link1->cgrp;

488

cgrp2 = link2->cgrp;

530

cgrp2 = link2->cgrp;

489

/* Hierarchies should be linked in the same order. */

531

/* Hierarchies should be linked in the same order. */

490

BUG_ON(cgrp1->root != cgrp2->root);

532

BUG_ON(cgrp1->root != cgrp2->root);

491

533

492

/*

534

/*

493

* If this hierarchy is the hierarchy of the cgroup

535

* If this hierarchy is the hierarchy of the cgroup

494

* that's changing, then we need to check that this

536

* that's changing, then we need to check that this

495

* css_set points to the new cgroup; if it's any other

537

* css_set points to the new cgroup; if it's any other

496

* hierarchy, then this css_set should point to the

538

* hierarchy, then this css_set should point to the

497

* same cgroup as the old css_set.

539

* same cgroup as the old css_set.

498

*/

540

*/

499

if (cgrp1->root == new_cgrp->root) {

541

if (cgrp1->root == new_cgrp->root) {

500

if (cgrp1 != new_cgrp)

542

if (cgrp1 != new_cgrp)

501

return false;

543

return false;

502

} else {

544

} else {

503

if (cgrp1 != cgrp2)

545

if (cgrp1 != cgrp2)

504

return false;

546

return false;

505

}

547

}

506

}

548

}

507

return true;

549

return true;

508

}

550

}

509

551

510

/**

552

/**

511

* find_existing_css_set - init css array and find the matching css_set

553

* find_existing_css_set - init css array and find the matching css_set

512

* @old_cset: the css_set that we're using before the cgroup transition

554

* @old_cset: the css_set that we're using before the cgroup transition

513

* @cgrp: the cgroup that we're moving into

555

* @cgrp: the cgroup that we're moving into

514

* @template: out param for the new set of csses, should be clear on entry

556

* @template: out param for the new set of csses, should be clear on entry

515

*/

557

*/

516

static struct css_set *find_existing_css_set(struct css_set *old_cset,

558

static struct css_set *find_existing_css_set(struct css_set *old_cset,

517

struct cgroup *cgrp,

559

struct cgroup *cgrp,

518

struct cgroup_subsys_state *template[])

560

struct cgroup_subsys_state *template[])

519

{

561

{

520

struct cgroup_root *root = cgrp->root;

562

struct cgroup_root *root = cgrp->root;

521

struct cgroup_subsys *ss;

563

struct cgroup_subsys *ss;

522

struct css_set *cset;

564

struct css_set *cset;

523

unsigned long key;

565

unsigned long key;

524

int i;

566

int i;

525

567

526

/*

568

/*

527

* Build the set of subsystem state objects that we want to see in the

569

* Build the set of subsystem state objects that we want to see in the

528

* new css_set. while subsystems can change globally, the entries here

570

* new css_set. while subsystems can change globally, the entries here

529

* won't change, so no need for locking.

571

* won't change, so no need for locking.

530

*/

572

*/

531

for_each_subsys(ss, i) {

573

for_each_subsys(ss, i) {

532

if (root->subsys_mask & (1UL << i)) {

574

if (root->subsys_mask & (1UL << i)) {

533

/* Subsystem is in this hierarchy. So we want

575

/*

534

* the subsystem state from the new

576

* @ss is in this hierarchy, so we want the

535

* cgroup */

577

* effective css from @cgrp.

536

template[i] = cgroup_css(cgrp, ss);

578

*/

579

template[i] = cgroup_e_css(cgrp, ss);

537

} else {

580

} else {

538

/* Subsystem is not in this hierarchy, so we

581

/*

539

* don't want to change the subsystem state */

582

* @ss is not in this hierarchy, so we don't want

583

* to change the css.

584

*/

540

template[i] = old_cset->subsys[i];

585

template[i] = old_cset->subsys[i];

541

}

586

}

542

}

587

}

543

588

544

key = css_set_hash(template);

589

key = css_set_hash(template);

545

hash_for_each_possible(css_set_table, cset, hlist, key) {

590

hash_for_each_possible(css_set_table, cset, hlist, key) {

546

if (!compare_css_sets(cset, old_cset, cgrp, template))

591

if (!compare_css_sets(cset, old_cset, cgrp, template))

547

continue;

592

continue;

548

593

549

/* This css_set matches what we need */

594

/* This css_set matches what we need */

550

return cset;

595

return cset;

551

}

596

}

552

597

553

/* No existing cgroup group matched */

598

/* No existing cgroup group matched */

554

return NULL;

599

return NULL;

555

}

600

}

556

601

557

static void free_cgrp_cset_links(struct list_head *links_to_free)

602

static void free_cgrp_cset_links(struct list_head *links_to_free)

558

{

603

{

559

struct cgrp_cset_link *link, *tmp_link;

604

struct cgrp_cset_link *link, *tmp_link;

560

605

561

list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {

606

list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {

562

list_del(&link->cset_link);

607

list_del(&link->cset_link);

563

kfree(link);

608

kfree(link);

564

}

609

}

565

}

610

}

566

611

567

/**

612

/**

568

* allocate_cgrp_cset_links - allocate cgrp_cset_links

613

* allocate_cgrp_cset_links - allocate cgrp_cset_links

569

* @count: the number of links to allocate

614

* @count: the number of links to allocate

570

* @tmp_links: list_head the allocated links are put on

615

* @tmp_links: list_head the allocated links are put on

571

*

616

*

572

* Allocate @count cgrp_cset_link structures and chain them on @tmp_links

617

* Allocate @count cgrp_cset_link structures and chain them on @tmp_links

573

* through ->cset_link. Returns 0 on success or -errno.

618

* through ->cset_link. Returns 0 on success or -errno.

574

*/

619

*/

575

static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)

620

static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)

576

{

621

{

577

struct cgrp_cset_link *link;

622

struct cgrp_cset_link *link;

578

int i;

623

int i;

579

624

580

INIT_LIST_HEAD(tmp_links);

625

INIT_LIST_HEAD(tmp_links);

581

626

582

for (i = 0; i < count; i++) {

627

for (i = 0; i < count; i++) {

583

link = kzalloc(sizeof(*link), GFP_KERNEL);

628

link = kzalloc(sizeof(*link), GFP_KERNEL);

584

if (!link) {

629

if (!link) {

585

free_cgrp_cset_links(tmp_links);

630

free_cgrp_cset_links(tmp_links);

586

return -ENOMEM;

631

return -ENOMEM;

587

}

632

}

588

list_add(&link->cset_link, tmp_links);

633

list_add(&link->cset_link, tmp_links);

589

}

634

}

590

return 0;

635

return 0;

591

}

636

}

592

637

593

/**

638

/**

594

* link_css_set - a helper function to link a css_set to a cgroup

639

* link_css_set - a helper function to link a css_set to a cgroup

595

* @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()

640

* @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()

596

* @cset: the css_set to be linked

641

* @cset: the css_set to be linked

597

* @cgrp: the destination cgroup

642

* @cgrp: the destination cgroup

598

*/

643

*/

599

static void link_css_set(struct list_head *tmp_links, struct css_set *cset,

644

static void link_css_set(struct list_head *tmp_links, struct css_set *cset,

600

struct cgroup *cgrp)

645

struct cgroup *cgrp)

601

{

646

{

602

struct cgrp_cset_link *link;

647

struct cgrp_cset_link *link;

603

648

604

BUG_ON(list_empty(tmp_links));

649

BUG_ON(list_empty(tmp_links));

605

link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);

650

link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);

606

link->cset = cset;

651

link->cset = cset;

607

link->cgrp = cgrp;

652

link->cgrp = cgrp;

608

list_move(&link->cset_link, &cgrp->cset_links);

653

list_move(&link->cset_link, &cgrp->cset_links);

609

/*

654

/*

610

* Always add links to the tail of the list so that the list

655

* Always add links to the tail of the list so that the list

611

* is sorted by order of hierarchy creation

656

* is sorted by order of hierarchy creation

612

*/

657

*/

613

list_add_tail(&link->cgrp_link, &cset->cgrp_links);

658

list_add_tail(&link->cgrp_link, &cset->cgrp_links);

614

}

659

}

615

660

616

/**

661

/**

617

* find_css_set - return a new css_set with one cgroup updated

662

* find_css_set - return a new css_set with one cgroup updated

618

* @old_cset: the baseline css_set

663

* @old_cset: the baseline css_set

619

* @cgrp: the cgroup to be updated

664

* @cgrp: the cgroup to be updated

620

*

665

*

621

* Return a new css_set that's equivalent to @old_cset, but with @cgrp

666

* Return a new css_set that's equivalent to @old_cset, but with @cgrp

622

* substituted into the appropriate hierarchy.

667

* substituted into the appropriate hierarchy.

623

*/

668

*/

624

static struct css_set *find_css_set(struct css_set *old_cset,

669

static struct css_set *find_css_set(struct css_set *old_cset,

625

struct cgroup *cgrp)

670

struct cgroup *cgrp)

626

{

671

{

627

struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };

672

struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };

628

struct css_set *cset;

673

struct css_set *cset;

629

struct list_head tmp_links;

674

struct list_head tmp_links;

630

struct cgrp_cset_link *link;

675

struct cgrp_cset_link *link;

631

unsigned long key;

676

unsigned long key;

632

677

633

lockdep_assert_held(&cgroup_mutex);

678

lockdep_assert_held(&cgroup_mutex);

634

679

635

/* First see if we already have a cgroup group that matches

680

/* First see if we already have a cgroup group that matches

636

* the desired set */

681

* the desired set */

637

down_read(&css_set_rwsem);

682

down_read(&css_set_rwsem);

638

cset = find_existing_css_set(old_cset, cgrp, template);

683

cset = find_existing_css_set(old_cset, cgrp, template);

639

if (cset)

684

if (cset)

640

get_css_set(cset);

685

get_css_set(cset);

641

up_read(&css_set_rwsem);

686

up_read(&css_set_rwsem);

642

687

643

if (cset)

688

if (cset)

644

return cset;

689

return cset;

645

690

646

cset = kzalloc(sizeof(*cset), GFP_KERNEL);

691

cset = kzalloc(sizeof(*cset), GFP_KERNEL);

647

if (!cset)

692

if (!cset)

648

return NULL;

693

return NULL;

649

694

650

/* Allocate all the cgrp_cset_link objects that we'll need */

695

/* Allocate all the cgrp_cset_link objects that we'll need */

651

if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {

696

if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {

652

kfree(cset);

697

kfree(cset);

653

return NULL;

698

return NULL;

654

}

699

}

655

700

656

atomic_set(&cset->refcount, 1);

701

atomic_set(&cset->refcount, 1);

657

INIT_LIST_HEAD(&cset->cgrp_links);

702

INIT_LIST_HEAD(&cset->cgrp_links);

658

INIT_LIST_HEAD(&cset->tasks);

703

INIT_LIST_HEAD(&cset->tasks);

659

INIT_LIST_HEAD(&cset->mg_tasks);

704

INIT_LIST_HEAD(&cset->mg_tasks);

660

INIT_LIST_HEAD(&cset->mg_preload_node);

705

INIT_LIST_HEAD(&cset->mg_preload_node);

661

INIT_LIST_HEAD(&cset->mg_node);

706

INIT_LIST_HEAD(&cset->mg_node);

662

INIT_HLIST_NODE(&cset->hlist);

707

INIT_HLIST_NODE(&cset->hlist);

663

708

664

/* Copy the set of subsystem state objects generated in

709

/* Copy the set of subsystem state objects generated in

665

* find_existing_css_set() */

710

* find_existing_css_set() */

666

memcpy(cset->subsys, template, sizeof(cset->subsys));

711

memcpy(cset->subsys, template, sizeof(cset->subsys));

667

712

668

down_write(&css_set_rwsem);

713

down_write(&css_set_rwsem);

669

/* Add reference counts and links from the new css_set. */

714

/* Add reference counts and links from the new css_set. */

670

list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {

715

list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {

671

struct cgroup *c = link->cgrp;

716

struct cgroup *c = link->cgrp;

672

717

673

if (c->root == cgrp->root)

718

if (c->root == cgrp->root)

674

c = cgrp;

719

c = cgrp;

675

link_css_set(&tmp_links, cset, c);

720

link_css_set(&tmp_links, cset, c);

676

}

721

}

677

722

678

BUG_ON(!list_empty(&tmp_links));

723

BUG_ON(!list_empty(&tmp_links));

679

724

680

css_set_count++;

725

css_set_count++;

681

726

682

/* Add this cgroup group to the hash table */

727

/* Add this cgroup group to the hash table */

683

key = css_set_hash(cset->subsys);

728

key = css_set_hash(cset->subsys);

684

hash_add(css_set_table, &cset->hlist, key);

729

hash_add(css_set_table, &cset->hlist, key);

685

730

686

up_write(&css_set_rwsem);

731

up_write(&css_set_rwsem);

687

732

688

return cset;

733

return cset;

689

}

734

}

690

735

691

static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)

736

static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)

692

{

737

{

693

struct cgroup *root_cgrp = kf_root->kn->priv;

738

struct cgroup *root_cgrp = kf_root->kn->priv;

694

739

695

return root_cgrp->root;

740

return root_cgrp->root;

696

}

741

}

697

742

698

static int cgroup_init_root_id(struct cgroup_root *root)

743

static int cgroup_init_root_id(struct cgroup_root *root)

699

{

744

{

700

int id;

745

int id;

701

746

702

lockdep_assert_held(&cgroup_mutex);

747

lockdep_assert_held(&cgroup_mutex);

703

748

704

id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);

749

id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);

705

if (id < 0)

750

if (id < 0)

706

return id;

751

return id;

707

752

708

root->hierarchy_id = id;

753

root->hierarchy_id = id;

709

return 0;

754

return 0;

710

}

755

}

711

756

712

static void cgroup_exit_root_id(struct cgroup_root *root)

757

static void cgroup_exit_root_id(struct cgroup_root *root)

713

{

758

{

714

lockdep_assert_held(&cgroup_mutex);

759

lockdep_assert_held(&cgroup_mutex);

715

760

716

if (root->hierarchy_id) {

761

if (root->hierarchy_id) {

717

idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);

762

idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);

718

root->hierarchy_id = 0;

763

root->hierarchy_id = 0;

719

}

764

}

720

}

765

}

721

766

722

static void cgroup_free_root(struct cgroup_root *root)

767

static void cgroup_free_root(struct cgroup_root *root)

723

{

768

{

724

if (root) {

769

if (root) {

725

/* hierarhcy ID shoulid already have been released */

770

/* hierarhcy ID shoulid already have been released */

726

WARN_ON_ONCE(root->hierarchy_id);

771

WARN_ON_ONCE(root->hierarchy_id);

727

772

728

idr_destroy(&root->cgroup_idr);

773

idr_destroy(&root->cgroup_idr);

729

kfree(root);

774

kfree(root);

730

}

775

}

731

}

776

}

732

777

733

static void cgroup_destroy_root(struct cgroup_root *root)

778

static void cgroup_destroy_root(struct cgroup_root *root)

734

{

779

{

735

struct cgroup *cgrp = &root->cgrp;

780

struct cgroup *cgrp = &root->cgrp;

736

struct cgrp_cset_link *link, *tmp_link;

781

struct cgrp_cset_link *link, *tmp_link;

737

782

738

mutex_lock(&cgroup_tree_mutex);

783

mutex_lock(&cgroup_tree_mutex);

739

mutex_lock(&cgroup_mutex);

784

mutex_lock(&cgroup_mutex);

740

785

741

BUG_ON(atomic_read(&root->nr_cgrps));

786

BUG_ON(atomic_read(&root->nr_cgrps));

742

BUG_ON(!list_empty(&cgrp->children));

787

BUG_ON(!list_empty(&cgrp->children));

743

788

744

/* Rebind all subsystems back to the default hierarchy */

789

/* Rebind all subsystems back to the default hierarchy */

745

rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);

790

rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);

746

791

747

/*

792

/*

748

* Release all the links from cset_links to this hierarchy's

793

* Release all the links from cset_links to this hierarchy's

749

* root cgroup

794

* root cgroup

750

*/

795

*/

751

down_write(&css_set_rwsem);

796

down_write(&css_set_rwsem);

752

797

753

list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {

798

list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {

754

list_del(&link->cset_link);

799

list_del(&link->cset_link);

755

list_del(&link->cgrp_link);

800

list_del(&link->cgrp_link);

756

kfree(link);

801

kfree(link);

757

}

802

}

758

up_write(&css_set_rwsem);

803

up_write(&css_set_rwsem);

759

804

760

if (!list_empty(&root->root_list)) {

805

if (!list_empty(&root->root_list)) {

761

list_del(&root->root_list);

806

list_del(&root->root_list);

762

cgroup_root_count--;

807

cgroup_root_count--;

763

}

808

}

764

809

765

cgroup_exit_root_id(root);

810

cgroup_exit_root_id(root);

766

811

767

mutex_unlock(&cgroup_mutex);

812

mutex_unlock(&cgroup_mutex);

768

mutex_unlock(&cgroup_tree_mutex);

813

mutex_unlock(&cgroup_tree_mutex);

769

814

770

kernfs_destroy_root(root->kf_root);

815

kernfs_destroy_root(root->kf_root);

771

cgroup_free_root(root);

816

cgroup_free_root(root);

772

}

817

}

773

818

774

/* look up cgroup associated with given css_set on the specified hierarchy */

819

/* look up cgroup associated with given css_set on the specified hierarchy */

775

static struct cgroup *cset_cgroup_from_root(struct css_set *cset,

820

static struct cgroup *cset_cgroup_from_root(struct css_set *cset,

776

struct cgroup_root *root)

821

struct cgroup_root *root)

777

{

822

{

778

struct cgroup *res = NULL;

823

struct cgroup *res = NULL;

779

824

780

lockdep_assert_held(&cgroup_mutex);

825

lockdep_assert_held(&cgroup_mutex);

781

lockdep_assert_held(&css_set_rwsem);

826

lockdep_assert_held(&css_set_rwsem);

782

827

783

if (cset == &init_css_set) {

828

if (cset == &init_css_set) {

784

res = &root->cgrp;

829

res = &root->cgrp;

785

} else {

830

} else {

786

struct cgrp_cset_link *link;

831

struct cgrp_cset_link *link;

787

832

788

list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {

833

list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {

789

struct cgroup *c = link->cgrp;

834

struct cgroup *c = link->cgrp;

790

835

791

if (c->root == root) {

836

if (c->root == root) {

792

res = c;

837

res = c;

793

break;

838

break;

794

}

839

}

795

}

840

}

796

}

841

}

797

842

798

BUG_ON(!res);

843

BUG_ON(!res);

799

return res;

844

return res;

800

}

845

}

801

846

802

/*

847

/*

803

* Return the cgroup for "task" from the given hierarchy. Must be

848

* Return the cgroup for "task" from the given hierarchy. Must be

804

* called with cgroup_mutex and css_set_rwsem held.

849

* called with cgroup_mutex and css_set_rwsem held.

805

*/

850

*/

806

static struct cgroup *task_cgroup_from_root(struct task_struct *task,

851

static struct cgroup *task_cgroup_from_root(struct task_struct *task,

807

struct cgroup_root *root)

852

struct cgroup_root *root)

808

{

853

{

809

/*

854

/*

810

* No need to lock the task - since we hold cgroup_mutex the

855

* No need to lock the task - since we hold cgroup_mutex the

811

* task can't change groups, so the only thing that can happen

856

* task can't change groups, so the only thing that can happen

812

* is that it exits and its css is set back to init_css_set.

857

* is that it exits and its css is set back to init_css_set.

813

*/

858

*/

814

return cset_cgroup_from_root(task_css_set(task), root);

859

return cset_cgroup_from_root(task_css_set(task), root);

815

}

860

}

816

861

817

/*

862

/*

818

* A task must hold cgroup_mutex to modify cgroups.

863

* A task must hold cgroup_mutex to modify cgroups.

819

*

864

*

820

* Any task can increment and decrement the count field without lock.

865

* Any task can increment and decrement the count field without lock.

821

* So in general, code holding cgroup_mutex can't rely on the count

866

* So in general, code holding cgroup_mutex can't rely on the count

822

* field not changing. However, if the count goes to zero, then only

867

* field not changing. However, if the count goes to zero, then only

823

* cgroup_attach_task() can increment it again. Because a count of zero

868

* cgroup_attach_task() can increment it again. Because a count of zero

824

* means that no tasks are currently attached, therefore there is no

869

* means that no tasks are currently attached, therefore there is no

825

* way a task attached to that cgroup can fork (the other way to

870

* way a task attached to that cgroup can fork (the other way to

826

* increment the count). So code holding cgroup_mutex can safely

871

* increment the count). So code holding cgroup_mutex can safely

827

* assume that if the count is zero, it will stay zero. Similarly, if

872

* assume that if the count is zero, it will stay zero. Similarly, if

828

* a task holds cgroup_mutex on a cgroup with zero count, it

873

* a task holds cgroup_mutex on a cgroup with zero count, it

829

* knows that the cgroup won't be removed, as cgroup_rmdir()

874

* knows that the cgroup won't be removed, as cgroup_rmdir()

830

* needs that mutex.

875

* needs that mutex.

831

*

876

*

832

* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't

877

* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't

833

* (usually) take cgroup_mutex. These are the two most performance

878

* (usually) take cgroup_mutex. These are the two most performance

834

* critical pieces of code here. The exception occurs on cgroup_exit(),

879

* critical pieces of code here. The exception occurs on cgroup_exit(),

835

* when a task in a notify_on_release cgroup exits. Then cgroup_mutex

880

* when a task in a notify_on_release cgroup exits. Then cgroup_mutex

836

* is taken, and if the cgroup count is zero, a usermode call made

881

* is taken, and if the cgroup count is zero, a usermode call made

837

* to the release agent with the name of the cgroup (path relative to

882

* to the release agent with the name of the cgroup (path relative to

838

* the root of cgroup file system) as the argument.

883

* the root of cgroup file system) as the argument.

839

*

884

*

840

* A cgroup can only be deleted if both its 'count' of using tasks

885

* A cgroup can only be deleted if both its 'count' of using tasks

841

* is zero, and its list of 'children' cgroups is empty. Since all

886

* is zero, and its list of 'children' cgroups is empty. Since all

842

* tasks in the system use _some_ cgroup, and since there is always at

887

* tasks in the system use _some_ cgroup, and since there is always at

843

* least one task in the system (init, pid == 1), therefore, root cgroup

888

* least one task in the system (init, pid == 1), therefore, root cgroup

844

* always has either children cgroups and/or using tasks. So we don't

889

* always has either children cgroups and/or using tasks. So we don't

845

* need a special hack to ensure that root cgroup cannot be deleted.

890

* need a special hack to ensure that root cgroup cannot be deleted.

846

*

891

*

847

* P.S. One more locking exception. RCU is used to guard the

892

* P.S. One more locking exception. RCU is used to guard the

848

* update of a tasks cgroup pointer by cgroup_attach_task()

893

* update of a tasks cgroup pointer by cgroup_attach_task()

849

*/

894

*/

850

895

851

static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);

896

static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);

852

static struct kernfs_syscall_ops cgroup_kf_syscall_ops;

897

static struct kernfs_syscall_ops cgroup_kf_syscall_ops;

853

static const struct file_operations proc_cgroupstats_operations;

898

static const struct file_operations proc_cgroupstats_operations;

854

899

855

static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,

900

static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,

856

char *buf)

901

char *buf)

857

{

902

{

858

if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&

903

if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&

859

!(cgrp->root->flags & CGRP_ROOT_NOPREFIX))

904

!(cgrp->root->flags & CGRP_ROOT_NOPREFIX))

860

snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",

905

snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",

861

cft->ss->name, cft->name);

906

cft->ss->name, cft->name);

862

else

907

else

863

strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);

908

strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);

864

return buf;

909

return buf;

865

}

910

}

866

911

867

/**

912

/**

868

* cgroup_file_mode - deduce file mode of a control file

913

* cgroup_file_mode - deduce file mode of a control file

869

* @cft: the control file in question

914

* @cft: the control file in question

870

*

915

*

871

* returns cft->mode if ->mode is not 0

916

* returns cft->mode if ->mode is not 0

872

* returns S_IRUGO|S_IWUSR if it has both a read and a write handler

917

* returns S_IRUGO|S_IWUSR if it has both a read and a write handler

873

* returns S_IRUGO if it has only a read handler

918

* returns S_IRUGO if it has only a read handler

874

* returns S_IWUSR if it has only a write hander

919

* returns S_IWUSR if it has only a write hander

875

*/

920

*/

876

static umode_t cgroup_file_mode(const struct cftype *cft)

921

static umode_t cgroup_file_mode(const struct cftype *cft)

877

{

922

{

878

umode_t mode = 0;

923

umode_t mode = 0;

879

924

880

if (cft->mode)

925

if (cft->mode)

881

return cft->mode;

926

return cft->mode;

882

927

883

if (cft->read_u64 || cft->read_s64 || cft->seq_show)

928

if (cft->read_u64 || cft->read_s64 || cft->seq_show)

884

mode |= S_IRUGO;

929

mode |= S_IRUGO;

885

930

886

if (cft->write_u64 || cft->write_s64 || cft->write_string ||

931

if (cft->write_u64 || cft->write_s64 || cft->write_string ||

887

cft->trigger)

932

cft->trigger)

888

mode |= S_IWUSR;

933

mode |= S_IWUSR;

889

934

890

return mode;

935

return mode;

891

}

936

}

892

937

893

static void cgroup_free_fn(struct work_struct *work)

938

static void cgroup_free_fn(struct work_struct *work)

894

{

939

{

895

struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);

940

struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);

896

941

897

atomic_dec(&cgrp->root->nr_cgrps);

942

atomic_dec(&cgrp->root->nr_cgrps);

898

cgroup_pidlist_destroy_all(cgrp);

943

cgroup_pidlist_destroy_all(cgrp);

899

944

900

if (cgrp->parent) {

945

if (cgrp->parent) {

901

/*

946

/*

902

* We get a ref to the parent, and put the ref when this

947

* We get a ref to the parent, and put the ref when this

903

* cgroup is being freed, so it's guaranteed that the

948

* cgroup is being freed, so it's guaranteed that the

904

* parent won't be destroyed before its children.

949

* parent won't be destroyed before its children.

905

*/

950

*/

906

cgroup_put(cgrp->parent);

951

cgroup_put(cgrp->parent);

907

kernfs_put(cgrp->kn);

952

kernfs_put(cgrp->kn);

908

kfree(cgrp);

953

kfree(cgrp);

909

} else {

954

} else {

910

/*

955

/*

911

* This is root cgroup's refcnt reaching zero, which

956

* This is root cgroup's refcnt reaching zero, which

912

* indicates that the root should be released.

957

* indicates that the root should be released.

913

*/

958

*/

914

cgroup_destroy_root(cgrp->root);

959

cgroup_destroy_root(cgrp->root);

915

}

960

}

916

}

961

}

917

962

918

static void cgroup_free_rcu(struct rcu_head *head)

963

static void cgroup_free_rcu(struct rcu_head *head)

919

{

964

{

920

struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);

965

struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);

921

966

922

INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);

967

INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);

923

queue_work(cgroup_destroy_wq, &cgrp->destroy_work);

968

queue_work(cgroup_destroy_wq, &cgrp->destroy_work);

924

}

969

}

925

970

926

static void cgroup_get(struct cgroup *cgrp)

971

static void cgroup_get(struct cgroup *cgrp)

927

{

972

{

928

WARN_ON_ONCE(cgroup_is_dead(cgrp));

973

WARN_ON_ONCE(cgroup_is_dead(cgrp));

929

WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);

974

WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);

930

atomic_inc(&cgrp->refcnt);

975

atomic_inc(&cgrp->refcnt);

931

}

976

}

932

977

933

static void cgroup_put(struct cgroup *cgrp)

978

static void cgroup_put(struct cgroup *cgrp)

934

{

979

{

935

if (!atomic_dec_and_test(&cgrp->refcnt))

980

if (!atomic_dec_and_test(&cgrp->refcnt))

936

return;

981

return;

937

if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))

982

if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))

938

return;

983

return;

939

984

940

/*

985

/*

941

* XXX: cgrp->id is only used to look up css's. As cgroup and

986

* XXX: cgrp->id is only used to look up css's. As cgroup and

942

* css's lifetimes will be decoupled, it should be made

987

* css's lifetimes will be decoupled, it should be made

943

* per-subsystem and moved to css->id so that lookups are

988

* per-subsystem and moved to css->id so that lookups are

944

* successful until the target css is released.

989

* successful until the target css is released.

945

*/

990

*/

946

mutex_lock(&cgroup_mutex);

991

mutex_lock(&cgroup_mutex);

947

idr_remove(&cgrp->root->cgroup_idr, cgrp->id);

992

idr_remove(&cgrp->root->cgroup_idr, cgrp->id);

948

mutex_unlock(&cgroup_mutex);

993

mutex_unlock(&cgroup_mutex);

949

cgrp->id = -1;

994

cgrp->id = -1;

950

995

951

call_rcu(&cgrp->rcu_head, cgroup_free_rcu);

996

call_rcu(&cgrp->rcu_head, cgroup_free_rcu);

952

}

997

}

953

998

954

static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)

999

static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)

955

{

1000

{

956

char name[CGROUP_FILE_NAME_MAX];

1001

char name[CGROUP_FILE_NAME_MAX];

957

1002

958

lockdep_assert_held(&cgroup_tree_mutex);

1003

lockdep_assert_held(&cgroup_tree_mutex);

959

kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));

1004

kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));

960

}

1005

}

961

1006

962

/**

1007

/**

963

* cgroup_clear_dir - remove subsys files in a cgroup directory

1008

* cgroup_clear_dir - remove subsys files in a cgroup directory

964

* @cgrp: target cgroup

1009

* @cgrp: target cgroup

965

* @subsys_mask: mask of the subsystem ids whose files should be removed

1010

* @subsys_mask: mask of the subsystem ids whose files should be removed

966

*/

1011

*/

967

static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)

1012

static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)

968

{

1013

{

969

struct cgroup_subsys *ss;

1014

struct cgroup_subsys *ss;

970

int i;

1015

int i;

971

1016

972

for_each_subsys(ss, i) {

1017

for_each_subsys(ss, i) {

973

struct cftype *cfts;

1018

struct cftype *cfts;

974

1019

975

if (!test_bit(i, &subsys_mask))

1020

if (!test_bit(i, &subsys_mask))

976

continue;

1021

continue;

977

list_for_each_entry(cfts, &ss->cfts, node)

1022

list_for_each_entry(cfts, &ss->cfts, node)

978

cgroup_addrm_files(cgrp, cfts, false);

1023

cgroup_addrm_files(cgrp, cfts, false);

979

}

1024

}

980

}

1025

}

981

1026

982

static int rebind_subsystems(struct cgroup_root *dst_root,

1027

static int rebind_subsystems(struct cgroup_root *dst_root,

983

unsigned long ss_mask)

1028

unsigned long ss_mask)

984

{

1029

{

985

struct cgroup_subsys *ss;

1030

struct cgroup_subsys *ss;

986

int ssid, ret;

1031

int ssid, ret;

987

1032

988

lockdep_assert_held(&cgroup_tree_mutex);

1033

lockdep_assert_held(&cgroup_tree_mutex);

989

lockdep_assert_held(&cgroup_mutex);

1034

lockdep_assert_held(&cgroup_mutex);

990

1035

991

for_each_subsys(ss, ssid) {

1036

for_each_subsys(ss, ssid) {

992

if (!(ss_mask & (1 << ssid)))

1037

if (!(ss_mask & (1 << ssid)))

993

continue;

1038

continue;

994

1039

995

/* if @ss is on the dummy_root, we can always move it */

1040

/* if @ss is on the dummy_root, we can always move it */

996

if (ss->root == &cgrp_dfl_root)

1041

if (ss->root == &cgrp_dfl_root)

997

continue;

1042

continue;

998

1043

999

/* if @ss has non-root cgroups attached to it, can't move */

1044

/* if @ss has non-root cgroups attached to it, can't move */

1000

if (!list_empty(&ss->root->cgrp.children))

1045

if (!list_empty(&ss->root->cgrp.children))

1001

return -EBUSY;

1046

return -EBUSY;

1002

1047

1003

/* can't move between two non-dummy roots either */

1048

/* can't move between two non-dummy roots either */

1004

if (dst_root != &cgrp_dfl_root)

1049

if (dst_root != &cgrp_dfl_root)

1005

return -EBUSY;

1050

return -EBUSY;

1006

}

1051

}

1007

1052

1008

ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);

1053

ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);

1009

if (ret) {

1054

if (ret) {

1010

if (dst_root != &cgrp_dfl_root)

1055

if (dst_root != &cgrp_dfl_root)

1011

return ret;

1056

return ret;

1012

1057

1013

/*

1058

/*

1014

* Rebinding back to the default root is not allowed to

1059

* Rebinding back to the default root is not allowed to

1015

* fail. Using both default and non-default roots should

1060

* fail. Using both default and non-default roots should

1016

* be rare. Moving subsystems back and forth even more so.

1061

* be rare. Moving subsystems back and forth even more so.

1017

* Just warn about it and continue.

1062

* Just warn about it and continue.

1018

*/

1063

*/

1019

if (cgrp_dfl_root_visible) {

1064

if (cgrp_dfl_root_visible) {

1020

pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",

1065

pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",

1021

ret, ss_mask);

1066

ret, ss_mask);

1022

pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");

1067

pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");

1023

}

1068

}

1024

}

1069

}

1025

1070

1026

/*

1071

/*

1027

* Nothing can fail from this point on. Remove files for the

1072

* Nothing can fail from this point on. Remove files for the

1028

* removed subsystems and rebind each subsystem.

1073

* removed subsystems and rebind each subsystem.

1029

*/

1074

*/

1030

mutex_unlock(&cgroup_mutex);

1075

mutex_unlock(&cgroup_mutex);

1031

for_each_subsys(ss, ssid)

1076

for_each_subsys(ss, ssid)

1032

if (ss_mask & (1 << ssid))

1077

if (ss_mask & (1 << ssid))

1033

cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);

1078

cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);

1034

mutex_lock(&cgroup_mutex);

1079

mutex_lock(&cgroup_mutex);

1035

1080

1036

for_each_subsys(ss, ssid) {

1081

for_each_subsys(ss, ssid) {

1037

struct cgroup_root *src_root;

1082

struct cgroup_root *src_root;

1038

struct cgroup_subsys_state *css;

1083

struct cgroup_subsys_state *css;

1039

1084

1040

if (!(ss_mask & (1 << ssid)))

1085

if (!(ss_mask & (1 << ssid)))

1041

continue;

1086

continue;

1042

1087

1043

src_root = ss->root;

1088

src_root = ss->root;

1044

css = cgroup_css(&src_root->cgrp, ss);

1089

css = cgroup_css(&src_root->cgrp, ss);

1045

1090

1046

WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));

1091

WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));

1047

1092

1048

RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);

1093

RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);

1049

rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);

1094

rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);

1050

ss->root = dst_root;

1095

ss->root = dst_root;

1051

css->cgroup = &dst_root->cgrp;

1096

css->cgroup = &dst_root->cgrp;

1052

1097

1053

src_root->subsys_mask &= ~(1 << ssid);

1098

src_root->subsys_mask &= ~(1 << ssid);

1054

src_root->cgrp.child_subsys_mask &= ~(1 << ssid);

1099

src_root->cgrp.child_subsys_mask &= ~(1 << ssid);

1055

1100

1056

dst_root->subsys_mask |= 1 << ssid;

1101

dst_root->subsys_mask |= 1 << ssid;

1057

dst_root->cgrp.child_subsys_mask |= 1 << ssid;

1102

dst_root->cgrp.child_subsys_mask |= 1 << ssid;

1058

1103

1059

if (ss->bind)

1104

if (ss->bind)

1060

ss->bind(css);

1105

ss->bind(css);

1061

}

1106

}

1062

1107

1063

kernfs_activate(dst_root->cgrp.kn);

1108

kernfs_activate(dst_root->cgrp.kn);

1064

return 0;

1109

return 0;

1065

}

1110

}

1066

1111

1067

static int cgroup_show_options(struct seq_file *seq,

1112

static int cgroup_show_options(struct seq_file *seq,

1068

struct kernfs_root *kf_root)

1113

struct kernfs_root *kf_root)

1069

{

1114

{

1070

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1115

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1071

struct cgroup_subsys *ss;

1116

struct cgroup_subsys *ss;

1072

int ssid;

1117

int ssid;

1073

1118

1074

for_each_subsys(ss, ssid)

1119

for_each_subsys(ss, ssid)

1075

if (root->subsys_mask & (1 << ssid))

1120

if (root->subsys_mask & (1 << ssid))

1076

seq_printf(seq, ",%s", ss->name);

1121

seq_printf(seq, ",%s", ss->name);

1077

if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)

1122

if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)

1078

seq_puts(seq, ",sane_behavior");

1123

seq_puts(seq, ",sane_behavior");

1079

if (root->flags & CGRP_ROOT_NOPREFIX)

1124

if (root->flags & CGRP_ROOT_NOPREFIX)

1080

seq_puts(seq, ",noprefix");

1125

seq_puts(seq, ",noprefix");

1081

if (root->flags & CGRP_ROOT_XATTR)

1126

if (root->flags & CGRP_ROOT_XATTR)

1082

seq_puts(seq, ",xattr");

1127

seq_puts(seq, ",xattr");

1083

1128

1084

spin_lock(&release_agent_path_lock);

1129

spin_lock(&release_agent_path_lock);

1085

if (strlen(root->release_agent_path))

1130

if (strlen(root->release_agent_path))

1086

seq_printf(seq, ",release_agent=%s", root->release_agent_path);

1131

seq_printf(seq, ",release_agent=%s", root->release_agent_path);

1087

spin_unlock(&release_agent_path_lock);

1132

spin_unlock(&release_agent_path_lock);

1088

1133

1089

if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))

1134

if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))

1090

seq_puts(seq, ",clone_children");

1135

seq_puts(seq, ",clone_children");

1091

if (strlen(root->name))

1136

if (strlen(root->name))

1092

seq_printf(seq, ",name=%s", root->name);

1137

seq_printf(seq, ",name=%s", root->name);

1093

return 0;

1138

return 0;

1094

}

1139

}

1095

1140

1096

struct cgroup_sb_opts {

1141

struct cgroup_sb_opts {

1097

unsigned long subsys_mask;

1142

unsigned long subsys_mask;

1098

unsigned long flags;

1143

unsigned long flags;

1099

char *release_agent;

1144

char *release_agent;

1100

bool cpuset_clone_children;

1145

bool cpuset_clone_children;

1101

char *name;

1146

char *name;

1102

/* User explicitly requested empty subsystem */

1147

/* User explicitly requested empty subsystem */

1103

bool none;

1148

bool none;

1104

};

1149

};

1105

1150

1106

/*

1151

/*

1107

* Convert a hierarchy specifier into a bitmask of subsystems and

1152

* Convert a hierarchy specifier into a bitmask of subsystems and

1108

* flags. Call with cgroup_mutex held to protect the cgroup_subsys[]

1153

* flags. Call with cgroup_mutex held to protect the cgroup_subsys[]

1109

* array. This function takes refcounts on subsystems to be used, unless it

1154

* array. This function takes refcounts on subsystems to be used, unless it

1110

* returns error, in which case no refcounts are taken.

1155

* returns error, in which case no refcounts are taken.

1111

*/

1156

*/

1112

static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)

1157

static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)

1113

{

1158

{

1114

char *token, *o = data;

1159

char *token, *o = data;

1115

bool all_ss = false, one_ss = false;

1160

bool all_ss = false, one_ss = false;

1116

unsigned long mask = (unsigned long)-1;

1161

unsigned long mask = (unsigned long)-1;

1117

struct cgroup_subsys *ss;

1162

struct cgroup_subsys *ss;

1118

int i;

1163

int i;

1119

1164

1120

BUG_ON(!mutex_is_locked(&cgroup_mutex));

1165

BUG_ON(!mutex_is_locked(&cgroup_mutex));

1121

1166

1122

#ifdef CONFIG_CPUSETS

1167

#ifdef CONFIG_CPUSETS

1123

mask = ~(1UL << cpuset_cgrp_id);

1168

mask = ~(1UL << cpuset_cgrp_id);

1124

#endif

1169

#endif

1125

1170

1126

memset(opts, 0, sizeof(*opts));

1171

memset(opts, 0, sizeof(*opts));

1127

1172

1128

while ((token = strsep(&o, ",")) != NULL) {

1173

while ((token = strsep(&o, ",")) != NULL) {

1129

if (!*token)

1174

if (!*token)

1130

return -EINVAL;

1175

return -EINVAL;

1131

if (!strcmp(token, "none")) {

1176

if (!strcmp(token, "none")) {

1132

/* Explicitly have no subsystems */

1177

/* Explicitly have no subsystems */

1133

opts->none = true;

1178

opts->none = true;

1134

continue;

1179

continue;

1135

}

1180

}

1136

if (!strcmp(token, "all")) {

1181

if (!strcmp(token, "all")) {

1137

/* Mutually exclusive option 'all' + subsystem name */

1182

/* Mutually exclusive option 'all' + subsystem name */

1138

if (one_ss)

1183

if (one_ss)

1139

return -EINVAL;

1184

return -EINVAL;

1140

all_ss = true;

1185

all_ss = true;

1141

continue;

1186

continue;

1142

}

1187

}

1143

if (!strcmp(token, "__DEVEL__sane_behavior")) {

1188

if (!strcmp(token, "__DEVEL__sane_behavior")) {

1144

opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;

1189

opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;

1145

continue;

1190

continue;

1146

}

1191

}

1147

if (!strcmp(token, "noprefix")) {

1192

if (!strcmp(token, "noprefix")) {

1148

opts->flags |= CGRP_ROOT_NOPREFIX;

1193

opts->flags |= CGRP_ROOT_NOPREFIX;

1149

continue;

1194

continue;

1150

}

1195

}

1151

if (!strcmp(token, "clone_children")) {

1196

if (!strcmp(token, "clone_children")) {

1152

opts->cpuset_clone_children = true;

1197

opts->cpuset_clone_children = true;

1153

continue;

1198

continue;

1154

}

1199

}

1155

if (!strcmp(token, "xattr")) {

1200

if (!strcmp(token, "xattr")) {

1156

opts->flags |= CGRP_ROOT_XATTR;

1201

opts->flags |= CGRP_ROOT_XATTR;

1157

continue;

1202

continue;

1158

}

1203

}

1159

if (!strncmp(token, "release_agent=", 14)) {

1204

if (!strncmp(token, "release_agent=", 14)) {

1160

/* Specifying two release agents is forbidden */

1205

/* Specifying two release agents is forbidden */

1161

if (opts->release_agent)

1206

if (opts->release_agent)

1162

return -EINVAL;

1207

return -EINVAL;

1163

opts->release_agent =

1208

opts->release_agent =

1164

kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);

1209

kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);

1165

if (!opts->release_agent)

1210

if (!opts->release_agent)

1166

return -ENOMEM;

1211

return -ENOMEM;

1167

continue;

1212

continue;

1168

}

1213

}

1169

if (!strncmp(token, "name=", 5)) {

1214

if (!strncmp(token, "name=", 5)) {

1170

const char *name = token + 5;

1215

const char *name = token + 5;

1171

/* Can't specify an empty name */

1216

/* Can't specify an empty name */

1172

if (!strlen(name))

1217

if (!strlen(name))

1173

return -EINVAL;

1218

return -EINVAL;

1174

/* Must match [\w.-]+ */

1219

/* Must match [\w.-]+ */

1175

for (i = 0; i < strlen(name); i++) {

1220

for (i = 0; i < strlen(name); i++) {

1176

char c = name[i];

1221

char c = name[i];

1177

if (isalnum(c))

1222

if (isalnum(c))

1178

continue;

1223

continue;

1179

if ((c == '.') || (c == '-') || (c == '_'))

1224

if ((c == '.') || (c == '-') || (c == '_'))

1180

continue;

1225

continue;

1181

return -EINVAL;

1226

return -EINVAL;

1182

}

1227

}

1183

/* Specifying two names is forbidden */

1228

/* Specifying two names is forbidden */

1184

if (opts->name)

1229

if (opts->name)

1185

return -EINVAL;

1230

return -EINVAL;

1186

opts->name = kstrndup(name,

1231

opts->name = kstrndup(name,

1187

MAX_CGROUP_ROOT_NAMELEN - 1,

1232

MAX_CGROUP_ROOT_NAMELEN - 1,

1188

GFP_KERNEL);

1233

GFP_KERNEL);

1189

if (!opts->name)

1234

if (!opts->name)

1190

return -ENOMEM;

1235

return -ENOMEM;

1191

1236

1192

continue;

1237

continue;

1193

}

1238

}

1194

1239

1195

for_each_subsys(ss, i) {

1240

for_each_subsys(ss, i) {

1196

if (strcmp(token, ss->name))

1241

if (strcmp(token, ss->name))

1197

continue;

1242

continue;

1198

if (ss->disabled)

1243

if (ss->disabled)

1199

continue;

1244

continue;

1200

1245

1201

/* Mutually exclusive option 'all' + subsystem name */

1246

/* Mutually exclusive option 'all' + subsystem name */

1202

if (all_ss)

1247

if (all_ss)

1203

return -EINVAL;

1248

return -EINVAL;

1204

set_bit(i, &opts->subsys_mask);

1249

set_bit(i, &opts->subsys_mask);

1205

one_ss = true;

1250

one_ss = true;

1206

1251

1207

break;

1252

break;

1208

}

1253

}

1209

if (i == CGROUP_SUBSYS_COUNT)

1254

if (i == CGROUP_SUBSYS_COUNT)

1210

return -ENOENT;

1255

return -ENOENT;

1211

}

1256

}

1212

1257

1213

/* Consistency checks */

1258

/* Consistency checks */

1214

1259

1215

if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {

1260

if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {

1216

pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");

1261

pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");

1217

1262

1218

if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||

1263

if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||

1219

opts->cpuset_clone_children || opts->release_agent ||

1264

opts->cpuset_clone_children || opts->release_agent ||

1220

opts->name) {

1265

opts->name) {

1221

pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");

1266

pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");

1222

return -EINVAL;

1267

return -EINVAL;

1223

}

1268

}

1224

} else {

1269

} else {

1225

/*

1270

/*

1226

* If the 'all' option was specified select all the

1271

* If the 'all' option was specified select all the

1227

* subsystems, otherwise if 'none', 'name=' and a subsystem

1272

* subsystems, otherwise if 'none', 'name=' and a subsystem

1228

* name options were not specified, let's default to 'all'

1273

* name options were not specified, let's default to 'all'

1229

*/

1274

*/

1230

if (all_ss || (!one_ss && !opts->none && !opts->name))

1275

if (all_ss || (!one_ss && !opts->none && !opts->name))

1231

for_each_subsys(ss, i)

1276

for_each_subsys(ss, i)

1232

if (!ss->disabled)

1277

if (!ss->disabled)

1233

set_bit(i, &opts->subsys_mask);

1278

set_bit(i, &opts->subsys_mask);

1234

1279

1235

/*

1280

/*

1236

* We either have to specify by name or by subsystems. (So

1281

* We either have to specify by name or by subsystems. (So

1237

* all empty hierarchies must have a name).

1282

* all empty hierarchies must have a name).

1238

*/

1283

*/

1239

if (!opts->subsys_mask && !opts->name)

1284

if (!opts->subsys_mask && !opts->name)

1240

return -EINVAL;

1285

return -EINVAL;

1241

}

1286

}

1242

1287

1243

/*

1288

/*

1244

* Option noprefix was introduced just for backward compatibility

1289

* Option noprefix was introduced just for backward compatibility

1245

* with the old cpuset, so we allow noprefix only if mounting just

1290

* with the old cpuset, so we allow noprefix only if mounting just

1246

* the cpuset subsystem.

1291

* the cpuset subsystem.

1247

*/

1292

*/

1248

if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))

1293

if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))

1249

return -EINVAL;

1294

return -EINVAL;

1250

1295

1251

1296

1252

/* Can't specify "none" and some subsystems */

1297

/* Can't specify "none" and some subsystems */

1253

if (opts->subsys_mask && opts->none)

1298

if (opts->subsys_mask && opts->none)

1254

return -EINVAL;

1299

return -EINVAL;

1255

1300

1256

return 0;

1301

return 0;

1257

}

1302

}

1258

1303

1259

static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)

1304

static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)

1260

{

1305

{

1261

int ret = 0;

1306

int ret = 0;

1262

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1307

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1263

struct cgroup_sb_opts opts;

1308

struct cgroup_sb_opts opts;

1264

unsigned long added_mask, removed_mask;

1309

unsigned long added_mask, removed_mask;

1265

1310

1266

if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {

1311

if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {

1267

pr_err("cgroup: sane_behavior: remount is not allowed\n");

1312

pr_err("cgroup: sane_behavior: remount is not allowed\n");

1268

return -EINVAL;

1313

return -EINVAL;

1269

}

1314

}

1270

1315

1271

mutex_lock(&cgroup_tree_mutex);

1316

mutex_lock(&cgroup_tree_mutex);

1272

mutex_lock(&cgroup_mutex);

1317

mutex_lock(&cgroup_mutex);

1273

1318

1274

/* See what subsystems are wanted */

1319

/* See what subsystems are wanted */

1275

ret = parse_cgroupfs_options(data, &opts);

1320

ret = parse_cgroupfs_options(data, &opts);

1276

if (ret)

1321

if (ret)

1277

goto out_unlock;

1322

goto out_unlock;

1278

1323

1279

if (opts.subsys_mask != root->subsys_mask || opts.release_agent)

1324

if (opts.subsys_mask != root->subsys_mask || opts.release_agent)

1280

pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",

1325

pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",

1281

task_tgid_nr(current), current->comm);

1326

task_tgid_nr(current), current->comm);

1282

1327

1283

added_mask = opts.subsys_mask & ~root->subsys_mask;

1328

added_mask = opts.subsys_mask & ~root->subsys_mask;

1284

removed_mask = root->subsys_mask & ~opts.subsys_mask;

1329

removed_mask = root->subsys_mask & ~opts.subsys_mask;

1285

1330

1286

/* Don't allow flags or name to change at remount */

1331

/* Don't allow flags or name to change at remount */

1287

if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||

1332

if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||

1288

(opts.name && strcmp(opts.name, root->name))) {

1333

(opts.name && strcmp(opts.name, root->name))) {

1289

pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",

1334

pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",

1290

opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",

1335

opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",

1291

root->flags & CGRP_ROOT_OPTION_MASK, root->name);

1336

root->flags & CGRP_ROOT_OPTION_MASK, root->name);

1292

ret = -EINVAL;

1337

ret = -EINVAL;

1293

goto out_unlock;

1338

goto out_unlock;

1294

}

1339

}

1295

1340

1296

/* remounting is not allowed for populated hierarchies */

1341

/* remounting is not allowed for populated hierarchies */

1297

if (!list_empty(&root->cgrp.children)) {

1342

if (!list_empty(&root->cgrp.children)) {

1298

ret = -EBUSY;

1343

ret = -EBUSY;

1299

goto out_unlock;

1344

goto out_unlock;

1300

}

1345

}

1301

1346

1302

ret = rebind_subsystems(root, added_mask);

1347

ret = rebind_subsystems(root, added_mask);

1303

if (ret)

1348

if (ret)

1304

goto out_unlock;

1349

goto out_unlock;

1305

1350

1306

rebind_subsystems(&cgrp_dfl_root, removed_mask);

1351

rebind_subsystems(&cgrp_dfl_root, removed_mask);

1307

1352

1308

if (opts.release_agent) {

1353

if (opts.release_agent) {

1309

spin_lock(&release_agent_path_lock);

1354

spin_lock(&release_agent_path_lock);

1310

strcpy(root->release_agent_path, opts.release_agent);

1355

strcpy(root->release_agent_path, opts.release_agent);

1311

spin_unlock(&release_agent_path_lock);

1356

spin_unlock(&release_agent_path_lock);

1312

}

1357

}

1313

out_unlock:

1358

out_unlock:

1314

kfree(opts.release_agent);

1359

kfree(opts.release_agent);

1315

kfree(opts.name);

1360

kfree(opts.name);

1316

mutex_unlock(&cgroup_mutex);

1361

mutex_unlock(&cgroup_mutex);

1317

mutex_unlock(&cgroup_tree_mutex);

1362

mutex_unlock(&cgroup_tree_mutex);

1318

return ret;

1363

return ret;

1319

}

1364

}

1320

1365

1321

/*

1366

/*

1322

* To reduce the fork() overhead for systems that are not actually using

1367

* To reduce the fork() overhead for systems that are not actually using

1323

* their cgroups capability, we don't maintain the lists running through

1368

* their cgroups capability, we don't maintain the lists running through

1324

* each css_set to its tasks until we see the list actually used - in other

1369

* each css_set to its tasks until we see the list actually used - in other

1325

* words after the first mount.

1370

* words after the first mount.

1326

*/

1371

*/

1327

static bool use_task_css_set_links __read_mostly;

1372

static bool use_task_css_set_links __read_mostly;

1328

1373

1329

static void cgroup_enable_task_cg_lists(void)

1374

static void cgroup_enable_task_cg_lists(void)

1330

{

1375

{

1331

struct task_struct *p, *g;

1376

struct task_struct *p, *g;

1332

1377

1333

down_write(&css_set_rwsem);

1378

down_write(&css_set_rwsem);

1334

1379

1335

if (use_task_css_set_links)

1380

if (use_task_css_set_links)

1336

goto out_unlock;

1381

goto out_unlock;

1337

1382

1338

use_task_css_set_links = true;

1383

use_task_css_set_links = true;

1339

1384

1340

/*

1385

/*

1341

* We need tasklist_lock because RCU is not safe against

1386

* We need tasklist_lock because RCU is not safe against

1342

* while_each_thread(). Besides, a forking task that has passed

1387

* while_each_thread(). Besides, a forking task that has passed

1343

* cgroup_post_fork() without seeing use_task_css_set_links = 1

1388

* cgroup_post_fork() without seeing use_task_css_set_links = 1

1344

* is not guaranteed to have its child immediately visible in the

1389

* is not guaranteed to have its child immediately visible in the

1345

* tasklist if we walk through it with RCU.

1390

* tasklist if we walk through it with RCU.

1346

*/

1391

*/

1347

read_lock(&tasklist_lock);

1392

read_lock(&tasklist_lock);

1348

do_each_thread(g, p) {

1393

do_each_thread(g, p) {

1349

WARN_ON_ONCE(!list_empty(&p->cg_list) ||

1394

WARN_ON_ONCE(!list_empty(&p->cg_list) ||

1350

task_css_set(p) != &init_css_set);

1395

task_css_set(p) != &init_css_set);

1351

1396

1352

/*

1397

/*

1353

* We should check if the process is exiting, otherwise

1398

* We should check if the process is exiting, otherwise

1354

* it will race with cgroup_exit() in that the list

1399

* it will race with cgroup_exit() in that the list

1355

* entry won't be deleted though the process has exited.

1400

* entry won't be deleted though the process has exited.

1356

* Do it while holding siglock so that we don't end up

1401

* Do it while holding siglock so that we don't end up

1357

* racing against cgroup_exit().

1402

* racing against cgroup_exit().

1358

*/

1403

*/

1359

spin_lock_irq(&p->sighand->siglock);

1404

spin_lock_irq(&p->sighand->siglock);

1360

if (!(p->flags & PF_EXITING)) {

1405

if (!(p->flags & PF_EXITING)) {

1361

struct css_set *cset = task_css_set(p);

1406

struct css_set *cset = task_css_set(p);

1362

1407

1363

list_add(&p->cg_list, &cset->tasks);

1408

list_add(&p->cg_list, &cset->tasks);

1364

get_css_set(cset);

1409

get_css_set(cset);

1365

}

1410

}

1366

spin_unlock_irq(&p->sighand->siglock);

1411

spin_unlock_irq(&p->sighand->siglock);

1367

} while_each_thread(g, p);

1412

} while_each_thread(g, p);

1368

read_unlock(&tasklist_lock);

1413

read_unlock(&tasklist_lock);

1369

out_unlock:

1414

out_unlock:

1370

up_write(&css_set_rwsem);

1415

up_write(&css_set_rwsem);

1371

}

1416

}

1372

1417

1373

static void init_cgroup_housekeeping(struct cgroup *cgrp)

1418

static void init_cgroup_housekeeping(struct cgroup *cgrp)

1374

{

1419

{

1375

atomic_set(&cgrp->refcnt, 1);

1420

atomic_set(&cgrp->refcnt, 1);

1376

INIT_LIST_HEAD(&cgrp->sibling);

1421

INIT_LIST_HEAD(&cgrp->sibling);

1377

INIT_LIST_HEAD(&cgrp->children);

1422

INIT_LIST_HEAD(&cgrp->children);

1378

INIT_LIST_HEAD(&cgrp->cset_links);

1423

INIT_LIST_HEAD(&cgrp->cset_links);

1379

INIT_LIST_HEAD(&cgrp->release_list);

1424

INIT_LIST_HEAD(&cgrp->release_list);

1380

INIT_LIST_HEAD(&cgrp->pidlists);

1425

INIT_LIST_HEAD(&cgrp->pidlists);

1381

mutex_init(&cgrp->pidlist_mutex);

1426

mutex_init(&cgrp->pidlist_mutex);

1382

cgrp->dummy_css.cgroup = cgrp;

1427

cgrp->dummy_css.cgroup = cgrp;

1383

}

1428

}

1384

1429

1385

static void init_cgroup_root(struct cgroup_root *root,

1430

static void init_cgroup_root(struct cgroup_root *root,

1386

struct cgroup_sb_opts *opts)

1431

struct cgroup_sb_opts *opts)

1387

{

1432

{

1388

struct cgroup *cgrp = &root->cgrp;

1433

struct cgroup *cgrp = &root->cgrp;

1389

1434

1390

INIT_LIST_HEAD(&root->root_list);

1435

INIT_LIST_HEAD(&root->root_list);

1391

atomic_set(&root->nr_cgrps, 1);

1436

atomic_set(&root->nr_cgrps, 1);

1392

cgrp->root = root;

1437

cgrp->root = root;

1393

init_cgroup_housekeeping(cgrp);

1438

init_cgroup_housekeeping(cgrp);

1394

idr_init(&root->cgroup_idr);

1439

idr_init(&root->cgroup_idr);

1395

1440

1396

root->flags = opts->flags;

1441

root->flags = opts->flags;

1397

if (opts->release_agent)

1442

if (opts->release_agent)

1398

strcpy(root->release_agent_path, opts->release_agent);

1443

strcpy(root->release_agent_path, opts->release_agent);

1399

if (opts->name)

1444

if (opts->name)

1400

strcpy(root->name, opts->name);

1445

strcpy(root->name, opts->name);

1401

if (opts->cpuset_clone_children)

1446

if (opts->cpuset_clone_children)

1402

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);

1447

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);

1403

}

1448

}

1404

1449

1405

static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)

1450

static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)

1406

{

1451

{

1407

LIST_HEAD(tmp_links);

1452

LIST_HEAD(tmp_links);

1408

struct cgroup *root_cgrp = &root->cgrp;

1453

struct cgroup *root_cgrp = &root->cgrp;

1409

struct css_set *cset;

1454

struct css_set *cset;

1410

int i, ret;

1455

int i, ret;

1411

1456

1412

lockdep_assert_held(&cgroup_tree_mutex);

1457

lockdep_assert_held(&cgroup_tree_mutex);

1413

lockdep_assert_held(&cgroup_mutex);

1458

lockdep_assert_held(&cgroup_mutex);

1414

1459

1415

ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);

1460

ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);

1416

if (ret < 0)

1461

if (ret < 0)

1417

goto out;

1462

goto out;

1418

root_cgrp->id = ret;

1463

root_cgrp->id = ret;

1419

1464

1420

/*

1465

/*

1421

* We're accessing css_set_count without locking css_set_rwsem here,

1466

* We're accessing css_set_count without locking css_set_rwsem here,

1422

* but that's OK - it can only be increased by someone holding

1467

* but that's OK - it can only be increased by someone holding

1423

* cgroup_lock, and that's us. The worst that can happen is that we

1468

* cgroup_lock, and that's us. The worst that can happen is that we

1424

* have some link structures left over

1469

* have some link structures left over

1425

*/

1470

*/

1426

ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);

1471

ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);

1427

if (ret)

1472

if (ret)

1428

goto out;

1473

goto out;

1429

1474

1430

ret = cgroup_init_root_id(root);

1475

ret = cgroup_init_root_id(root);

1431

if (ret)

1476

if (ret)

1432

goto out;

1477

goto out;

1433

1478

1434

root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,

1479

root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,

1435

KERNFS_ROOT_CREATE_DEACTIVATED,

1480

KERNFS_ROOT_CREATE_DEACTIVATED,

1436

root_cgrp);

1481

root_cgrp);

1437

if (IS_ERR(root->kf_root)) {

1482

if (IS_ERR(root->kf_root)) {

1438

ret = PTR_ERR(root->kf_root);

1483

ret = PTR_ERR(root->kf_root);

1439

goto exit_root_id;

1484

goto exit_root_id;

1440

}

1485

}

1441

root_cgrp->kn = root->kf_root->kn;

1486

root_cgrp->kn = root->kf_root->kn;

1442

1487

1443

ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);

1488

ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);

1444

if (ret)

1489

if (ret)

1445

goto destroy_root;

1490

goto destroy_root;

1446

1491

1447

ret = rebind_subsystems(root, ss_mask);

1492

ret = rebind_subsystems(root, ss_mask);

1448

if (ret)

1493

if (ret)

1449

goto destroy_root;

1494

goto destroy_root;

1450

1495

1451

/*

1496

/*

1452

* There must be no failure case after here, since rebinding takes

1497

* There must be no failure case after here, since rebinding takes

1453

* care of subsystems' refcounts, which are explicitly dropped in

1498

* care of subsystems' refcounts, which are explicitly dropped in

1454

* the failure exit path.

1499

* the failure exit path.

1455

*/

1500

*/

1456

list_add(&root->root_list, &cgroup_roots);

1501

list_add(&root->root_list, &cgroup_roots);

1457

cgroup_root_count++;

1502

cgroup_root_count++;

1458

1503

1459

/*

1504

/*

1460

* Link the root cgroup in this hierarchy into all the css_set

1505

* Link the root cgroup in this hierarchy into all the css_set

1461

* objects.

1506

* objects.

1462

*/

1507

*/

1463

down_write(&css_set_rwsem);

1508

down_write(&css_set_rwsem);

1464

hash_for_each(css_set_table, i, cset, hlist)

1509

hash_for_each(css_set_table, i, cset, hlist)

1465

link_css_set(&tmp_links, cset, root_cgrp);

1510

link_css_set(&tmp_links, cset, root_cgrp);

1466

up_write(&css_set_rwsem);

1511

up_write(&css_set_rwsem);

1467

1512

1468

BUG_ON(!list_empty(&root_cgrp->children));

1513

BUG_ON(!list_empty(&root_cgrp->children));

1469

BUG_ON(atomic_read(&root->nr_cgrps) != 1);

1514

BUG_ON(atomic_read(&root->nr_cgrps) != 1);

1470

1515

1471

kernfs_activate(root_cgrp->kn);

1516

kernfs_activate(root_cgrp->kn);

1472

ret = 0;

1517

ret = 0;

1473

goto out;

1518

goto out;

1474

1519

1475

destroy_root:

1520

destroy_root:

1476

kernfs_destroy_root(root->kf_root);

1521

kernfs_destroy_root(root->kf_root);

1477

root->kf_root = NULL;

1522

root->kf_root = NULL;

1478

exit_root_id:

1523

exit_root_id:

1479

cgroup_exit_root_id(root);

1524

cgroup_exit_root_id(root);

1480

out:

1525

out:

1481

free_cgrp_cset_links(&tmp_links);

1526

free_cgrp_cset_links(&tmp_links);

1482

return ret;

1527

return ret;

1483

}

1528

}

1484

1529

1485

static struct dentry *cgroup_mount(struct file_system_type *fs_type,

1530

static struct dentry *cgroup_mount(struct file_system_type *fs_type,

1486

int flags, const char *unused_dev_name,

1531

int flags, const char *unused_dev_name,

1487

void *data)

1532

void *data)

1488

{

1533

{

1489

struct cgroup_root *root;

1534

struct cgroup_root *root;

1490

struct cgroup_sb_opts opts;

1535

struct cgroup_sb_opts opts;

1491

struct dentry *dentry;

1536

struct dentry *dentry;

1492

int ret;

1537

int ret;

1493

bool new_sb;

1538

bool new_sb;

1494

1539

1495

/*

1540

/*

1496

* The first time anyone tries to mount a cgroup, enable the list

1541

* The first time anyone tries to mount a cgroup, enable the list

1497

* linking each css_set to its tasks and fix up all existing tasks.

1542

* linking each css_set to its tasks and fix up all existing tasks.

1498

*/

1543

*/

1499

if (!use_task_css_set_links)

1544

if (!use_task_css_set_links)

1500

cgroup_enable_task_cg_lists();

1545

cgroup_enable_task_cg_lists();

1501

1546

1502

mutex_lock(&cgroup_tree_mutex);

1547

mutex_lock(&cgroup_tree_mutex);

1503

mutex_lock(&cgroup_mutex);

1548

mutex_lock(&cgroup_mutex);

1504

1549

1505

/* First find the desired set of subsystems */

1550

/* First find the desired set of subsystems */

1506

ret = parse_cgroupfs_options(data, &opts);

1551

ret = parse_cgroupfs_options(data, &opts);

1507

if (ret)

1552

if (ret)

1508

goto out_unlock;

1553

goto out_unlock;

1509

retry:

1554

retry:

1510

/* look for a matching existing root */

1555

/* look for a matching existing root */

1511

if (!opts.subsys_mask && !opts.none && !opts.name) {

1556

if (!opts.subsys_mask && !opts.none && !opts.name) {

1512

cgrp_dfl_root_visible = true;

1557

cgrp_dfl_root_visible = true;

1513

root = &cgrp_dfl_root;

1558

root = &cgrp_dfl_root;

1514

cgroup_get(&root->cgrp);

1559

cgroup_get(&root->cgrp);

1515

ret = 0;

1560

ret = 0;

1516

goto out_unlock;

1561

goto out_unlock;

1517

}

1562

}

1518

1563

1519

for_each_root(root) {

1564

for_each_root(root) {

1520

bool name_match = false;

1565

bool name_match = false;

1521

1566

1522

if (root == &cgrp_dfl_root)

1567

if (root == &cgrp_dfl_root)

1523

continue;

1568

continue;

1524

1569

1525

/*

1570

/*

1526

* If we asked for a name then it must match. Also, if

1571

* If we asked for a name then it must match. Also, if

1527

* name matches but sybsys_mask doesn't, we should fail.

1572

* name matches but sybsys_mask doesn't, we should fail.

1528

* Remember whether name matched.

1573

* Remember whether name matched.

1529

*/

1574

*/

1530

if (opts.name) {

1575

if (opts.name) {

1531

if (strcmp(opts.name, root->name))

1576

if (strcmp(opts.name, root->name))

1532

continue;

1577

continue;

1533

name_match = true;

1578

name_match = true;

1534

}

1579

}

1535

1580

1536

/*

1581

/*

1537

* If we asked for subsystems (or explicitly for no

1582

* If we asked for subsystems (or explicitly for no

1538

* subsystems) then they must match.

1583

* subsystems) then they must match.

1539

*/

1584

*/

1540

if ((opts.subsys_mask || opts.none) &&

1585

if ((opts.subsys_mask || opts.none) &&

1541

(opts.subsys_mask != root->subsys_mask)) {

1586

(opts.subsys_mask != root->subsys_mask)) {

1542

if (!name_match)

1587

if (!name_match)

1543

continue;

1588

continue;

1544

ret = -EBUSY;

1589

ret = -EBUSY;

1545

goto out_unlock;

1590

goto out_unlock;

1546

}

1591

}

1547

1592

1548

if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {

1593

if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {

1549

if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {

1594

if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {

1550

pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");

1595

pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");

1551

ret = -EINVAL;

1596

ret = -EINVAL;

1552

goto out_unlock;

1597

goto out_unlock;

1553

} else {

1598

} else {

1554

pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");

1599

pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");

1555

}

1600

}

1556

}

1601

}

1557

1602

1558

/*

1603

/*

1559

* A root's lifetime is governed by its root cgroup. Zero

1604

* A root's lifetime is governed by its root cgroup. Zero

1560

* ref indicate that the root is being destroyed. Wait for

1605

* ref indicate that the root is being destroyed. Wait for

1561

* destruction to complete so that the subsystems are free.

1606

* destruction to complete so that the subsystems are free.

1562

* We can use wait_queue for the wait but this path is

1607

* We can use wait_queue for the wait but this path is

1563

* super cold. Let's just sleep for a bit and retry.

1608

* super cold. Let's just sleep for a bit and retry.

1564

*/

1609

*/

1565

if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {

1610

if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {

1566

mutex_unlock(&cgroup_mutex);

1611

mutex_unlock(&cgroup_mutex);

1567

mutex_unlock(&cgroup_tree_mutex);

1612

mutex_unlock(&cgroup_tree_mutex);

1568

msleep(10);

1613

msleep(10);

1569

mutex_lock(&cgroup_tree_mutex);

1614

mutex_lock(&cgroup_tree_mutex);

1570

mutex_lock(&cgroup_mutex);

1615

mutex_lock(&cgroup_mutex);

1571

goto retry;

1616

goto retry;

1572

}

1617

}

1573

1618

1574

ret = 0;

1619

ret = 0;

1575

goto out_unlock;

1620

goto out_unlock;

1576

}

1621

}

1577

1622

1578

/*

1623

/*

1579

* No such thing, create a new one. name= matching without subsys

1624

* No such thing, create a new one. name= matching without subsys

1580

* specification is allowed for already existing hierarchies but we

1625

* specification is allowed for already existing hierarchies but we

1581

* can't create new one without subsys specification.

1626

* can't create new one without subsys specification.

1582

*/

1627

*/

1583

if (!opts.subsys_mask && !opts.none) {

1628

if (!opts.subsys_mask && !opts.none) {

1584

ret = -EINVAL;

1629

ret = -EINVAL;

1585

goto out_unlock;

1630

goto out_unlock;

1586

}

1631

}

1587

1632

1588

root = kzalloc(sizeof(*root), GFP_KERNEL);

1633

root = kzalloc(sizeof(*root), GFP_KERNEL);

1589

if (!root) {

1634

if (!root) {

1590

ret = -ENOMEM;

1635

ret = -ENOMEM;

1591

goto out_unlock;

1636

goto out_unlock;

1592

}

1637

}

1593

1638

1594

init_cgroup_root(root, &opts);

1639

init_cgroup_root(root, &opts);

1595

1640

1596

ret = cgroup_setup_root(root, opts.subsys_mask);

1641

ret = cgroup_setup_root(root, opts.subsys_mask);

1597

if (ret)

1642

if (ret)

1598

cgroup_free_root(root);

1643

cgroup_free_root(root);

1599

1644

1600

out_unlock:

1645

out_unlock:

1601

mutex_unlock(&cgroup_mutex);

1646

mutex_unlock(&cgroup_mutex);

1602

mutex_unlock(&cgroup_tree_mutex);

1647

mutex_unlock(&cgroup_tree_mutex);

1603

1648

1604

kfree(opts.release_agent);

1649

kfree(opts.release_agent);

1605

kfree(opts.name);

1650

kfree(opts.name);

1606

1651

1607

if (ret)

1652

if (ret)

1608

return ERR_PTR(ret);

1653

return ERR_PTR(ret);

1609

1654

1610

dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);

1655

dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);

1611

if (IS_ERR(dentry) || !new_sb)

1656

if (IS_ERR(dentry) || !new_sb)

1612

cgroup_put(&root->cgrp);

1657

cgroup_put(&root->cgrp);

1613

return dentry;

1658

return dentry;

1614

}

1659

}

1615

1660

1616

static void cgroup_kill_sb(struct super_block *sb)

1661

static void cgroup_kill_sb(struct super_block *sb)

1617

{

1662

{

1618

struct kernfs_root *kf_root = kernfs_root_from_sb(sb);

1663

struct kernfs_root *kf_root = kernfs_root_from_sb(sb);

1619

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1664

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1620

1665

1621

cgroup_put(&root->cgrp);

1666

cgroup_put(&root->cgrp);

1622

kernfs_kill_sb(sb);

1667

kernfs_kill_sb(sb);

1623

}

1668

}

1624

1669

1625

static struct file_system_type cgroup_fs_type = {

1670

static struct file_system_type cgroup_fs_type = {

1626

.name = "cgroup",

1671

.name = "cgroup",

1627

.mount = cgroup_mount,

1672

.mount = cgroup_mount,

1628

.kill_sb = cgroup_kill_sb,

1673

.kill_sb = cgroup_kill_sb,

1629

};

1674

};

1630

1675

1631

static struct kobject *cgroup_kobj;

1676

static struct kobject *cgroup_kobj;

1632

1677

1633

/**

1678

/**

1634

* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy

1679

* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy

1635

* @task: target task

1680

* @task: target task

1636

* @buf: the buffer to write the path into

1681

* @buf: the buffer to write the path into

1637

* @buflen: the length of the buffer

1682

* @buflen: the length of the buffer

1638

*

1683

*

1639

* Determine @task's cgroup on the first (the one with the lowest non-zero

1684

* Determine @task's cgroup on the first (the one with the lowest non-zero

1640

* hierarchy_id) cgroup hierarchy and copy its path into @buf. This

1685

* hierarchy_id) cgroup hierarchy and copy its path into @buf. This

1641

* function grabs cgroup_mutex and shouldn't be used inside locks used by

1686

* function grabs cgroup_mutex and shouldn't be used inside locks used by

1642

* cgroup controller callbacks.

1687

* cgroup controller callbacks.

1643

*

1688

*

1644

* Return value is the same as kernfs_path().

1689

* Return value is the same as kernfs_path().

1645

*/

1690

*/

1646

char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)

1691

char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)

1647

{

1692

{

1648

struct cgroup_root *root;

1693

struct cgroup_root *root;

1649

struct cgroup *cgrp;

1694

struct cgroup *cgrp;

1650

int hierarchy_id = 1;

1695

int hierarchy_id = 1;

1651

char *path = NULL;

1696

char *path = NULL;

1652

1697

1653

mutex_lock(&cgroup_mutex);

1698

mutex_lock(&cgroup_mutex);

1654

down_read(&css_set_rwsem);

1699

down_read(&css_set_rwsem);

1655

1700

1656

root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);

1701

root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);

1657

1702

1658

if (root) {

1703

if (root) {

1659

cgrp = task_cgroup_from_root(task, root);

1704

cgrp = task_cgroup_from_root(task, root);

1660

path = cgroup_path(cgrp, buf, buflen);

1705

path = cgroup_path(cgrp, buf, buflen);

1661

} else {

1706

} else {

1662

/* if no hierarchy exists, everyone is in "/" */

1707

/* if no hierarchy exists, everyone is in "/" */

1663

if (strlcpy(buf, "/", buflen) < buflen)

1708

if (strlcpy(buf, "/", buflen) < buflen)

1664

path = buf;

1709

path = buf;

1665

}

1710

}

1666

1711

1667

up_read(&css_set_rwsem);

1712

up_read(&css_set_rwsem);

1668

mutex_unlock(&cgroup_mutex);

1713

mutex_unlock(&cgroup_mutex);

1669

return path;

1714

return path;

1670

}

1715

}

1671

EXPORT_SYMBOL_GPL(task_cgroup_path);

1716

EXPORT_SYMBOL_GPL(task_cgroup_path);

1672

1717

1673

/* used to track tasks and other necessary states during migration */

1718

/* used to track tasks and other necessary states during migration */

1674

struct cgroup_taskset {

1719

struct cgroup_taskset {

1675

/* the src and dst cset list running through cset->mg_node */

1720

/* the src and dst cset list running through cset->mg_node */

1676

struct list_head src_csets;

1721

struct list_head src_csets;

1677

struct list_head dst_csets;

1722

struct list_head dst_csets;

1678

1723

1679

/*

1724

/*

1680

* Fields for cgroup_taskset_*() iteration.

1725

* Fields for cgroup_taskset_*() iteration.

1681

*

1726

*

1682

* Before migration is committed, the target migration tasks are on

1727

* Before migration is committed, the target migration tasks are on

1683

* ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of

1728

* ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of

1684

* the csets on ->dst_csets. ->csets point to either ->src_csets

1729

* the csets on ->dst_csets. ->csets point to either ->src_csets

1685

* or ->dst_csets depending on whether migration is committed.

1730

* or ->dst_csets depending on whether migration is committed.

1686

*

1731

*

1687

* ->cur_csets and ->cur_task point to the current task position

1732

* ->cur_csets and ->cur_task point to the current task position

1688

* during iteration.

1733

* during iteration.

1689

*/

1734

*/

1690

struct list_head *csets;

1735

struct list_head *csets;

1691

struct css_set *cur_cset;

1736

struct css_set *cur_cset;

1692

struct task_struct *cur_task;

1737

struct task_struct *cur_task;

1693

};

1738

};

1694

1739

1695

/**

1740

/**

1696

* cgroup_taskset_first - reset taskset and return the first task

1741

* cgroup_taskset_first - reset taskset and return the first task

1697

* @tset: taskset of interest

1742

* @tset: taskset of interest

1698

*

1743

*

1699

* @tset iteration is initialized and the first task is returned.

1744

* @tset iteration is initialized and the first task is returned.

1700

*/

1745

*/

1701

struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)

1746

struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)

1702

{

1747

{

1703

tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);

1748

tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);

1704

tset->cur_task = NULL;

1749

tset->cur_task = NULL;

1705

1750

1706

return cgroup_taskset_next(tset);

1751

return cgroup_taskset_next(tset);

1707

}

1752

}

1708

1753

1709

/**

1754

/**

1710

* cgroup_taskset_next - iterate to the next task in taskset

1755

* cgroup_taskset_next - iterate to the next task in taskset

1711

* @tset: taskset of interest

1756

* @tset: taskset of interest

1712

*

1757

*

1713

* Return the next task in @tset. Iteration must have been initialized

1758

* Return the next task in @tset. Iteration must have been initialized

1714

* with cgroup_taskset_first().

1759

* with cgroup_taskset_first().

1715

*/

1760

*/

1716

struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)

1761

struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)

1717

{

1762

{

1718

struct css_set *cset = tset->cur_cset;

1763

struct css_set *cset = tset->cur_cset;

1719

struct task_struct *task = tset->cur_task;

1764

struct task_struct *task = tset->cur_task;

1720

1765

1721

while (&cset->mg_node != tset->csets) {

1766

while (&cset->mg_node != tset->csets) {

1722

if (!task)

1767

if (!task)

1723

task = list_first_entry(&cset->mg_tasks,

1768

task = list_first_entry(&cset->mg_tasks,

1724

struct task_struct, cg_list);

1769

struct task_struct, cg_list);

1725

else

1770

else

1726

task = list_next_entry(task, cg_list);

1771

task = list_next_entry(task, cg_list);

1727

1772

1728

if (&task->cg_list != &cset->mg_tasks) {

1773

if (&task->cg_list != &cset->mg_tasks) {

1729

tset->cur_cset = cset;

1774

tset->cur_cset = cset;

1730

tset->cur_task = task;

1775

tset->cur_task = task;

1731

return task;

1776

return task;

1732

}

1777

}

1733

1778

1734

cset = list_next_entry(cset, mg_node);

1779

cset = list_next_entry(cset, mg_node);

1735

task = NULL;

1780

task = NULL;

1736

}

1781

}

1737

1782

1738

return NULL;

1783

return NULL;

1739

}

1784

}

1740

1785

1741

/**

1786

/**

1742

* cgroup_task_migrate - move a task from one cgroup to another.

1787

* cgroup_task_migrate - move a task from one cgroup to another.

1743

* @old_cgrp; the cgroup @tsk is being migrated from

1788

* @old_cgrp; the cgroup @tsk is being migrated from

1744

* @tsk: the task being migrated

1789

* @tsk: the task being migrated

1745

* @new_cset: the new css_set @tsk is being attached to

1790

* @new_cset: the new css_set @tsk is being attached to

1746

*

1791

*

1747

* Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.

1792

* Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.

1748

*/

1793

*/

1749

static void cgroup_task_migrate(struct cgroup *old_cgrp,

1794

static void cgroup_task_migrate(struct cgroup *old_cgrp,

1750

struct task_struct *tsk,

1795

struct task_struct *tsk,

1751

struct css_set *new_cset)

1796

struct css_set *new_cset)

1752

{

1797

{

1753

struct css_set *old_cset;

1798

struct css_set *old_cset;

1754

1799

1755

lockdep_assert_held(&cgroup_mutex);

1800

lockdep_assert_held(&cgroup_mutex);

1756

lockdep_assert_held(&css_set_rwsem);

1801

lockdep_assert_held(&css_set_rwsem);

1757

1802

1758

/*

1803

/*

1759

* We are synchronized through threadgroup_lock() against PF_EXITING

1804

* We are synchronized through threadgroup_lock() against PF_EXITING

1760

* setting such that we can't race against cgroup_exit() changing the

1805

* setting such that we can't race against cgroup_exit() changing the

1761

* css_set to init_css_set and dropping the old one.

1806

* css_set to init_css_set and dropping the old one.

1762

*/

1807

*/

1763

WARN_ON_ONCE(tsk->flags & PF_EXITING);

1808

WARN_ON_ONCE(tsk->flags & PF_EXITING);

1764

old_cset = task_css_set(tsk);

1809

old_cset = task_css_set(tsk);

1765

1810

1766

get_css_set(new_cset);

1811

get_css_set(new_cset);

1767

rcu_assign_pointer(tsk->cgroups, new_cset);

1812

rcu_assign_pointer(tsk->cgroups, new_cset);

1768

1813

1769

/*

1814

/*

1770

* Use move_tail so that cgroup_taskset_first() still returns the

1815

* Use move_tail so that cgroup_taskset_first() still returns the

1771

* leader after migration. This works because cgroup_migrate()

1816

* leader after migration. This works because cgroup_migrate()

1772

* ensures that the dst_cset of the leader is the first on the

1817

* ensures that the dst_cset of the leader is the first on the

1773

* tset's dst_csets list.

1818

* tset's dst_csets list.

1774

*/

1819

*/

1775

list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);

1820

list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);

1776

1821

1777

/*

1822

/*

1778

* We just gained a reference on old_cset by taking it from the

1823

* We just gained a reference on old_cset by taking it from the

1779

* task. As trading it for new_cset is protected by cgroup_mutex,

1824

* task. As trading it for new_cset is protected by cgroup_mutex,

1780

* we're safe to drop it here; it will be freed under RCU.

1825

* we're safe to drop it here; it will be freed under RCU.

1781

*/

1826

*/

1782

set_bit(CGRP_RELEASABLE, &old_cgrp->flags);

1827

set_bit(CGRP_RELEASABLE, &old_cgrp->flags);

1783

put_css_set_locked(old_cset, false);

1828

put_css_set_locked(old_cset, false);

1784

}

1829

}

1785

1830

1786

/**

1831

/**

1787

* cgroup_migrate_finish - cleanup after attach

1832

* cgroup_migrate_finish - cleanup after attach

1788

* @preloaded_csets: list of preloaded css_sets

1833

* @preloaded_csets: list of preloaded css_sets

1789

*

1834

*

1790

* Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See

1835

* Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See

1791

* those functions for details.

1836

* those functions for details.

1792

*/

1837

*/

1793

static void cgroup_migrate_finish(struct list_head *preloaded_csets)

1838

static void cgroup_migrate_finish(struct list_head *preloaded_csets)

1794

{

1839

{

1795

struct css_set *cset, *tmp_cset;

1840

struct css_set *cset, *tmp_cset;

1796

1841

1797

lockdep_assert_held(&cgroup_mutex);

1842

lockdep_assert_held(&cgroup_mutex);

1798

1843

1799

down_write(&css_set_rwsem);

1844

down_write(&css_set_rwsem);

1800

list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {

1845

list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {

1801

cset->mg_src_cgrp = NULL;

1846

cset->mg_src_cgrp = NULL;

1802

cset->mg_dst_cset = NULL;

1847

cset->mg_dst_cset = NULL;

1803

list_del_init(&cset->mg_preload_node);

1848

list_del_init(&cset->mg_preload_node);

1804

put_css_set_locked(cset, false);

1849

put_css_set_locked(cset, false);

1805

}

1850

}

1806

up_write(&css_set_rwsem);

1851

up_write(&css_set_rwsem);

1807

}

1852

}

1808

1853

1809

/**

1854

/**

1810

* cgroup_migrate_add_src - add a migration source css_set

1855

* cgroup_migrate_add_src - add a migration source css_set

1811

* @src_cset: the source css_set to add

1856

* @src_cset: the source css_set to add

1812

* @dst_cgrp: the destination cgroup

1857

* @dst_cgrp: the destination cgroup

1813

* @preloaded_csets: list of preloaded css_sets

1858

* @preloaded_csets: list of preloaded css_sets

1814

*

1859

*

1815

* Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin

1860

* Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin

1816

* @src_cset and add it to @preloaded_csets, which should later be cleaned

1861

* @src_cset and add it to @preloaded_csets, which should later be cleaned

1817

* up by cgroup_migrate_finish().

1862

* up by cgroup_migrate_finish().

1818

*

1863

*

1819

* This function may be called without holding threadgroup_lock even if the

1864

* This function may be called without holding threadgroup_lock even if the

1820

* target is a process. Threads may be created and destroyed but as long

1865

* target is a process. Threads may be created and destroyed but as long

1821

* as cgroup_mutex is not dropped, no new css_set can be put into play and

1866

* as cgroup_mutex is not dropped, no new css_set can be put into play and

1822

* the preloaded css_sets are guaranteed to cover all migrations.

1867

* the preloaded css_sets are guaranteed to cover all migrations.

1823

*/

1868

*/

1824

static void cgroup_migrate_add_src(struct css_set *src_cset,

1869

static void cgroup_migrate_add_src(struct css_set *src_cset,

1825

struct cgroup *dst_cgrp,

1870

struct cgroup *dst_cgrp,

1826

struct list_head *preloaded_csets)

1871

struct list_head *preloaded_csets)

1827

{

1872

{

1828

struct cgroup *src_cgrp;

1873

struct cgroup *src_cgrp;

1829

1874

1830

lockdep_assert_held(&cgroup_mutex);

1875

lockdep_assert_held(&cgroup_mutex);

1831

lockdep_assert_held(&css_set_rwsem);

1876

lockdep_assert_held(&css_set_rwsem);

1832

1877

1833

src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);

1878

src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);

1834

1879

1835

/* nothing to do if this cset already belongs to the cgroup */

1880

/* nothing to do if this cset already belongs to the cgroup */

1836

if (src_cgrp == dst_cgrp)

1881

if (src_cgrp == dst_cgrp)

1837

return;

1882

return;

1838

1883

1839

if (!list_empty(&src_cset->mg_preload_node))

1884

if (!list_empty(&src_cset->mg_preload_node))

1840

return;

1885

return;

1841

1886

1842

WARN_ON(src_cset->mg_src_cgrp);

1887

WARN_ON(src_cset->mg_src_cgrp);

1843

WARN_ON(!list_empty(&src_cset->mg_tasks));

1888

WARN_ON(!list_empty(&src_cset->mg_tasks));

1844

WARN_ON(!list_empty(&src_cset->mg_node));

1889

WARN_ON(!list_empty(&src_cset->mg_node));

1845

1890

1846

src_cset->mg_src_cgrp = src_cgrp;

1891

src_cset->mg_src_cgrp = src_cgrp;

1847

get_css_set(src_cset);

1892

get_css_set(src_cset);

1848

list_add(&src_cset->mg_preload_node, preloaded_csets);

1893

list_add(&src_cset->mg_preload_node, preloaded_csets);

1849

}

1894

}

1850

1895

1851

/**

1896

/**

1852

* cgroup_migrate_prepare_dst - prepare destination css_sets for migration

1897

* cgroup_migrate_prepare_dst - prepare destination css_sets for migration

1853

* @dst_cgrp: the destination cgroup

1898

* @dst_cgrp: the destination cgroup

1854

* @preloaded_csets: list of preloaded source css_sets

1899

* @preloaded_csets: list of preloaded source css_sets

1855

*

1900

*

1856

* Tasks are about to be moved to @dst_cgrp and all the source css_sets

1901

* Tasks are about to be moved to @dst_cgrp and all the source css_sets

1857

* have been preloaded to @preloaded_csets. This function looks up and

1902

* have been preloaded to @preloaded_csets. This function looks up and

1858

* pins all destination css_sets, links each to its source, and put them on

1903

* pins all destination css_sets, links each to its source, and put them on

1859

* @preloaded_csets.

1904

* @preloaded_csets.

1860

*

1905

*

1861

* This function must be called after cgroup_migrate_add_src() has been

1906

* This function must be called after cgroup_migrate_add_src() has been

1862

* called on each migration source css_set. After migration is performed

1907

* called on each migration source css_set. After migration is performed

1863

* using cgroup_migrate(), cgroup_migrate_finish() must be called on

1908

* using cgroup_migrate(), cgroup_migrate_finish() must be called on

1864

* @preloaded_csets.

1909

* @preloaded_csets.

1865

*/

1910

*/

1866

static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,

1911

static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,

1867

struct list_head *preloaded_csets)

1912

struct list_head *preloaded_csets)

1868

{

1913

{

1869

LIST_HEAD(csets);

1914

LIST_HEAD(csets);

1870

struct css_set *src_cset;

1915

struct css_set *src_cset;

1871

1916

1872

lockdep_assert_held(&cgroup_mutex);

1917

lockdep_assert_held(&cgroup_mutex);

1873

1918

1874

/* look up the dst cset for each src cset and link it to src */

1919

/* look up the dst cset for each src cset and link it to src */

1875

list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {

1920

list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {

1876

struct css_set *dst_cset;

1921

struct css_set *dst_cset;

1877

1922

1878

dst_cset = find_css_set(src_cset, dst_cgrp);

1923

dst_cset = find_css_set(src_cset, dst_cgrp);

1879

if (!dst_cset)

1924

if (!dst_cset)

1880

goto err;

1925

goto err;

1881

1926

1882

WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);

1927

WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);

1883

src_cset->mg_dst_cset = dst_cset;

1928

src_cset->mg_dst_cset = dst_cset;

1884

1929

1885

if (list_empty(&dst_cset->mg_preload_node))

1930

if (list_empty(&dst_cset->mg_preload_node))

1886

list_add(&dst_cset->mg_preload_node, &csets);

1931

list_add(&dst_cset->mg_preload_node, &csets);

1887

else

1932

else

1888

put_css_set(dst_cset, false);

1933

put_css_set(dst_cset, false);

1889

}

1934

}

1890

1935

1891

list_splice(&csets, preloaded_csets);

1936

list_splice(&csets, preloaded_csets);

1892

return 0;

1937

return 0;

1893

err:

1938

err:

1894

cgroup_migrate_finish(&csets);

1939

cgroup_migrate_finish(&csets);

1895

return -ENOMEM;

1940

return -ENOMEM;

1896

}

1941

}

1897

1942

1898

/**

1943

/**

1899

* cgroup_migrate - migrate a process or task to a cgroup

1944

* cgroup_migrate - migrate a process or task to a cgroup

1900

* @cgrp: the destination cgroup

1945

* @cgrp: the destination cgroup

1901

* @leader: the leader of the process or the task to migrate

1946

* @leader: the leader of the process or the task to migrate

1902

* @threadgroup: whether @leader points to the whole process or a single task

1947

* @threadgroup: whether @leader points to the whole process or a single task

1903

*

1948

*

1904

* Migrate a process or task denoted by @leader to @cgrp. If migrating a

1949

* Migrate a process or task denoted by @leader to @cgrp. If migrating a

1905

* process, the caller must be holding threadgroup_lock of @leader. The

1950

* process, the caller must be holding threadgroup_lock of @leader. The

1906

* caller is also responsible for invoking cgroup_migrate_add_src() and

1951

* caller is also responsible for invoking cgroup_migrate_add_src() and

1907

* cgroup_migrate_prepare_dst() on the targets before invoking this

1952

* cgroup_migrate_prepare_dst() on the targets before invoking this

1908

* function and following up with cgroup_migrate_finish().

1953

* function and following up with cgroup_migrate_finish().

1909

*

1954

*

1910

* As long as a controller's ->can_attach() doesn't fail, this function is

1955

* As long as a controller's ->can_attach() doesn't fail, this function is

1911

* guaranteed to succeed. This means that, excluding ->can_attach()

1956

* guaranteed to succeed. This means that, excluding ->can_attach()

1912

* failure, when migrating multiple targets, the success or failure can be

1957

* failure, when migrating multiple targets, the success or failure can be

1913

* decided for all targets by invoking group_migrate_prepare_dst() before

1958

* decided for all targets by invoking group_migrate_prepare_dst() before

1914

* actually starting migrating.

1959

* actually starting migrating.

1915

*/

1960

*/

1916

static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,

1961

static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,

1917

bool threadgroup)

1962

bool threadgroup)

1918

{

1963

{

1919

struct cgroup_taskset tset = {

1964

struct cgroup_taskset tset = {

1920

.src_csets = LIST_HEAD_INIT(tset.src_csets),

1965

.src_csets = LIST_HEAD_INIT(tset.src_csets),

1921

.dst_csets = LIST_HEAD_INIT(tset.dst_csets),

1966

.dst_csets = LIST_HEAD_INIT(tset.dst_csets),

1922

.csets = &tset.src_csets,

1967

.csets = &tset.src_csets,

1923

};

1968

};

1924

struct cgroup_subsys_state *css, *failed_css = NULL;

1969

struct cgroup_subsys_state *css, *failed_css = NULL;

1925

struct css_set *cset, *tmp_cset;

1970

struct css_set *cset, *tmp_cset;

1926

struct task_struct *task, *tmp_task;

1971

struct task_struct *task, *tmp_task;

1927

int i, ret;

1972

int i, ret;

1928

1973

1929

/*

1974

/*

1930

* Prevent freeing of tasks while we take a snapshot. Tasks that are

1975

* Prevent freeing of tasks while we take a snapshot. Tasks that are

1931

* already PF_EXITING could be freed from underneath us unless we

1976

* already PF_EXITING could be freed from underneath us unless we

1932

* take an rcu_read_lock.

1977

* take an rcu_read_lock.

1933

*/

1978

*/

1934

down_write(&css_set_rwsem);

1979

down_write(&css_set_rwsem);

1935

rcu_read_lock();

1980

rcu_read_lock();

1936

task = leader;

1981

task = leader;

1937

do {

1982

do {

1938

/* @task either already exited or can't exit until the end */

1983

/* @task either already exited or can't exit until the end */

1939

if (task->flags & PF_EXITING)

1984

if (task->flags & PF_EXITING)

1940

goto next;

1985

goto next;

1941

1986

1942

/* leave @task alone if post_fork() hasn't linked it yet */

1987

/* leave @task alone if post_fork() hasn't linked it yet */

1943

if (list_empty(&task->cg_list))

1988

if (list_empty(&task->cg_list))

1944

goto next;

1989

goto next;

1945

1990

1946

cset = task_css_set(task);

1991

cset = task_css_set(task);

1947

if (!cset->mg_src_cgrp)

1992

if (!cset->mg_src_cgrp)

1948

goto next;

1993

goto next;

1949

1994

1950

/*

1995

/*

1951

* cgroup_taskset_first() must always return the leader.

1996

* cgroup_taskset_first() must always return the leader.

1952

* Take care to avoid disturbing the ordering.

1997

* Take care to avoid disturbing the ordering.

1953

*/

1998

*/

1954

list_move_tail(&task->cg_list, &cset->mg_tasks);

1999

list_move_tail(&task->cg_list, &cset->mg_tasks);

1955

if (list_empty(&cset->mg_node))

2000

if (list_empty(&cset->mg_node))

1956

list_add_tail(&cset->mg_node, &tset.src_csets);

2001

list_add_tail(&cset->mg_node, &tset.src_csets);

1957

if (list_empty(&cset->mg_dst_cset->mg_node))

2002

if (list_empty(&cset->mg_dst_cset->mg_node))

1958

list_move_tail(&cset->mg_dst_cset->mg_node,

2003

list_move_tail(&cset->mg_dst_cset->mg_node,

1959

&tset.dst_csets);

2004

&tset.dst_csets);

1960

if (!threadgroup)

2006

if (!threadgroup)

1962

break;

2007

break;

1963

} while_each_thread(leader, task);

2008

} while_each_thread(leader, task);

1964

rcu_read_unlock();

2009

rcu_read_unlock();

1965

up_write(&css_set_rwsem);

2010

up_write(&css_set_rwsem);

1966

2011

1967

/* methods shouldn't be called if no task is actually migrating */

2012

/* methods shouldn't be called if no task is actually migrating */

1968

if (list_empty(&tset.src_csets))

2013

if (list_empty(&tset.src_csets))

1969

return 0;

2014

return 0;

1970

2015

1971

/* check that we can legitimately attach to the cgroup */

2016

/* check that we can legitimately attach to the cgroup */

1972

for_each_css(css, i, cgrp) {

2017

for_each_e_css(css, i, cgrp) {

1973

if (css->ss->can_attach) {

2018

if (css->ss->can_attach) {

1974

ret = css->ss->can_attach(css, &tset);

2019

ret = css->ss->can_attach(css, &tset);

1975

if (ret) {

2020

if (ret) {

1976

failed_css = css;

2021

failed_css = css;

1977

goto out_cancel_attach;

2022

goto out_cancel_attach;

1978

}

2023

}

1979

}

2024

}

1980

}

2025

}

1981

2026

1982

/*

2027

/*

1983

* Now that we're guaranteed success, proceed to move all tasks to

2028

* Now that we're guaranteed success, proceed to move all tasks to

1984

* the new cgroup. There are no failure cases after here, so this

2029

* the new cgroup. There are no failure cases after here, so this

1985

* is the commit point.

2030

* is the commit point.

1986

*/

2031

*/

1987

down_write(&css_set_rwsem);

2032

down_write(&css_set_rwsem);

1988

list_for_each_entry(cset, &tset.src_csets, mg_node) {

2033

list_for_each_entry(cset, &tset.src_csets, mg_node) {

1989

list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)

2034

list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)

1990

cgroup_task_migrate(cset->mg_src_cgrp, task,

2035

cgroup_task_migrate(cset->mg_src_cgrp, task,

1991

cset->mg_dst_cset);

2036

cset->mg_dst_cset);

1992

}

2037

}

1993

up_write(&css_set_rwsem);

2038

up_write(&css_set_rwsem);

1994

2039

1995

/*

2040

/*

1996

* Migration is committed, all target tasks are now on dst_csets.

2041

* Migration is committed, all target tasks are now on dst_csets.

1997

* Nothing is sensitive to fork() after this point. Notify

2042

* Nothing is sensitive to fork() after this point. Notify

1998

* controllers that migration is complete.

2043

* controllers that migration is complete.

1999

*/

2044

*/

2000

tset.csets = &tset.dst_csets;

2045

tset.csets = &tset.dst_csets;

2001

2046

2002

for_each_css(css, i, cgrp)

2047

for_each_e_css(css, i, cgrp)

2003

if (css->ss->attach)

2048

if (css->ss->attach)

2004

css->ss->attach(css, &tset);

2049

css->ss->attach(css, &tset);

2005

2050

2006

ret = 0;

2051

ret = 0;

2007

goto out_release_tset;

2052

goto out_release_tset;

2008

2053

2009

out_cancel_attach:

2054

out_cancel_attach:

2010

for_each_css(css, i, cgrp) {

2055

for_each_e_css(css, i, cgrp) {

2011

if (css == failed_css)

2056

if (css == failed_css)

2012

break;

2057

break;

2013

if (css->ss->cancel_attach)

2058

if (css->ss->cancel_attach)

2014

css->ss->cancel_attach(css, &tset);

2059

css->ss->cancel_attach(css, &tset);

2015

}

2060

}

2016

out_release_tset:

2061

out_release_tset:

2017

down_write(&css_set_rwsem);

2062

down_write(&css_set_rwsem);

2018

list_splice_init(&tset.dst_csets, &tset.src_csets);

2063

list_splice_init(&tset.dst_csets, &tset.src_csets);

2019

list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {

2064

list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {

2020

list_splice_tail_init(&cset->mg_tasks, &cset->tasks);

2065

list_splice_tail_init(&cset->mg_tasks, &cset->tasks);

2021

list_del_init(&cset->mg_node);

2066

list_del_init(&cset->mg_node);

2022

}

2067

}

2023

up_write(&css_set_rwsem);

2068

up_write(&css_set_rwsem);

2024

return ret;

2069

return ret;

2025

}

2070

}

2026

2071

2027

/**

2072

/**

2028

* cgroup_attach_task - attach a task or a whole threadgroup to a cgroup

2073

* cgroup_attach_task - attach a task or a whole threadgroup to a cgroup

2029

* @dst_cgrp: the cgroup to attach to

2074

* @dst_cgrp: the cgroup to attach to

2030

* @leader: the task or the leader of the threadgroup to be attached

2075

* @leader: the task or the leader of the threadgroup to be attached

2031

* @threadgroup: attach the whole threadgroup?

2076

* @threadgroup: attach the whole threadgroup?

2032

*

2077

*

2033

* Call holding cgroup_mutex and threadgroup_lock of @leader.

2078

* Call holding cgroup_mutex and threadgroup_lock of @leader.

2034

*/

2079

*/

2035

static int cgroup_attach_task(struct cgroup *dst_cgrp,

2080

static int cgroup_attach_task(struct cgroup *dst_cgrp,

2036

struct task_struct *leader, bool threadgroup)

2081

struct task_struct *leader, bool threadgroup)

2037

{

2082

{

2038

LIST_HEAD(preloaded_csets);

2083

LIST_HEAD(preloaded_csets);

2039

struct task_struct *task;

2084

struct task_struct *task;

2040

int ret;

2085

int ret;

2041

2086

2042

/* look up all src csets */

2087

/* look up all src csets */

2043

down_read(&css_set_rwsem);

2088

down_read(&css_set_rwsem);

2044

rcu_read_lock();

2089

rcu_read_lock();

2045

task = leader;

2090

task = leader;

2046

do {

2091

do {

2047

cgroup_migrate_add_src(task_css_set(task), dst_cgrp,

2092

cgroup_migrate_add_src(task_css_set(task), dst_cgrp,

2048

&preloaded_csets);

2093

&preloaded_csets);

2049

if (!threadgroup)

2094

if (!threadgroup)

2050

break;

2095

break;

2051

} while_each_thread(leader, task);

2096

} while_each_thread(leader, task);

2052

rcu_read_unlock();

2097

rcu_read_unlock();

2053

up_read(&css_set_rwsem);

2098

up_read(&css_set_rwsem);

2054

2099

2055

/* prepare dst csets and commit */

2100

/* prepare dst csets and commit */

2056

ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);

2101

ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);

2057

if (!ret)

2102

if (!ret)

2058

ret = cgroup_migrate(dst_cgrp, leader, threadgroup);

2103

ret = cgroup_migrate(dst_cgrp, leader, threadgroup);

2059

2104

2060

cgroup_migrate_finish(&preloaded_csets);

2105

cgroup_migrate_finish(&preloaded_csets);

2061

return ret;

2106

return ret;

2062

}

2107

}

2063

2108

2064

/*

2109

/*

2065

* Find the task_struct of the task to attach by vpid and pass it along to the

2110

* Find the task_struct of the task to attach by vpid and pass it along to the

2066

* function to attach either it or all tasks in its threadgroup. Will lock

2111

* function to attach either it or all tasks in its threadgroup. Will lock

2067

* cgroup_mutex and threadgroup.

2112

* cgroup_mutex and threadgroup.

2068

*/

2113

*/

2069

static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)

2114

static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)

2070

{

2115

{

2071

struct task_struct *tsk;

2116

struct task_struct *tsk;

2072

const struct cred *cred = current_cred(), *tcred;

2117

const struct cred *cred = current_cred(), *tcred;

2073

int ret;

2118

int ret;

2074

2119

2075

if (!cgroup_lock_live_group(cgrp))

2120

if (!cgroup_lock_live_group(cgrp))

2076

return -ENODEV;

2121

return -ENODEV;

2077

2122

2078

retry_find_task:

2123

retry_find_task:

2079

rcu_read_lock();

2124

rcu_read_lock();

2080

if (pid) {

2125

if (pid) {

2081

tsk = find_task_by_vpid(pid);

2126

tsk = find_task_by_vpid(pid);

2082

if (!tsk) {

2127

if (!tsk) {

2083

rcu_read_unlock();

2128

rcu_read_unlock();

2084

ret = -ESRCH;

2129

ret = -ESRCH;

2085

goto out_unlock_cgroup;

2130

goto out_unlock_cgroup;

2086

}

2131

}

2087

/*

2132

/*

2088

* even if we're attaching all tasks in the thread group, we

2133

* even if we're attaching all tasks in the thread group, we

2089

* only need to check permissions on one of them.

2134

* only need to check permissions on one of them.

2090

*/

2135

*/

2091

tcred = __task_cred(tsk);

2136

tcred = __task_cred(tsk);

2092

if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&

2137

if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&

2093

!uid_eq(cred->euid, tcred->uid) &&

2138

!uid_eq(cred->euid, tcred->uid) &&

2094

!uid_eq(cred->euid, tcred->suid)) {

2139

!uid_eq(cred->euid, tcred->suid)) {

2095

rcu_read_unlock();

2140

rcu_read_unlock();

2096

ret = -EACCES;

2141

ret = -EACCES;

2097

goto out_unlock_cgroup;

2142

goto out_unlock_cgroup;

2098

}

2143

}

2099

} else

2144

} else

2100

tsk = current;

2145

tsk = current;

2101

2146

2102

if (threadgroup)

2147

if (threadgroup)

2103

tsk = tsk->group_leader;

2148

tsk = tsk->group_leader;

2104

2149

2105

/*

2150

/*

2106

* Workqueue threads may acquire PF_NO_SETAFFINITY and become

2151

* Workqueue threads may acquire PF_NO_SETAFFINITY and become

2107

* trapped in a cpuset, or RT worker may be born in a cgroup

2152

* trapped in a cpuset, or RT worker may be born in a cgroup

2108

* with no rt_runtime allocated. Just say no.

2153

* with no rt_runtime allocated. Just say no.

2109

*/

2154

*/

2110

if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {

2155

if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {

2111

ret = -EINVAL;

2156

ret = -EINVAL;

2112

rcu_read_unlock();

2157

rcu_read_unlock();

2113

goto out_unlock_cgroup;

2158

goto out_unlock_cgroup;

2114

}

2159

}

2115

2160

2116

get_task_struct(tsk);

2161

get_task_struct(tsk);

2117

rcu_read_unlock();

2162

rcu_read_unlock();

2118

2163

2119

threadgroup_lock(tsk);

2164

threadgroup_lock(tsk);

2120

if (threadgroup) {

2165

if (threadgroup) {

2121

if (!thread_group_leader(tsk)) {

2166

if (!thread_group_leader(tsk)) {

2122

/*

2167

/*

2123

* a race with de_thread from another thread's exec()

2168

* a race with de_thread from another thread's exec()

2124

* may strip us of our leadership, if this happens,

2169

* may strip us of our leadership, if this happens,

2125

* there is no choice but to throw this task away and

2170

* there is no choice but to throw this task away and

2126

* try again; this is

2171

* try again; this is

2127

* "double-double-toil-and-trouble-check locking".

2172

* "double-double-toil-and-trouble-check locking".

2128

*/

2173

*/

2129

threadgroup_unlock(tsk);

2174

threadgroup_unlock(tsk);

2130

put_task_struct(tsk);

2175

put_task_struct(tsk);

2131

goto retry_find_task;

2176

goto retry_find_task;

2132

}

2177

}

2133

}

2178

}

2134

2179

2135

ret = cgroup_attach_task(cgrp, tsk, threadgroup);

2180

ret = cgroup_attach_task(cgrp, tsk, threadgroup);

2136

2181

2137

threadgroup_unlock(tsk);

2182

threadgroup_unlock(tsk);

2138

2183

2139

put_task_struct(tsk);

2184

put_task_struct(tsk);

2140

out_unlock_cgroup:

2185

out_unlock_cgroup:

2141

mutex_unlock(&cgroup_mutex);

2186

mutex_unlock(&cgroup_mutex);

2142

return ret;

2187

return ret;

2143

}

2188

}

2144

2189

2145

/**

2190

/**

2146

* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'

2191

* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'

2147

* @from: attach to all cgroups of a given task

2192

* @from: attach to all cgroups of a given task

2148

* @tsk: the task to be attached

2193

* @tsk: the task to be attached

2149

*/

2194

*/

2150

int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)

2195

int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)

2151

{

2196

{

2152

struct cgroup_root *root;

2197

struct cgroup_root *root;

2153

int retval = 0;

2198

int retval = 0;

2154

2199

2155

mutex_lock(&cgroup_mutex);

2200

mutex_lock(&cgroup_mutex);

2156

for_each_root(root) {

2201

for_each_root(root) {

2157

struct cgroup *from_cgrp;

2202

struct cgroup *from_cgrp;

2158

2203

2159

if (root == &cgrp_dfl_root)

2204

if (root == &cgrp_dfl_root)

2160

continue;

2205

continue;

2161

2206

2162

down_read(&css_set_rwsem);

2207

down_read(&css_set_rwsem);

2163

from_cgrp = task_cgroup_from_root(from, root);

2208

from_cgrp = task_cgroup_from_root(from, root);

2164

up_read(&css_set_rwsem);

2209

up_read(&css_set_rwsem);

2165

2210

2166

retval = cgroup_attach_task(from_cgrp, tsk, false);

2211

retval = cgroup_attach_task(from_cgrp, tsk, false);

2167

if (retval)

2212

if (retval)

2168

break;

2213

break;

2169

}

2214

}

2170

mutex_unlock(&cgroup_mutex);

2215

mutex_unlock(&cgroup_mutex);

2171

2216

2172

return retval;

2217

return retval;

2173

}

2218

}

2174

EXPORT_SYMBOL_GPL(cgroup_attach_task_all);

2219

EXPORT_SYMBOL_GPL(cgroup_attach_task_all);

2175

2220

2176

static int cgroup_tasks_write(struct cgroup_subsys_state *css,

2221

static int cgroup_tasks_write(struct cgroup_subsys_state *css,

2177

struct cftype *cft, u64 pid)

2222

struct cftype *cft, u64 pid)

2178

{

2223

{

2179

return attach_task_by_pid(css->cgroup, pid, false);

2224

return attach_task_by_pid(css->cgroup, pid, false);

2180

}

2225

}

2181

2226

2182

static int cgroup_procs_write(struct cgroup_subsys_state *css,

2227

static int cgroup_procs_write(struct cgroup_subsys_state *css,

2183

struct cftype *cft, u64 tgid)

2228

struct cftype *cft, u64 tgid)

2184

{

2229

{

2185

return attach_task_by_pid(css->cgroup, tgid, true);

2230

return attach_task_by_pid(css->cgroup, tgid, true);

2186

}

2231

}

2187

2232

2188

static int cgroup_release_agent_write(struct cgroup_subsys_state *css,

2233

static int cgroup_release_agent_write(struct cgroup_subsys_state *css,

2189

struct cftype *cft, char *buffer)

2234

struct cftype *cft, char *buffer)

2190

{

2235

{

2191

struct cgroup_root *root = css->cgroup->root;

2236

struct cgroup_root *root = css->cgroup->root;

2192

2237

2193

BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);

2238

BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);

2194

if (!cgroup_lock_live_group(css->cgroup))

2239

if (!cgroup_lock_live_group(css->cgroup))

2195

return -ENODEV;

2240

return -ENODEV;

2196

spin_lock(&release_agent_path_lock);

2241

spin_lock(&release_agent_path_lock);

2197

strlcpy(root->release_agent_path, buffer,

2242

strlcpy(root->release_agent_path, buffer,

2198

sizeof(root->release_agent_path));

2243

sizeof(root->release_agent_path));

2199

spin_unlock(&release_agent_path_lock);

2244

spin_unlock(&release_agent_path_lock);

2200

mutex_unlock(&cgroup_mutex);

2245

mutex_unlock(&cgroup_mutex);

2201

return 0;

2246

return 0;

2202

}

2247

}

2203

2248

2204

static int cgroup_release_agent_show(struct seq_file *seq, void *v)

2249

static int cgroup_release_agent_show(struct seq_file *seq, void *v)

2205

{

2250

{

2206

struct cgroup *cgrp = seq_css(seq)->cgroup;

2251

struct cgroup *cgrp = seq_css(seq)->cgroup;

2207

2252

2208

if (!cgroup_lock_live_group(cgrp))

2253

if (!cgroup_lock_live_group(cgrp))

2209

return -ENODEV;

2254

return -ENODEV;

2210

seq_puts(seq, cgrp->root->release_agent_path);

2255

seq_puts(seq, cgrp->root->release_agent_path);

2211

seq_putc(seq, '\n');

2256

seq_putc(seq, '\n');

2212

mutex_unlock(&cgroup_mutex);

2257

mutex_unlock(&cgroup_mutex);

2213

return 0;

2258

return 0;

2214

}

2259

}

2215

2260

2216

static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)

2261

static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)

2217

{

2262

{

2218

struct cgroup *cgrp = seq_css(seq)->cgroup;

2263

struct cgroup *cgrp = seq_css(seq)->cgroup;

2219

2264

2220

seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));

2265

seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));

2221

return 0;

2266

return 0;

2222

}

2267

}

2223

2268

2224

static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,

2269

static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,

2225

size_t nbytes, loff_t off)

2270

size_t nbytes, loff_t off)

2226

{

2271

{

2227

struct cgroup *cgrp = of->kn->parent->priv;

2272

struct cgroup *cgrp = of->kn->parent->priv;

2228

struct cftype *cft = of->kn->priv;

2273

struct cftype *cft = of->kn->priv;

2229

struct cgroup_subsys_state *css;

2274

struct cgroup_subsys_state *css;

2230

int ret;

2275

int ret;

2231

2276

2232

/*

2277

/*

2233

* kernfs guarantees that a file isn't deleted with operations in

2278

* kernfs guarantees that a file isn't deleted with operations in

2234

* flight, which means that the matching css is and stays alive and

2279

* flight, which means that the matching css is and stays alive and

2235

* doesn't need to be pinned. The RCU locking is not necessary

2280

* doesn't need to be pinned. The RCU locking is not necessary

2236

* either. It's just for the convenience of using cgroup_css().

2281

* either. It's just for the convenience of using cgroup_css().

2237

*/

2282

*/

2238

rcu_read_lock();

2283

rcu_read_lock();

2239

css = cgroup_css(cgrp, cft->ss);

2284

css = cgroup_css(cgrp, cft->ss);

2240

rcu_read_unlock();

2285

rcu_read_unlock();

2241

2286

2242

if (cft->write_string) {

2287

if (cft->write_string) {

2243

ret = cft->write_string(css, cft, strstrip(buf));

2288

ret = cft->write_string(css, cft, strstrip(buf));

2244

} else if (cft->write_u64) {

2289

} else if (cft->write_u64) {

2245

unsigned long long v;

2290

unsigned long long v;

2246

ret = kstrtoull(buf, 0, &v);

2291

ret = kstrtoull(buf, 0, &v);

2247

if (!ret)

2292

if (!ret)

2248

ret = cft->write_u64(css, cft, v);

2293

ret = cft->write_u64(css, cft, v);

2249

} else if (cft->write_s64) {

2294

} else if (cft->write_s64) {

2250

long long v;

2295

long long v;

2251

ret = kstrtoll(buf, 0, &v);

2296

ret = kstrtoll(buf, 0, &v);

2252

if (!ret)

2297

if (!ret)

2253

ret = cft->write_s64(css, cft, v);

2298

ret = cft->write_s64(css, cft, v);

2254

} else if (cft->trigger) {

2299

} else if (cft->trigger) {

2255

ret = cft->trigger(css, (unsigned int)cft->private);

2300

ret = cft->trigger(css, (unsigned int)cft->private);

2256

} else {

2301

} else {

2257

ret = -EINVAL;

2302

ret = -EINVAL;

2258

}

2303

}

2259

2304

2260

return ret ?: nbytes;

2305

return ret ?: nbytes;

2261

}

2306

}

2262

2307

2263

static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)

2308

static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)

2264

{

2309

{

2265

return seq_cft(seq)->seq_start(seq, ppos);

2310

return seq_cft(seq)->seq_start(seq, ppos);

2266

}

2311

}

2267

2312

2268

static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)

2313

static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)

2269

{

2314

{

2270

return seq_cft(seq)->seq_next(seq, v, ppos);

2315

return seq_cft(seq)->seq_next(seq, v, ppos);

2271

}

2316

}

2272

2317

2273

static void cgroup_seqfile_stop(struct seq_file *seq, void *v)

2318

static void cgroup_seqfile_stop(struct seq_file *seq, void *v)

2274

{

2319

{

2275

seq_cft(seq)->seq_stop(seq, v);

2320

seq_cft(seq)->seq_stop(seq, v);

2276

}

2321

}

2277

2322

2278

static int cgroup_seqfile_show(struct seq_file *m, void *arg)

2323

static int cgroup_seqfile_show(struct seq_file *m, void *arg)

2279

{

2324

{

2280

struct cftype *cft = seq_cft(m);

2325

struct cftype *cft = seq_cft(m);

2281

struct cgroup_subsys_state *css = seq_css(m);

2326

struct cgroup_subsys_state *css = seq_css(m);

2282

2327

2283

if (cft->seq_show)

2328

if (cft->seq_show)

2284

return cft->seq_show(m, arg);

2329

return cft->seq_show(m, arg);

2285

2330

2286

if (cft->read_u64)

2331

if (cft->read_u64)

2287

seq_printf(m, "%llu\n", cft->read_u64(css, cft));

2332

seq_printf(m, "%llu\n", cft->read_u64(css, cft));

2288

else if (cft->read_s64)

2333

else if (cft->read_s64)

2289

seq_printf(m, "%lld\n", cft->read_s64(css, cft));

2334

seq_printf(m, "%lld\n", cft->read_s64(css, cft));

2290

else

2335

else

2291

return -EINVAL;

2336

return -EINVAL;

2292

return 0;

2337

return 0;

2293

}

2338

}

2294

2339

2295

static struct kernfs_ops cgroup_kf_single_ops = {

2340

static struct kernfs_ops cgroup_kf_single_ops = {

2296

.atomic_write_len = PAGE_SIZE,

2341

.atomic_write_len = PAGE_SIZE,

2297

.write = cgroup_file_write,

2342

.write = cgroup_file_write,

2298

.seq_show = cgroup_seqfile_show,

2343

.seq_show = cgroup_seqfile_show,

2299

};

2344

};

2300

2345

2301

static struct kernfs_ops cgroup_kf_ops = {

2346

static struct kernfs_ops cgroup_kf_ops = {

2302

.atomic_write_len = PAGE_SIZE,

2347

.atomic_write_len = PAGE_SIZE,

2303

.write = cgroup_file_write,

2348

.write = cgroup_file_write,

2304

.seq_start = cgroup_seqfile_start,

2349

.seq_start = cgroup_seqfile_start,

2305

.seq_next = cgroup_seqfile_next,

2350

.seq_next = cgroup_seqfile_next,

2306

.seq_stop = cgroup_seqfile_stop,

2351

.seq_stop = cgroup_seqfile_stop,

2307

.seq_show = cgroup_seqfile_show,

2352

.seq_show = cgroup_seqfile_show,

2308

};

2353

};

2309

2354

2310

/*

2355

/*

2311

* cgroup_rename - Only allow simple rename of directories in place.

2356

* cgroup_rename - Only allow simple rename of directories in place.

2312

*/

2357

*/

2313

static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,

2358

static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,

2314

const char *new_name_str)

2359

const char *new_name_str)

2315

{

2360

{

2316

struct cgroup *cgrp = kn->priv;

2361

struct cgroup *cgrp = kn->priv;

2317

int ret;

2362

int ret;

2318

2363

2319

if (kernfs_type(kn) != KERNFS_DIR)

2364

if (kernfs_type(kn) != KERNFS_DIR)

2320

return -ENOTDIR;

2365

return -ENOTDIR;

2321

if (kn->parent != new_parent)

2366

if (kn->parent != new_parent)

2322

return -EIO;

2367

return -EIO;

2323

2368

2324

/*

2369

/*

2325

* This isn't a proper migration and its usefulness is very

2370

* This isn't a proper migration and its usefulness is very

2326

* limited. Disallow if sane_behavior.

2371

* limited. Disallow if sane_behavior.

2327

*/

2372

*/

2328

if (cgroup_sane_behavior(cgrp))

2373

if (cgroup_sane_behavior(cgrp))

2329

return -EPERM;

2374

return -EPERM;

2330

2375

2331

/*

2376

/*

2332

* We're gonna grab cgroup_tree_mutex which nests outside kernfs

2377

* We're gonna grab cgroup_tree_mutex which nests outside kernfs

2333

* active_ref. kernfs_rename() doesn't require active_ref

2378

* active_ref. kernfs_rename() doesn't require active_ref

2334

* protection. Break them before grabbing cgroup_tree_mutex.

2379

* protection. Break them before grabbing cgroup_tree_mutex.

2335

*/

2380

*/

2336

kernfs_break_active_protection(new_parent);

2381

kernfs_break_active_protection(new_parent);

2337

kernfs_break_active_protection(kn);

2382

kernfs_break_active_protection(kn);

2338

2383

2339

mutex_lock(&cgroup_tree_mutex);

2384

mutex_lock(&cgroup_tree_mutex);

2340

mutex_lock(&cgroup_mutex);

2385

mutex_lock(&cgroup_mutex);

2341

2386

2342

ret = kernfs_rename(kn, new_parent, new_name_str);

2387

ret = kernfs_rename(kn, new_parent, new_name_str);

2343

2388

2344

mutex_unlock(&cgroup_mutex);

2389

mutex_unlock(&cgroup_mutex);

2345

mutex_unlock(&cgroup_tree_mutex);

2390

mutex_unlock(&cgroup_tree_mutex);

2346

2391

2347

kernfs_unbreak_active_protection(kn);

2392

kernfs_unbreak_active_protection(kn);

2348

kernfs_unbreak_active_protection(new_parent);

2393

kernfs_unbreak_active_protection(new_parent);

2349

return ret;

2394

return ret;

2350

}

2395

}

2351

2396

2352

/* set uid and gid of cgroup dirs and files to that of the creator */

2397

/* set uid and gid of cgroup dirs and files to that of the creator */

2353

static int cgroup_kn_set_ugid(struct kernfs_node *kn)

2398

static int cgroup_kn_set_ugid(struct kernfs_node *kn)

2354

{

2399

{

2355

struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,

2400

struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,

2356

.ia_uid = current_fsuid(),

2401

.ia_uid = current_fsuid(),

2357

.ia_gid = current_fsgid(), };

2402

.ia_gid = current_fsgid(), };

2358

2403

2359

if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&

2404

if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&

2360

gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))

2405

gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))

2361

return 0;

2406

return 0;

2362

2407

2363

return kernfs_setattr(kn, &iattr);

2408

return kernfs_setattr(kn, &iattr);

2364

}

2409

}

2365

2410

2366

static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)

2411

static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)

2367

{

2412

{

2368

char name[CGROUP_FILE_NAME_MAX];

2413

char name[CGROUP_FILE_NAME_MAX];

2369

struct kernfs_node *kn;

2414

struct kernfs_node *kn;

2370

struct lock_class_key *key = NULL;

2415

struct lock_class_key *key = NULL;

2371

int ret;

2416

int ret;

2372

2417

2373

#ifdef CONFIG_DEBUG_LOCK_ALLOC

2418

#ifdef CONFIG_DEBUG_LOCK_ALLOC

2374

key = &cft->lockdep_key;

2419

key = &cft->lockdep_key;

2375

#endif

2420

#endif

2376

kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),

2421

kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),

2377

cgroup_file_mode(cft), 0, cft->kf_ops, cft,

2422

cgroup_file_mode(cft), 0, cft->kf_ops, cft,

2378

NULL, false, key);

2423

NULL, false, key);

2379

if (IS_ERR(kn))

2424

if (IS_ERR(kn))

2380

return PTR_ERR(kn);

2425

return PTR_ERR(kn);

2381

2426

2382

ret = cgroup_kn_set_ugid(kn);

2427

ret = cgroup_kn_set_ugid(kn);

2383

if (ret)

2428

if (ret)

2384

kernfs_remove(kn);

2429

kernfs_remove(kn);

2385

return ret;

2430

return ret;

2386

}

2431

}

2387

2432

2388

/**

2433

/**

2389

* cgroup_addrm_files - add or remove files to a cgroup directory

2434

* cgroup_addrm_files - add or remove files to a cgroup directory

2390

* @cgrp: the target cgroup

2435

* @cgrp: the target cgroup

2391

* @cfts: array of cftypes to be added

2436

* @cfts: array of cftypes to be added

2392

* @is_add: whether to add or remove

2437

* @is_add: whether to add or remove

2393

*

2438

*

2394

* Depending on @is_add, add or remove files defined by @cfts on @cgrp.

2439

* Depending on @is_add, add or remove files defined by @cfts on @cgrp.

2395

* For removals, this function never fails. If addition fails, this

2440

* For removals, this function never fails. If addition fails, this

2396

* function doesn't remove files already added. The caller is responsible

2441

* function doesn't remove files already added. The caller is responsible

2397

* for cleaning up.

2442

* for cleaning up.

2398

*/

2443

*/

2399

static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],

2444

static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],

2400

bool is_add)

2445

bool is_add)

2401

{

2446

{

2402

struct cftype *cft;

2447

struct cftype *cft;

2403

int ret;

2448

int ret;

2404

2449

2405

lockdep_assert_held(&cgroup_tree_mutex);

2450

lockdep_assert_held(&cgroup_tree_mutex);

2406

2451

2407

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2452

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2408

/* does cft->flags tell us to skip this file on @cgrp? */

2453

/* does cft->flags tell us to skip this file on @cgrp? */

2409

if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))

2454

if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))

2410

continue;

2455

continue;

2411

if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))

2456

if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))

2412

continue;

2457

continue;

2413

if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)

2458

if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)

2414

continue;

2459

continue;

2415

if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)

2460

if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)

2416

continue;

2461

continue;

2417

2462

2418

if (is_add) {

2463

if (is_add) {

2419

ret = cgroup_add_file(cgrp, cft);

2464

ret = cgroup_add_file(cgrp, cft);

2420

if (ret) {

2465

if (ret) {

2421

pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",

2466

pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",

2422

cft->name, ret);

2467

cft->name, ret);

2423

return ret;

2468

return ret;

2424

}

2469

}

2425

} else {

2470

} else {

2426

cgroup_rm_file(cgrp, cft);

2471

cgroup_rm_file(cgrp, cft);

2427

}

2472

}

2428

}

2473

}

2429

return 0;

2474

return 0;

2430

}

2475

}

2431

2476

2432

static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)

2477

static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)

2433

{

2478

{

2434

LIST_HEAD(pending);

2479

LIST_HEAD(pending);

2435

struct cgroup_subsys *ss = cfts[0].ss;

2480

struct cgroup_subsys *ss = cfts[0].ss;

2436

struct cgroup *root = &ss->root->cgrp;

2481

struct cgroup *root = &ss->root->cgrp;

2437

struct cgroup_subsys_state *css;

2482

struct cgroup_subsys_state *css;

2438

int ret = 0;

2483

int ret = 0;

2439

2484

2440

lockdep_assert_held(&cgroup_tree_mutex);

2485

lockdep_assert_held(&cgroup_tree_mutex);

2441

2486

2442

/* add/rm files for all cgroups created before */

2487

/* add/rm files for all cgroups created before */

2443

css_for_each_descendant_pre(css, cgroup_css(root, ss)) {

2488

css_for_each_descendant_pre(css, cgroup_css(root, ss)) {

2444

struct cgroup *cgrp = css->cgroup;

2489

struct cgroup *cgrp = css->cgroup;

2445

2490

2446

if (cgroup_is_dead(cgrp))

2491

if (cgroup_is_dead(cgrp))

2447

continue;

2492

continue;

2448

2493

2449

ret = cgroup_addrm_files(cgrp, cfts, is_add);

2494

ret = cgroup_addrm_files(cgrp, cfts, is_add);

2450

if (ret)

2495

if (ret)

2451

break;

2496

break;

2452

}

2497

}

2453

2498

2454

if (is_add && !ret)

2499

if (is_add && !ret)

2455

kernfs_activate(root->kn);

2500

kernfs_activate(root->kn);

2456

return ret;

2501

return ret;

2457

}

2502

}

2458

2503

2459

static void cgroup_exit_cftypes(struct cftype *cfts)

2504

static void cgroup_exit_cftypes(struct cftype *cfts)

2460

{

2505

{

2461

struct cftype *cft;

2506

struct cftype *cft;

2462

2507

2463

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2508

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2464

/* free copy for custom atomic_write_len, see init_cftypes() */

2509

/* free copy for custom atomic_write_len, see init_cftypes() */

2465

if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)

2510

if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)

2466

kfree(cft->kf_ops);

2511

kfree(cft->kf_ops);

2467

cft->kf_ops = NULL;

2512

cft->kf_ops = NULL;

2468

cft->ss = NULL;

2513

cft->ss = NULL;

2469

}

2514

}

2470

}

2515

}

2471

2516

2472

static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

2517

static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

2473

{

2518

{

2474

struct cftype *cft;

2519

struct cftype *cft;

2475

2520

2476

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2521

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2477

struct kernfs_ops *kf_ops;

2522

struct kernfs_ops *kf_ops;

2478

2523

2479

WARN_ON(cft->ss || cft->kf_ops);

2524

WARN_ON(cft->ss || cft->kf_ops);

2480

2525

2481

if (cft->seq_start)

2526

if (cft->seq_start)

2482

kf_ops = &cgroup_kf_ops;

2527

kf_ops = &cgroup_kf_ops;

2483

else

2528

else

2484

kf_ops = &cgroup_kf_single_ops;

2529

kf_ops = &cgroup_kf_single_ops;

2485

2530

2486

/*

2531

/*

2487

* Ugh... if @cft wants a custom max_write_len, we need to

2532

* Ugh... if @cft wants a custom max_write_len, we need to

2488

* make a copy of kf_ops to set its atomic_write_len.

2533

* make a copy of kf_ops to set its atomic_write_len.

2489

*/

2534

*/

2490

if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {

2535

if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {

2491

kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);

2536

kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);

2492

if (!kf_ops) {

2537

if (!kf_ops) {

2493

cgroup_exit_cftypes(cfts);

2538

cgroup_exit_cftypes(cfts);

2494

return -ENOMEM;

2539

return -ENOMEM;

2495

}

2540

}

2496

kf_ops->atomic_write_len = cft->max_write_len;

2541

kf_ops->atomic_write_len = cft->max_write_len;

2497

}

2542

}

2498

2543

2499

cft->kf_ops = kf_ops;

2544

cft->kf_ops = kf_ops;

2500

cft->ss = ss;

2545

cft->ss = ss;

2501

}

2546

}

2502

2547

2503

return 0;

2548

return 0;

2504

}

2549

}

2505

2550

2506

static int cgroup_rm_cftypes_locked(struct cftype *cfts)

2551

static int cgroup_rm_cftypes_locked(struct cftype *cfts)

2507

{

2552

{

2508

lockdep_assert_held(&cgroup_tree_mutex);

2553

lockdep_assert_held(&cgroup_tree_mutex);

2509

2554

2510

if (!cfts || !cfts[0].ss)

2555

if (!cfts || !cfts[0].ss)

2511

return -ENOENT;

2556

return -ENOENT;

2512

2557

2513

list_del(&cfts->node);

2558

list_del(&cfts->node);

2514

cgroup_apply_cftypes(cfts, false);

2559

cgroup_apply_cftypes(cfts, false);

2515

cgroup_exit_cftypes(cfts);

2560

cgroup_exit_cftypes(cfts);

2516

return 0;

2561

return 0;

2517

}

2562

}

2518

2563

2519

/**

2564

/**

2520

* cgroup_rm_cftypes - remove an array of cftypes from a subsystem

2565

* cgroup_rm_cftypes - remove an array of cftypes from a subsystem

2521

* @cfts: zero-length name terminated array of cftypes

2566

* @cfts: zero-length name terminated array of cftypes

2522

*

2567

*

2523

* Unregister @cfts. Files described by @cfts are removed from all

2568

* Unregister @cfts. Files described by @cfts are removed from all

2524

* existing cgroups and all future cgroups won't have them either. This

2569

* existing cgroups and all future cgroups won't have them either. This

2525

* function can be called anytime whether @cfts' subsys is attached or not.

2570

* function can be called anytime whether @cfts' subsys is attached or not.

2526

*

2571

*

2527

* Returns 0 on successful unregistration, -ENOENT if @cfts is not

2572

* Returns 0 on successful unregistration, -ENOENT if @cfts is not

2528

* registered.

2573

* registered.

2529

*/

2574

*/

2530

int cgroup_rm_cftypes(struct cftype *cfts)

2575

int cgroup_rm_cftypes(struct cftype *cfts)

2531

{

2576

{

2532

int ret;

2577

int ret;

2533

2578

2534

mutex_lock(&cgroup_tree_mutex);

2579

mutex_lock(&cgroup_tree_mutex);

2535

ret = cgroup_rm_cftypes_locked(cfts);

2580

ret = cgroup_rm_cftypes_locked(cfts);

2536

mutex_unlock(&cgroup_tree_mutex);

2581

mutex_unlock(&cgroup_tree_mutex);

2537

return ret;

2582

return ret;

2538

}

2583

}

2539

2584

2540

/**

2585

/**

2541

* cgroup_add_cftypes - add an array of cftypes to a subsystem

2586

* cgroup_add_cftypes - add an array of cftypes to a subsystem

2542

* @ss: target cgroup subsystem

2587

* @ss: target cgroup subsystem

2543

* @cfts: zero-length name terminated array of cftypes

2588

* @cfts: zero-length name terminated array of cftypes

2544

*

2589

*

2545

* Register @cfts to @ss. Files described by @cfts are created for all

2590

* Register @cfts to @ss. Files described by @cfts are created for all

2546

* existing cgroups to which @ss is attached and all future cgroups will

2591

* existing cgroups to which @ss is attached and all future cgroups will

2547

* have them too. This function can be called anytime whether @ss is

2592

* have them too. This function can be called anytime whether @ss is

2548

* attached or not.

2593

* attached or not.

2549

*

2594

*

2550

* Returns 0 on successful registration, -errno on failure. Note that this

2595

* Returns 0 on successful registration, -errno on failure. Note that this

2551

* function currently returns 0 as long as @cfts registration is successful

2596

* function currently returns 0 as long as @cfts registration is successful

2552

* even if some file creation attempts on existing cgroups fail.

2597

* even if some file creation attempts on existing cgroups fail.

2553

*/

2598

*/

2554

int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

2599

int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

2555

{

2600

{

2556

int ret;

2601

int ret;

2557

2602

2558

if (!cfts || cfts[0].name[0] == '\0')

2603

if (!cfts || cfts[0].name[0] == '\0')

2559

return 0;

2604

return 0;

2560

2605

2561

ret = cgroup_init_cftypes(ss, cfts);

2606

ret = cgroup_init_cftypes(ss, cfts);

2562

if (ret)

2607

if (ret)

2563

return ret;

2608

return ret;

2564

2609

2565

mutex_lock(&cgroup_tree_mutex);

2610

mutex_lock(&cgroup_tree_mutex);

2566

2611

2567

list_add_tail(&cfts->node, &ss->cfts);

2612

list_add_tail(&cfts->node, &ss->cfts);

2568

ret = cgroup_apply_cftypes(cfts, true);

2613

ret = cgroup_apply_cftypes(cfts, true);

2569

if (ret)

2614

if (ret)

2570

cgroup_rm_cftypes_locked(cfts);

2615

cgroup_rm_cftypes_locked(cfts);

2571

2616

2572

mutex_unlock(&cgroup_tree_mutex);

2617

mutex_unlock(&cgroup_tree_mutex);

2573

return ret;

2618

return ret;

2574

}

2619

}

2575

2620

2576

/**

2621

/**

2577

* cgroup_task_count - count the number of tasks in a cgroup.

2622

* cgroup_task_count - count the number of tasks in a cgroup.

2578

* @cgrp: the cgroup in question

2623

* @cgrp: the cgroup in question

2579

*

2624

*

2580

* Return the number of tasks in the cgroup.

2625

* Return the number of tasks in the cgroup.

2581

*/

2626

*/

2582

static int cgroup_task_count(const struct cgroup *cgrp)

2627

static int cgroup_task_count(const struct cgroup *cgrp)

2583

{

2628

{

2584

int count = 0;

2629

int count = 0;

2585

struct cgrp_cset_link *link;

2630

struct cgrp_cset_link *link;

2586

2631

2587

down_read(&css_set_rwsem);

2632

down_read(&css_set_rwsem);

2588

list_for_each_entry(link, &cgrp->cset_links, cset_link)

2633

list_for_each_entry(link, &cgrp->cset_links, cset_link)

2589

count += atomic_read(&link->cset->refcount);

2634

count += atomic_read(&link->cset->refcount);

2590

up_read(&css_set_rwsem);

2635

up_read(&css_set_rwsem);

2591

return count;

2636

return count;

2592

}

2637

}

2593

2638

2594

/**

2639

/**

2595

* css_next_child - find the next child of a given css

2640

* css_next_child - find the next child of a given css

2596

* @pos_css: the current position (%NULL to initiate traversal)

2641

* @pos_css: the current position (%NULL to initiate traversal)

2597

* @parent_css: css whose children to walk

2642

* @parent_css: css whose children to walk

2598

*

2643

*

2599

* This function returns the next child of @parent_css and should be called

2644

* This function returns the next child of @parent_css and should be called

2600

* under either cgroup_mutex or RCU read lock. The only requirement is

2645

* under either cgroup_mutex or RCU read lock. The only requirement is

2601

* that @parent_css and @pos_css are accessible. The next sibling is

2646

* that @parent_css and @pos_css are accessible. The next sibling is

2602

* guaranteed to be returned regardless of their states.

2647

* guaranteed to be returned regardless of their states.

2603

*/

2648

*/

2604

struct cgroup_subsys_state *

2649

struct cgroup_subsys_state *

2605

css_next_child(struct cgroup_subsys_state *pos_css,

2650

css_next_child(struct cgroup_subsys_state *pos_css,

2606

struct cgroup_subsys_state *parent_css)

2651

struct cgroup_subsys_state *parent_css)

2607

{

2652

{

2608

struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;

2653

struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;

2609

struct cgroup *cgrp = parent_css->cgroup;

2654

struct cgroup *cgrp = parent_css->cgroup;

2610

struct cgroup *next;

2655

struct cgroup *next;

2611

2656

2612

cgroup_assert_mutexes_or_rcu_locked();

2657

cgroup_assert_mutexes_or_rcu_locked();

2613

2658

2614

/*

2659

/*

2615

* @pos could already have been removed. Once a cgroup is removed,

2660

* @pos could already have been removed. Once a cgroup is removed,

2616

* its ->sibling.next is no longer updated when its next sibling

2661

* its ->sibling.next is no longer updated when its next sibling

2617

* changes. As CGRP_DEAD assertion is serialized and happens

2662

* changes. As CGRP_DEAD assertion is serialized and happens

2618

* before the cgroup is taken off the ->sibling list, if we see it

2663

* before the cgroup is taken off the ->sibling list, if we see it

2619

* unasserted, it's guaranteed that the next sibling hasn't

2664

* unasserted, it's guaranteed that the next sibling hasn't

2620

* finished its grace period even if it's already removed, and thus

2665

* finished its grace period even if it's already removed, and thus

2621

* safe to dereference from this RCU critical section. If

2666

* safe to dereference from this RCU critical section. If

2622

* ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed

2667

* ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed

2623

* to be visible as %true here.

2668

* to be visible as %true here.

2624

*

2669

*

2625

* If @pos is dead, its next pointer can't be dereferenced;

2670

* If @pos is dead, its next pointer can't be dereferenced;

2626

* however, as each cgroup is given a monotonically increasing

2671

* however, as each cgroup is given a monotonically increasing

2627

* unique serial number and always appended to the sibling list,

2672

* unique serial number and always appended to the sibling list,

2628

* the next one can be found by walking the parent's children until

2673

* the next one can be found by walking the parent's children until

2629

* we see a cgroup with higher serial number than @pos's. While

2674

* we see a cgroup with higher serial number than @pos's. While

2630

* this path can be slower, it's taken only when either the current

2675

* this path can be slower, it's taken only when either the current

2631

* cgroup is removed or iteration and removal race.

2676

* cgroup is removed or iteration and removal race.

2632

*/

2677

*/

2633

if (!pos) {

2678

if (!pos) {

2634

next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);

2679

next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);

2635

} else if (likely(!cgroup_is_dead(pos))) {

2680

} else if (likely(!cgroup_is_dead(pos))) {

2636

next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);

2681

next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);

2637

} else {

2682

} else {

2638

list_for_each_entry_rcu(next, &cgrp->children, sibling)

2683

list_for_each_entry_rcu(next, &cgrp->children, sibling)

2639

if (next->serial_nr > pos->serial_nr)

2684

if (next->serial_nr > pos->serial_nr)

2640

break;

2685

break;

2641

}

2686

}

2642

2687

2643

if (&next->sibling == &cgrp->children)

2688

if (&next->sibling == &cgrp->children)

2644

return NULL;

2689

return NULL;

2645

2690

2646

return cgroup_css(next, parent_css->ss);

2691

return cgroup_css(next, parent_css->ss);

2647

}

2692

}

2648

2693

2649

/**

2694

/**

2650

* css_next_descendant_pre - find the next descendant for pre-order walk

2695

* css_next_descendant_pre - find the next descendant for pre-order walk

2651

* @pos: the current position (%NULL to initiate traversal)

2696

* @pos: the current position (%NULL to initiate traversal)

2652

* @root: css whose descendants to walk

2697

* @root: css whose descendants to walk

2653

*

2698

*

2654

* To be used by css_for_each_descendant_pre(). Find the next descendant

2699

* To be used by css_for_each_descendant_pre(). Find the next descendant

2655

* to visit for pre-order traversal of @root's descendants. @root is

2700

* to visit for pre-order traversal of @root's descendants. @root is

2656

* included in the iteration and the first node to be visited.

2701

* included in the iteration and the first node to be visited.

2657

*

2702

*

2658

* While this function requires cgroup_mutex or RCU read locking, it

2703

* While this function requires cgroup_mutex or RCU read locking, it

2659

* doesn't require the whole traversal to be contained in a single critical

2704

* doesn't require the whole traversal to be contained in a single critical

2660

* section. This function will return the correct next descendant as long

2705

* section. This function will return the correct next descendant as long

2661

* as both @pos and @root are accessible and @pos is a descendant of @root.

2706

* as both @pos and @root are accessible and @pos is a descendant of @root.

2662

*/

2707

*/

2663

struct cgroup_subsys_state *

2708

struct cgroup_subsys_state *

2664

css_next_descendant_pre(struct cgroup_subsys_state *pos,

2709

css_next_descendant_pre(struct cgroup_subsys_state *pos,

2665

struct cgroup_subsys_state *root)

2710

struct cgroup_subsys_state *root)

2666

{

2711

{

2667

struct cgroup_subsys_state *next;

2712

struct cgroup_subsys_state *next;

2668

2713

2669

cgroup_assert_mutexes_or_rcu_locked();

2714

cgroup_assert_mutexes_or_rcu_locked();

2670

2715

2671

/* if first iteration, visit @root */

2716

/* if first iteration, visit @root */

2672

if (!pos)

2717

if (!pos)

2673

return root;

2718

return root;

2674

2719

2675

/* visit the first child if exists */

2720

/* visit the first child if exists */

2676

next = css_next_child(NULL, pos);

2721

next = css_next_child(NULL, pos);

2677

if (next)

2722

if (next)

2678

return next;

2723

return next;

2679

2724

2680

/* no child, visit my or the closest ancestor's next sibling */

2725

/* no child, visit my or the closest ancestor's next sibling */

2681

while (pos != root) {

2726

while (pos != root) {

2682

next = css_next_child(pos, css_parent(pos));

2727

next = css_next_child(pos, css_parent(pos));

2683

if (next)

2728

if (next)

2684

return next;

2729

return next;

2685

pos = css_parent(pos);

2730

pos = css_parent(pos);

2686

}

2731

}

2687

2732

2688

return NULL;

2733

return NULL;

2689

}

2734

}

2690

2735

2691

/**

2736

/**

2692

* css_rightmost_descendant - return the rightmost descendant of a css

2737

* css_rightmost_descendant - return the rightmost descendant of a css

2693

* @pos: css of interest

2738

* @pos: css of interest

2694

*

2739

*

2695

* Return the rightmost descendant of @pos. If there's no descendant, @pos

2740

* Return the rightmost descendant of @pos. If there's no descendant, @pos

2696

* is returned. This can be used during pre-order traversal to skip

2741

* is returned. This can be used during pre-order traversal to skip

2697

* subtree of @pos.

2742

* subtree of @pos.

2698

*

2743

*

2699

* While this function requires cgroup_mutex or RCU read locking, it

2744

* While this function requires cgroup_mutex or RCU read locking, it

2700

* doesn't require the whole traversal to be contained in a single critical

2745

* doesn't require the whole traversal to be contained in a single critical

2701

* section. This function will return the correct rightmost descendant as

2746

* section. This function will return the correct rightmost descendant as

2702

* long as @pos is accessible.

2747

* long as @pos is accessible.

2703

*/

2748

*/

2704

struct cgroup_subsys_state *

2749

struct cgroup_subsys_state *

2705

css_rightmost_descendant(struct cgroup_subsys_state *pos)

2750

css_rightmost_descendant(struct cgroup_subsys_state *pos)

2706

{

2751

{

2707

struct cgroup_subsys_state *last, *tmp;

2752

struct cgroup_subsys_state *last, *tmp;

2708

2753

2709

cgroup_assert_mutexes_or_rcu_locked();

2754

cgroup_assert_mutexes_or_rcu_locked();

2710

2755

2711

do {

2756

do {

2712

last = pos;

2757

last = pos;

2713

/* ->prev isn't RCU safe, walk ->next till the end */

2758

/* ->prev isn't RCU safe, walk ->next till the end */

2714

pos = NULL;

2759

pos = NULL;

2715

css_for_each_child(tmp, last)

2760

css_for_each_child(tmp, last)

2716

pos = tmp;

2761

pos = tmp;

2717

} while (pos);

2762

} while (pos);

2718

2763

2719

return last;

2764

return last;

2720

}

2765

}

2721

2766

2722

static struct cgroup_subsys_state *

2767

static struct cgroup_subsys_state *

2723

css_leftmost_descendant(struct cgroup_subsys_state *pos)

2768

css_leftmost_descendant(struct cgroup_subsys_state *pos)

2724

{

2769

{

2725

struct cgroup_subsys_state *last;

2770

struct cgroup_subsys_state *last;

2726

2771

2727

do {

2772

do {

2728

last = pos;

2773

last = pos;

2729

pos = css_next_child(NULL, pos);

2774

pos = css_next_child(NULL, pos);

2730

} while (pos);

2775

} while (pos);

2731

2776

2732

return last;

2777

return last;

2733

}

2778

}

2734

2779

2735

/**

2780

/**

2736

* css_next_descendant_post - find the next descendant for post-order walk

2781

* css_next_descendant_post - find the next descendant for post-order walk

2737

* @pos: the current position (%NULL to initiate traversal)

2782

* @pos: the current position (%NULL to initiate traversal)

2738

* @root: css whose descendants to walk

2783

* @root: css whose descendants to walk

2739

*

2784

*

2740

* To be used by css_for_each_descendant_post(). Find the next descendant

2785

* To be used by css_for_each_descendant_post(). Find the next descendant

2741

* to visit for post-order traversal of @root's descendants. @root is

2786

* to visit for post-order traversal of @root's descendants. @root is

2742

* included in the iteration and the last node to be visited.

2787

* included in the iteration and the last node to be visited.

2743

*

2788

*

2744

* While this function requires cgroup_mutex or RCU read locking, it

2789

* While this function requires cgroup_mutex or RCU read locking, it

2745

* doesn't require the whole traversal to be contained in a single critical

2790

* doesn't require the whole traversal to be contained in a single critical

2746

* section. This function will return the correct next descendant as long

2791

* section. This function will return the correct next descendant as long

2747

* as both @pos and @cgroup are accessible and @pos is a descendant of

2792

* as both @pos and @cgroup are accessible and @pos is a descendant of

2748

* @cgroup.

2793

* @cgroup.

2749

*/

2794

*/

2750

struct cgroup_subsys_state *

2795

struct cgroup_subsys_state *

2751

css_next_descendant_post(struct cgroup_subsys_state *pos,

2796

css_next_descendant_post(struct cgroup_subsys_state *pos,

2752

struct cgroup_subsys_state *root)

2797

struct cgroup_subsys_state *root)

2753

{

2798

{

2754

struct cgroup_subsys_state *next;

2799

struct cgroup_subsys_state *next;

2755

2800

2756

cgroup_assert_mutexes_or_rcu_locked();

2801

cgroup_assert_mutexes_or_rcu_locked();

2757

2802

2758

/* if first iteration, visit leftmost descendant which may be @root */

2803

/* if first iteration, visit leftmost descendant which may be @root */

2759

if (!pos)

2804

if (!pos)

2760

return css_leftmost_descendant(root);

2805

return css_leftmost_descendant(root);

2761

2806

2762

/* if we visited @root, we're done */

2807

/* if we visited @root, we're done */

2763

if (pos == root)

2808

if (pos == root)

2764

return NULL;

2809

return NULL;

2765

2810

2766

/* if there's an unvisited sibling, visit its leftmost descendant */

2811

/* if there's an unvisited sibling, visit its leftmost descendant */

2767

next = css_next_child(pos, css_parent(pos));

2812

next = css_next_child(pos, css_parent(pos));

2768

if (next)

2813

if (next)

2769

return css_leftmost_descendant(next);

2814

return css_leftmost_descendant(next);

2770

2815

2771

/* no sibling left, visit parent */

2816

/* no sibling left, visit parent */

2772

return css_parent(pos);

2817

return css_parent(pos);

2773

}

2818

}

2774

2819

2775

/**

2820

/**

2776

* css_advance_task_iter - advance a task itererator to the next css_set

2821

* css_advance_task_iter - advance a task itererator to the next css_set

2777

* @it: the iterator to advance

2822

* @it: the iterator to advance

2778

*

2823

*

2779

* Advance @it to the next css_set to walk.

2824

* Advance @it to the next css_set to walk.

2780

*/

2825

*/

2781

static void css_advance_task_iter(struct css_task_iter *it)

2826

static void css_advance_task_iter(struct css_task_iter *it)

2782

{

2827

{

2783

struct list_head *l = it->cset_link;

2828

struct list_head *l = it->cset_link;

2784

struct cgrp_cset_link *link;

2829

struct cgrp_cset_link *link;

2785

struct css_set *cset;

2830

struct css_set *cset;

2786

2831

2787

/* Advance to the next non-empty css_set */

2832

/* Advance to the next non-empty css_set */

2788

do {

2833

do {

2789

l = l->next;

2834

l = l->next;

2790

if (l == &it->origin_css->cgroup->cset_links) {

2835

if (l == &it->origin_css->cgroup->cset_links) {

2791

it->cset_link = NULL;

2836

it->cset_link = NULL;

2792

return;

2837

return;

2793

}

2838

}

2794

link = list_entry(l, struct cgrp_cset_link, cset_link);

2839

link = list_entry(l, struct cgrp_cset_link, cset_link);

2795

cset = link->cset;

2840

cset = link->cset;

2796

} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));

2841

} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));

2797

2842

2798

it->cset_link = l;

2843

it->cset_link = l;

2799

2844

2800

if (!list_empty(&cset->tasks))

2845

if (!list_empty(&cset->tasks))

2801

it->task = cset->tasks.next;

2846

it->task = cset->tasks.next;

2802

else

2847

else

2803

it->task = cset->mg_tasks.next;

2848

it->task = cset->mg_tasks.next;

2804

}

2849

}

2805

2850

2806

/**

2851

/**

2807

* css_task_iter_start - initiate task iteration

2852

* css_task_iter_start - initiate task iteration

2808

* @css: the css to walk tasks of

2853

* @css: the css to walk tasks of

2809

* @it: the task iterator to use

2854

* @it: the task iterator to use

2810

*

2855

*

2811

* Initiate iteration through the tasks of @css. The caller can call

2856

* Initiate iteration through the tasks of @css. The caller can call

2812

* css_task_iter_next() to walk through the tasks until the function

2857

* css_task_iter_next() to walk through the tasks until the function

2813

* returns NULL. On completion of iteration, css_task_iter_end() must be

2858

* returns NULL. On completion of iteration, css_task_iter_end() must be

2814

* called.

2859

* called.

2815

*

2860

*

2816

* Note that this function acquires a lock which is released when the

2861

* Note that this function acquires a lock which is released when the

2817

* iteration finishes. The caller can't sleep while iteration is in

2862

* iteration finishes. The caller can't sleep while iteration is in

2818

* progress.

2863

* progress.

2819

*/

2864

*/

2820

void css_task_iter_start(struct cgroup_subsys_state *css,

2865

void css_task_iter_start(struct cgroup_subsys_state *css,

2821

struct css_task_iter *it)

2866

struct css_task_iter *it)

2822

__acquires(css_set_rwsem)

2867

__acquires(css_set_rwsem)

2823

{

2868

{

2824

/* no one should try to iterate before mounting cgroups */

2869

/* no one should try to iterate before mounting cgroups */

2825

WARN_ON_ONCE(!use_task_css_set_links);

2870

WARN_ON_ONCE(!use_task_css_set_links);

2826

2871

2827

down_read(&css_set_rwsem);

2872

down_read(&css_set_rwsem);

2828

2873

2829

it->origin_css = css;

2874

it->origin_css = css;

2830

it->cset_link = &css->cgroup->cset_links;

2875

it->cset_link = &css->cgroup->cset_links;

2831

2876

2832

css_advance_task_iter(it);

2877

css_advance_task_iter(it);

2833

}

2878

}

2834

2879

2835

/**

2880

/**

2836

* css_task_iter_next - return the next task for the iterator

2881

* css_task_iter_next - return the next task for the iterator

2837

* @it: the task iterator being iterated

2882

* @it: the task iterator being iterated

2838

*

2883

*

2839

* The "next" function for task iteration. @it should have been

2884

* The "next" function for task iteration. @it should have been

2840

* initialized via css_task_iter_start(). Returns NULL when the iteration

2885

* initialized via css_task_iter_start(). Returns NULL when the iteration

2841

* reaches the end.

2886

* reaches the end.

2842

*/

2887

*/

2843

struct task_struct *css_task_iter_next(struct css_task_iter *it)

2888

struct task_struct *css_task_iter_next(struct css_task_iter *it)

2844

{

2889

{

2845

struct task_struct *res;

2890

struct task_struct *res;

2846

struct list_head *l = it->task;

2891

struct list_head *l = it->task;

2847

struct cgrp_cset_link *link = list_entry(it->cset_link,

2892

struct cgrp_cset_link *link = list_entry(it->cset_link,

2848

struct cgrp_cset_link, cset_link);

2893

struct cgrp_cset_link, cset_link);

2849

2894

2850

/* If the iterator cg is NULL, we have no tasks */

2895

/* If the iterator cg is NULL, we have no tasks */

2851

if (!it->cset_link)

2896

if (!it->cset_link)

2852

return NULL;

2897

return NULL;

2853

res = list_entry(l, struct task_struct, cg_list);

2898

res = list_entry(l, struct task_struct, cg_list);

2854

2899

2855

/*

2900

/*

2856

* Advance iterator to find next entry. cset->tasks is consumed

2901

* Advance iterator to find next entry. cset->tasks is consumed

2857

* first and then ->mg_tasks. After ->mg_tasks, we move onto the

2902

* first and then ->mg_tasks. After ->mg_tasks, we move onto the

2858

* next cset.

2903

* next cset.

2859

*/

2904

*/

2860

l = l->next;

2905

l = l->next;

2861

2906

2862

if (l == &link->cset->tasks)

2907

if (l == &link->cset->tasks)

2863

l = link->cset->mg_tasks.next;

2908

l = link->cset->mg_tasks.next;

2864

2909

2865

if (l == &link->cset->mg_tasks)

2910

if (l == &link->cset->mg_tasks)

2866

css_advance_task_iter(it);

2911

css_advance_task_iter(it);

2867

else

2912

else

2868

it->task = l;

2913

it->task = l;

2869

2914

2870

return res;

2915

return res;

2871

}

2916

}

2872

2917

2873

/**

2918

/**

2874

* css_task_iter_end - finish task iteration

2919

* css_task_iter_end - finish task iteration

2875

* @it: the task iterator to finish

2920

* @it: the task iterator to finish

2876

*

2921

*

2877

* Finish task iteration started by css_task_iter_start().

2922

* Finish task iteration started by css_task_iter_start().

2878

*/

2923

*/

2879

void css_task_iter_end(struct css_task_iter *it)

2924

void css_task_iter_end(struct css_task_iter *it)

2880

__releases(css_set_rwsem)

2925

__releases(css_set_rwsem)

2881

{

2926

{

2882

up_read(&css_set_rwsem);

2927

up_read(&css_set_rwsem);

2883

}

2928

}

2884

2929

2885

/**

2930

/**

2886

* cgroup_trasnsfer_tasks - move tasks from one cgroup to another

2931

* cgroup_trasnsfer_tasks - move tasks from one cgroup to another

2887

* @to: cgroup to which the tasks will be moved

2932

* @to: cgroup to which the tasks will be moved

2888

* @from: cgroup in which the tasks currently reside

2933

* @from: cgroup in which the tasks currently reside

2889

*

2934

*

2890

* Locking rules between cgroup_post_fork() and the migration path

2935

* Locking rules between cgroup_post_fork() and the migration path

2891

* guarantee that, if a task is forking while being migrated, the new child

2936

* guarantee that, if a task is forking while being migrated, the new child

2892

* is guaranteed to be either visible in the source cgroup after the

2937

* is guaranteed to be either visible in the source cgroup after the

2893

* parent's migration is complete or put into the target cgroup. No task

2938

* parent's migration is complete or put into the target cgroup. No task

2894

* can slip out of migration through forking.

2939

* can slip out of migration through forking.

2895

*/

2940

*/

2896

int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)

2941

int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)

2897

{

2942

{

2898

LIST_HEAD(preloaded_csets);

2943

LIST_HEAD(preloaded_csets);

2899

struct cgrp_cset_link *link;

2944

struct cgrp_cset_link *link;

2900

struct css_task_iter it;

2945

struct css_task_iter it;

2901

struct task_struct *task;

2946

struct task_struct *task;

2902

int ret;

2947

int ret;

2903

2948

2904

mutex_lock(&cgroup_mutex);

2949

mutex_lock(&cgroup_mutex);

2905

2950

2906

/* all tasks in @from are being moved, all csets are source */

2951

/* all tasks in @from are being moved, all csets are source */

2907

down_read(&css_set_rwsem);

2952

down_read(&css_set_rwsem);

2908

list_for_each_entry(link, &from->cset_links, cset_link)

2953

list_for_each_entry(link, &from->cset_links, cset_link)

2909

cgroup_migrate_add_src(link->cset, to, &preloaded_csets);

2954

cgroup_migrate_add_src(link->cset, to, &preloaded_csets);

2910

up_read(&css_set_rwsem);

2955

up_read(&css_set_rwsem);

2911

2956

2912

ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);

2957

ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);

2913

if (ret)

2958

if (ret)

2914

goto out_err;

2959

goto out_err;

2915

2960

2916

/*

2961

/*

2917

* Migrate tasks one-by-one until @form is empty. This fails iff

2962

* Migrate tasks one-by-one until @form is empty. This fails iff

2918

* ->can_attach() fails.

2963

* ->can_attach() fails.

2919

*/

2964

*/

2920

do {

2965

do {

2921

css_task_iter_start(&from->dummy_css, &it);

2966

css_task_iter_start(&from->dummy_css, &it);

2922

task = css_task_iter_next(&it);

2967

task = css_task_iter_next(&it);

2923

if (task)

2968

if (task)

2924

get_task_struct(task);

2969

get_task_struct(task);

2925

css_task_iter_end(&it);

2970

css_task_iter_end(&it);

2926

2971

2927

if (task) {

2972

if (task) {

2928

ret = cgroup_migrate(to, task, false);

2973

ret = cgroup_migrate(to, task, false);

2929

put_task_struct(task);

2974

put_task_struct(task);

2930

}

2975

}

2931

} while (task && !ret);

2976

} while (task && !ret);

2932

out_err:

2977

out_err:

2933

cgroup_migrate_finish(&preloaded_csets);

2978

cgroup_migrate_finish(&preloaded_csets);

2934

mutex_unlock(&cgroup_mutex);

2979

mutex_unlock(&cgroup_mutex);

2935

return ret;

2980

return ret;

2936

}

2981

}

2937

2982

2938

/*

2983

/*

2939

* Stuff for reading the 'tasks'/'procs' files.

2984

* Stuff for reading the 'tasks'/'procs' files.

2940

*

2985

*

2941

* Reading this file can return large amounts of data if a cgroup has

2986

* Reading this file can return large amounts of data if a cgroup has

2942

* *lots* of attached tasks. So it may need several calls to read(),

2987

* *lots* of attached tasks. So it may need several calls to read(),

2943

* but we cannot guarantee that the information we produce is correct

2988

* but we cannot guarantee that the information we produce is correct

2944

* unless we produce it entirely atomically.

2989

* unless we produce it entirely atomically.

2945

*

2990

*

2946

*/

2991

*/

2947

2992

2948

/* which pidlist file are we talking about? */

2993

/* which pidlist file are we talking about? */

2949

enum cgroup_filetype {

2994

enum cgroup_filetype {

2950

CGROUP_FILE_PROCS,

2995

CGROUP_FILE_PROCS,

2951

CGROUP_FILE_TASKS,

2996

CGROUP_FILE_TASKS,

2952

};

2997

};

2953

2998

2954

/*

2999

/*

2955

* A pidlist is a list of pids that virtually represents the contents of one

3000

* A pidlist is a list of pids that virtually represents the contents of one

2956

* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,

3001

* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,

2957

* a pair (one each for procs, tasks) for each pid namespace that's relevant

3002

* a pair (one each for procs, tasks) for each pid namespace that's relevant

2958

* to the cgroup.

3003

* to the cgroup.

2959

*/

3004

*/

2960

struct cgroup_pidlist {

3005

struct cgroup_pidlist {

2961

/*

3006

/*

2962

* used to find which pidlist is wanted. doesn't change as long as

3007

* used to find which pidlist is wanted. doesn't change as long as

2963

* this particular list stays in the list.

3008

* this particular list stays in the list.

2964

*/

3009

*/

2965

struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;

3010

struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;

2966

/* array of xids */

3011

/* array of xids */

2967

pid_t *list;

3012

pid_t *list;

2968

/* how many elements the above list has */

3013

/* how many elements the above list has */

2969

int length;

3014

int length;

2970

/* each of these stored in a list by its cgroup */

3015

/* each of these stored in a list by its cgroup */

2971

struct list_head links;

3016

struct list_head links;

2972

/* pointer to the cgroup we belong to, for list removal purposes */

3017

/* pointer to the cgroup we belong to, for list removal purposes */

2973

struct cgroup *owner;

3018

struct cgroup *owner;

2974

/* for delayed destruction */

3019

/* for delayed destruction */

2975

struct delayed_work destroy_dwork;

3020

struct delayed_work destroy_dwork;

2976

};

3021

};

2977

3022

2978

/*

3023

/*

2979

* The following two functions "fix" the issue where there are more pids

3024

* The following two functions "fix" the issue where there are more pids

2980

* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.

3025

* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.

2981

* TODO: replace with a kernel-wide solution to this problem

3026

* TODO: replace with a kernel-wide solution to this problem

2982

*/

3027

*/

2983

#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))

3028

#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))

2984

static void *pidlist_allocate(int count)

3029

static void *pidlist_allocate(int count)

2985

{

3030

{

2986

if (PIDLIST_TOO_LARGE(count))

3031

if (PIDLIST_TOO_LARGE(count))

2987

return vmalloc(count * sizeof(pid_t));

3032

return vmalloc(count * sizeof(pid_t));

2988

else

3033

else

2989

return kmalloc(count * sizeof(pid_t), GFP_KERNEL);

3034

return kmalloc(count * sizeof(pid_t), GFP_KERNEL);

2990

}

3035

}

2991

3036

2992

static void pidlist_free(void *p)

3037

static void pidlist_free(void *p)

2993

{

3038

{

2994

if (is_vmalloc_addr(p))

3039

if (is_vmalloc_addr(p))

2995

vfree(p);

3040

vfree(p);

2996

else

3041

else

2997

kfree(p);

3042

kfree(p);

2998

}

3043

}

2999

3044

3000

/*

3045

/*

3001

* Used to destroy all pidlists lingering waiting for destroy timer. None

3046

* Used to destroy all pidlists lingering waiting for destroy timer. None

3002

* should be left afterwards.

3047

* should be left afterwards.

3003

*/

3048

*/

3004

static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)

3049

static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)

3005

{

3050

{

3006

struct cgroup_pidlist *l, *tmp_l;

3051

struct cgroup_pidlist *l, *tmp_l;

3007

3052

3008

mutex_lock(&cgrp->pidlist_mutex);

3053

mutex_lock(&cgrp->pidlist_mutex);

3009

list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)

3054

list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)

3010

mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);

3055

mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);

3011

mutex_unlock(&cgrp->pidlist_mutex);

3056

mutex_unlock(&cgrp->pidlist_mutex);

3012

3057

3013

flush_workqueue(cgroup_pidlist_destroy_wq);

3058

flush_workqueue(cgroup_pidlist_destroy_wq);

3014

BUG_ON(!list_empty(&cgrp->pidlists));

3059

BUG_ON(!list_empty(&cgrp->pidlists));

3015

}

3060

}

3016

3061

3017

static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)

3062

static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)

3018

{

3063

{

3019

struct delayed_work *dwork = to_delayed_work(work);

3064

struct delayed_work *dwork = to_delayed_work(work);

3020

struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,

3065

struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,

3021

destroy_dwork);

3066

destroy_dwork);

3022

struct cgroup_pidlist *tofree = NULL;

3067

struct cgroup_pidlist *tofree = NULL;

3023

3068

3024

mutex_lock(&l->owner->pidlist_mutex);

3069

mutex_lock(&l->owner->pidlist_mutex);

3025

3070

3026

/*

3071

/*

3027

* Destroy iff we didn't get queued again. The state won't change

3072

* Destroy iff we didn't get queued again. The state won't change

3028

* as destroy_dwork can only be queued while locked.

3073

* as destroy_dwork can only be queued while locked.

3029

*/

3074

*/

3030

if (!delayed_work_pending(dwork)) {

3075

if (!delayed_work_pending(dwork)) {

3031

list_del(&l->links);

3076

list_del(&l->links);

3032

pidlist_free(l->list);

3077

pidlist_free(l->list);

3033

put_pid_ns(l->key.ns);

3078

put_pid_ns(l->key.ns);

3034

tofree = l;

3079

tofree = l;

3035

}

3080

}

3036

3081

3037

mutex_unlock(&l->owner->pidlist_mutex);

3082

mutex_unlock(&l->owner->pidlist_mutex);

3038

kfree(tofree);

3083

kfree(tofree);

3039

}

3084

}

3040

3085

3041

/*

3086

/*

3042

* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries

3087

* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries

3043

* Returns the number of unique elements.

3088

* Returns the number of unique elements.

3044

*/

3089

*/

3045

static int pidlist_uniq(pid_t *list, int length)

3090

static int pidlist_uniq(pid_t *list, int length)

3046

{

3091

{

3047

int src, dest = 1;

3092

int src, dest = 1;

3048

3093

3049

/*

3094

/*

3050

* we presume the 0th element is unique, so i starts at 1. trivial

3095

* we presume the 0th element is unique, so i starts at 1. trivial

3051

* edge cases first; no work needs to be done for either

3096

* edge cases first; no work needs to be done for either

3052

*/

3097

*/

3053

if (length == 0 || length == 1)

3098

if (length == 0 || length == 1)

3054

return length;

3099

return length;

3055

/* src and dest walk down the list; dest counts unique elements */

3100

/* src and dest walk down the list; dest counts unique elements */

3056

for (src = 1; src < length; src++) {

3101

for (src = 1; src < length; src++) {

3057

/* find next unique element */

3102

/* find next unique element */

3058

while (list[src] == list[src-1]) {

3103

while (list[src] == list[src-1]) {

3059

src++;

3104

src++;

3060

if (src == length)

3105

if (src == length)

3061

goto after;

3106

goto after;

3062

}

3107

}

3063

/* dest always points to where the next unique element goes */

3108

/* dest always points to where the next unique element goes */

3064

list[dest] = list[src];

3109

list[dest] = list[src];

3065

dest++;

3110

dest++;

3066

}

3111

}

3067

after:

3112

after:

3068

return dest;

3113

return dest;

3069

}

3114

}

3070

3115

3071

/*

3116

/*

3072

* The two pid files - task and cgroup.procs - guaranteed that the result

3117

* The two pid files - task and cgroup.procs - guaranteed that the result

3073

* is sorted, which forced this whole pidlist fiasco. As pid order is

3118

* is sorted, which forced this whole pidlist fiasco. As pid order is

3074

* different per namespace, each namespace needs differently sorted list,

3119

* different per namespace, each namespace needs differently sorted list,

3075

* making it impossible to use, for example, single rbtree of member tasks

3120

* making it impossible to use, for example, single rbtree of member tasks

3076

* sorted by task pointer. As pidlists can be fairly large, allocating one

3121

* sorted by task pointer. As pidlists can be fairly large, allocating one

3077

* per open file is dangerous, so cgroup had to implement shared pool of

3122

* per open file is dangerous, so cgroup had to implement shared pool of

3078

* pidlists keyed by cgroup and namespace.

3123

* pidlists keyed by cgroup and namespace.

3079

*

3124

*

3080

* All this extra complexity was caused by the original implementation

3125

* All this extra complexity was caused by the original implementation

3081

* committing to an entirely unnecessary property. In the long term, we

3126

* committing to an entirely unnecessary property. In the long term, we

3082

* want to do away with it. Explicitly scramble sort order if

3127

* want to do away with it. Explicitly scramble sort order if

3083

* sane_behavior so that no such expectation exists in the new interface.

3128

* sane_behavior so that no such expectation exists in the new interface.

3084

*

3129

*

3085

* Scrambling is done by swapping every two consecutive bits, which is

3130

* Scrambling is done by swapping every two consecutive bits, which is

3086

* non-identity one-to-one mapping which disturbs sort order sufficiently.

3131

* non-identity one-to-one mapping which disturbs sort order sufficiently.

3087

*/

3132

*/

3088

static pid_t pid_fry(pid_t pid)

3133

static pid_t pid_fry(pid_t pid)

3089

{

3134

{

3090

unsigned a = pid & 0x55555555;

3135

unsigned a = pid & 0x55555555;

3091

unsigned b = pid & 0xAAAAAAAA;

3136

unsigned b = pid & 0xAAAAAAAA;

3092

3137

3093

return (a << 1) | (b >> 1);

3138

return (a << 1) | (b >> 1);

3094

}

3139

}

3095

3140

3096

static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)

3141

static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)

3097

{

3142

{

3098

if (cgroup_sane_behavior(cgrp))

3143

if (cgroup_sane_behavior(cgrp))

3099

return pid_fry(pid);

3144

return pid_fry(pid);

3100

else

3145

else

3101

return pid;

3146

return pid;

3102

}

3147

}

3103

3148

3104

static int cmppid(const void *a, const void *b)

3149

static int cmppid(const void *a, const void *b)

3105

{

3150

{

3106

return *(pid_t *)a - *(pid_t *)b;

3151

return *(pid_t *)a - *(pid_t *)b;

3107

}

3152

}

3108

3153

3109

static int fried_cmppid(const void *a, const void *b)

3154

static int fried_cmppid(const void *a, const void *b)

3110

{

3155

{

3111

return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);

3156

return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);

3112

}

3157

}

3113

3158

3114

static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,

3159

static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,

3115

enum cgroup_filetype type)

3160

enum cgroup_filetype type)

3116

{

3161

{

3117

struct cgroup_pidlist *l;

3162

struct cgroup_pidlist *l;

3118

/* don't need task_nsproxy() if we're looking at ourself */

3163

/* don't need task_nsproxy() if we're looking at ourself */

3119

struct pid_namespace *ns = task_active_pid_ns(current);

3164

struct pid_namespace *ns = task_active_pid_ns(current);

3120

3165

3121

lockdep_assert_held(&cgrp->pidlist_mutex);

3166

lockdep_assert_held(&cgrp->pidlist_mutex);

3122

3167

3123

list_for_each_entry(l, &cgrp->pidlists, links)

3168

list_for_each_entry(l, &cgrp->pidlists, links)

3124

if (l->key.type == type && l->key.ns == ns)

3169

if (l->key.type == type && l->key.ns == ns)

3125

return l;

3170

return l;

3126

return NULL;

3171

return NULL;

3127

}

3172

}

3128

3173

3129

/*

3174

/*

3130

* find the appropriate pidlist for our purpose (given procs vs tasks)

3175

* find the appropriate pidlist for our purpose (given procs vs tasks)

3131

* returns with the lock on that pidlist already held, and takes care

3176

* returns with the lock on that pidlist already held, and takes care

3132

* of the use count, or returns NULL with no locks held if we're out of

3177

* of the use count, or returns NULL with no locks held if we're out of

3133

* memory.

3178

* memory.

3134

*/

3179

*/

3135

static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,

3180

static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,

3136

enum cgroup_filetype type)

3181

enum cgroup_filetype type)

3137

{

3182

{

3138

struct cgroup_pidlist *l;

3183

struct cgroup_pidlist *l;

3139

3184

3140

lockdep_assert_held(&cgrp->pidlist_mutex);

3185

lockdep_assert_held(&cgrp->pidlist_mutex);

3141

3186

3142

l = cgroup_pidlist_find(cgrp, type);

3187

l = cgroup_pidlist_find(cgrp, type);

3143

if (l)

3188

if (l)

3144

return l;

3189

return l;

3145

3190

3146

/* entry not found; create a new one */

3191

/* entry not found; create a new one */

3147

l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);

3192

l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);

3148

if (!l)

3193

if (!l)

3149

return l;

3194

return l;

3150

3195

3151

INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);

3196

INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);

3152

l->key.type = type;

3197

l->key.type = type;

3153

/* don't need task_nsproxy() if we're looking at ourself */

3198

/* don't need task_nsproxy() if we're looking at ourself */

3154

l->key.ns = get_pid_ns(task_active_pid_ns(current));

3199

l->key.ns = get_pid_ns(task_active_pid_ns(current));

3155

l->owner = cgrp;

3200

l->owner = cgrp;

3156

list_add(&l->links, &cgrp->pidlists);

3201

list_add(&l->links, &cgrp->pidlists);

3157

return l;

3202

return l;

3158

}

3203

}

3159

3204

3160

/*

3205

/*

3161

* Load a cgroup's pidarray with either procs' tgids or tasks' pids

3206

* Load a cgroup's pidarray with either procs' tgids or tasks' pids

3162

*/

3207

*/

3163

static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,

3208

static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,

3164

struct cgroup_pidlist **lp)

3209

struct cgroup_pidlist **lp)

3165

{

3210

{

3166

pid_t *array;

3211

pid_t *array;

3167

int length;

3212

int length;

3168

int pid, n = 0; /* used for populating the array */

3213

int pid, n = 0; /* used for populating the array */

3169

struct css_task_iter it;

3214

struct css_task_iter it;

3170

struct task_struct *tsk;

3215

struct task_struct *tsk;

3171

struct cgroup_pidlist *l;

3216

struct cgroup_pidlist *l;

3172

3217

3173

lockdep_assert_held(&cgrp->pidlist_mutex);

3218

lockdep_assert_held(&cgrp->pidlist_mutex);

3174

3219

3175

/*

3220

/*

3176

* If cgroup gets more users after we read count, we won't have

3221

* If cgroup gets more users after we read count, we won't have

3177

* enough space - tough. This race is indistinguishable to the

3222

* enough space - tough. This race is indistinguishable to the

3178

* caller from the case that the additional cgroup users didn't

3223

* caller from the case that the additional cgroup users didn't

3179

* show up until sometime later on.

3224

* show up until sometime later on.

3180

*/

3225

*/

3181

length = cgroup_task_count(cgrp);

3226

length = cgroup_task_count(cgrp);

3182

array = pidlist_allocate(length);

3227

array = pidlist_allocate(length);

3183

if (!array)

3228

if (!array)

3184

return -ENOMEM;

3229

return -ENOMEM;

3185

/* now, populate the array */

3230

/* now, populate the array */

3186

css_task_iter_start(&cgrp->dummy_css, &it);

3231

css_task_iter_start(&cgrp->dummy_css, &it);

3187

while ((tsk = css_task_iter_next(&it))) {

3232

while ((tsk = css_task_iter_next(&it))) {

3188

if (unlikely(n == length))

3233

if (unlikely(n == length))

3189

break;

3234

break;

3190

/* get tgid or pid for procs or tasks file respectively */

3235

/* get tgid or pid for procs or tasks file respectively */

3191

if (type == CGROUP_FILE_PROCS)

3236

if (type == CGROUP_FILE_PROCS)

3192

pid = task_tgid_vnr(tsk);

3237

pid = task_tgid_vnr(tsk);

3193

else

3238

else

3194

pid = task_pid_vnr(tsk);

3239

pid = task_pid_vnr(tsk);

3195

if (pid > 0) /* make sure to only use valid results */

3240

if (pid > 0) /* make sure to only use valid results */

3196

array[n++] = pid;

3241

array[n++] = pid;

3197

}

3242

}

3198

css_task_iter_end(&it);

3243

css_task_iter_end(&it);

3199

length = n;

3244

length = n;

3200

/* now sort & (if procs) strip out duplicates */

3245

/* now sort & (if procs) strip out duplicates */

3201

if (cgroup_sane_behavior(cgrp))

3246

if (cgroup_sane_behavior(cgrp))

3202

sort(array, length, sizeof(pid_t), fried_cmppid, NULL);

3247

sort(array, length, sizeof(pid_t), fried_cmppid, NULL);

3203

else

3248

else

3204

sort(array, length, sizeof(pid_t), cmppid, NULL);

3249

sort(array, length, sizeof(pid_t), cmppid, NULL);

3205

if (type == CGROUP_FILE_PROCS)

3250

if (type == CGROUP_FILE_PROCS)

3206

length = pidlist_uniq(array, length);

3251

length = pidlist_uniq(array, length);

3207

3252

3208

l = cgroup_pidlist_find_create(cgrp, type);

3253

l = cgroup_pidlist_find_create(cgrp, type);

3209

if (!l) {

3254

if (!l) {

3210

mutex_unlock(&cgrp->pidlist_mutex);

3255

mutex_unlock(&cgrp->pidlist_mutex);

3211

pidlist_free(array);

3256

pidlist_free(array);

3212

return -ENOMEM;

3257

return -ENOMEM;

3213

}

3258

}

3214

3259

3215

/* store array, freeing old if necessary */

3260

/* store array, freeing old if necessary */

3216

pidlist_free(l->list);

3261

pidlist_free(l->list);

3217

l->list = array;

3262

l->list = array;

3218

l->length = length;

3263

l->length = length;

3219

*lp = l;

3264

*lp = l;

3220

return 0;

3265

return 0;

3221

}

3266

}

3222

3267

3223

/**

3268

/**

3224

* cgroupstats_build - build and fill cgroupstats

3269

* cgroupstats_build - build and fill cgroupstats

3225

* @stats: cgroupstats to fill information into

3270

* @stats: cgroupstats to fill information into

3226

* @dentry: A dentry entry belonging to the cgroup for which stats have

3271

* @dentry: A dentry entry belonging to the cgroup for which stats have

3227

* been requested.

3272

* been requested.

3228

*

3273

*

3229

* Build and fill cgroupstats so that taskstats can export it to user

3274

* Build and fill cgroupstats so that taskstats can export it to user

3230

* space.

3275

* space.

3231

*/

3276

*/

3232

int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)

3277

int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)

3233

{

3278

{

3234

struct kernfs_node *kn = kernfs_node_from_dentry(dentry);

3279

struct kernfs_node *kn = kernfs_node_from_dentry(dentry);

3235

struct cgroup *cgrp;

3280

struct cgroup *cgrp;

3236

struct css_task_iter it;

3281

struct css_task_iter it;

3237

struct task_struct *tsk;

3282

struct task_struct *tsk;

3238

3283

3239

/* it should be kernfs_node belonging to cgroupfs and is a directory */

3284

/* it should be kernfs_node belonging to cgroupfs and is a directory */

3240

if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||

3285

if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||

3241

kernfs_type(kn) != KERNFS_DIR)

3286

kernfs_type(kn) != KERNFS_DIR)

3242

return -EINVAL;

3287

return -EINVAL;

3243

3288

3244

mutex_lock(&cgroup_mutex);

3289

mutex_lock(&cgroup_mutex);

3245

3290

3246

/*

3291

/*

3247

* We aren't being called from kernfs and there's no guarantee on

3292

* We aren't being called from kernfs and there's no guarantee on

3248

* @kn->priv's validity. For this and css_tryget_from_dir(),

3293

* @kn->priv's validity. For this and css_tryget_from_dir(),

3249

* @kn->priv is RCU safe. Let's do the RCU dancing.

3294

* @kn->priv is RCU safe. Let's do the RCU dancing.

3250

*/

3295

*/

3251

rcu_read_lock();

3296

rcu_read_lock();

3252

cgrp = rcu_dereference(kn->priv);

3297

cgrp = rcu_dereference(kn->priv);

3253

if (!cgrp || cgroup_is_dead(cgrp)) {

3298

if (!cgrp || cgroup_is_dead(cgrp)) {

3254

rcu_read_unlock();

3299

rcu_read_unlock();

3255

mutex_unlock(&cgroup_mutex);

3300

mutex_unlock(&cgroup_mutex);

3256

return -ENOENT;

3301

return -ENOENT;

3257

}

3302

}

3258

rcu_read_unlock();

3303

rcu_read_unlock();

3259

3304

3260

css_task_iter_start(&cgrp->dummy_css, &it);

3305

css_task_iter_start(&cgrp->dummy_css, &it);

3261

while ((tsk = css_task_iter_next(&it))) {

3306

while ((tsk = css_task_iter_next(&it))) {

3262

switch (tsk->state) {

3307

switch (tsk->state) {

3263

case TASK_RUNNING:

3308

case TASK_RUNNING:

3264

stats->nr_running++;

3309

stats->nr_running++;

3265

break;

3310

break;

3266

case TASK_INTERRUPTIBLE:

3311

case TASK_INTERRUPTIBLE:

3267

stats->nr_sleeping++;

3312

stats->nr_sleeping++;

3268

break;

3313

break;

3269

case TASK_UNINTERRUPTIBLE:

3314

case TASK_UNINTERRUPTIBLE:

3270

stats->nr_uninterruptible++;

3315

stats->nr_uninterruptible++;

3271

break;

3316

break;

3272

case TASK_STOPPED:

3317

case TASK_STOPPED:

3273

stats->nr_stopped++;

3318

stats->nr_stopped++;

3274

break;

3319

break;

3275

default:

3320

default:

3276

if (delayacct_is_task_waiting_on_io(tsk))

3321

if (delayacct_is_task_waiting_on_io(tsk))

3277

stats->nr_io_wait++;

3322

stats->nr_io_wait++;

3278

break;

3323

break;

3279

}

3324

}

3280

}

3325

}

3281

css_task_iter_end(&it);

3326

css_task_iter_end(&it);

3282

3327

3283

mutex_unlock(&cgroup_mutex);

3328

mutex_unlock(&cgroup_mutex);

3284

return 0;

3329

return 0;

3285

}

3330

}

3286

3331

3287

3332

3288

/*

3333

/*

3289

* seq_file methods for the tasks/procs files. The seq_file position is the

3334

* seq_file methods for the tasks/procs files. The seq_file position is the

3290

* next pid to display; the seq_file iterator is a pointer to the pid

3335

* next pid to display; the seq_file iterator is a pointer to the pid

3291

* in the cgroup->l->list array.

3336

* in the cgroup->l->list array.

3292

*/

3337

*/

3293

3338

3294

static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)

3339

static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)

3295

{

3340

{

3296

/*

3341

/*

3297

* Initially we receive a position value that corresponds to

3342

* Initially we receive a position value that corresponds to

3298

* one more than the last pid shown (or 0 on the first call or

3343

* one more than the last pid shown (or 0 on the first call or

3299

* after a seek to the start). Use a binary-search to find the

3344

* after a seek to the start). Use a binary-search to find the

3300

* next pid to display, if any

3345

* next pid to display, if any

3301

*/

3346

*/

3302

struct kernfs_open_file *of = s->private;

3347

struct kernfs_open_file *of = s->private;

3303

struct cgroup *cgrp = seq_css(s)->cgroup;

3348

struct cgroup *cgrp = seq_css(s)->cgroup;

3304

struct cgroup_pidlist *l;

3349

struct cgroup_pidlist *l;

3305

enum cgroup_filetype type = seq_cft(s)->private;

3350

enum cgroup_filetype type = seq_cft(s)->private;

3306

int index = 0, pid = *pos;

3351

int index = 0, pid = *pos;

3307

int *iter, ret;

3352

int *iter, ret;

3308

3353

3309

mutex_lock(&cgrp->pidlist_mutex);

3354

mutex_lock(&cgrp->pidlist_mutex);

3310

3355

3311

/*

3356

/*

3312

* !NULL @of->priv indicates that this isn't the first start()

3357

* !NULL @of->priv indicates that this isn't the first start()

3313

* after open. If the matching pidlist is around, we can use that.

3358

* after open. If the matching pidlist is around, we can use that.

3314

* Look for it. Note that @of->priv can't be used directly. It

3359

* Look for it. Note that @of->priv can't be used directly. It

3315

* could already have been destroyed.

3360

* could already have been destroyed.

3316

*/

3361

*/

3317

if (of->priv)

3362

if (of->priv)

3318

of->priv = cgroup_pidlist_find(cgrp, type);

3363

of->priv = cgroup_pidlist_find(cgrp, type);

3319

3364

3320

/*

3365

/*

3321

* Either this is the first start() after open or the matching

3366

* Either this is the first start() after open or the matching

3322

* pidlist has been destroyed inbetween. Create a new one.

3367

* pidlist has been destroyed inbetween. Create a new one.

3323

*/

3368

*/

3324

if (!of->priv) {

3369

if (!of->priv) {

3325

ret = pidlist_array_load(cgrp, type,

3370

ret = pidlist_array_load(cgrp, type,

3326

(struct cgroup_pidlist **)&of->priv);

3371

(struct cgroup_pidlist **)&of->priv);

3327

if (ret)

3372

if (ret)

3328

return ERR_PTR(ret);

3373

return ERR_PTR(ret);

3329

}

3374

}

3330

l = of->priv;

3375

l = of->priv;

3331

3376

3332

if (pid) {

3377

if (pid) {

3333

int end = l->length;

3378

int end = l->length;

3334

3379

3335

while (index < end) {

3380

while (index < end) {

3336

int mid = (index + end) / 2;

3381

int mid = (index + end) / 2;

3337

if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {

3382

if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {

3338

index = mid;

3383

index = mid;

3339

break;

3384

break;

3340

} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)

3385

} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)

3341

index = mid + 1;

3386

index = mid + 1;

3342

else

3387

else

3343

end = mid;

3388

end = mid;

3344

}

3389

}

3345

}

3390

}

3346

/* If we're off the end of the array, we're done */

3391

/* If we're off the end of the array, we're done */

3347

if (index >= l->length)

3392

if (index >= l->length)

3348

return NULL;

3393

return NULL;

3349

/* Update the abstract position to be the actual pid that we found */

3394

/* Update the abstract position to be the actual pid that we found */

3350

iter = l->list + index;

3395

iter = l->list + index;

3351

*pos = cgroup_pid_fry(cgrp, *iter);

3396

*pos = cgroup_pid_fry(cgrp, *iter);

3352

return iter;

3397

return iter;

3353

}

3398

}

3354

3399

3355

static void cgroup_pidlist_stop(struct seq_file *s, void *v)

3400

static void cgroup_pidlist_stop(struct seq_file *s, void *v)

3356

{

3401

{

3357

struct kernfs_open_file *of = s->private;

3402

struct kernfs_open_file *of = s->private;

3358

struct cgroup_pidlist *l = of->priv;

3403

struct cgroup_pidlist *l = of->priv;

3359

3404

3360

if (l)

3405

if (l)

3361

mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,

3406

mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,

3362

CGROUP_PIDLIST_DESTROY_DELAY);

3407

CGROUP_PIDLIST_DESTROY_DELAY);

3363

mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);

3408

mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);

3364

}

3409

}

3365

3410

3366

static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)

3411

static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)

3367

{

3412

{

3368

struct kernfs_open_file *of = s->private;

3413

struct kernfs_open_file *of = s->private;

3369

struct cgroup_pidlist *l = of->priv;

3414

struct cgroup_pidlist *l = of->priv;

3370

pid_t *p = v;

3415

pid_t *p = v;

3371

pid_t *end = l->list + l->length;

3416

pid_t *end = l->list + l->length;

3372

/*

3417

/*

3373

* Advance to the next pid in the array. If this goes off the

3418

* Advance to the next pid in the array. If this goes off the

3374

* end, we're done

3419

* end, we're done

3375

*/

3420

*/

3376

p++;

3421

p++;

3377

if (p >= end) {

3422

if (p >= end) {

3378

return NULL;

3423

return NULL;

3379

} else {

3424

} else {

3380

*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);

3425

*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);

3381

return p;

3426

return p;

3382

}

3427

}

3383

}

3428

}

3384

3429

3385

static int cgroup_pidlist_show(struct seq_file *s, void *v)

3430

static int cgroup_pidlist_show(struct seq_file *s, void *v)

3386

{

3431

{

3387

return seq_printf(s, "%d\n", *(int *)v);

3432

return seq_printf(s, "%d\n", *(int *)v);

3388

}

3433

}

3389

3434

3390

/*

3435

/*

3391

* seq_operations functions for iterating on pidlists through seq_file -

3436

* seq_operations functions for iterating on pidlists through seq_file -

3392

* independent of whether it's tasks or procs

3437

* independent of whether it's tasks or procs

3393

*/

3438

*/

3394

static const struct seq_operations cgroup_pidlist_seq_operations = {

3439

static const struct seq_operations cgroup_pidlist_seq_operations = {

3395

.start = cgroup_pidlist_start,

3440

.start = cgroup_pidlist_start,

3396

.stop = cgroup_pidlist_stop,

3441

.stop = cgroup_pidlist_stop,

3397

.next = cgroup_pidlist_next,

3442

.next = cgroup_pidlist_next,

3398

.show = cgroup_pidlist_show,

3443

.show = cgroup_pidlist_show,

3399

};

3444

};

3400

3445

3401

static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,

3446

static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,

3402

struct cftype *cft)

3447

struct cftype *cft)

3403

{

3448

{

3404

return notify_on_release(css->cgroup);

3449

return notify_on_release(css->cgroup);

3405

}

3450

}

3406

3451

3407

static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,

3452

static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,

3408

struct cftype *cft, u64 val)

3453

struct cftype *cft, u64 val)

3409

{

3454

{

3410

clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);

3455

clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);

3411

if (val)

3456

if (val)

3412

set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);

3457

set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);

3413

else

3458

else

3414

clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);

3459

clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);

3415

return 0;

3460

return 0;

3416

}

3461

}

3417

3462

3418

static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,

3463

static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,

3419

struct cftype *cft)

3464

struct cftype *cft)

3420

{

3465

{

3421

return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3466

return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3422

}

3467

}

3423

3468

3424

static int cgroup_clone_children_write(struct cgroup_subsys_state *css,

3469

static int cgroup_clone_children_write(struct cgroup_subsys_state *css,

3425

struct cftype *cft, u64 val)

3470

struct cftype *cft, u64 val)

3426

{

3471

{

3427

if (val)

3472

if (val)

3428

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3473

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3429

else

3474

else

3430

clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3475

clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3431

return 0;

3476

return 0;

3432

}

3477

}

3433

3478

3434

static struct cftype cgroup_base_files[] = {

3479

static struct cftype cgroup_base_files[] = {

3435

{

3480

{

3436

.name = "cgroup.procs",

3481

.name = "cgroup.procs",

3437

.seq_start = cgroup_pidlist_start,

3482

.seq_start = cgroup_pidlist_start,

3438

.seq_next = cgroup_pidlist_next,

3483

.seq_next = cgroup_pidlist_next,

3439

.seq_stop = cgroup_pidlist_stop,

3484

.seq_stop = cgroup_pidlist_stop,

3440

.seq_show = cgroup_pidlist_show,

3485

.seq_show = cgroup_pidlist_show,

3441

.private = CGROUP_FILE_PROCS,

3486

.private = CGROUP_FILE_PROCS,

3442

.write_u64 = cgroup_procs_write,

3487

.write_u64 = cgroup_procs_write,

3443

.mode = S_IRUGO | S_IWUSR,

3488

.mode = S_IRUGO | S_IWUSR,

3444

},

3489

},

3445

{

3490

{

3446

.name = "cgroup.clone_children",

3491

.name = "cgroup.clone_children",

3447

.flags = CFTYPE_INSANE,

3492

.flags = CFTYPE_INSANE,

3448

.read_u64 = cgroup_clone_children_read,

3493

.read_u64 = cgroup_clone_children_read,

3449

.write_u64 = cgroup_clone_children_write,

3494

.write_u64 = cgroup_clone_children_write,

3450

},

3495

},

3451

{

3496

{

3452

.name = "cgroup.sane_behavior",

3497

.name = "cgroup.sane_behavior",

3453

.flags = CFTYPE_ONLY_ON_ROOT,

3498

.flags = CFTYPE_ONLY_ON_ROOT,

3454

.seq_show = cgroup_sane_behavior_show,

3499

.seq_show = cgroup_sane_behavior_show,

3455

},

3500

},

3456

3501

3457

/*

3502

/*

3458

* Historical crazy stuff. These don't have "cgroup." prefix and

3503

* Historical crazy stuff. These don't have "cgroup." prefix and

3459

* don't exist if sane_behavior. If you're depending on these, be

3504

* don't exist if sane_behavior. If you're depending on these, be

3460

* prepared to be burned.

3505

* prepared to be burned.

3461

*/

3506

*/

3462

{

3507

{

3463

.name = "tasks",

3508

.name = "tasks",

3464

.flags = CFTYPE_INSANE, /* use "procs" instead */

3509

.flags = CFTYPE_INSANE, /* use "procs" instead */

3465

.seq_start = cgroup_pidlist_start,

3510

.seq_start = cgroup_pidlist_start,

3466

.seq_next = cgroup_pidlist_next,

3511

.seq_next = cgroup_pidlist_next,

3467

.seq_stop = cgroup_pidlist_stop,

3512

.seq_stop = cgroup_pidlist_stop,

3468

.seq_show = cgroup_pidlist_show,

3513

.seq_show = cgroup_pidlist_show,

3469

.private = CGROUP_FILE_TASKS,

3514

.private = CGROUP_FILE_TASKS,

3470

.write_u64 = cgroup_tasks_write,

3515

.write_u64 = cgroup_tasks_write,

3471

.mode = S_IRUGO | S_IWUSR,

3516

.mode = S_IRUGO | S_IWUSR,

3472

},

3517

},

3473

{

3518

{

3474

.name = "notify_on_release",

3519

.name = "notify_on_release",

3475

.flags = CFTYPE_INSANE,

3520

.flags = CFTYPE_INSANE,

3476

.read_u64 = cgroup_read_notify_on_release,

3521

.read_u64 = cgroup_read_notify_on_release,

3477

.write_u64 = cgroup_write_notify_on_release,

3522

.write_u64 = cgroup_write_notify_on_release,

3478

},

3523

},

3479

{

3524

{

3480

.name = "release_agent",

3525

.name = "release_agent",

3481

.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,

3526

.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,

3482

.seq_show = cgroup_release_agent_show,

3527

.seq_show = cgroup_release_agent_show,

3483

.write_string = cgroup_release_agent_write,

3528

.write_string = cgroup_release_agent_write,

3484

.max_write_len = PATH_MAX - 1,

3529

.max_write_len = PATH_MAX - 1,

3485

},

3530

},

3486

{ } /* terminate */

3531

{ } /* terminate */

3487

};

3532

};

3488

3533

3489

/**

3534

/**

3490

* cgroup_populate_dir - create subsys files in a cgroup directory

3535

* cgroup_populate_dir - create subsys files in a cgroup directory

3491

* @cgrp: target cgroup

3536

* @cgrp: target cgroup

3492

* @subsys_mask: mask of the subsystem ids whose files should be added

3537

* @subsys_mask: mask of the subsystem ids whose files should be added

3493

*

3538

*

3494

* On failure, no file is added.

3539

* On failure, no file is added.

3495

*/

3540

*/

3496

static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)

3541

static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)

3497

{

3542

{

3498

struct cgroup_subsys *ss;

3543

struct cgroup_subsys *ss;

3499

int i, ret = 0;

3544

int i, ret = 0;

3500

3545

3501

/* process cftsets of each subsystem */

3546

/* process cftsets of each subsystem */

3502

for_each_subsys(ss, i) {

3547

for_each_subsys(ss, i) {

3503

struct cftype *cfts;

3548

struct cftype *cfts;

3504

3549

3505

if (!test_bit(i, &subsys_mask))

3550

if (!test_bit(i, &subsys_mask))

3506

continue;

3551

continue;

3507

3552

3508

list_for_each_entry(cfts, &ss->cfts, node) {

3553

list_for_each_entry(cfts, &ss->cfts, node) {

3509

ret = cgroup_addrm_files(cgrp, cfts, true);

3554

ret = cgroup_addrm_files(cgrp, cfts, true);

3510

if (ret < 0)

3555

if (ret < 0)

3511

goto err;

3556

goto err;

3512

}

3557

}

3513

}

3558

}

3514

return 0;

3559

return 0;

3515

err:

3560

err:

3516

cgroup_clear_dir(cgrp, subsys_mask);

3561

cgroup_clear_dir(cgrp, subsys_mask);

3517

return ret;

3562

return ret;

3518

}

3563

}

3519

3564

3520

/*

3565

/*

3521

* css destruction is four-stage process.

3566

* css destruction is four-stage process.

3522

*

3567

*

3523

* 1. Destruction starts. Killing of the percpu_ref is initiated.

3568

* 1. Destruction starts. Killing of the percpu_ref is initiated.

3524

* Implemented in kill_css().

3569

* Implemented in kill_css().

3525

*

3570

*

3526

* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs

3571

* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs

3527

* and thus css_tryget() is guaranteed to fail, the css can be offlined

3572

* and thus css_tryget() is guaranteed to fail, the css can be offlined

3528

* by invoking offline_css(). After offlining, the base ref is put.

3573

* by invoking offline_css(). After offlining, the base ref is put.

3529

* Implemented in css_killed_work_fn().

3574

* Implemented in css_killed_work_fn().

3530

*

3575

*

3531

* 3. When the percpu_ref reaches zero, the only possible remaining

3576

* 3. When the percpu_ref reaches zero, the only possible remaining

3532

* accessors are inside RCU read sections. css_release() schedules the

3577

* accessors are inside RCU read sections. css_release() schedules the

3533

* RCU callback.

3578

* RCU callback.

3534

*

3579

*

3535

* 4. After the grace period, the css can be freed. Implemented in

3580

* 4. After the grace period, the css can be freed. Implemented in

3536

* css_free_work_fn().

3581

* css_free_work_fn().

3537

*

3582

*

3538

* It is actually hairier because both step 2 and 4 require process context

3583

* It is actually hairier because both step 2 and 4 require process context

3539

* and thus involve punting to css->destroy_work adding two additional

3584

* and thus involve punting to css->destroy_work adding two additional

3540

* steps to the already complex sequence.

3585

* steps to the already complex sequence.

3541

*/

3586

*/

3542

static void css_free_work_fn(struct work_struct *work)

3587

static void css_free_work_fn(struct work_struct *work)

3543

{

3588

{

3544

struct cgroup_subsys_state *css =

3589

struct cgroup_subsys_state *css =

3545

container_of(work, struct cgroup_subsys_state, destroy_work);

3590

container_of(work, struct cgroup_subsys_state, destroy_work);

3546

struct cgroup *cgrp = css->cgroup;

3591

struct cgroup *cgrp = css->cgroup;

3547

3592

3548

if (css->parent)

3593

if (css->parent)

3549

css_put(css->parent);

3594

css_put(css->parent);

3550

3595

3551

css->ss->css_free(css);

3596

css->ss->css_free(css);

3552

cgroup_put(cgrp);

3597

cgroup_put(cgrp);

3553

}

3598

}

3554

3599

3555

static void css_free_rcu_fn(struct rcu_head *rcu_head)

3600

static void css_free_rcu_fn(struct rcu_head *rcu_head)

3556

{

3601

{

3557

struct cgroup_subsys_state *css =

3602

struct cgroup_subsys_state *css =

3558

container_of(rcu_head, struct cgroup_subsys_state, rcu_head);

3603

container_of(rcu_head, struct cgroup_subsys_state, rcu_head);

3559

3604

3560

INIT_WORK(&css->destroy_work, css_free_work_fn);

3605

INIT_WORK(&css->destroy_work, css_free_work_fn);

3561

queue_work(cgroup_destroy_wq, &css->destroy_work);

3606

queue_work(cgroup_destroy_wq, &css->destroy_work);

3562

}

3607

}

3563

3608

3564

static void css_release(struct percpu_ref *ref)

3609

static void css_release(struct percpu_ref *ref)

3565

{

3610

{

3566

struct cgroup_subsys_state *css =

3611

struct cgroup_subsys_state *css =

3567

container_of(ref, struct cgroup_subsys_state, refcnt);

3612

container_of(ref, struct cgroup_subsys_state, refcnt);

3568

3613

3569

RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);

3614

RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);

3570

call_rcu(&css->rcu_head, css_free_rcu_fn);

3615

call_rcu(&css->rcu_head, css_free_rcu_fn);

3571

}

3616

}

3572

3617

3573

static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,

3618

static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,

3574

struct cgroup *cgrp)

3619

struct cgroup *cgrp)

3575

{

3620

{

3576

css->cgroup = cgrp;

3621

css->cgroup = cgrp;

3577

css->ss = ss;

3622

css->ss = ss;

3578

css->flags = 0;

3623

css->flags = 0;

3579

3624

3580

if (cgrp->parent)

3625

if (cgrp->parent)

3581

css->parent = cgroup_css(cgrp->parent, ss);

3626

css->parent = cgroup_css(cgrp->parent, ss);

3582

else

3627

else

3583

css->flags |= CSS_ROOT;

3628

css->flags |= CSS_ROOT;

3584

3629

3585

BUG_ON(cgroup_css(cgrp, ss));

3630

BUG_ON(cgroup_css(cgrp, ss));

3586

}

3631

}

3587

3632

3588

/* invoke ->css_online() on a new CSS and mark it online if successful */

3633

/* invoke ->css_online() on a new CSS and mark it online if successful */

3589

static int online_css(struct cgroup_subsys_state *css)

3634

static int online_css(struct cgroup_subsys_state *css)

3590

{

3635

{

3591

struct cgroup_subsys *ss = css->ss;

3636

struct cgroup_subsys *ss = css->ss;

3592

int ret = 0;

3637

int ret = 0;

3593

3638

3594

lockdep_assert_held(&cgroup_tree_mutex);

3639

lockdep_assert_held(&cgroup_tree_mutex);

3595

lockdep_assert_held(&cgroup_mutex);

3640

lockdep_assert_held(&cgroup_mutex);

3596

3641

3597

if (ss->css_online)

3642

if (ss->css_online)

3598

ret = ss->css_online(css);

3643

ret = ss->css_online(css);

3599

if (!ret) {

3644

if (!ret) {

3600

css->flags |= CSS_ONLINE;

3645

css->flags |= CSS_ONLINE;

3601

css->cgroup->nr_css++;

3646

css->cgroup->nr_css++;

3602

rcu_assign_pointer(css->cgroup->subsys[ss->id], css);

3647

rcu_assign_pointer(css->cgroup->subsys[ss->id], css);

3603

}

3648

}

3604

return ret;

3649

return ret;

3605

}

3650

}

3606

3651

3607

/* if the CSS is online, invoke ->css_offline() on it and mark it offline */

3652

/* if the CSS is online, invoke ->css_offline() on it and mark it offline */

3608

static void offline_css(struct cgroup_subsys_state *css)

3653

static void offline_css(struct cgroup_subsys_state *css)

3609

{

3654

{

3610

struct cgroup_subsys *ss = css->ss;

3655

struct cgroup_subsys *ss = css->ss;

3611

3656

3612

lockdep_assert_held(&cgroup_tree_mutex);

3657

lockdep_assert_held(&cgroup_tree_mutex);

3613

lockdep_assert_held(&cgroup_mutex);

3658

lockdep_assert_held(&cgroup_mutex);

3614

3659

3615

if (!(css->flags & CSS_ONLINE))

3660

if (!(css->flags & CSS_ONLINE))

3616

return;

3661

return;

3617

3662

3618

if (ss->css_offline)

3663

if (ss->css_offline)

3619

ss->css_offline(css);

3664

ss->css_offline(css);

3620

3665

3621

css->flags &= ~CSS_ONLINE;

3666

css->flags &= ~CSS_ONLINE;

3622

css->cgroup->nr_css--;

3667

css->cgroup->nr_css--;

3623

RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);

3668

RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);

3624

}

3669

}

3625

3670

3626

/**

3671

/**

3627

* create_css - create a cgroup_subsys_state

3672

* create_css - create a cgroup_subsys_state

3628

* @cgrp: the cgroup new css will be associated with

3673

* @cgrp: the cgroup new css will be associated with

3629

* @ss: the subsys of new css

3674

* @ss: the subsys of new css

3630

*

3675

*

3631

* Create a new css associated with @cgrp - @ss pair. On success, the new

3676

* Create a new css associated with @cgrp - @ss pair. On success, the new

3632

* css is online and installed in @cgrp with all interface files created.

3677

* css is online and installed in @cgrp with all interface files created.

3633

* Returns 0 on success, -errno on failure.

3678

* Returns 0 on success, -errno on failure.

3634

*/

3679

*/

3635

static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)

3680

static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)

3636

{

3681

{

3637

struct cgroup *parent = cgrp->parent;

3682

struct cgroup *parent = cgrp->parent;

3638

struct cgroup_subsys_state *css;

3683

struct cgroup_subsys_state *css;

3639

int err;

3684

int err;

3640

3685

3641

lockdep_assert_held(&cgroup_mutex);

3686

lockdep_assert_held(&cgroup_mutex);

3642

3687

3643

css = ss->css_alloc(cgroup_css(parent, ss));

3688

css = ss->css_alloc(cgroup_css(parent, ss));

3644

if (IS_ERR(css))

3689

if (IS_ERR(css))

3645

return PTR_ERR(css);

3690

return PTR_ERR(css);

3646

3691

3647

err = percpu_ref_init(&css->refcnt, css_release);

3692

err = percpu_ref_init(&css->refcnt, css_release);

3648

if (err)

3693

if (err)

3649

goto err_free_css;

3694

goto err_free_css;

3650

3695

3651

init_css(css, ss, cgrp);

3696

init_css(css, ss, cgrp);

3652

3697

3653

err = cgroup_populate_dir(cgrp, 1 << ss->id);

3698

err = cgroup_populate_dir(cgrp, 1 << ss->id);

3654

if (err)

3699

if (err)

3655

goto err_free_percpu_ref;

3700

goto err_free_percpu_ref;

3656

3701

3657

err = online_css(css);

3702

err = online_css(css);

3658

if (err)

3703

if (err)

3659

goto err_clear_dir;

3704

goto err_clear_dir;

3660

3705

3661

cgroup_get(cgrp);

3706

cgroup_get(cgrp);

3662

css_get(css->parent);

3707

css_get(css->parent);

3663

3708

3664

if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&

3709

if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&

3665

parent->parent) {

3710

parent->parent) {

3666

pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",

3711

pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",

3667

current->comm, current->pid, ss->name);

3712

current->comm, current->pid, ss->name);

3668

if (!strcmp(ss->name, "memory"))

3713

if (!strcmp(ss->name, "memory"))

3669

pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");

3714

pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");

3670

ss->warned_broken_hierarchy = true;

3715

ss->warned_broken_hierarchy = true;

3671

}

3716

}

3672

3717

3673

return 0;

3718

return 0;

3674

3719

3675

err_clear_dir:

3720

err_clear_dir:

3676

cgroup_clear_dir(css->cgroup, 1 << css->ss->id);

3721

cgroup_clear_dir(css->cgroup, 1 << css->ss->id);

3677

err_free_percpu_ref:

3722

err_free_percpu_ref:

3678

percpu_ref_cancel_init(&css->refcnt);

3723

percpu_ref_cancel_init(&css->refcnt);

3679

err_free_css:

3724

err_free_css:

3680

ss->css_free(css);

3725

ss->css_free(css);

3681

return err;

3726

return err;

3682

}

3727

}

3683

3728

3684

/**

3729

/**

3685

* cgroup_create - create a cgroup

3730

* cgroup_create - create a cgroup

3686

* @parent: cgroup that will be parent of the new cgroup

3731

* @parent: cgroup that will be parent of the new cgroup

3687

* @name: name of the new cgroup

3732

* @name: name of the new cgroup

3688

* @mode: mode to set on new cgroup

3733

* @mode: mode to set on new cgroup

3689

*/

3734

*/

3690

static long cgroup_create(struct cgroup *parent, const char *name,

3735

static long cgroup_create(struct cgroup *parent, const char *name,

3691

umode_t mode)

3736

umode_t mode)

3692

{

3737

{

3693

struct cgroup *cgrp;

3738

struct cgroup *cgrp;

3694

struct cgroup_root *root = parent->root;

3739

struct cgroup_root *root = parent->root;

3695

int ssid, err;

3740

int ssid, err;

3696

struct cgroup_subsys *ss;

3741

struct cgroup_subsys *ss;

3697

struct kernfs_node *kn;

3742

struct kernfs_node *kn;

3698

3743

3699

/*

3744

/*

3700

* XXX: The default hierarchy isn't fully implemented yet. Block

3745

* XXX: The default hierarchy isn't fully implemented yet. Block

3701

* !root cgroup creation on it for now.

3746

* !root cgroup creation on it for now.

3702

*/

3747

*/

3703

if (root == &cgrp_dfl_root)

3748

if (root == &cgrp_dfl_root)

3704

return -EINVAL;

3749

return -EINVAL;

3705

3750

3706

/* allocate the cgroup and its ID, 0 is reserved for the root */

3751

/* allocate the cgroup and its ID, 0 is reserved for the root */

3707

cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);

3752

cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);

3708

if (!cgrp)

3753

if (!cgrp)

3709

return -ENOMEM;

3754

return -ENOMEM;

3710

3755

3711

mutex_lock(&cgroup_tree_mutex);

3756

mutex_lock(&cgroup_tree_mutex);

3712

3757

3713

/*

3758

/*

3714

* Only live parents can have children. Note that the liveliness

3759

* Only live parents can have children. Note that the liveliness

3715

* check isn't strictly necessary because cgroup_mkdir() and

3760

* check isn't strictly necessary because cgroup_mkdir() and

3716

* cgroup_rmdir() are fully synchronized by i_mutex; however, do it

3761

* cgroup_rmdir() are fully synchronized by i_mutex; however, do it

3717

* anyway so that locking is contained inside cgroup proper and we

3762

* anyway so that locking is contained inside cgroup proper and we

3718

* don't get nasty surprises if we ever grow another caller.

3763

* don't get nasty surprises if we ever grow another caller.

3719

*/

3764

*/

3720

if (!cgroup_lock_live_group(parent)) {

3765

if (!cgroup_lock_live_group(parent)) {

3721

err = -ENODEV;

3766

err = -ENODEV;

3722

goto err_unlock_tree;

3767

goto err_unlock_tree;

3723

}

3768

}

3724

3769

3725

/*

3770

/*

3726

* Temporarily set the pointer to NULL, so idr_find() won't return

3771

* Temporarily set the pointer to NULL, so idr_find() won't return

3727

* a half-baked cgroup.

3772

* a half-baked cgroup.

3728

*/

3773

*/

3729

cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);

3774

cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);

3730

if (cgrp->id < 0) {

3775

if (cgrp->id < 0) {

3731

err = -ENOMEM;

3776

err = -ENOMEM;

3732

goto err_unlock;

3777

goto err_unlock;

3733

}

3778

}

3734

3779

3735

init_cgroup_housekeeping(cgrp);

3780

init_cgroup_housekeeping(cgrp);

3736

3781

3737

cgrp->parent = parent;

3782

cgrp->parent = parent;

3738

cgrp->dummy_css.parent = &parent->dummy_css;

3783

cgrp->dummy_css.parent = &parent->dummy_css;

3739

cgrp->root = parent->root;

3784

cgrp->root = parent->root;

3740

3785

3741

if (notify_on_release(parent))

3786

if (notify_on_release(parent))

3742

set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

3787

set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

3743

3788

3744

if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))

3789

if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))

3745

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);

3790

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);

3746

3791

3747

/* create the directory */

3792

/* create the directory */

3748

kn = kernfs_create_dir(parent->kn, name, mode, cgrp);

3793

kn = kernfs_create_dir(parent->kn, name, mode, cgrp);

3749

if (IS_ERR(kn)) {

3794

if (IS_ERR(kn)) {

3750

err = PTR_ERR(kn);

3795

err = PTR_ERR(kn);

3751

goto err_free_id;

3796

goto err_free_id;

3752

}

3797

}

3753

cgrp->kn = kn;

3798

cgrp->kn = kn;

3754

3799

3755

/*

3800

/*

3756

* This extra ref will be put in cgroup_free_fn() and guarantees

3801

* This extra ref will be put in cgroup_free_fn() and guarantees

3757

* that @cgrp->kn is always accessible.

3802

* that @cgrp->kn is always accessible.

3758

*/

3803

*/

3759

kernfs_get(kn);

3804

kernfs_get(kn);

3760

3805

3761

cgrp->serial_nr = cgroup_serial_nr_next++;

3806

cgrp->serial_nr = cgroup_serial_nr_next++;

3762

3807

3763

/* allocation complete, commit to creation */

3808

/* allocation complete, commit to creation */

3764

list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);

3809

list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);

3765

atomic_inc(&root->nr_cgrps);

3810

atomic_inc(&root->nr_cgrps);

3766

cgroup_get(parent);

3811

cgroup_get(parent);

3767

3812

3768

/*

3813

/*

3769

* @cgrp is now fully operational. If something fails after this

3814

* @cgrp is now fully operational. If something fails after this

3770

* point, it'll be released via the normal destruction path.

3815

* point, it'll be released via the normal destruction path.

3771

*/

3816

*/

3772

idr_replace(&root->cgroup_idr, cgrp, cgrp->id);

3817

idr_replace(&root->cgroup_idr, cgrp, cgrp->id);

3773

3818

3774

err = cgroup_kn_set_ugid(kn);

3819

err = cgroup_kn_set_ugid(kn);

3775

if (err)

3820

if (err)

3776

goto err_destroy;

3821

goto err_destroy;

3777

3822

3778

err = cgroup_addrm_files(cgrp, cgroup_base_files, true);

3823

err = cgroup_addrm_files(cgrp, cgroup_base_files, true);

3779

if (err)

3824

if (err)

3780

goto err_destroy;

3825

goto err_destroy;

3781

3826

3782

/* let's create and online css's */

3827

/* let's create and online css's */

3783

for_each_subsys(ss, ssid) {

3828

for_each_subsys(ss, ssid) {

3784

if (parent->child_subsys_mask & (1 << ssid)) {

3829

if (parent->child_subsys_mask & (1 << ssid)) {

3785

err = create_css(cgrp, ss);

3830

err = create_css(cgrp, ss);

3786

if (err)

3831

if (err)

3787

goto err_destroy;

3832

goto err_destroy;

3788

}

3833

}

3789

}

3834

}

3790

3835

3791

cgrp->child_subsys_mask = parent->child_subsys_mask;

3836

cgrp->child_subsys_mask = parent->child_subsys_mask;

3792

3837

3793

kernfs_activate(kn);

3838

kernfs_activate(kn);

3794

3839

3795

mutex_unlock(&cgroup_mutex);

3840

mutex_unlock(&cgroup_mutex);

3796

mutex_unlock(&cgroup_tree_mutex);

3841

mutex_unlock(&cgroup_tree_mutex);

3797

3842

3798

return 0;

3843

return 0;

3799

3844

3800

err_free_id:

3845

err_free_id:

3801

idr_remove(&root->cgroup_idr, cgrp->id);

3846

idr_remove(&root->cgroup_idr, cgrp->id);

3802

err_unlock:

3847

err_unlock:

3803

mutex_unlock(&cgroup_mutex);

3848

mutex_unlock(&cgroup_mutex);

3804

err_unlock_tree:

3849

err_unlock_tree:

3805

mutex_unlock(&cgroup_tree_mutex);

3850

mutex_unlock(&cgroup_tree_mutex);

3806

kfree(cgrp);

3851

kfree(cgrp);

3807

return err;

3852

return err;

3808

3853

3809

err_destroy:

3854

err_destroy:

3810

cgroup_destroy_locked(cgrp);

3855

cgroup_destroy_locked(cgrp);

3811

mutex_unlock(&cgroup_mutex);

3856

mutex_unlock(&cgroup_mutex);

3812

mutex_unlock(&cgroup_tree_mutex);

3857

mutex_unlock(&cgroup_tree_mutex);

3813

return err;

3858

return err;

3814

}

3859

}

3815

3860

3816

static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,

3861

static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,

3817

umode_t mode)

3862

umode_t mode)

3818

{

3863

{

3819

struct cgroup *parent = parent_kn->priv;

3864

struct cgroup *parent = parent_kn->priv;

3820

int ret;

3865

int ret;

3821

3866

3822

/*

3867

/*

3823

* cgroup_create() grabs cgroup_tree_mutex which nests outside

3868

* cgroup_create() grabs cgroup_tree_mutex which nests outside

3824

* kernfs active_ref and cgroup_create() already synchronizes

3869

* kernfs active_ref and cgroup_create() already synchronizes

3825

* properly against removal through cgroup_lock_live_group().

3870

* properly against removal through cgroup_lock_live_group().

3826

* Break it before calling cgroup_create().

3871

* Break it before calling cgroup_create().

3827

*/

3872

*/

3828

cgroup_get(parent);

3873

cgroup_get(parent);

3829

kernfs_break_active_protection(parent_kn);

3874

kernfs_break_active_protection(parent_kn);

3830

3875

3831

ret = cgroup_create(parent, name, mode);

3876

ret = cgroup_create(parent, name, mode);

3832

3877

3833

kernfs_unbreak_active_protection(parent_kn);

3878

kernfs_unbreak_active_protection(parent_kn);

3834

cgroup_put(parent);

3879

cgroup_put(parent);

3835

return ret;

3880

return ret;

3836

}

3881

}

3837

3882

3838

/*

3883

/*

3839

* This is called when the refcnt of a css is confirmed to be killed.

3884

* This is called when the refcnt of a css is confirmed to be killed.

3840

* css_tryget() is now guaranteed to fail.

3885

* css_tryget() is now guaranteed to fail.

3841

*/

3886

*/

3842

static void css_killed_work_fn(struct work_struct *work)

3887

static void css_killed_work_fn(struct work_struct *work)

3843

{

3888

{

3844

struct cgroup_subsys_state *css =

3889

struct cgroup_subsys_state *css =

3845

container_of(work, struct cgroup_subsys_state, destroy_work);

3890

container_of(work, struct cgroup_subsys_state, destroy_work);

3846

struct cgroup *cgrp = css->cgroup;

3891

struct cgroup *cgrp = css->cgroup;

3847

3892

3848

mutex_lock(&cgroup_tree_mutex);

3893

mutex_lock(&cgroup_tree_mutex);

3849

mutex_lock(&cgroup_mutex);

3894

mutex_lock(&cgroup_mutex);

3850

3895

3851

/*

3896

/*

3852

* css_tryget() is guaranteed to fail now. Tell subsystems to

3897

* css_tryget() is guaranteed to fail now. Tell subsystems to

3853

* initate destruction.

3898

* initate destruction.

3854

*/

3899

*/

3855

offline_css(css);

3900

offline_css(css);

3856

3901

3857

/*

3902

/*

3858

* If @cgrp is marked dead, it's waiting for refs of all css's to

3903

* If @cgrp is marked dead, it's waiting for refs of all css's to

3859

* be disabled before proceeding to the second phase of cgroup

3904

* be disabled before proceeding to the second phase of cgroup

3860

* destruction. If we are the last one, kick it off.

3905

* destruction. If we are the last one, kick it off.

3861

*/

3906

*/

3862

if (!cgrp->nr_css && cgroup_is_dead(cgrp))

3907

if (!cgrp->nr_css && cgroup_is_dead(cgrp))

3863

cgroup_destroy_css_killed(cgrp);

3908

cgroup_destroy_css_killed(cgrp);

3864

3909

3865

mutex_unlock(&cgroup_mutex);

3910

mutex_unlock(&cgroup_mutex);

3866

mutex_unlock(&cgroup_tree_mutex);

3911

mutex_unlock(&cgroup_tree_mutex);

3867

3912

3868

/*

3913

/*

3869

* Put the css refs from kill_css(). Each css holds an extra

3914

* Put the css refs from kill_css(). Each css holds an extra

3870

* reference to the cgroup's dentry and cgroup removal proceeds

3915

* reference to the cgroup's dentry and cgroup removal proceeds

3871

* regardless of css refs. On the last put of each css, whenever

3916

* regardless of css refs. On the last put of each css, whenever

3872

* that may be, the extra dentry ref is put so that dentry

3917

* that may be, the extra dentry ref is put so that dentry

3873

* destruction happens only after all css's are released.

3918

* destruction happens only after all css's are released.

3874

*/

3919

*/

3875

css_put(css);

3920

css_put(css);

3876

}

3921

}

3877

3922

3878

/* css kill confirmation processing requires process context, bounce */

3923

/* css kill confirmation processing requires process context, bounce */

3879

static void css_killed_ref_fn(struct percpu_ref *ref)

3924

static void css_killed_ref_fn(struct percpu_ref *ref)

3880

{

3925

{

3881

struct cgroup_subsys_state *css =

3926

struct cgroup_subsys_state *css =

3882

container_of(ref, struct cgroup_subsys_state, refcnt);

3927

container_of(ref, struct cgroup_subsys_state, refcnt);

3883

3928

3884

INIT_WORK(&css->destroy_work, css_killed_work_fn);

3929

INIT_WORK(&css->destroy_work, css_killed_work_fn);

3885

queue_work(cgroup_destroy_wq, &css->destroy_work);

3930

queue_work(cgroup_destroy_wq, &css->destroy_work);

3886

}

3931

}

3887

3932

3888

/**

3933

/**

3889

* kill_css - destroy a css

3934

* kill_css - destroy a css

3890

* @css: css to destroy

3935

* @css: css to destroy

3891

*

3936

*

3892

* This function initiates destruction of @css by removing cgroup interface

3937

* This function initiates destruction of @css by removing cgroup interface

3893

* files and putting its base reference. ->css_offline() will be invoked

3938

* files and putting its base reference. ->css_offline() will be invoked

3894

* asynchronously once css_tryget() is guaranteed to fail and when the

3939

* asynchronously once css_tryget() is guaranteed to fail and when the

3895

* reference count reaches zero, @css will be released.

3940

* reference count reaches zero, @css will be released.

3896

*/

3941

*/

3897

static void kill_css(struct cgroup_subsys_state *css)

3942

static void kill_css(struct cgroup_subsys_state *css)

3898

{

3943

{

3899

lockdep_assert_held(&cgroup_tree_mutex);

3944

lockdep_assert_held(&cgroup_tree_mutex);

3900

3945

3901

/*

3946

/*

3902

* This must happen before css is disassociated with its cgroup.

3947

* This must happen before css is disassociated with its cgroup.

3903

* See seq_css() for details.

3948

* See seq_css() for details.

3904

*/

3949

*/

3905

cgroup_clear_dir(css->cgroup, 1 << css->ss->id);

3950

cgroup_clear_dir(css->cgroup, 1 << css->ss->id);

3906

3951

3907

/*

3952

/*

3908

* Killing would put the base ref, but we need to keep it alive

3953

* Killing would put the base ref, but we need to keep it alive

3909

* until after ->css_offline().

3954

* until after ->css_offline().

3910

*/

3955

*/

3911

css_get(css);

3956

css_get(css);

3912

3957

3913

/*

3958

/*

3914

* cgroup core guarantees that, by the time ->css_offline() is

3959

* cgroup core guarantees that, by the time ->css_offline() is

3915

* invoked, no new css reference will be given out via

3960

* invoked, no new css reference will be given out via

3916

* css_tryget(). We can't simply call percpu_ref_kill() and

3961

* css_tryget(). We can't simply call percpu_ref_kill() and

3917

* proceed to offlining css's because percpu_ref_kill() doesn't

3962

* proceed to offlining css's because percpu_ref_kill() doesn't

3918

* guarantee that the ref is seen as killed on all CPUs on return.

3963

* guarantee that the ref is seen as killed on all CPUs on return.

3919

*

3964

*

3920

* Use percpu_ref_kill_and_confirm() to get notifications as each

3965

* Use percpu_ref_kill_and_confirm() to get notifications as each

3921

* css is confirmed to be seen as killed on all CPUs.

3966

* css is confirmed to be seen as killed on all CPUs.

3922

*/

3967

*/

3923

percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);

3968

percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);

3924

}

3969

}

3925

3970

3926

/**

3971

/**

3927

* cgroup_destroy_locked - the first stage of cgroup destruction

3972

* cgroup_destroy_locked - the first stage of cgroup destruction

3928

* @cgrp: cgroup to be destroyed

3973

* @cgrp: cgroup to be destroyed

3929

*

3974

*

3930

* css's make use of percpu refcnts whose killing latency shouldn't be

3975

* css's make use of percpu refcnts whose killing latency shouldn't be

3931

* exposed to userland and are RCU protected. Also, cgroup core needs to

3976

* exposed to userland and are RCU protected. Also, cgroup core needs to

3932

* guarantee that css_tryget() won't succeed by the time ->css_offline() is

3977

* guarantee that css_tryget() won't succeed by the time ->css_offline() is

3933

* invoked. To satisfy all the requirements, destruction is implemented in

3978

* invoked. To satisfy all the requirements, destruction is implemented in

3934

* the following two steps.

3979

* the following two steps.

3935

*

3980

*

3936

* s1. Verify @cgrp can be destroyed and mark it dying. Remove all

3981

* s1. Verify @cgrp can be destroyed and mark it dying. Remove all

3937

* userland visible parts and start killing the percpu refcnts of

3982

* userland visible parts and start killing the percpu refcnts of

3938

* css's. Set up so that the next stage will be kicked off once all

3983

* css's. Set up so that the next stage will be kicked off once all

3939

* the percpu refcnts are confirmed to be killed.

3984

* the percpu refcnts are confirmed to be killed.

3940

*

3985

*

3941

* s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the

3986

* s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the

3942

* rest of destruction. Once all cgroup references are gone, the

3987

* rest of destruction. Once all cgroup references are gone, the

3943

* cgroup is RCU-freed.

3988

* cgroup is RCU-freed.

3944

*

3989

*

3945

* This function implements s1. After this step, @cgrp is gone as far as

3990

* This function implements s1. After this step, @cgrp is gone as far as

3946

* the userland is concerned and a new cgroup with the same name may be

3991

* the userland is concerned and a new cgroup with the same name may be

3947

* created. As cgroup doesn't care about the names internally, this

3992

* created. As cgroup doesn't care about the names internally, this

3948

* doesn't cause any problem.

3993

* doesn't cause any problem.

3949

*/

3994

*/

3950

static int cgroup_destroy_locked(struct cgroup *cgrp)

3995

static int cgroup_destroy_locked(struct cgroup *cgrp)

3951

__releases(&cgroup_mutex) __acquires(&cgroup_mutex)

3996

__releases(&cgroup_mutex) __acquires(&cgroup_mutex)

3952

{

3997

{

3953

struct cgroup *child;

3998

struct cgroup *child;

3954

struct cgroup_subsys_state *css;

3999

struct cgroup_subsys_state *css;

3955

bool empty;

4000

bool empty;

3956

int ssid;

4001

int ssid;

3957

4002

3958

lockdep_assert_held(&cgroup_tree_mutex);

4003

lockdep_assert_held(&cgroup_tree_mutex);

3959

lockdep_assert_held(&cgroup_mutex);

4004

lockdep_assert_held(&cgroup_mutex);

3960

4005

3961

/*

4006

/*

3962

* css_set_rwsem synchronizes access to ->cset_links and prevents

4007

* css_set_rwsem synchronizes access to ->cset_links and prevents

3963

* @cgrp from being removed while put_css_set() is in progress.

4008

* @cgrp from being removed while put_css_set() is in progress.

3964

*/

4009

*/

3965

down_read(&css_set_rwsem);

4010

down_read(&css_set_rwsem);

3966

empty = list_empty(&cgrp->cset_links);

4011

empty = list_empty(&cgrp->cset_links);

3967

up_read(&css_set_rwsem);

4012

up_read(&css_set_rwsem);

3968

if (!empty)

4013

if (!empty)

3969

return -EBUSY;

4014

return -EBUSY;

3970

4015

3971

/*

4016

/*

3972

* Make sure there's no live children. We can't test ->children

4017

* Make sure there's no live children. We can't test ->children

3973

* emptiness as dead children linger on it while being destroyed;

4018

* emptiness as dead children linger on it while being destroyed;

3974

* otherwise, "rmdir parent/child parent" may fail with -EBUSY.

4019

* otherwise, "rmdir parent/child parent" may fail with -EBUSY.

3975

*/

4020

*/

3976

empty = true;

4021

empty = true;

3977

rcu_read_lock();

4022

rcu_read_lock();

3978

list_for_each_entry_rcu(child, &cgrp->children, sibling) {

4023

list_for_each_entry_rcu(child, &cgrp->children, sibling) {

3979

empty = cgroup_is_dead(child);

4024

empty = cgroup_is_dead(child);

3980

if (!empty)

4025

if (!empty)

3981

break;

4026

break;

3982

}

4027

}

3983

rcu_read_unlock();

4028

rcu_read_unlock();

3984

if (!empty)

4029

if (!empty)

3985

return -EBUSY;

4030

return -EBUSY;

3986

4031

3987

/*

4032

/*

3988

* Mark @cgrp dead. This prevents further task migration and child

4033

* Mark @cgrp dead. This prevents further task migration and child

3989

* creation by disabling cgroup_lock_live_group(). Note that

4034

* creation by disabling cgroup_lock_live_group(). Note that

3990

* CGRP_DEAD assertion is depended upon by css_next_child() to

4035

* CGRP_DEAD assertion is depended upon by css_next_child() to

3991

* resume iteration after dropping RCU read lock. See

4036

* resume iteration after dropping RCU read lock. See

3992

* css_next_child() for details.

4037

* css_next_child() for details.

3993

*/

4038

*/

3994

set_bit(CGRP_DEAD, &cgrp->flags);

4039

set_bit(CGRP_DEAD, &cgrp->flags);

3995

4040

3996

/*

4041

/*

3997

* Initiate massacre of all css's. cgroup_destroy_css_killed()

4042

* Initiate massacre of all css's. cgroup_destroy_css_killed()

3998

* will be invoked to perform the rest of destruction once the

4043

* will be invoked to perform the rest of destruction once the

3999

* percpu refs of all css's are confirmed to be killed. This

4044

* percpu refs of all css's are confirmed to be killed. This

4000

* involves removing the subsystem's files, drop cgroup_mutex.

4045

* involves removing the subsystem's files, drop cgroup_mutex.

4001

*/

4046

*/

4002

mutex_unlock(&cgroup_mutex);

4047

mutex_unlock(&cgroup_mutex);

4003

for_each_css(css, ssid, cgrp)

4048

for_each_css(css, ssid, cgrp)

4004

kill_css(css);

4049

kill_css(css);

4005

mutex_lock(&cgroup_mutex);

4050

mutex_lock(&cgroup_mutex);

4006

4051

4007

/* CGRP_DEAD is set, remove from ->release_list for the last time */

4052

/* CGRP_DEAD is set, remove from ->release_list for the last time */

4008

raw_spin_lock(&release_list_lock);

4053

raw_spin_lock(&release_list_lock);

4009

if (!list_empty(&cgrp->release_list))

4054

if (!list_empty(&cgrp->release_list))

4010

list_del_init(&cgrp->release_list);

4055

list_del_init(&cgrp->release_list);

4011

raw_spin_unlock(&release_list_lock);

4056

raw_spin_unlock(&release_list_lock);

4012

4057

4013

/*

4058

/*

4014

* If @cgrp has css's attached, the second stage of cgroup

4059

* If @cgrp has css's attached, the second stage of cgroup

4015

* destruction is kicked off from css_killed_work_fn() after the

4060

* destruction is kicked off from css_killed_work_fn() after the

4016

* refs of all attached css's are killed. If @cgrp doesn't have

4061

* refs of all attached css's are killed. If @cgrp doesn't have

4017

* any css, we kick it off here.

4062

* any css, we kick it off here.

4018

*/

4063

*/

4019

if (!cgrp->nr_css)

4064

if (!cgrp->nr_css)

4020

cgroup_destroy_css_killed(cgrp);

4065

cgroup_destroy_css_killed(cgrp);

4021

4066

4022

/* remove @cgrp directory along with the base files */

4067

/* remove @cgrp directory along with the base files */

4023

mutex_unlock(&cgroup_mutex);

4068

mutex_unlock(&cgroup_mutex);

4024

4069

4025

/*

4070

/*

4026

* There are two control paths which try to determine cgroup from

4071

* There are two control paths which try to determine cgroup from

4027

* dentry without going through kernfs - cgroupstats_build() and

4072

* dentry without going through kernfs - cgroupstats_build() and

4028

* css_tryget_from_dir(). Those are supported by RCU protecting

4073

* css_tryget_from_dir(). Those are supported by RCU protecting

4029

* clearing of cgrp->kn->priv backpointer, which should happen

4074

* clearing of cgrp->kn->priv backpointer, which should happen

4030

* after all files under it have been removed.

4075

* after all files under it have been removed.

4031

*/

4076

*/

4032

kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */

4077

kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */

4033

RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);

4078

RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);

4034

4079

4035

mutex_lock(&cgroup_mutex);

4080

mutex_lock(&cgroup_mutex);

4036

4081

4037

return 0;

4082

return 0;

4038

};

4083

};

4039

4084

4040

/**

4085

/**

4041

* cgroup_destroy_css_killed - the second step of cgroup destruction

4086

* cgroup_destroy_css_killed - the second step of cgroup destruction

4042

* @work: cgroup->destroy_free_work

4087

* @work: cgroup->destroy_free_work

4043

*

4088

*

4044

* This function is invoked from a work item for a cgroup which is being

4089

* This function is invoked from a work item for a cgroup which is being

4045

* destroyed after all css's are offlined and performs the rest of

4090

* destroyed after all css's are offlined and performs the rest of

4046

* destruction. This is the second step of destruction described in the

4091

* destruction. This is the second step of destruction described in the

4047

* comment above cgroup_destroy_locked().

4092

* comment above cgroup_destroy_locked().

4048

*/

4093

*/

4049

static void cgroup_destroy_css_killed(struct cgroup *cgrp)

4094

static void cgroup_destroy_css_killed(struct cgroup *cgrp)

4050

{

4095

{

4051

struct cgroup *parent = cgrp->parent;

4096

struct cgroup *parent = cgrp->parent;

4052

4097

4053

lockdep_assert_held(&cgroup_tree_mutex);

4098

lockdep_assert_held(&cgroup_tree_mutex);

4054

lockdep_assert_held(&cgroup_mutex);

4099

lockdep_assert_held(&cgroup_mutex);

4055

4100

4056

/* delete this cgroup from parent->children */

4101

/* delete this cgroup from parent->children */

4057

list_del_rcu(&cgrp->sibling);

4102

list_del_rcu(&cgrp->sibling);

4058

4103

4059

cgroup_put(cgrp);

4104

cgroup_put(cgrp);

4060

4105

4061

set_bit(CGRP_RELEASABLE, &parent->flags);

4106

set_bit(CGRP_RELEASABLE, &parent->flags);

4062

check_for_release(parent);

4107

check_for_release(parent);

4063

}

4108

}

4064

4109

4065

static int cgroup_rmdir(struct kernfs_node *kn)

4110

static int cgroup_rmdir(struct kernfs_node *kn)

4066

{

4111

{

4067

struct cgroup *cgrp = kn->priv;

4112

struct cgroup *cgrp = kn->priv;

4068

int ret = 0;

4113

int ret = 0;

4069

4114

4070

/*

4115

/*

4071

* This is self-destruction but @kn can't be removed while this

4116

* This is self-destruction but @kn can't be removed while this

4072

* callback is in progress. Let's break active protection. Once

4117

* callback is in progress. Let's break active protection. Once

4073

* the protection is broken, @cgrp can be destroyed at any point.

4118

* the protection is broken, @cgrp can be destroyed at any point.

4074

* Pin it so that it stays accessible.

4119

* Pin it so that it stays accessible.

4075

*/

4120

*/

4076

cgroup_get(cgrp);

4121

cgroup_get(cgrp);

4077

kernfs_break_active_protection(kn);

4122

kernfs_break_active_protection(kn);

4078

4123

4079

mutex_lock(&cgroup_tree_mutex);

4124

mutex_lock(&cgroup_tree_mutex);

4080

mutex_lock(&cgroup_mutex);

4125

mutex_lock(&cgroup_mutex);

4081

4126

4082

/*

4127

/*

4083

* @cgrp might already have been destroyed while we're trying to

4128

* @cgrp might already have been destroyed while we're trying to

4084

* grab the mutexes.

4129

* grab the mutexes.

4085

*/

4130

*/

4086

if (!cgroup_is_dead(cgrp))

4131

if (!cgroup_is_dead(cgrp))

4087

ret = cgroup_destroy_locked(cgrp);

4132

ret = cgroup_destroy_locked(cgrp);

4088

4133

4089

mutex_unlock(&cgroup_mutex);

4134

mutex_unlock(&cgroup_mutex);

4090

mutex_unlock(&cgroup_tree_mutex);

4135

mutex_unlock(&cgroup_tree_mutex);

4091

4136

4092

kernfs_unbreak_active_protection(kn);

4137

kernfs_unbreak_active_protection(kn);

4093

cgroup_put(cgrp);

4138

cgroup_put(cgrp);

4094

return ret;

4139

return ret;

4095

}

4140

}

4096

4141

4097

static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {

4142

static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {

4098

.remount_fs = cgroup_remount,

4143

.remount_fs = cgroup_remount,

4099

.show_options = cgroup_show_options,

4144

.show_options = cgroup_show_options,

4100

.mkdir = cgroup_mkdir,

4145

.mkdir = cgroup_mkdir,

4101

.rmdir = cgroup_rmdir,

4146

.rmdir = cgroup_rmdir,

4102

.rename = cgroup_rename,

4147

.rename = cgroup_rename,

4103

};

4148

};

4104

4149

4105

static void __init cgroup_init_subsys(struct cgroup_subsys *ss)

4150

static void __init cgroup_init_subsys(struct cgroup_subsys *ss)

4106

{

4151

{

4107

struct cgroup_subsys_state *css;

4152

struct cgroup_subsys_state *css;

4108

4153

4109

printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);

4154

printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);

4110

4155

4111

mutex_lock(&cgroup_tree_mutex);

4156

mutex_lock(&cgroup_tree_mutex);

4112

mutex_lock(&cgroup_mutex);

4157

mutex_lock(&cgroup_mutex);

4113

4158

4114

INIT_LIST_HEAD(&ss->cfts);

4159

INIT_LIST_HEAD(&ss->cfts);

4115

4160

4116

/* Create the root cgroup state for this subsystem */

4161

/* Create the root cgroup state for this subsystem */

4117

ss->root = &cgrp_dfl_root;

4162

ss->root = &cgrp_dfl_root;

4118

css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));

4163

css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));

4119

/* We don't handle early failures gracefully */

4164

/* We don't handle early failures gracefully */

4120

BUG_ON(IS_ERR(css));

4165

BUG_ON(IS_ERR(css));

4121

init_css(css, ss, &cgrp_dfl_root.cgrp);

4166

init_css(css, ss, &cgrp_dfl_root.cgrp);

4122

4167

4123

/* Update the init_css_set to contain a subsys

4168

/* Update the init_css_set to contain a subsys

4124

* pointer to this state - since the subsystem is

4169

* pointer to this state - since the subsystem is

4125

* newly registered, all tasks and hence the

4170

* newly registered, all tasks and hence the

4126

* init_css_set is in the subsystem's root cgroup. */

4171

* init_css_set is in the subsystem's root cgroup. */

4127

init_css_set.subsys[ss->id] = css;

4172

init_css_set.subsys[ss->id] = css;

4128

4173

4129

need_forkexit_callback |= ss->fork || ss->exit;

4174

need_forkexit_callback |= ss->fork || ss->exit;

4130

4175

4131

/* At system boot, before all subsystems have been

4176

/* At system boot, before all subsystems have been

4132

* registered, no tasks have been forked, so we don't

4177

* registered, no tasks have been forked, so we don't

4133

* need to invoke fork callbacks here. */

4178

* need to invoke fork callbacks here. */

4134

BUG_ON(!list_empty(&init_task.tasks));

4179

BUG_ON(!list_empty(&init_task.tasks));

4135

4180

4136

BUG_ON(online_css(css));

4181

BUG_ON(online_css(css));

4137

4182

4138

cgrp_dfl_root.subsys_mask |= 1 << ss->id;

4183

cgrp_dfl_root.subsys_mask |= 1 << ss->id;

4139

4184

4140

mutex_unlock(&cgroup_mutex);

4185

mutex_unlock(&cgroup_mutex);

4141

mutex_unlock(&cgroup_tree_mutex);

4186

mutex_unlock(&cgroup_tree_mutex);

4142

}

4187

}

4143

4188

4144

/**

4189

/**

4145

* cgroup_init_early - cgroup initialization at system boot

4190

* cgroup_init_early - cgroup initialization at system boot

4146

*

4191

*

4147

* Initialize cgroups at system boot, and initialize any

4192

* Initialize cgroups at system boot, and initialize any

4148

* subsystems that request early init.

4193

* subsystems that request early init.

4149

*/

4194

*/

4150

int __init cgroup_init_early(void)

4195

int __init cgroup_init_early(void)

4151

{

4196

{

4152

static struct cgroup_sb_opts __initdata opts =

4197

static struct cgroup_sb_opts __initdata opts =

4153

{ .flags = CGRP_ROOT_SANE_BEHAVIOR };

4198

{ .flags = CGRP_ROOT_SANE_BEHAVIOR };

4154

struct cgroup_subsys *ss;

4199

struct cgroup_subsys *ss;

4155

int i;

4200

int i;

4156

4201

4157

init_cgroup_root(&cgrp_dfl_root, &opts);

4202

init_cgroup_root(&cgrp_dfl_root, &opts);

4158

RCU_INIT_POINTER(init_task.cgroups, &init_css_set);

4203

RCU_INIT_POINTER(init_task.cgroups, &init_css_set);

4159

4204

4160

for_each_subsys(ss, i) {

4205

for_each_subsys(ss, i) {

4161

WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,

4206

WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,

4162

"invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",

4207

"invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",

4163

i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,

4208

i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,

4164

ss->id, ss->name);

4209

ss->id, ss->name);

4165

WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,

4210

WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,

4166

"cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);

4211

"cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);

4167

4212

4168

ss->id = i;

4213

ss->id = i;

4169

ss->name = cgroup_subsys_name[i];

4214

ss->name = cgroup_subsys_name[i];

4170

4215

4171

if (ss->early_init)

4216

if (ss->early_init)

4172

cgroup_init_subsys(ss);

4217

cgroup_init_subsys(ss);

4173

}

4218

}

4174

return 0;

4219

return 0;

4175

}

4220

}

4176

4221

4177

/**

4222

/**

4178

* cgroup_init - cgroup initialization

4223

* cgroup_init - cgroup initialization

4179

*

4224

*

4180

* Register cgroup filesystem and /proc file, and initialize

4225

* Register cgroup filesystem and /proc file, and initialize

4181

* any subsystems that didn't request early init.

4226

* any subsystems that didn't request early init.

4182

*/

4227

*/

4183

int __init cgroup_init(void)

4228

int __init cgroup_init(void)

4184

{

4229

{

4185

struct cgroup_subsys *ss;

4230

struct cgroup_subsys *ss;

4186

unsigned long key;

4231

unsigned long key;

4187

int ssid, err;

4232

int ssid, err;

4188

4233

4189

BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));

4234

BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));

4190

4235

4191

mutex_lock(&cgroup_tree_mutex);

4236

mutex_lock(&cgroup_tree_mutex);

4192

mutex_lock(&cgroup_mutex);

4237

mutex_lock(&cgroup_mutex);

4193

4238

4194

/* Add init_css_set to the hash table */

4239

/* Add init_css_set to the hash table */

4195

key = css_set_hash(init_css_set.subsys);

4240

key = css_set_hash(init_css_set.subsys);

4196

hash_add(css_set_table, &init_css_set.hlist, key);

4241

hash_add(css_set_table, &init_css_set.hlist, key);

4197

4242

4198

BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));

4243

BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));

4199

4244

4200

mutex_unlock(&cgroup_mutex);

4245

mutex_unlock(&cgroup_mutex);

4201

mutex_unlock(&cgroup_tree_mutex);

4246

mutex_unlock(&cgroup_tree_mutex);

4202

4247

4203

for_each_subsys(ss, ssid) {

4248

for_each_subsys(ss, ssid) {

4204

if (!ss->early_init)

4249

if (!ss->early_init)

4205

cgroup_init_subsys(ss);

4250

cgroup_init_subsys(ss);

4206

4251

4207

/*

4252

/*

4208

* cftype registration needs kmalloc and can't be done

4253

* cftype registration needs kmalloc and can't be done

4209

* during early_init. Register base cftypes separately.

4254

* during early_init. Register base cftypes separately.

4210

*/

4255

*/

4211

if (ss->base_cftypes)

4256

if (ss->base_cftypes)

4212

WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));

4257

WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));

4213

}

4258

}

4214

4259

4215

cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);

4260

cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);

4216

if (!cgroup_kobj)

4261

if (!cgroup_kobj)

4217

return -ENOMEM;

4262

return -ENOMEM;

4218

4263

4219

err = register_filesystem(&cgroup_fs_type);

4264

err = register_filesystem(&cgroup_fs_type);

4220

if (err < 0) {

4265

if (err < 0) {

4221

kobject_put(cgroup_kobj);

4266

kobject_put(cgroup_kobj);

4222

return err;

4267

return err;

4223

}

4268

}

4224

4269

4225

proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);

4270

proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);

4226

return 0;

4271

return 0;

4227

}

4272

}

4228

4273

4229

static int __init cgroup_wq_init(void)

4274

static int __init cgroup_wq_init(void)

4230

{

4275

{

4231

/*

4276

/*

4232

* There isn't much point in executing destruction path in

4277

* There isn't much point in executing destruction path in

4233

* parallel. Good chunk is serialized with cgroup_mutex anyway.

4278

* parallel. Good chunk is serialized with cgroup_mutex anyway.

4234

* Use 1 for @max_active.

4279

* Use 1 for @max_active.

4235

*

4280

*

4236

* We would prefer to do this in cgroup_init() above, but that

4281

* We would prefer to do this in cgroup_init() above, but that

4237

* is called before init_workqueues(): so leave this until after.

4282

* is called before init_workqueues(): so leave this until after.

4238

*/

4283

*/

4239

cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);

4284

cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);

4240

BUG_ON(!cgroup_destroy_wq);

4285

BUG_ON(!cgroup_destroy_wq);

4241

4286

4242

/*

4287

/*

4243

* Used to destroy pidlists and separate to serve as flush domain.

4288

* Used to destroy pidlists and separate to serve as flush domain.

4244

* Cap @max_active to 1 too.

4289

* Cap @max_active to 1 too.

4245

*/

4290

*/

4246

cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",

4291

cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",

4247

0, 1);

4292

0, 1);

4248

BUG_ON(!cgroup_pidlist_destroy_wq);

4293

BUG_ON(!cgroup_pidlist_destroy_wq);

4249

4294

4250

return 0;

4295

return 0;

4251

}

4296

}

4252

core_initcall(cgroup_wq_init);

4297

core_initcall(cgroup_wq_init);

4253

4298

4254

/*

4299

/*

4255

* proc_cgroup_show()

4300

* proc_cgroup_show()

4256

* - Print task's cgroup paths into seq_file, one line for each hierarchy

4301

* - Print task's cgroup paths into seq_file, one line for each hierarchy

4257

* - Used for /proc/<pid>/cgroup.

4302

* - Used for /proc/<pid>/cgroup.

4258

*/

4303

*/

4259

4304

4260

/* TODO: Use a proper seq_file iterator */

4305

/* TODO: Use a proper seq_file iterator */

4261

int proc_cgroup_show(struct seq_file *m, void *v)

4306

int proc_cgroup_show(struct seq_file *m, void *v)

4262

{

4307

{

4263

struct pid *pid;

4308

struct pid *pid;

4264

struct task_struct *tsk;

4309

struct task_struct *tsk;

4265

char *buf, *path;

4310

char *buf, *path;

4266

int retval;

4311

int retval;

4267

struct cgroup_root *root;

4312

struct cgroup_root *root;

4268

4313

4269

retval = -ENOMEM;

4314

retval = -ENOMEM;

4270

buf = kmalloc(PATH_MAX, GFP_KERNEL);

4315

buf = kmalloc(PATH_MAX, GFP_KERNEL);

4271

if (!buf)

4316

if (!buf)

4272

goto out;

4317

goto out;

4273

4318

4274

retval = -ESRCH;

4319

retval = -ESRCH;

4275

pid = m->private;

4320

pid = m->private;

4276

tsk = get_pid_task(pid, PIDTYPE_PID);

4321

tsk = get_pid_task(pid, PIDTYPE_PID);

4277

if (!tsk)

4322

if (!tsk)

4278

goto out_free;

4323

goto out_free;

4279

4324

4280

retval = 0;

4325

retval = 0;

4281

4326

4282

mutex_lock(&cgroup_mutex);

4327

mutex_lock(&cgroup_mutex);

4283

down_read(&css_set_rwsem);

4328

down_read(&css_set_rwsem);

4284

4329

4285

for_each_root(root) {

4330

for_each_root(root) {

4286

struct cgroup_subsys *ss;

4331

struct cgroup_subsys *ss;

4287

struct cgroup *cgrp;

4332

struct cgroup *cgrp;

4288

int ssid, count = 0;

4333

int ssid, count = 0;

4289

4334

4290

if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)

4335

if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)

4291

continue;

4336

continue;

4292

4337

4293

seq_printf(m, "%d:", root->hierarchy_id);

4338

seq_printf(m, "%d:", root->hierarchy_id);

4294

for_each_subsys(ss, ssid)

4339

for_each_subsys(ss, ssid)

4295

if (root->subsys_mask & (1 << ssid))

4340

if (root->subsys_mask & (1 << ssid))

4296

seq_printf(m, "%s%s", count++ ? "," : "", ss->name);

4341

seq_printf(m, "%s%s", count++ ? "," : "", ss->name);

4297

if (strlen(root->name))

4342

if (strlen(root->name))

4298

seq_printf(m, "%sname=%s", count ? "," : "",

4343

seq_printf(m, "%sname=%s", count ? "," : "",

4299

root->name);

4344

root->name);

4300

seq_putc(m, ':');

4345

seq_putc(m, ':');

4301

cgrp = task_cgroup_from_root(tsk, root);

4346

cgrp = task_cgroup_from_root(tsk, root);

4302

path = cgroup_path(cgrp, buf, PATH_MAX);

4347

path = cgroup_path(cgrp, buf, PATH_MAX);

4303

if (!path) {

4348

if (!path) {

4304

retval = -ENAMETOOLONG;

4349

retval = -ENAMETOOLONG;

4305

goto out_unlock;

4350

goto out_unlock;

4306

}

4351

}

4307

seq_puts(m, path);

4352

seq_puts(m, path);

4308

seq_putc(m, '\n');

4353

seq_putc(m, '\n');

4309

}

4354

}

4310

4355

4311

out_unlock:

4356

out_unlock:

4312

up_read(&css_set_rwsem);

4357

up_read(&css_set_rwsem);

4313

mutex_unlock(&cgroup_mutex);

4358

mutex_unlock(&cgroup_mutex);

4314

put_task_struct(tsk);

4359

put_task_struct(tsk);

4315

out_free:

4360

out_free:

4316

kfree(buf);

4361

kfree(buf);

4317

out:

4362

out:

4318

return retval;

4363

return retval;

4319

}

4364

}

4320

4365

4321

/* Display information about each subsystem and each hierarchy */

4366

/* Display information about each subsystem and each hierarchy */

4322

static int proc_cgroupstats_show(struct seq_file *m, void *v)

4367

static int proc_cgroupstats_show(struct seq_file *m, void *v)

4323

{

4368

{

4324

struct cgroup_subsys *ss;

4369

struct cgroup_subsys *ss;

4325

int i;

4370

int i;

4326

4371

4327

seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");

4372

seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");

4328

/*

4373

/*

4329

* ideally we don't want subsystems moving around while we do this.

4374

* ideally we don't want subsystems moving around while we do this.

4330

* cgroup_mutex is also necessary to guarantee an atomic snapshot of

4375

* cgroup_mutex is also necessary to guarantee an atomic snapshot of

4331

* subsys/hierarchy state.

4376

* subsys/hierarchy state.

4332

*/

4377

*/

4333

mutex_lock(&cgroup_mutex);

4378

mutex_lock(&cgroup_mutex);

4334

4379

4335

for_each_subsys(ss, i)

4380

for_each_subsys(ss, i)

4336

seq_printf(m, "%s\t%d\t%d\t%d\n",

4381

seq_printf(m, "%s\t%d\t%d\t%d\n",

4337

ss->name, ss->root->hierarchy_id,

4382

ss->name, ss->root->hierarchy_id,

4338

atomic_read(&ss->root->nr_cgrps), !ss->disabled);

4383

atomic_read(&ss->root->nr_cgrps), !ss->disabled);

4339

4384

4340

mutex_unlock(&cgroup_mutex);

4385

mutex_unlock(&cgroup_mutex);

4341

return 0;

4386

return 0;

4342

}

4387

}

4343

4388

4344

static int cgroupstats_open(struct inode *inode, struct file *file)

4389

static int cgroupstats_open(struct inode *inode, struct file *file)

4345

{

4390

{

4346

return single_open(file, proc_cgroupstats_show, NULL);

4391

return single_open(file, proc_cgroupstats_show, NULL);

4347

}

4392

}

4348

4393

4349

static const struct file_operations proc_cgroupstats_operations = {

4394

static const struct file_operations proc_cgroupstats_operations = {

4350

.open = cgroupstats_open,

4395

.open = cgroupstats_open,

4351

.read = seq_read,

4396

.read = seq_read,

4352

.llseek = seq_lseek,

4397

.llseek = seq_lseek,

4353

.release = single_release,

4398

.release = single_release,

4354

};

4399

};

4355

4400

4356

/**

4401

/**

4357

* cgroup_fork - initialize cgroup related fields during copy_process()

4402

* cgroup_fork - initialize cgroup related fields during copy_process()

4358

* @child: pointer to task_struct of forking parent process.

4403

* @child: pointer to task_struct of forking parent process.

4359

*

4404

*

4360

* A task is associated with the init_css_set until cgroup_post_fork()

4405

* A task is associated with the init_css_set until cgroup_post_fork()

4361

* attaches it to the parent's css_set. Empty cg_list indicates that

4406

* attaches it to the parent's css_set. Empty cg_list indicates that

4362

* @child isn't holding reference to its css_set.

4407

* @child isn't holding reference to its css_set.

4363

*/

4408

*/

4364

void cgroup_fork(struct task_struct *child)

4409

void cgroup_fork(struct task_struct *child)

4365

{

4410

{

4366

RCU_INIT_POINTER(child->cgroups, &init_css_set);

4411

RCU_INIT_POINTER(child->cgroups, &init_css_set);

4367

INIT_LIST_HEAD(&child->cg_list);

4412

INIT_LIST_HEAD(&child->cg_list);

4368

}

4413

}

4369

4414

4370

/**

4415

/**

4371

* cgroup_post_fork - called on a new task after adding it to the task list

4416

* cgroup_post_fork - called on a new task after adding it to the task list

4372

* @child: the task in question

4417

* @child: the task in question

4373

*

4418

*

4374

* Adds the task to the list running through its css_set if necessary and

4419

* Adds the task to the list running through its css_set if necessary and

4375

* call the subsystem fork() callbacks. Has to be after the task is

4420

* call the subsystem fork() callbacks. Has to be after the task is

4376

* visible on the task list in case we race with the first call to

4421

* visible on the task list in case we race with the first call to

4377

* cgroup_task_iter_start() - to guarantee that the new task ends up on its

4422

* cgroup_task_iter_start() - to guarantee that the new task ends up on its

4378

* list.

4423

* list.

4379

*/

4424

*/

4380

void cgroup_post_fork(struct task_struct *child)

4425

void cgroup_post_fork(struct task_struct *child)

4381

{

4426

{

4382

struct cgroup_subsys *ss;

4427

struct cgroup_subsys *ss;

4383

int i;

4428

int i;

4384

4429

4385

/*

4430

/*

4386

* This may race against cgroup_enable_task_cg_links(). As that

4431

* This may race against cgroup_enable_task_cg_links(). As that

4387

* function sets use_task_css_set_links before grabbing

4432

* function sets use_task_css_set_links before grabbing

4388

* tasklist_lock and we just went through tasklist_lock to add

4433

* tasklist_lock and we just went through tasklist_lock to add

4389

* @child, it's guaranteed that either we see the set

4434

* @child, it's guaranteed that either we see the set

4390

* use_task_css_set_links or cgroup_enable_task_cg_lists() sees

4435

* use_task_css_set_links or cgroup_enable_task_cg_lists() sees

4391

* @child during its iteration.

4436

* @child during its iteration.

4392

*

4437

*

4393

* If we won the race, @child is associated with %current's

4438

* If we won the race, @child is associated with %current's

4394

* css_set. Grabbing css_set_rwsem guarantees both that the

4439

* css_set. Grabbing css_set_rwsem guarantees both that the

4395

* association is stable, and, on completion of the parent's

4440

* association is stable, and, on completion of the parent's

4396

* migration, @child is visible in the source of migration or

4441

* migration, @child is visible in the source of migration or

4397

* already in the destination cgroup. This guarantee is necessary

4442

* already in the destination cgroup. This guarantee is necessary

4398

* when implementing operations which need to migrate all tasks of

4443

* when implementing operations which need to migrate all tasks of

4399

* a cgroup to another.

4444

* a cgroup to another.

4400

*

4445

*

4401

* Note that if we lose to cgroup_enable_task_cg_links(), @child

4446

* Note that if we lose to cgroup_enable_task_cg_links(), @child

4402

* will remain in init_css_set. This is safe because all tasks are

4447

* will remain in init_css_set. This is safe because all tasks are

4403

* in the init_css_set before cg_links is enabled and there's no

4448

* in the init_css_set before cg_links is enabled and there's no

4404

* operation which transfers all tasks out of init_css_set.

4449

* operation which transfers all tasks out of init_css_set.

4405

*/

4450

*/

4406

if (use_task_css_set_links) {

4451

if (use_task_css_set_links) {

4407

struct css_set *cset;

4452

struct css_set *cset;

4408

4453

4409

down_write(&css_set_rwsem);

4454

down_write(&css_set_rwsem);

4410

cset = task_css_set(current);

4455

cset = task_css_set(current);

4411

if (list_empty(&child->cg_list)) {

4456

if (list_empty(&child->cg_list)) {

4412

rcu_assign_pointer(child->cgroups, cset);

4457

rcu_assign_pointer(child->cgroups, cset);

4413

list_add(&child->cg_list, &cset->tasks);

4458

list_add(&child->cg_list, &cset->tasks);

4414

get_css_set(cset);

4459

get_css_set(cset);

4415

}

4460

}

4416

up_write(&css_set_rwsem);

4461

up_write(&css_set_rwsem);

4417

}

4462

}

4418

4463

4419

/*

4464

/*

4420

* Call ss->fork(). This must happen after @child is linked on

4465

* Call ss->fork(). This must happen after @child is linked on

4421

* css_set; otherwise, @child might change state between ->fork()

4466

* css_set; otherwise, @child might change state between ->fork()

4422

* and addition to css_set.

4467

* and addition to css_set.

4423

*/

4468

*/

4424

if (need_forkexit_callback) {

4469

if (need_forkexit_callback) {

4425

for_each_subsys(ss, i)

4470

for_each_subsys(ss, i)

4426

if (ss->fork)

4471

if (ss->fork)

4427

ss->fork(child);

4472

ss->fork(child);

4428

}

4473

}

4429

}

4474

}

4430

4475

4431

/**

4476

/**

4432

* cgroup_exit - detach cgroup from exiting task

4477

* cgroup_exit - detach cgroup from exiting task

4433

* @tsk: pointer to task_struct of exiting process

4478

* @tsk: pointer to task_struct of exiting process

4434

*

4479

*

4435

* Description: Detach cgroup from @tsk and release it.

4480

* Description: Detach cgroup from @tsk and release it.

4436

*

4481

*

4437

* Note that cgroups marked notify_on_release force every task in

4482

* Note that cgroups marked notify_on_release force every task in

4438

* them to take the global cgroup_mutex mutex when exiting.

4483

* them to take the global cgroup_mutex mutex when exiting.

4439

* This could impact scaling on very large systems. Be reluctant to

4484

* This could impact scaling on very large systems. Be reluctant to

4440

* use notify_on_release cgroups where very high task exit scaling

4485

* use notify_on_release cgroups where very high task exit scaling

4441

* is required on large systems.

4486

* is required on large systems.

4442

*

4487

*

4443

* We set the exiting tasks cgroup to the root cgroup (top_cgroup). We

4488

* We set the exiting tasks cgroup to the root cgroup (top_cgroup). We

4444

* call cgroup_exit() while the task is still competent to handle

4489

* call cgroup_exit() while the task is still competent to handle

4445

* notify_on_release(), then leave the task attached to the root cgroup in

4490

* notify_on_release(), then leave the task attached to the root cgroup in

4446

* each hierarchy for the remainder of its exit. No need to bother with

4491

* each hierarchy for the remainder of its exit. No need to bother with

4447

* init_css_set refcnting. init_css_set never goes away and we can't race

4492

* init_css_set refcnting. init_css_set never goes away and we can't race

4448

* with migration path - PF_EXITING is visible to migration path.

4493

* with migration path - PF_EXITING is visible to migration path.

4449

*/

4494

*/

4450

void cgroup_exit(struct task_struct *tsk)

4495

void cgroup_exit(struct task_struct *tsk)

4451

{

4496

{

4452

struct cgroup_subsys *ss;

4497

struct cgroup_subsys *ss;

4453

struct css_set *cset;

4498

struct css_set *cset;

4454

bool put_cset = false;

4499

bool put_cset = false;

4455

int i;

4500

int i;

4456

4501

4457

/*

4502

/*

4458

* Unlink from @tsk from its css_set. As migration path can't race

4503

* Unlink from @tsk from its css_set. As migration path can't race

4459

* with us, we can check cg_list without grabbing css_set_rwsem.

4504

* with us, we can check cg_list without grabbing css_set_rwsem.

4460

*/

4505

*/

4461

if (!list_empty(&tsk->cg_list)) {

4506

if (!list_empty(&tsk->cg_list)) {

4462

down_write(&css_set_rwsem);

4507

down_write(&css_set_rwsem);

4463

list_del_init(&tsk->cg_list);

4508

list_del_init(&tsk->cg_list);

4464

up_write(&css_set_rwsem);

4509

up_write(&css_set_rwsem);

4465

put_cset = true;

4510

put_cset = true;

4466

}

4511

}

4467

4512

4468

/* Reassign the task to the init_css_set. */

4513

/* Reassign the task to the init_css_set. */

4469

cset = task_css_set(tsk);

4514

cset = task_css_set(tsk);

4470

RCU_INIT_POINTER(tsk->cgroups, &init_css_set);

4515

RCU_INIT_POINTER(tsk->cgroups, &init_css_set);

4471

4516

4472

if (need_forkexit_callback) {

4517

if (need_forkexit_callback) {

4473

/* see cgroup_post_fork() for details */

4518

/* see cgroup_post_fork() for details */

4474

for_each_subsys(ss, i) {

4519

for_each_subsys(ss, i) {

4475

if (ss->exit) {

4520

if (ss->exit) {

4476

struct cgroup_subsys_state *old_css = cset->subsys[i];

4521

struct cgroup_subsys_state *old_css = cset->subsys[i];

4477

struct cgroup_subsys_state *css = task_css(tsk, i);

4522

struct cgroup_subsys_state *css = task_css(tsk, i);

4478

4523

4479

ss->exit(css, old_css, tsk);

4524

ss->exit(css, old_css, tsk);

4480

}

4525

}

4481

}

4526

}

4482

}

4527

}

4483

4528

4484

if (put_cset)

4529

if (put_cset)

4485

put_css_set(cset, true);

4530

put_css_set(cset, true);

4486

}

4531

}

4487

4532

4488

static void check_for_release(struct cgroup *cgrp)

4533

static void check_for_release(struct cgroup *cgrp)

4489

{

4534

{

4490

if (cgroup_is_releasable(cgrp) &&

4535

if (cgroup_is_releasable(cgrp) &&

4491

list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {

4536

list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {

4492

/*

4537

/*

4493

* Control Group is currently removeable. If it's not

4538

* Control Group is currently removeable. If it's not

4494

* already queued for a userspace notification, queue

4539

* already queued for a userspace notification, queue

4495

* it now

4540

* it now

4496

*/

4541

*/

4497

int need_schedule_work = 0;

4542

int need_schedule_work = 0;

4498

4543

4499

raw_spin_lock(&release_list_lock);

4544

raw_spin_lock(&release_list_lock);

4500

if (!cgroup_is_dead(cgrp) &&

4545

if (!cgroup_is_dead(cgrp) &&

4501

list_empty(&cgrp->release_list)) {

4546

list_empty(&cgrp->release_list)) {

4502

list_add(&cgrp->release_list, &release_list);

4547

list_add(&cgrp->release_list, &release_list);

4503

need_schedule_work = 1;

4548

need_schedule_work = 1;

4504

}

4549

}

4505

raw_spin_unlock(&release_list_lock);

4550

raw_spin_unlock(&release_list_lock);

4506

if (need_schedule_work)

4551

if (need_schedule_work)

4507

schedule_work(&release_agent_work);

4552

schedule_work(&release_agent_work);

4508

}

4553

}

4509

}

4554

}

4510

4555

4511

/*

4556

/*

4512

* Notify userspace when a cgroup is released, by running the

4557

* Notify userspace when a cgroup is released, by running the

4513

* configured release agent with the name of the cgroup (path

4558

* configured release agent with the name of the cgroup (path

4514

* relative to the root of cgroup file system) as the argument.

4559

* relative to the root of cgroup file system) as the argument.

4515

*

4560

*

4516

* Most likely, this user command will try to rmdir this cgroup.

4561

* Most likely, this user command will try to rmdir this cgroup.

4517

*

4562

*

4518

* This races with the possibility that some other task will be

4563

* This races with the possibility that some other task will be

4519

* attached to this cgroup before it is removed, or that some other

4564

* attached to this cgroup before it is removed, or that some other

4520

* user task will 'mkdir' a child cgroup of this cgroup. That's ok.

4565

* user task will 'mkdir' a child cgroup of this cgroup. That's ok.

4521

* The presumed 'rmdir' will fail quietly if this cgroup is no longer

4566

* The presumed 'rmdir' will fail quietly if this cgroup is no longer

4522

* unused, and this cgroup will be reprieved from its death sentence,

4567

* unused, and this cgroup will be reprieved from its death sentence,

4523

* to continue to serve a useful existence. Next time it's released,

4568

* to continue to serve a useful existence. Next time it's released,

4524

* we will get notified again, if it still has 'notify_on_release' set.

4569

* we will get notified again, if it still has 'notify_on_release' set.

4525

*

4570

*

4526

* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which

4571

* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which

4527

* means only wait until the task is successfully execve()'d. The

4572

* means only wait until the task is successfully execve()'d. The

4528

* separate release agent task is forked by call_usermodehelper(),

4573

* separate release agent task is forked by call_usermodehelper(),

4529

* then control in this thread returns here, without waiting for the

4574

* then control in this thread returns here, without waiting for the

4530

* release agent task. We don't bother to wait because the caller of

4575

* release agent task. We don't bother to wait because the caller of

4531

* this routine has no use for the exit status of the release agent

4576

* this routine has no use for the exit status of the release agent

4532

* task, so no sense holding our caller up for that.

4577

* task, so no sense holding our caller up for that.

4533

*/

4578

*/

4534

static void cgroup_release_agent(struct work_struct *work)

4579

static void cgroup_release_agent(struct work_struct *work)

4535

{

4580

{

4536

BUG_ON(work != &release_agent_work);

4581

BUG_ON(work != &release_agent_work);

4537

mutex_lock(&cgroup_mutex);

4582

mutex_lock(&cgroup_mutex);

4538

raw_spin_lock(&release_list_lock);

4583

raw_spin_lock(&release_list_lock);

4539

while (!list_empty(&release_list)) {

4584

while (!list_empty(&release_list)) {

4540

char *argv[3], *envp[3];

4585

char *argv[3], *envp[3];

4541

int i;

4586

int i;

4542

char *pathbuf = NULL, *agentbuf = NULL, *path;

4587

char *pathbuf = NULL, *agentbuf = NULL, *path;

4543

struct cgroup *cgrp = list_entry(release_list.next,

4588

struct cgroup *cgrp = list_entry(release_list.next,

4544

struct cgroup,

4589

struct cgroup,

4545

release_list);

4590

release_list);

4546

list_del_init(&cgrp->release_list);

4591

list_del_init(&cgrp->release_list);

4547

raw_spin_unlock(&release_list_lock);

4592

raw_spin_unlock(&release_list_lock);

4548

pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);

4593

pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);

4549

if (!pathbuf)

4594

if (!pathbuf)

4550

goto continue_free;

4595

goto continue_free;

4551

path = cgroup_path(cgrp, pathbuf, PATH_MAX);

4596

path = cgroup_path(cgrp, pathbuf, PATH_MAX);

4552

if (!path)

4597

if (!path)

4553

goto continue_free;

4598

goto continue_free;

4554

agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);

4599

agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);

4555

if (!agentbuf)

4600

if (!agentbuf)

4556

goto continue_free;

4601

goto continue_free;

4557

4602

4558

i = 0;

4603

i = 0;

4559

argv[i++] = agentbuf;

4604

argv[i++] = agentbuf;

4560

argv[i++] = path;

4605

argv[i++] = path;

4561

argv[i] = NULL;

4606

argv[i] = NULL;

4562

4607

4563

i = 0;

4608

i = 0;

4564

/* minimal command environment */

4609

/* minimal command environment */

4565

envp[i++] = "HOME=/";

4610

envp[i++] = "HOME=/";

4566

envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";

4611

envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";

4567

envp[i] = NULL;

4612

envp[i] = NULL;

4568

4613

4569

/* Drop the lock while we invoke the usermode helper,

4614

/* Drop the lock while we invoke the usermode helper,

4570

* since the exec could involve hitting disk and hence

4615

* since the exec could involve hitting disk and hence

4571

* be a slow process */

4616

* be a slow process */

4572

mutex_unlock(&cgroup_mutex);

4617

mutex_unlock(&cgroup_mutex);

4573

call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);

4618

call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);

4574

mutex_lock(&cgroup_mutex);

4619

mutex_lock(&cgroup_mutex);

4575

continue_free:

4620

continue_free:

4576

kfree(pathbuf);

4621

kfree(pathbuf);

4577

kfree(agentbuf);

4622

kfree(agentbuf);

4578

raw_spin_lock(&release_list_lock);

4623

raw_spin_lock(&release_list_lock);

4579

}

4624

}

4580

raw_spin_unlock(&release_list_lock);

4625

raw_spin_unlock(&release_list_lock);

4581

mutex_unlock(&cgroup_mutex);

4626

mutex_unlock(&cgroup_mutex);

4582

}

4627

}

4583

4628

4584

static int __init cgroup_disable(char *str)

4629

static int __init cgroup_disable(char *str)

4585

{

4630

{

4586

struct cgroup_subsys *ss;

4631

struct cgroup_subsys *ss;

4587

char *token;

4632

char *token;

4588

int i;

4633

int i;

4589

4634

4590

while ((token = strsep(&str, ",")) != NULL) {

4635

while ((token = strsep(&str, ",")) != NULL) {

4591

if (!*token)

4636

if (!*token)

4592

continue;

4637

continue;

4593

4638

4594

for_each_subsys(ss, i) {

4639

for_each_subsys(ss, i) {

4595

if (!strcmp(token, ss->name)) {

4640

if (!strcmp(token, ss->name)) {

4596

ss->disabled = 1;

4641

ss->disabled = 1;

4597

printk(KERN_INFO "Disabling %s control group"

4642

printk(KERN_INFO "Disabling %s control group"

4598

" subsystem\n", ss->name);

4643

" subsystem\n", ss->name);

4599

break;

4644

break;

4600

}

4645

}

4601

}

4646

}

4602

}

4647

}

4603

return 1;

4648

return 1;

4604

}

4649

}

4605

__setup("cgroup_disable=", cgroup_disable);

4650

__setup("cgroup_disable=", cgroup_disable);

4606

4651

4607

/**

4652

/**

4608

* css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir

4653

* css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir

4609

* @dentry: directory dentry of interest

4654

* @dentry: directory dentry of interest

4610

* @ss: subsystem of interest

4655

* @ss: subsystem of interest

4611

*

4656

*

4612

* If @dentry is a directory for a cgroup which has @ss enabled on it, try

4657

* If @dentry is a directory for a cgroup which has @ss enabled on it, try

4613

* to get the corresponding css and return it. If such css doesn't exist

4658

* to get the corresponding css and return it. If such css doesn't exist

4614

* or can't be pinned, an ERR_PTR value is returned.

4659

* or can't be pinned, an ERR_PTR value is returned.

4615

*/

4660

*/

4616

struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,

4661

struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,

4617

struct cgroup_subsys *ss)

4662

struct cgroup_subsys *ss)

4618

{

4663

{

4619

struct kernfs_node *kn = kernfs_node_from_dentry(dentry);

4664

struct kernfs_node *kn = kernfs_node_from_dentry(dentry);

4620

struct cgroup_subsys_state *css = NULL;

4665

struct cgroup_subsys_state *css = NULL;

4621

struct cgroup *cgrp;

4666

struct cgroup *cgrp;

4622

4667

4623

/* is @dentry a cgroup dir? */

4668

/* is @dentry a cgroup dir? */

4624

if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||

4669

if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||

4625

kernfs_type(kn) != KERNFS_DIR)

4670

kernfs_type(kn) != KERNFS_DIR)

4626

return ERR_PTR(-EBADF);

4671

return ERR_PTR(-EBADF);

4627

4672

4628

rcu_read_lock();

4673

rcu_read_lock();

4629

4674

4630

/*

4675

/*

4631

* This path doesn't originate from kernfs and @kn could already

4676

* This path doesn't originate from kernfs and @kn could already

4632

* have been or be removed at any point. @kn->priv is RCU

4677

* have been or be removed at any point. @kn->priv is RCU

4633

* protected for this access. See destroy_locked() for details.

4678

* protected for this access. See destroy_locked() for details.

4634

*/

4679

*/

4635

cgrp = rcu_dereference(kn->priv);

4680

cgrp = rcu_dereference(kn->priv);

4636

if (cgrp)

4681

if (cgrp)

4637

css = cgroup_css(cgrp, ss);

4682

css = cgroup_css(cgrp, ss);

4638

4683

4639

if (!css || !css_tryget(css))

4684

if (!css || !css_tryget(css))

4640

css = ERR_PTR(-ENOENT);

4685

css = ERR_PTR(-ENOENT);

4641

4686

4642

rcu_read_unlock();

4687

rcu_read_unlock();

4643

return css;

4688

return css;

4644

}

4689

}

4645

4690

4646

/**

4691

/**

4647

* css_from_id - lookup css by id

4692

* css_from_id - lookup css by id

4648

* @id: the cgroup id

4693

* @id: the cgroup id

4649

* @ss: cgroup subsys to be looked into

4694

* @ss: cgroup subsys to be looked into

4650

*

4695

*

4651

* Returns the css if there's valid one with @id, otherwise returns NULL.

4696

* Returns the css if there's valid one with @id, otherwise returns NULL.

4652

* Should be called under rcu_read_lock().

4697

* Should be called under rcu_read_lock().

4653

*/

4698

*/

4654

struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)

4699

struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)

4655

{

4700

{

4656

struct cgroup *cgrp;

4701

struct cgroup *cgrp;

4657

4702

4658

cgroup_assert_mutexes_or_rcu_locked();

4703

cgroup_assert_mutexes_or_rcu_locked();

4659

4704

4660

cgrp = idr_find(&ss->root->cgroup_idr, id);

4705

cgrp = idr_find(&ss->root->cgroup_idr, id);

4661

if (cgrp)

4706

if (cgrp)

4662

return cgroup_css(cgrp, ss);

4707

return cgroup_css(cgrp, ss);

4663

return NULL;

4708

return NULL;

4664

}

4709

}

4665

4710

4666

#ifdef CONFIG_CGROUP_DEBUG

4711

#ifdef CONFIG_CGROUP_DEBUG

4667

static struct cgroup_subsys_state *

4712

static struct cgroup_subsys_state *

4668

debug_css_alloc(struct cgroup_subsys_state *parent_css)

4713

debug_css_alloc(struct cgroup_subsys_state *parent_css)

4669

{

4714

{

4670

struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);

4715

struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);

4671

4716

4672

if (!css)

4717

if (!css)

4673

return ERR_PTR(-ENOMEM);

4718

return ERR_PTR(-ENOMEM);

4674

4719

4675

return css;

4720

return css;

4676

}

4721

}

4677

4722

4678

static void debug_css_free(struct cgroup_subsys_state *css)

4723

static void debug_css_free(struct cgroup_subsys_state *css)

4679

{

4724

{

4680

kfree(css);

4725

kfree(css);

4681

}

4726

}

4682

4727

4683

static u64 debug_taskcount_read(struct cgroup_subsys_state *css,

4728

static u64 debug_taskcount_read(struct cgroup_subsys_state *css,

4684

struct cftype *cft)

4729

struct cftype *cft)

4685

{

4730

{

4686

return cgroup_task_count(css->cgroup);

4731

return cgroup_task_count(css->cgroup);

4687

}

4732

}

4688

4733

4689

static u64 current_css_set_read(struct cgroup_subsys_state *css,

4734

static u64 current_css_set_read(struct cgroup_subsys_state *css,

4690

struct cftype *cft)

4735

struct cftype *cft)

4691

{

4736

{

4692

return (u64)(unsigned long)current->cgroups;

4737

return (u64)(unsigned long)current->cgroups;

4693

}

4738

}

4694

4739

4695

static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,

4740

static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,

4696

struct cftype *cft)

4741

struct cftype *cft)

4697

{

4742

{

4698

u64 count;

4743

u64 count;

4699

4744

4700

rcu_read_lock();

4745

rcu_read_lock();

4701

count = atomic_read(&task_css_set(current)->refcount);

4746

count = atomic_read(&task_css_set(current)->refcount);

4702

rcu_read_unlock();

4747

rcu_read_unlock();

4703

return count;

4748

return count;

4704

}

4749

}

4705

4750

4706

static int current_css_set_cg_links_read(struct seq_file *seq, void *v)

4751

static int current_css_set_cg_links_read(struct seq_file *seq, void *v)

4707

{

4752

{

4708

struct cgrp_cset_link *link;

4753

struct cgrp_cset_link *link;

4709

struct css_set *cset;

4754

struct css_set *cset;

4710

char *name_buf;

4755

char *name_buf;

4711

4756

4712

name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);

4757

name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);

4713

if (!name_buf)

4758

if (!name_buf)

4714

return -ENOMEM;

4759

return -ENOMEM;

4715

4760

4716

down_read(&css_set_rwsem);

4761

down_read(&css_set_rwsem);

4717

rcu_read_lock();

4762

rcu_read_lock();

4718

cset = rcu_dereference(current->cgroups);

4763

cset = rcu_dereference(current->cgroups);

4719

list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {

4764

list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {

4720

struct cgroup *c = link->cgrp;

4765

struct cgroup *c = link->cgrp;

4721

4766

4722

cgroup_name(c, name_buf, NAME_MAX + 1);

4767

cgroup_name(c, name_buf, NAME_MAX + 1);

4723

seq_printf(seq, "Root %d group %s\n",

4768

seq_printf(seq, "Root %d group %s\n",

4724

c->root->hierarchy_id, name_buf);

4769

c->root->hierarchy_id, name_buf);

4725

}

4770

}

4726

rcu_read_unlock();

4771

rcu_read_unlock();

4727

up_read(&css_set_rwsem);

4772

up_read(&css_set_rwsem);

4728

kfree(name_buf);

4773

kfree(name_buf);

4729

return 0;

4774

return 0;

4730

}

4775

}

4731

4776

4732

#define MAX_TASKS_SHOWN_PER_CSS 25

4777

#define MAX_TASKS_SHOWN_PER_CSS 25

4733

static int cgroup_css_links_read(struct seq_file *seq, void *v)

4778

static int cgroup_css_links_read(struct seq_file *seq, void *v)

4734

{

4779

{

4735

struct cgroup_subsys_state *css = seq_css(seq);

4780

struct cgroup_subsys_state *css = seq_css(seq);

4736

struct cgrp_cset_link *link;

4781

struct cgrp_cset_link *link;

4737

4782

4738

down_read(&css_set_rwsem);

4783

down_read(&css_set_rwsem);

4739

list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {

4784

list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {

4740

struct css_set *cset = link->cset;

4785

struct css_set *cset = link->cset;

4741

struct task_struct *task;

4786

struct task_struct *task;

4742

int count = 0;

4787

int count = 0;

4743

4788

4744

seq_printf(seq, "css_set %p\n", cset);

4789

seq_printf(seq, "css_set %p\n", cset);

4745

4790

4746

list_for_each_entry(task, &cset->tasks, cg_list) {

4791

list_for_each_entry(task, &cset->tasks, cg_list) {

4747

if (count++ > MAX_TASKS_SHOWN_PER_CSS)

4792

if (count++ > MAX_TASKS_SHOWN_PER_CSS)

4748

goto overflow;

4793

goto overflow;

4749

seq_printf(seq, " task %d\n", task_pid_vnr(task));

4794

seq_printf(seq, " task %d\n", task_pid_vnr(task));

4750

}

4795

}

4751

4796

4752

list_for_each_entry(task, &cset->mg_tasks, cg_list) {

4797

list_for_each_entry(task, &cset->mg_tasks, cg_list) {

4753

if (count++ > MAX_TASKS_SHOWN_PER_CSS)

4798

if (count++ > MAX_TASKS_SHOWN_PER_CSS)

4754

goto overflow;

4799

goto overflow;

4755

seq_printf(seq, " task %d\n", task_pid_vnr(task));

4800

seq_printf(seq, " task %d\n", task_pid_vnr(task));

4756

}

4801

}

4757

continue;

4802

continue;

4758

overflow:

4803

overflow:

4759

seq_puts(seq, " ...\n");

4804

seq_puts(seq, " ...\n");

4760

}

4805

}

4761

up_read(&css_set_rwsem);

4806

up_read(&css_set_rwsem);

4762

return 0;

4807

return 0;

4763

}

4808

}

4764

4809

4765

static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)

4810

static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)

4766

{

4811

{

4767

return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);

4812

return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);

4768

}

4813

}

4769

4814

4770

static struct cftype debug_files[] = {

4815

static struct cftype debug_files[] = {

4771

{

4816

{

4772

.name = "taskcount",

4817

.name = "taskcount",

4773

.read_u64 = debug_taskcount_read,

4818

.read_u64 = debug_taskcount_read,

4774

},

4819

},

4775

4820

4776

{

4821

{

4777

.name = "current_css_set",

4822

.name = "current_css_set",

4778

.read_u64 = current_css_set_read,

4823

.read_u64 = current_css_set_read,

4779

},

4824

},

4780

4825

4781

{

4826

{

4782

.name = "current_css_set_refcount",

4827

.name = "current_css_set_refcount",

4783

.read_u64 = current_css_set_refcount_read,

4828

.read_u64 = current_css_set_refcount_read,

4784

},

4829

},

4785

4830

4786

{

4831

{

4787

.name = "current_css_set_cg_links",

4832

.name = "current_css_set_cg_links",

4788

.seq_show = current_css_set_cg_links_read,

4833

.seq_show = current_css_set_cg_links_read,

4789

},

4834

},

4790

4835

4791

{

4836

{

4792

.name = "cgroup_css_links",

4837

.name = "cgroup_css_links",

4793

.seq_show = cgroup_css_links_read,

4838

.seq_show = cgroup_css_links_read,

4794

},

4839

},

4795

4840

4796

{

4841

{

4797

.name = "releasable",

4842

.name = "releasable",

4798

.read_u64 = releasable_read,

4843

.read_u64 = releasable_read,

4799

},

4844

},

4800

4845

4801

{ } /* terminate */

4846

{ } /* terminate */

4802

};

4847

};

4803

4848

4804

struct cgroup_subsys debug_cgrp_subsys = {

4849

struct cgroup_subsys debug_cgrp_subsys = {

4805

.css_alloc = debug_css_alloc,

4850

.css_alloc = debug_css_alloc,

4806

.css_free = debug_css_free,

4851

.css_free = debug_css_free,

GITLAB

cgroup: introduce effective cgroup_subsys_state

 /*
  *  Generic process-grouping system.
  *
  *  Based originally on the cpuset system, extracted by Paul Menage
  *  Copyright (C) 2006 Google, Inc
  *
  *  Notifications support
  *  Copyright (C) 2009 Nokia Corporation
  *  Author: Kirill A. Shutemov
  *
  *  Copyright notices from the original cpuset code:
  *  --------------------------------------------------
  *  Copyright (C) 2003 BULL SA.
  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  *
  *  Portions derived from Patrick Mochel's sysfs code.
  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  *
  *  2003-10-10 Written by Simon Derr.
  *  2003-10-22 Updates by Stephen Hemminger.
  *  2004 May-July Rework by Paul Jackson.
  *  ---------------------------------------------------
  *
  *  This file is subject to the terms and conditions of the GNU General Public
  *  License.  See the file COPYING in the main directory of the Linux
  *  distribution for more details.
  */
 #include <linux/cgroup.h>
 #include <linux/cred.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/init_task.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/rwsem.h>
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hashtable.h>
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/kthread.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
 /*
  * pidlists linger the following amount before being destroyed.  The goal
  * is avoiding frequent destruction in the middle of consecutive read calls
  * Expiring in the middle is a performance problem not a correctness one.
  * 1 sec should be enough.
  */
 #define CGROUP_PIDLIST_DESTROY_DELAY	HZ
 #define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
 					 MAX_CFTYPE_NAME + 2)
 /*
  * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
  * creation/removal and hierarchy changing operations including cgroup
  * creation, removal, css association and controller rebinding.  This outer
  * lock is needed mainly to resolve the circular dependency between kernfs
  * active ref and cgroup_mutex.  cgroup_tree_mutex nests above both.
  */
 static DEFINE_MUTEX(cgroup_tree_mutex);
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
  *
  * css_set_rwsem protects task->cgroups pointer, the list of css_set
  * objects, and the chain of tasks off each css_set.
  *
  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  * cgroup.h can use them for lockdep annotations.
  */
 #ifdef CONFIG_PROVE_RCU
 DEFINE_MUTEX(cgroup_mutex);
 DECLARE_RWSEM(css_set_rwsem);
 EXPORT_SYMBOL_GPL(cgroup_mutex);
 EXPORT_SYMBOL_GPL(css_set_rwsem);
 #else
 static DEFINE_MUTEX(cgroup_mutex);
 static DECLARE_RWSEM(css_set_rwsem);
 #endif
 /*
  * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
  * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
  */
 static DEFINE_SPINLOCK(release_agent_path_lock);
 #define cgroup_assert_mutexes_or_rcu_locked()				\
 	rcu_lockdep_assert(rcu_read_lock_held() ||			\
 			   lockdep_is_held(&cgroup_tree_mutex) ||	\
 			   lockdep_is_held(&cgroup_mutex),		\
 			   "cgroup_[tree_]mutex or RCU read lock required");
 /*
  * cgroup destruction makes heavy use of work items and there can be a lot
  * of concurrent destructions.  Use a separate workqueue so that cgroup
  * destruction work items don't end up filling up max_active of system_wq
  * which may lead to deadlock.
  */
 static struct workqueue_struct *cgroup_destroy_wq;
 /*
  * pidlist destructions need to be flushed on cgroup destruction.  Use a
  * separate workqueue as flush domain.
  */
 static struct workqueue_struct *cgroup_pidlist_destroy_wq;
 /* generate an array of cgroup subsystem pointers */
 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
 static struct cgroup_subsys *cgroup_subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
 #undef SUBSYS
 /* array of cgroup subsystem names */
 #define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
 static const char *cgroup_subsys_name[] = {
 #include <linux/cgroup_subsys.h>
 };
 #undef SUBSYS
 /*
  * The default hierarchy, reserved for the subsystems that are otherwise
  * unattached - it never has more than a single cgroup, and all tasks are
  * part of that cgroup.
  */
 struct cgroup_root cgrp_dfl_root;
 /*
  * The default hierarchy always exists but is hidden until mounted for the
  * first time.  This is for backward compatibility.
  */
 static bool cgrp_dfl_root_visible;
 /* The list of hierarchy roots */
 static LIST_HEAD(cgroup_roots);
 static int cgroup_root_count;
 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
 static DEFINE_IDR(cgroup_hierarchy_idr);
 /*
  * Assign a monotonically increasing serial number to cgroups.  It
  * guarantees cgroups with bigger numbers are newer than those with smaller
  * numbers.  Also, as cgroups are always appended to the parent's
  * ->children list, it guarantees that sibling cgroups are always sorted in
  * the ascending serial number order on the list.  Protected by
  * cgroup_mutex.
  */
 static u64 cgroup_serial_nr_next = 1;
 /* This flag indicates whether tasks in the fork and exit paths should
  * check for fork/exit handlers to call. This avoids us having to do
  * extra work in the fork/exit path if none of the subsystems need to
  * be called.
  */
 static int need_forkexit_callback __read_mostly;
 static struct cftype cgroup_base_files[];
 static void cgroup_put(struct cgroup *cgrp);
 static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned long ss_mask);
 static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
 /**
  * cgroup_css - obtain a cgroup's css for the specified subsystem
  * @cgrp: the cgroup of interest
  * @ss: the subsystem of interest (%NULL returns the dummy_css)
  *
  * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
  * function must be called either under cgroup_mutex or rcu_read_lock() and
  * the caller is responsible for pinning the returned css if it wants to
  * keep accessing it outside the said locks.  This function may return
  * %NULL if @cgrp doesn't have @subsys_id enabled.
  */
 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 					      struct cgroup_subsys *ss)
 {
 	if (ss)
 		return rcu_dereference_check(cgrp->subsys[ss->id],
 					lockdep_is_held(&cgroup_tree_mutex) ||
 					lockdep_is_held(&cgroup_mutex));
 	else
 		return &cgrp->dummy_css;
 }
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest (%NULL returns the dummy_css)
+ *
+ * Similar to cgroup_css() but returns the effctive css, which is defined
+ * as the matching css of the nearest ancestor including self which has @ss
+ * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
+ * function is guaranteed to return non-NULL css.
+ */
+static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+						struct cgroup_subsys *ss)
+{
+	lockdep_assert_held(&cgroup_mutex);
+	if (!ss)
+		return &cgrp->dummy_css;
+	if (!(cgrp->root->subsys_mask & (1 << ss->id)))
+		return NULL;
+	while (cgrp->parent &&
+	       !(cgrp->parent->child_subsys_mask & (1 << ss->id)))
+		cgrp = cgrp->parent;
+	return cgroup_css(cgrp, ss);
+}
 /* convenient tests for these bits */
 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 {
 	return test_bit(CGRP_DEAD, &cgrp->flags);
 }
 struct cgroup_subsys_state *seq_css(struct seq_file *seq)
 {
 	struct kernfs_open_file *of = seq->private;
 	struct cgroup *cgrp = of->kn->parent->priv;
 	struct cftype *cft = seq_cft(seq);
 	/*
 	 * This is open and unprotected implementation of cgroup_css().
 	 * seq_css() is only called from a kernfs file operation which has
 	 * an active reference on the file.  Because all the subsystem
 	 * files are drained before a css is disassociated with a cgroup,
 	 * the matching css from the cgroup's subsys table is guaranteed to
 	 * be and stay valid until the enclosing operation is complete.
 	 */
 	if (cft->ss)
 		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
 	else
 		return &cgrp->dummy_css;
 }
 EXPORT_SYMBOL_GPL(seq_css);
 /**
  * cgroup_is_descendant - test ancestry
  * @cgrp: the cgroup to be tested
  * @ancestor: possible ancestor of @cgrp
  *
  * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
  * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
  * and @ancestor are accessible.
  */
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
 {
 	while (cgrp) {
 		if (cgrp == ancestor)
 			return true;
 		cgrp = cgrp->parent;
 	}
 	return false;
 }
 static int cgroup_is_releasable(const struct cgroup *cgrp)
 {
 	const int bits =
 		(1 << CGRP_RELEASABLE) |
 		(1 << CGRP_NOTIFY_ON_RELEASE);
 	return (cgrp->flags & bits) == bits;
 }
 static int notify_on_release(const struct cgroup *cgrp)
 {
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
 /**
  * for_each_css - iterate all css's of a cgroup
  * @css: the iteration cursor
  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
  * @cgrp: the target cgroup to iterate css's of
  *
- * Should be called under cgroup_mutex.
+ * Should be called under cgroup_[tree_]mutex.
  */
 #define for_each_css(css, ssid, cgrp)					\
 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
 		if (!((css) = rcu_dereference_check(			\
 				(cgrp)->subsys[(ssid)],			\
 				lockdep_is_held(&cgroup_tree_mutex) ||	\
 				lockdep_is_held(&cgroup_mutex)))) { }	\
 		else
 /**
+ * for_each_e_css - iterate all effective css's of a cgroup
+ * @css: the iteration cursor
+ * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
+ * @cgrp: the target cgroup to iterate css's of
+ *
+ * Should be called under cgroup_[tree_]mutex.
+ */
+#define for_each_e_css(css, ssid, cgrp)					\
+	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
+		if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
+			;						\
+		else
+/**
  * for_each_subsys - iterate all enabled cgroup subsystems
  * @ss: the iteration cursor
  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
  */
 #define for_each_subsys(ss, ssid)					\
 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
 	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
 /* iterate across the hierarchies */
 #define for_each_root(root)						\
 	list_for_each_entry((root), &cgroup_roots, root_list)
 /**
  * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
  * @cgrp: the cgroup to be checked for liveness
  *
  * On success, returns true; the mutex should be later unlocked.  On
  * failure returns false with no lock held.
  */
 static bool cgroup_lock_live_group(struct cgroup *cgrp)
 {
 	mutex_lock(&cgroup_mutex);
 	if (cgroup_is_dead(cgrp)) {
 		mutex_unlock(&cgroup_mutex);
 		return false;
 	}
 	return true;
 }
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
 static LIST_HEAD(release_list);
 static DEFINE_RAW_SPINLOCK(release_list_lock);
 static void cgroup_release_agent(struct work_struct *work);
 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
 /*
  * A cgroup can be associated with multiple css_sets as different tasks may
  * belong to different cgroups on different hierarchies.  In the other
  * direction, a css_set is naturally associated with multiple cgroups.
  * This M:N relationship is represented by the following link structure
  * which exists for each association and allows traversing the associations
  * from both sides.
  */
 struct cgrp_cset_link {
 	/* the cgroup and css_set this link associates */
 	struct cgroup		*cgrp;
 	struct css_set		*cset;
 	/* list of cgrp_cset_links anchored at cgrp->cset_links */
 	struct list_head	cset_link;
 	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
 	struct list_head	cgrp_link;
 };
 /*
  * The default css_set - used by init and its children prior to any
  * hierarchies being mounted. It contains a pointer to the root state
  * for each subsystem. Also used to anchor the list of css_sets. Not
  * reference-counted, to improve performance when child cgroups
  * haven't been created.
  */
 static struct css_set init_css_set = {
 	.refcount		= ATOMIC_INIT(1),
 	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
 	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
 };
 static int css_set_count	= 1;	/* 1 for init_css_set */
 /*
  * hash table for cgroup groups. This improves the performance to find
  * an existing css_set. This hash doesn't (currently) take into
  * account cgroups in empty hierarchies.
  */
 #define CSS_SET_HASH_BITS	7
 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 {
 	unsigned long key = 0UL;
 	struct cgroup_subsys *ss;
 	int i;
 	for_each_subsys(ss, i)
 		key += (unsigned long)css[i];
 	key = (key >> 16) ^ key;
 	return key;
 }
 static void put_css_set_locked(struct css_set *cset, bool taskexit)
 {
 	struct cgrp_cset_link *link, *tmp_link;
 	lockdep_assert_held(&css_set_rwsem);
 	if (!atomic_dec_and_test(&cset->refcount))
 		return;
 	/* This css_set is dead. unlink it and release cgroup refcounts */
 	hash_del(&cset->hlist);
 	css_set_count--;
 	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
 		struct cgroup *cgrp = link->cgrp;
 		list_del(&link->cset_link);
 		list_del(&link->cgrp_link);
 		/* @cgrp can't go away while we're holding css_set_rwsem */
 		if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
 			if (taskexit)
 				set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
 		}
 		kfree(link);
 	}
 	kfree_rcu(cset, rcu_head);
 }
 static void put_css_set(struct css_set *cset, bool taskexit)
 {
 	/*
 	 * Ensure that the refcount doesn't hit zero while any readers
 	 * can see it. Similar to atomic_dec_and_lock(), but for an
 	 * rwlock
 	 */
 	if (atomic_add_unless(&cset->refcount, -1, 1))
 		return;
 	down_write(&css_set_rwsem);
 	put_css_set_locked(cset, taskexit);
 	up_write(&css_set_rwsem);
 }
 /*
  * refcounted get/put for css_set objects
  */
 static inline void get_css_set(struct css_set *cset)
 {
 	atomic_inc(&cset->refcount);
 }
 /**
  * compare_css_sets - helper function for find_existing_css_set().
  * @cset: candidate css_set being tested
  * @old_cset: existing css_set for a task
  * @new_cgrp: cgroup that's being entered by the task
  * @template: desired set of css pointers in css_set (pre-calculated)
  *
  * Returns true if "cset" matches "old_cset" except for the hierarchy
  * which "new_cgrp" belongs to, for which it should match "new_cgrp".
  */
 static bool compare_css_sets(struct css_set *cset,
 			     struct css_set *old_cset,
 			     struct cgroup *new_cgrp,
 			     struct cgroup_subsys_state *template[])
 {
 	struct list_head *l1, *l2;
-	if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
+	/*
-		/* Not all subsystems matched */
+	 * On the default hierarchy, there can be csets which are
+	 * associated with the same set of cgroups but different csses.
+	 * Let's first ensure that csses match.
+	 */
+	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
 		return false;
-	}
 	/*
 	 * Compare cgroup pointers in order to distinguish between
-	 * different cgroups in heirarchies with no subsystems. We
+	 * different cgroups in hierarchies.  As different cgroups may
-	 * could get by with just this check alone (and skip the
+	 * share the same effective css, this comparison is always
-	 * memcmp above) but on most setups the memcmp check will
+	 * necessary.
-	 * avoid the need for this more expensive check on almost all
-	 * candidates.
 	 */
 	l1 = &cset->cgrp_links;
 	l2 = &old_cset->cgrp_links;
 	while (1) {
 		struct cgrp_cset_link *link1, *link2;
 		struct cgroup *cgrp1, *cgrp2;
 		l1 = l1->next;
 		l2 = l2->next;
 		/* See if we reached the end - both lists are equal length. */
 		if (l1 == &cset->cgrp_links) {
 			BUG_ON(l2 != &old_cset->cgrp_links);
 			break;
 		} else {
 			BUG_ON(l2 == &old_cset->cgrp_links);
 		}
 		/* Locate the cgroups associated with these links. */
 		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
 		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
 		cgrp1 = link1->cgrp;
 		cgrp2 = link2->cgrp;
 		/* Hierarchies should be linked in the same order. */
 		BUG_ON(cgrp1->root != cgrp2->root);
 		/*
 		 * If this hierarchy is the hierarchy of the cgroup
 		 * that's changing, then we need to check that this
 		 * css_set points to the new cgroup; if it's any other
 		 * hierarchy, then this css_set should point to the
 		 * same cgroup as the old css_set.
 		 */
 		if (cgrp1->root == new_cgrp->root) {
 			if (cgrp1 != new_cgrp)
 				return false;
 		} else {
 			if (cgrp1 != cgrp2)
 				return false;
 		}
 	}
 	return true;
 }
 /**
  * find_existing_css_set - init css array and find the matching css_set
  * @old_cset: the css_set that we're using before the cgroup transition
  * @cgrp: the cgroup that we're moving into
  * @template: out param for the new set of csses, should be clear on entry
  */
 static struct css_set *find_existing_css_set(struct css_set *old_cset,
 					struct cgroup *cgrp,
 					struct cgroup_subsys_state *template[])
 {
 	struct cgroup_root *root = cgrp->root;
 	struct cgroup_subsys *ss;
 	struct css_set *cset;
 	unsigned long key;
 	int i;
 	/*
 	 * Build the set of subsystem state objects that we want to see in the
 	 * new css_set. while subsystems can change globally, the entries here
 	 * won't change, so no need for locking.
 	 */
 	for_each_subsys(ss, i) {
 		if (root->subsys_mask & (1UL << i)) {
-			/* Subsystem is in this hierarchy. So we want
+			/*
-			 * the subsystem state from the new
+			 * @ss is in this hierarchy, so we want the
-			 * cgroup */
+			 * effective css from @cgrp.
-			template[i] = cgroup_css(cgrp, ss);
+			 */
+			template[i] = cgroup_e_css(cgrp, ss);
 		} else {
-			/* Subsystem is not in this hierarchy, so we
+			/*
-			 * don't want to change the subsystem state */
+			 * @ss is not in this hierarchy, so we don't want
+			 * to change the css.
+			 */
 			template[i] = old_cset->subsys[i];
 		}
 	}
 	key = css_set_hash(template);
 	hash_for_each_possible(css_set_table, cset, hlist, key) {
 		if (!compare_css_sets(cset, old_cset, cgrp, template))
 			continue;
 		/* This css_set matches what we need */
 		return cset;
 	}
 	/* No existing cgroup group matched */
 	return NULL;
 }
 static void free_cgrp_cset_links(struct list_head *links_to_free)
 {
 	struct cgrp_cset_link *link, *tmp_link;
 	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
 		list_del(&link->cset_link);
 		kfree(link);
 	}
 }
 /**
  * allocate_cgrp_cset_links - allocate cgrp_cset_links
  * @count: the number of links to allocate
  * @tmp_links: list_head the allocated links are put on
  *
  * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
  * through ->cset_link.  Returns 0 on success or -errno.
  */
 static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
 {
 	struct cgrp_cset_link *link;
 	int i;
 	INIT_LIST_HEAD(tmp_links);
 	for (i = 0; i < count; i++) {
 		link = kzalloc(sizeof(*link), GFP_KERNEL);
 		if (!link) {
 			free_cgrp_cset_links(tmp_links);
 			return -ENOMEM;
 		}
 		list_add(&link->cset_link, tmp_links);
 	}
 	return 0;
 }
 /**
  * link_css_set - a helper function to link a css_set to a cgroup
  * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
  * @cset: the css_set to be linked
  * @cgrp: the destination cgroup
  */
 static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
 			 struct cgroup *cgrp)
 {
 	struct cgrp_cset_link *link;
 	BUG_ON(list_empty(tmp_links));
 	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
 	link->cset = cset;
 	link->cgrp = cgrp;
 	list_move(&link->cset_link, &cgrp->cset_links);
 	/*
 	 * Always add links to the tail of the list so that the list
 	 * is sorted by order of hierarchy creation
 	 */
 	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
 }
 /**
  * find_css_set - return a new css_set with one cgroup updated
  * @old_cset: the baseline css_set
  * @cgrp: the cgroup to be updated
  *
  * Return a new css_set that's equivalent to @old_cset, but with @cgrp
  * substituted into the appropriate hierarchy.
  */
 static struct css_set *find_css_set(struct css_set *old_cset,
 				    struct cgroup *cgrp)
 {
 	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
 	struct css_set *cset;
 	struct list_head tmp_links;
 	struct cgrp_cset_link *link;
 	unsigned long key;
 	lockdep_assert_held(&cgroup_mutex);
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
 	down_read(&css_set_rwsem);
 	cset = find_existing_css_set(old_cset, cgrp, template);
 	if (cset)
 		get_css_set(cset);
 	up_read(&css_set_rwsem);
 	if (cset)
 		return cset;
 	cset = kzalloc(sizeof(*cset), GFP_KERNEL);
 	if (!cset)
 		return NULL;
 	/* Allocate all the cgrp_cset_link objects that we'll need */
 	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
 		kfree(cset);
 		return NULL;
 	}
 	atomic_set(&cset->refcount, 1);
 	INIT_LIST_HEAD(&cset->cgrp_links);
 	INIT_LIST_HEAD(&cset->tasks);
 	INIT_LIST_HEAD(&cset->mg_tasks);
 	INIT_LIST_HEAD(&cset->mg_preload_node);
 	INIT_LIST_HEAD(&cset->mg_node);
 	INIT_HLIST_NODE(&cset->hlist);
 	/* Copy the set of subsystem state objects generated in
 	 * find_existing_css_set() */
 	memcpy(cset->subsys, template, sizeof(cset->subsys));
 	down_write(&css_set_rwsem);
 	/* Add reference counts and links from the new css_set. */
 	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
 		if (c->root == cgrp->root)
 			c = cgrp;
 		link_css_set(&tmp_links, cset, c);
 	}
 	BUG_ON(!list_empty(&tmp_links));
 	css_set_count++;
 	/* Add this cgroup group to the hash table */
 	key = css_set_hash(cset->subsys);
 	hash_add(css_set_table, &cset->hlist, key);
 	up_write(&css_set_rwsem);
 	return cset;
 }
 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
 	struct cgroup *root_cgrp = kf_root->kn->priv;
 	return root_cgrp->root;
 }
 static int cgroup_init_root_id(struct cgroup_root *root)
 {
 	int id;
 	lockdep_assert_held(&cgroup_mutex);
 	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
 	if (id < 0)
 		return id;
 	root->hierarchy_id = id;
 	return 0;
 }
 static void cgroup_exit_root_id(struct cgroup_root *root)
 {
 	lockdep_assert_held(&cgroup_mutex);
 	if (root->hierarchy_id) {
 		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
 		root->hierarchy_id = 0;
 	}
 }
 static void cgroup_free_root(struct cgroup_root *root)
 {
 	if (root) {
 		/* hierarhcy ID shoulid already have been released */
 		WARN_ON_ONCE(root->hierarchy_id);
 		idr_destroy(&root->cgroup_idr);
 		kfree(root);
 	}
 }
 static void cgroup_destroy_root(struct cgroup_root *root)
 {
 	struct cgroup *cgrp = &root->cgrp;
 	struct cgrp_cset_link *link, *tmp_link;
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	BUG_ON(atomic_read(&root->nr_cgrps));
 	BUG_ON(!list_empty(&cgrp->children));
 	/* Rebind all subsystems back to the default hierarchy */
 	rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
 	/*
 	 * Release all the links from cset_links to this hierarchy's
 	 * root cgroup
 	 */
 	down_write(&css_set_rwsem);
 	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
 		list_del(&link->cset_link);
 		list_del(&link->cgrp_link);
 		kfree(link);
 	}
 	up_write(&css_set_rwsem);
 	if (!list_empty(&root->root_list)) {
 		list_del(&root->root_list);
 		cgroup_root_count--;
 	}
 	cgroup_exit_root_id(root);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	kernfs_destroy_root(root->kf_root);
 	cgroup_free_root(root);
 }
 /* look up cgroup associated with given css_set on the specified hierarchy */
 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 					    struct cgroup_root *root)
 {
 	struct cgroup *res = NULL;
 	lockdep_assert_held(&cgroup_mutex);
 	lockdep_assert_held(&css_set_rwsem);
 	if (cset == &init_css_set) {
 		res = &root->cgrp;
 	} else {
 		struct cgrp_cset_link *link;
 		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 			struct cgroup *c = link->cgrp;
 			if (c->root == root) {
 				res = c;
 				break;
 			}
 		}
 	}
 	BUG_ON(!res);
 	return res;
 }
 /*
  * Return the cgroup for "task" from the given hierarchy. Must be
  * called with cgroup_mutex and css_set_rwsem held.
  */
 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 					    struct cgroup_root *root)
 {
 	/*
 	 * No need to lock the task - since we hold cgroup_mutex the
 	 * task can't change groups, so the only thing that can happen
 	 * is that it exits and its css is set back to init_css_set.
 	 */
 	return cset_cgroup_from_root(task_css_set(task), root);
 }
 /*
  * A task must hold cgroup_mutex to modify cgroups.
  *
  * Any task can increment and decrement the count field without lock.
  * So in general, code holding cgroup_mutex can't rely on the count
  * field not changing.  However, if the count goes to zero, then only
  * cgroup_attach_task() can increment it again.  Because a count of zero
  * means that no tasks are currently attached, therefore there is no
  * way a task attached to that cgroup can fork (the other way to
  * increment the count).  So code holding cgroup_mutex can safely
  * assume that if the count is zero, it will stay zero. Similarly, if
  * a task holds cgroup_mutex on a cgroup with zero count, it
  * knows that the cgroup won't be removed, as cgroup_rmdir()
  * needs that mutex.
  *
  * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
  * (usually) take cgroup_mutex.  These are the two most performance
  * critical pieces of code here.  The exception occurs on cgroup_exit(),
  * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
  * is taken, and if the cgroup count is zero, a usermode call made
  * to the release agent with the name of the cgroup (path relative to
  * the root of cgroup file system) as the argument.
  *
  * A cgroup can only be deleted if both its 'count' of using tasks
  * is zero, and its list of 'children' cgroups is empty.  Since all
  * tasks in the system use _some_ cgroup, and since there is always at
  * least one task in the system (init, pid == 1), therefore, root cgroup
  * always has either children cgroups and/or using tasks.  So we don't
  * need a special hack to ensure that root cgroup cannot be deleted.
  *
  * P.S.  One more locking exception.  RCU is used to guard the
  * update of a tasks cgroup pointer by cgroup_attach_task()
  */
 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 static const struct file_operations proc_cgroupstats_operations;
 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 			      char *buf)
 {
 	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
 	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
 		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
 			 cft->ss->name, cft->name);
 	else
 		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
 	return buf;
 }
 /**
  * cgroup_file_mode - deduce file mode of a control file
  * @cft: the control file in question
  *
  * returns cft->mode if ->mode is not 0
  * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
  * returns S_IRUGO if it has only a read handler
  * returns S_IWUSR if it has only a write hander
  */
 static umode_t cgroup_file_mode(const struct cftype *cft)
 {
 	umode_t mode = 0;
 	if (cft->mode)
 		return cft->mode;
 	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
 		mode |= S_IRUGO;
 	if (cft->write_u64 || cft->write_s64 || cft->write_string ||
 	    cft->trigger)
 		mode |= S_IWUSR;
 	return mode;
 }
 static void cgroup_free_fn(struct work_struct *work)
 {
 	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
 	atomic_dec(&cgrp->root->nr_cgrps);
 	cgroup_pidlist_destroy_all(cgrp);
 	if (cgrp->parent) {
 		/*
 		 * We get a ref to the parent, and put the ref when this
 		 * cgroup is being freed, so it's guaranteed that the
 		 * parent won't be destroyed before its children.
 		 */
 		cgroup_put(cgrp->parent);
 		kernfs_put(cgrp->kn);
 		kfree(cgrp);
 	} else {
 		/*
 		 * This is root cgroup's refcnt reaching zero, which
 		 * indicates that the root should be released.
 		 */
 		cgroup_destroy_root(cgrp->root);
 	}
 }
 static void cgroup_free_rcu(struct rcu_head *head)
 {
 	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
 	INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
 	queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
 }
 static void cgroup_get(struct cgroup *cgrp)
 {
 	WARN_ON_ONCE(cgroup_is_dead(cgrp));
 	WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
 	atomic_inc(&cgrp->refcnt);
 }
 static void cgroup_put(struct cgroup *cgrp)
 {
 	if (!atomic_dec_and_test(&cgrp->refcnt))
 		return;
 	if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
 		return;
 	/*
 	 * XXX: cgrp->id is only used to look up css's.  As cgroup and
 	 * css's lifetimes will be decoupled, it should be made
 	 * per-subsystem and moved to css->id so that lookups are
 	 * successful until the target css is released.
 	 */
 	mutex_lock(&cgroup_mutex);
 	idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
 	mutex_unlock(&cgroup_mutex);
 	cgrp->id = -1;
 	call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
 }
 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 {
 	char name[CGROUP_FILE_NAME_MAX];
 	lockdep_assert_held(&cgroup_tree_mutex);
 	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
 }
 /**
  * cgroup_clear_dir - remove subsys files in a cgroup directory
  * @cgrp: target cgroup
  * @subsys_mask: mask of the subsystem ids whose files should be removed
  */
 static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 {
 	struct cgroup_subsys *ss;
 	int i;
 	for_each_subsys(ss, i) {
 		struct cftype *cfts;
 		if (!test_bit(i, &subsys_mask))
 			continue;
 		list_for_each_entry(cfts, &ss->cfts, node)
 			cgroup_addrm_files(cgrp, cfts, false);
 	}
 }
 static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned long ss_mask)
 {
 	struct cgroup_subsys *ss;
 	int ssid, ret;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	for_each_subsys(ss, ssid) {
 		if (!(ss_mask & (1 << ssid)))
 			continue;
 		/* if @ss is on the dummy_root, we can always move it */
 		if (ss->root == &cgrp_dfl_root)
 			continue;
 		/* if @ss has non-root cgroups attached to it, can't move */
 		if (!list_empty(&ss->root->cgrp.children))
 			return -EBUSY;
 		/* can't move between two non-dummy roots either */
 		if (dst_root != &cgrp_dfl_root)
 			return -EBUSY;
 	}
 	ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
 	if (ret) {
 		if (dst_root != &cgrp_dfl_root)
 			return ret;
 		/*
 		 * Rebinding back to the default root is not allowed to
 		 * fail.  Using both default and non-default roots should
 		 * be rare.  Moving subsystems back and forth even more so.
 		 * Just warn about it and continue.
 		 */
 		if (cgrp_dfl_root_visible) {
 			pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",
 				   ret, ss_mask);
 			pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");
 		}
 	}
 	/*
 	 * Nothing can fail from this point on.  Remove files for the
 	 * removed subsystems and rebind each subsystem.
 	 */
 	mutex_unlock(&cgroup_mutex);
 	for_each_subsys(ss, ssid)
 		if (ss_mask & (1 << ssid))
 			cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
 	mutex_lock(&cgroup_mutex);
 	for_each_subsys(ss, ssid) {
 		struct cgroup_root *src_root;
 		struct cgroup_subsys_state *css;
 		if (!(ss_mask & (1 << ssid)))
 			continue;
 		src_root = ss->root;
 		css = cgroup_css(&src_root->cgrp, ss);
 		WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
 		RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
 		rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
 		ss->root = dst_root;
 		css->cgroup = &dst_root->cgrp;
 		src_root->subsys_mask &= ~(1 << ssid);
 		src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
 		dst_root->subsys_mask |= 1 << ssid;
 		dst_root->cgrp.child_subsys_mask |= 1 << ssid;
 		if (ss->bind)
 			ss->bind(css);
 	}
 	kernfs_activate(dst_root->cgrp.kn);
 	return 0;
 }
 static int cgroup_show_options(struct seq_file *seq,
 			       struct kernfs_root *kf_root)
 {
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_subsys *ss;
 	int ssid;
 	for_each_subsys(ss, ssid)
 		if (root->subsys_mask & (1 << ssid))
 			seq_printf(seq, ",%s", ss->name);
 	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
 		seq_puts(seq, ",sane_behavior");
 	if (root->flags & CGRP_ROOT_NOPREFIX)
 		seq_puts(seq, ",noprefix");
 	if (root->flags & CGRP_ROOT_XATTR)
 		seq_puts(seq, ",xattr");
 	spin_lock(&release_agent_path_lock);
 	if (strlen(root->release_agent_path))
 		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
 	spin_unlock(&release_agent_path_lock);
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
 		seq_printf(seq, ",name=%s", root->name);
 	return 0;
 }
 struct cgroup_sb_opts {
 	unsigned long subsys_mask;
 	unsigned long flags;
 	char *release_agent;
 	bool cpuset_clone_children;
 	char *name;
 	/* User explicitly requested empty subsystem */
 	bool none;
 };
 /*
  * Convert a hierarchy specifier into a bitmask of subsystems and
  * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
  * array. This function takes refcounts on subsystems to be used, unless it
  * returns error, in which case no refcounts are taken.
  */
 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
 	char *token, *o = data;
 	bool all_ss = false, one_ss = false;
 	unsigned long mask = (unsigned long)-1;
 	struct cgroup_subsys *ss;
 	int i;
 	BUG_ON(!mutex_is_locked(&cgroup_mutex));
 #ifdef CONFIG_CPUSETS
 	mask = ~(1UL << cpuset_cgrp_id);
 #endif
 	memset(opts, 0, sizeof(*opts));
 	while ((token = strsep(&o, ",")) != NULL) {
 		if (!*token)
 			return -EINVAL;
 		if (!strcmp(token, "none")) {
 			/* Explicitly have no subsystems */
 			opts->none = true;
 			continue;
 		}
 		if (!strcmp(token, "all")) {
 			/* Mutually exclusive option 'all' + subsystem name */
 			if (one_ss)
 				return -EINVAL;
 			all_ss = true;
 			continue;
 		}
 		if (!strcmp(token, "__DEVEL__sane_behavior")) {
 			opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
 			continue;
 		}
 		if (!strcmp(token, "noprefix")) {
 			opts->flags |= CGRP_ROOT_NOPREFIX;
 			continue;
 		}
 		if (!strcmp(token, "clone_children")) {
 			opts->cpuset_clone_children = true;
 			continue;
 		}
 		if (!strcmp(token, "xattr")) {
 			opts->flags |= CGRP_ROOT_XATTR;
 			continue;
 		}
 		if (!strncmp(token, "release_agent=", 14)) {
 			/* Specifying two release agents is forbidden */
 			if (opts->release_agent)
 				return -EINVAL;
 			opts->release_agent =
 				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
 			if (!opts->release_agent)
 				return -ENOMEM;
 			continue;
 		}
 		if (!strncmp(token, "name=", 5)) {
 			const char *name = token + 5;
 			/* Can't specify an empty name */
 			if (!strlen(name))
 				return -EINVAL;
 			/* Must match [\w.-]+ */
 			for (i = 0; i < strlen(name); i++) {
 				char c = name[i];
 				if (isalnum(c))
 					continue;
 				if ((c == '.') || (c == '-') || (c == '_'))
 					continue;
 				return -EINVAL;
 			}
 			/* Specifying two names is forbidden */
 			if (opts->name)
 				return -EINVAL;
 			opts->name = kstrndup(name,
 					      MAX_CGROUP_ROOT_NAMELEN - 1,
 					      GFP_KERNEL);
 			if (!opts->name)
 				return -ENOMEM;
 			continue;
 		}
 		for_each_subsys(ss, i) {
 			if (strcmp(token, ss->name))
 				continue;
 			if (ss->disabled)
 				continue;
 			/* Mutually exclusive option 'all' + subsystem name */
 			if (all_ss)
 				return -EINVAL;
 			set_bit(i, &opts->subsys_mask);
 			one_ss = true;
 			break;
 		}
 		if (i == CGROUP_SUBSYS_COUNT)
 			return -ENOENT;
 	}
 	/* Consistency checks */
 	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
 		pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
 		if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
 		    opts->cpuset_clone_children || opts->release_agent ||
 		    opts->name) {
 			pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
 			return -EINVAL;
 		}
 	} else {
 		/*
 		 * If the 'all' option was specified select all the
 		 * subsystems, otherwise if 'none', 'name=' and a subsystem
 		 * name options were not specified, let's default to 'all'
 		 */
 		if (all_ss || (!one_ss && !opts->none && !opts->name))
 			for_each_subsys(ss, i)
 				if (!ss->disabled)
 					set_bit(i, &opts->subsys_mask);
 		/*
 		 * We either have to specify by name or by subsystems. (So
 		 * all empty hierarchies must have a name).
 		 */
 		if (!opts->subsys_mask && !opts->name)
 			return -EINVAL;
 	}
 	/*
 	 * Option noprefix was introduced just for backward compatibility
 	 * with the old cpuset, so we allow noprefix only if mounting just
 	 * the cpuset subsystem.
 	 */
 	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
 		return -EINVAL;
 	/* Can't specify "none" and some subsystems */
 	if (opts->subsys_mask && opts->none)
 		return -EINVAL;
 	return 0;
 }
 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 {
 	int ret = 0;
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_sb_opts opts;
 	unsigned long added_mask, removed_mask;
 	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
 		pr_err("cgroup: sane_behavior: remount is not allowed\n");
 		return -EINVAL;
 	}
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	/* See what subsystems are wanted */
 	ret = parse_cgroupfs_options(data, &opts);
 	if (ret)
 		goto out_unlock;
 	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
 		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
 			   task_tgid_nr(current), current->comm);
 	added_mask = opts.subsys_mask & ~root->subsys_mask;
 	removed_mask = root->subsys_mask & ~opts.subsys_mask;
 	/* Don't allow flags or name to change at remount */
 	if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
 	    (opts.name && strcmp(opts.name, root->name))) {
 		pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
 		       opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
 		       root->flags & CGRP_ROOT_OPTION_MASK, root->name);
 		ret = -EINVAL;
 		goto out_unlock;
 	}
 	/* remounting is not allowed for populated hierarchies */
 	if (!list_empty(&root->cgrp.children)) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
 	ret = rebind_subsystems(root, added_mask);
 	if (ret)
 		goto out_unlock;
 	rebind_subsystems(&cgrp_dfl_root, removed_mask);
 	if (opts.release_agent) {
 		spin_lock(&release_agent_path_lock);
 		strcpy(root->release_agent_path, opts.release_agent);
 		spin_unlock(&release_agent_path_lock);
 	}
  out_unlock:
 	kfree(opts.release_agent);
 	kfree(opts.name);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	return ret;
 }
 /*
  * To reduce the fork() overhead for systems that are not actually using
  * their cgroups capability, we don't maintain the lists running through
  * each css_set to its tasks until we see the list actually used - in other
  * words after the first mount.
  */
 static bool use_task_css_set_links __read_mostly;
 static void cgroup_enable_task_cg_lists(void)
 {
 	struct task_struct *p, *g;
 	down_write(&css_set_rwsem);
 	if (use_task_css_set_links)
 		goto out_unlock;
 	use_task_css_set_links = true;
 	/*
 	 * We need tasklist_lock because RCU is not safe against
 	 * while_each_thread(). Besides, a forking task that has passed
 	 * cgroup_post_fork() without seeing use_task_css_set_links = 1
 	 * is not guaranteed to have its child immediately visible in the
 	 * tasklist if we walk through it with RCU.
 	 */
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
 			     task_css_set(p) != &init_css_set);
 		/*
 		 * We should check if the process is exiting, otherwise
 		 * it will race with cgroup_exit() in that the list
 		 * entry won't be deleted though the process has exited.
 		 * Do it while holding siglock so that we don't end up
 		 * racing against cgroup_exit().
 		 */
 		spin_lock_irq(&p->sighand->siglock);
 		if (!(p->flags & PF_EXITING)) {
 			struct css_set *cset = task_css_set(p);
 			list_add(&p->cg_list, &cset->tasks);
 			get_css_set(cset);
 		}
 		spin_unlock_irq(&p->sighand->siglock);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 out_unlock:
 	up_write(&css_set_rwsem);
 }
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
 {
 	atomic_set(&cgrp->refcnt, 1);
 	INIT_LIST_HEAD(&cgrp->sibling);
 	INIT_LIST_HEAD(&cgrp->children);
 	INIT_LIST_HEAD(&cgrp->cset_links);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
 	mutex_init(&cgrp->pidlist_mutex);
 	cgrp->dummy_css.cgroup = cgrp;
 }
 static void init_cgroup_root(struct cgroup_root *root,
 			     struct cgroup_sb_opts *opts)
 {
 	struct cgroup *cgrp = &root->cgrp;
 	INIT_LIST_HEAD(&root->root_list);
 	atomic_set(&root->nr_cgrps, 1);
 	cgrp->root = root;
 	init_cgroup_housekeeping(cgrp);
 	idr_init(&root->cgroup_idr);
 	root->flags = opts->flags;
 	if (opts->release_agent)
 		strcpy(root->release_agent_path, opts->release_agent);
 	if (opts->name)
 		strcpy(root->name, opts->name);
 	if (opts->cpuset_clone_children)
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
 static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 {
 	LIST_HEAD(tmp_links);
 	struct cgroup *root_cgrp = &root->cgrp;
 	struct css_set *cset;
 	int i, ret;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
 	if (ret < 0)
 		goto out;
 	root_cgrp->id = ret;
 	/*
 	 * We're accessing css_set_count without locking css_set_rwsem here,
 	 * but that's OK - it can only be increased by someone holding
 	 * cgroup_lock, and that's us. The worst that can happen is that we
 	 * have some link structures left over
 	 */
 	ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
 	if (ret)
 		goto out;
 	ret = cgroup_init_root_id(root);
 	if (ret)
 		goto out;
 	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
 					   KERNFS_ROOT_CREATE_DEACTIVATED,
 					   root_cgrp);
 	if (IS_ERR(root->kf_root)) {
 		ret = PTR_ERR(root->kf_root);
 		goto exit_root_id;
 	}
 	root_cgrp->kn = root->kf_root->kn;
 	ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
 	if (ret)
 		goto destroy_root;
 	ret = rebind_subsystems(root, ss_mask);
 	if (ret)
 		goto destroy_root;
 	/*
 	 * There must be no failure case after here, since rebinding takes
 	 * care of subsystems' refcounts, which are explicitly dropped in
 	 * the failure exit path.
 	 */
 	list_add(&root->root_list, &cgroup_roots);
 	cgroup_root_count++;
 	/*
 	 * Link the root cgroup in this hierarchy into all the css_set
 	 * objects.
 	 */
 	down_write(&css_set_rwsem);
 	hash_for_each(css_set_table, i, cset, hlist)
 		link_css_set(&tmp_links, cset, root_cgrp);
 	up_write(&css_set_rwsem);
 	BUG_ON(!list_empty(&root_cgrp->children));
 	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
 	kernfs_activate(root_cgrp->kn);
 	ret = 0;
 	goto out;
 destroy_root:
 	kernfs_destroy_root(root->kf_root);
 	root->kf_root = NULL;
 exit_root_id:
 	cgroup_exit_root_id(root);
 out:
 	free_cgrp_cset_links(&tmp_links);
 	return ret;
 }
 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data)
 {
 	struct cgroup_root *root;
 	struct cgroup_sb_opts opts;
 	struct dentry *dentry;
 	int ret;
 	bool new_sb;
 	/*
 	 * The first time anyone tries to mount a cgroup, enable the list
 	 * linking each css_set to its tasks and fix up all existing tasks.
 	 */
 	if (!use_task_css_set_links)
 		cgroup_enable_task_cg_lists();
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	/* First find the desired set of subsystems */
 	ret = parse_cgroupfs_options(data, &opts);
 	if (ret)
 		goto out_unlock;
 retry:
 	/* look for a matching existing root */
 	if (!opts.subsys_mask && !opts.none && !opts.name) {
 		cgrp_dfl_root_visible = true;
 		root = &cgrp_dfl_root;
 		cgroup_get(&root->cgrp);
 		ret = 0;
 		goto out_unlock;
 	}
 	for_each_root(root) {
 		bool name_match = false;
 		if (root == &cgrp_dfl_root)
 			continue;
 		/*
 		 * If we asked for a name then it must match.  Also, if
 		 * name matches but sybsys_mask doesn't, we should fail.
 		 * Remember whether name matched.
 		 */
 		if (opts.name) {
 			if (strcmp(opts.name, root->name))
 				continue;
 			name_match = true;
 		}
 		/*
 		 * If we asked for subsystems (or explicitly for no
 		 * subsystems) then they must match.
 		 */
 		if ((opts.subsys_mask || opts.none) &&
 		    (opts.subsys_mask != root->subsys_mask)) {
 			if (!name_match)
 				continue;
 			ret = -EBUSY;
 			goto out_unlock;
 		}
 		if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
 			if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
 				pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
 				ret = -EINVAL;
 				goto out_unlock;
 			} else {
 				pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
 			}
 		}
 		/*
 		 * A root's lifetime is governed by its root cgroup.  Zero
 		 * ref indicate that the root is being destroyed.  Wait for
 		 * destruction to complete so that the subsystems are free.
 		 * We can use wait_queue for the wait but this path is
 		 * super cold.  Let's just sleep for a bit and retry.
 		 */
 		if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
 			mutex_unlock(&cgroup_mutex);
 			mutex_unlock(&cgroup_tree_mutex);
 			msleep(10);
 			mutex_lock(&cgroup_tree_mutex);
 			mutex_lock(&cgroup_mutex);
 			goto retry;
 		}
 		ret = 0;
 		goto out_unlock;
 	}
 	/*
 	 * No such thing, create a new one.  name= matching without subsys
 	 * specification is allowed for already existing hierarchies but we
 	 * can't create new one without subsys specification.
 	 */
 	if (!opts.subsys_mask && !opts.none) {
 		ret = -EINVAL;
 		goto out_unlock;
 	}
 	root = kzalloc(sizeof(*root), GFP_KERNEL);
 	if (!root) {
 		ret = -ENOMEM;
 		goto out_unlock;
 	}
 	init_cgroup_root(root, &opts);
 	ret = cgroup_setup_root(root, opts.subsys_mask);
 	if (ret)
 		cgroup_free_root(root);
 out_unlock:
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	kfree(opts.release_agent);
 	kfree(opts.name);
 	if (ret)
 		return ERR_PTR(ret);
 	dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
 	if (IS_ERR(dentry) || !new_sb)
 		cgroup_put(&root->cgrp);
 	return dentry;
 }
 static void cgroup_kill_sb(struct super_block *sb)
 {
 	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	cgroup_put(&root->cgrp);
 	kernfs_kill_sb(sb);
 }
 static struct file_system_type cgroup_fs_type = {
 	.name = "cgroup",
 	.mount = cgroup_mount,
 	.kill_sb = cgroup_kill_sb,
 };
 static struct kobject *cgroup_kobj;
 /**
  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
  * @task: target task
  * @buf: the buffer to write the path into
  * @buflen: the length of the buffer
  *
  * Determine @task's cgroup on the first (the one with the lowest non-zero
  * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
  * function grabs cgroup_mutex and shouldn't be used inside locks used by
  * cgroup controller callbacks.
  *
  * Return value is the same as kernfs_path().
  */
 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 {
 	struct cgroup_root *root;
 	struct cgroup *cgrp;
 	int hierarchy_id = 1;
 	char *path = NULL;
 	mutex_lock(&cgroup_mutex);
 	down_read(&css_set_rwsem);
 	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
 	if (root) {
 		cgrp = task_cgroup_from_root(task, root);
 		path = cgroup_path(cgrp, buf, buflen);
 	} else {
 		/* if no hierarchy exists, everyone is in "/" */
 		if (strlcpy(buf, "/", buflen) < buflen)
 			path = buf;
 	}
 	up_read(&css_set_rwsem);
 	mutex_unlock(&cgroup_mutex);
 	return path;
 }
 EXPORT_SYMBOL_GPL(task_cgroup_path);
 /* used to track tasks and other necessary states during migration */
 struct cgroup_taskset {
 	/* the src and dst cset list running through cset->mg_node */
 	struct list_head	src_csets;
 	struct list_head	dst_csets;
 	/*
 	 * Fields for cgroup_taskset_*() iteration.
 	 *
 	 * Before migration is committed, the target migration tasks are on
 	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
 	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
 	 * or ->dst_csets depending on whether migration is committed.
 	 *
 	 * ->cur_csets and ->cur_task point to the current task position
 	 * during iteration.
 	 */
 	struct list_head	*csets;
 	struct css_set		*cur_cset;
 	struct task_struct	*cur_task;
 };
 /**
  * cgroup_taskset_first - reset taskset and return the first task
  * @tset: taskset of interest
  *
  * @tset iteration is initialized and the first task is returned.
  */
 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
 {
 	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
 	tset->cur_task = NULL;
 	return cgroup_taskset_next(tset);
 }
 /**
  * cgroup_taskset_next - iterate to the next task in taskset
  * @tset: taskset of interest
  *
  * Return the next task in @tset.  Iteration must have been initialized
  * with cgroup_taskset_first().
  */
 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
 {
 	struct css_set *cset = tset->cur_cset;
 	struct task_struct *task = tset->cur_task;
 	while (&cset->mg_node != tset->csets) {
 		if (!task)
 			task = list_first_entry(&cset->mg_tasks,
 						struct task_struct, cg_list);
 		else
 			task = list_next_entry(task, cg_list);
 		if (&task->cg_list != &cset->mg_tasks) {
 			tset->cur_cset = cset;
 			tset->cur_task = task;
 			return task;
 		}
 		cset = list_next_entry(cset, mg_node);
 		task = NULL;
 	}
 	return NULL;
 }
 /**
  * cgroup_task_migrate - move a task from one cgroup to another.
  * @old_cgrp; the cgroup @tsk is being migrated from
  * @tsk: the task being migrated
  * @new_cset: the new css_set @tsk is being attached to
  *
  * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
  */
 static void cgroup_task_migrate(struct cgroup *old_cgrp,
 				struct task_struct *tsk,
 				struct css_set *new_cset)
 {
 	struct css_set *old_cset;
 	lockdep_assert_held(&cgroup_mutex);
 	lockdep_assert_held(&css_set_rwsem);
 	/*
 	 * We are synchronized through threadgroup_lock() against PF_EXITING
 	 * setting such that we can't race against cgroup_exit() changing the
 	 * css_set to init_css_set and dropping the old one.
 	 */
 	WARN_ON_ONCE(tsk->flags & PF_EXITING);
 	old_cset = task_css_set(tsk);
 	get_css_set(new_cset);
 	rcu_assign_pointer(tsk->cgroups, new_cset);
 	/*
 	 * Use move_tail so that cgroup_taskset_first() still returns the
 	 * leader after migration.  This works because cgroup_migrate()
 	 * ensures that the dst_cset of the leader is the first on the
 	 * tset's dst_csets list.
 	 */
 	list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
 	/*
 	 * We just gained a reference on old_cset by taking it from the
 	 * task. As trading it for new_cset is protected by cgroup_mutex,
 	 * we're safe to drop it here; it will be freed under RCU.
 	 */
 	set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
 	put_css_set_locked(old_cset, false);
 }
 /**
  * cgroup_migrate_finish - cleanup after attach
  * @preloaded_csets: list of preloaded css_sets
  *
  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
  * those functions for details.
  */
 static void cgroup_migrate_finish(struct list_head *preloaded_csets)
 {
 	struct css_set *cset, *tmp_cset;
 	lockdep_assert_held(&cgroup_mutex);
 	down_write(&css_set_rwsem);
 	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
 		cset->mg_src_cgrp = NULL;
 		cset->mg_dst_cset = NULL;
 		list_del_init(&cset->mg_preload_node);
 		put_css_set_locked(cset, false);
 	}
 	up_write(&css_set_rwsem);
 }
 /**
  * cgroup_migrate_add_src - add a migration source css_set
  * @src_cset: the source css_set to add
  * @dst_cgrp: the destination cgroup
  * @preloaded_csets: list of preloaded css_sets
  *
  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
  * @src_cset and add it to @preloaded_csets, which should later be cleaned
  * up by cgroup_migrate_finish().
  *
  * This function may be called without holding threadgroup_lock even if the
  * target is a process.  Threads may be created and destroyed but as long
  * as cgroup_mutex is not dropped, no new css_set can be put into play and
  * the preloaded css_sets are guaranteed to cover all migrations.
  */
 static void cgroup_migrate_add_src(struct css_set *src_cset,
 				   struct cgroup *dst_cgrp,
 				   struct list_head *preloaded_csets)
 {
 	struct cgroup *src_cgrp;
 	lockdep_assert_held(&cgroup_mutex);
 	lockdep_assert_held(&css_set_rwsem);
 	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
 	/* nothing to do if this cset already belongs to the cgroup */
 	if (src_cgrp == dst_cgrp)
 		return;
 	if (!list_empty(&src_cset->mg_preload_node))
 		return;
 	WARN_ON(src_cset->mg_src_cgrp);
 	WARN_ON(!list_empty(&src_cset->mg_tasks));
 	WARN_ON(!list_empty(&src_cset->mg_node));
 	src_cset->mg_src_cgrp = src_cgrp;
 	get_css_set(src_cset);
 	list_add(&src_cset->mg_preload_node, preloaded_csets);
 }
 /**
  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
  * @dst_cgrp: the destination cgroup
  * @preloaded_csets: list of preloaded source css_sets
  *
  * Tasks are about to be moved to @dst_cgrp and all the source css_sets
  * have been preloaded to @preloaded_csets.  This function looks up and
  * pins all destination css_sets, links each to its source, and put them on
  * @preloaded_csets.
  *
  * This function must be called after cgroup_migrate_add_src() has been
  * called on each migration source css_set.  After migration is performed
  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
  * @preloaded_csets.
  */
 static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
 				      struct list_head *preloaded_csets)
 {
 	LIST_HEAD(csets);
 	struct css_set *src_cset;
 	lockdep_assert_held(&cgroup_mutex);
 	/* look up the dst cset for each src cset and link it to src */
 	list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {
 		struct css_set *dst_cset;
 		dst_cset = find_css_set(src_cset, dst_cgrp);
 		if (!dst_cset)
 			goto err;
 		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
 		src_cset->mg_dst_cset = dst_cset;
 		if (list_empty(&dst_cset->mg_preload_node))
 			list_add(&dst_cset->mg_preload_node, &csets);
 		else
 			put_css_set(dst_cset, false);
 	}
 	list_splice(&csets, preloaded_csets);
 	return 0;
 err:
 	cgroup_migrate_finish(&csets);
 	return -ENOMEM;
 }
 /**
  * cgroup_migrate - migrate a process or task to a cgroup
  * @cgrp: the destination cgroup
  * @leader: the leader of the process or the task to migrate
  * @threadgroup: whether @leader points to the whole process or a single task
  *
  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
  * process, the caller must be holding threadgroup_lock of @leader.  The
  * caller is also responsible for invoking cgroup_migrate_add_src() and
  * cgroup_migrate_prepare_dst() on the targets before invoking this
  * function and following up with cgroup_migrate_finish().
  *
  * As long as a controller's ->can_attach() doesn't fail, this function is
  * guaranteed to succeed.  This means that, excluding ->can_attach()
  * failure, when migrating multiple targets, the success or failure can be
  * decided for all targets by invoking group_migrate_prepare_dst() before
  * actually starting migrating.
  */
 static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
 			  bool threadgroup)
 {
 	struct cgroup_taskset tset = {
 		.src_csets	= LIST_HEAD_INIT(tset.src_csets),
 		.dst_csets	= LIST_HEAD_INIT(tset.dst_csets),
 		.csets		= &tset.src_csets,
 	};
 	struct cgroup_subsys_state *css, *failed_css = NULL;
 	struct css_set *cset, *tmp_cset;
 	struct task_struct *task, *tmp_task;
 	int i, ret;
 	/*
 	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
 	 * already PF_EXITING could be freed from underneath us unless we
 	 * take an rcu_read_lock.
 	 */
 	down_write(&css_set_rwsem);
 	rcu_read_lock();
 	task = leader;
 	do {
 		/* @task either already exited or can't exit until the end */
 		if (task->flags & PF_EXITING)
 			goto next;
 		/* leave @task alone if post_fork() hasn't linked it yet */
 		if (list_empty(&task->cg_list))
 			goto next;
 		cset = task_css_set(task);
 		if (!cset->mg_src_cgrp)
 			goto next;
 		/*
 		 * cgroup_taskset_first() must always return the leader.
 		 * Take care to avoid disturbing the ordering.
 		 */
 		list_move_tail(&task->cg_list, &cset->mg_tasks);
 		if (list_empty(&cset->mg_node))
 			list_add_tail(&cset->mg_node, &tset.src_csets);
 		if (list_empty(&cset->mg_dst_cset->mg_node))
 			list_move_tail(&cset->mg_dst_cset->mg_node,
 				       &tset.dst_csets);
 	next:
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
 	up_write(&css_set_rwsem);
 	/* methods shouldn't be called if no task is actually migrating */
 	if (list_empty(&tset.src_csets))
 		return 0;
 	/* check that we can legitimately attach to the cgroup */
-	for_each_css(css, i, cgrp) {
+	for_each_e_css(css, i, cgrp) {
 		if (css->ss->can_attach) {
 			ret = css->ss->can_attach(css, &tset);
 			if (ret) {
 				failed_css = css;
 				goto out_cancel_attach;
 			}
 		}
 	}
 	/*
 	 * Now that we're guaranteed success, proceed to move all tasks to
 	 * the new cgroup.  There are no failure cases after here, so this
 	 * is the commit point.
 	 */
 	down_write(&css_set_rwsem);
 	list_for_each_entry(cset, &tset.src_csets, mg_node) {
 		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
 			cgroup_task_migrate(cset->mg_src_cgrp, task,
 					    cset->mg_dst_cset);
 	}
 	up_write(&css_set_rwsem);
 	/*
 	 * Migration is committed, all target tasks are now on dst_csets.
 	 * Nothing is sensitive to fork() after this point.  Notify
 	 * controllers that migration is complete.
 	 */
 	tset.csets = &tset.dst_csets;
-	for_each_css(css, i, cgrp)
+	for_each_e_css(css, i, cgrp)
 		if (css->ss->attach)
 			css->ss->attach(css, &tset);
 	ret = 0;
 	goto out_release_tset;
 out_cancel_attach:
-	for_each_css(css, i, cgrp) {
+	for_each_e_css(css, i, cgrp) {
 		if (css == failed_css)
 			break;
 		if (css->ss->cancel_attach)
 			css->ss->cancel_attach(css, &tset);
 	}
 out_release_tset:
 	down_write(&css_set_rwsem);
 	list_splice_init(&tset.dst_csets, &tset.src_csets);
 	list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
 		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
 		list_del_init(&cset->mg_node);
 	}
 	up_write(&css_set_rwsem);
 	return ret;
 }
 /**
  * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
  * @dst_cgrp: the cgroup to attach to
  * @leader: the task or the leader of the threadgroup to be attached
  * @threadgroup: attach the whole threadgroup?
  *
  * Call holding cgroup_mutex and threadgroup_lock of @leader.
  */
 static int cgroup_attach_task(struct cgroup *dst_cgrp,
 			      struct task_struct *leader, bool threadgroup)
 {
 	LIST_HEAD(preloaded_csets);
 	struct task_struct *task;
 	int ret;
 	/* look up all src csets */
 	down_read(&css_set_rwsem);
 	rcu_read_lock();
 	task = leader;
 	do {
 		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
 				       &preloaded_csets);
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
 	up_read(&css_set_rwsem);
 	/* prepare dst csets and commit */
 	ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
 	if (!ret)
 		ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
 	cgroup_migrate_finish(&preloaded_csets);
 	return ret;
 }
 /*
  * Find the task_struct of the task to attach by vpid and pass it along to the
  * function to attach either it or all tasks in its threadgroup. Will lock
  * cgroup_mutex and threadgroup.
  */
 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 {
 	struct task_struct *tsk;
 	const struct cred *cred = current_cred(), *tcred;
 	int ret;
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 retry_find_task:
 	rcu_read_lock();
 	if (pid) {
 		tsk = find_task_by_vpid(pid);
 		if (!tsk) {
 			rcu_read_unlock();
 			ret = -ESRCH;
 			goto out_unlock_cgroup;
 		}
 		/*
 		 * even if we're attaching all tasks in the thread group, we
 		 * only need to check permissions on one of them.
 		 */
 		tcred = __task_cred(tsk);
 		if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
 		    !uid_eq(cred->euid, tcred->uid) &&
 		    !uid_eq(cred->euid, tcred->suid)) {
 			rcu_read_unlock();
 			ret = -EACCES;
 			goto out_unlock_cgroup;
 		}
 	} else
 		tsk = current;
 	if (threadgroup)
 		tsk = tsk->group_leader;
 	/*
 	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
 	 * trapped in a cpuset, or RT worker may be born in a cgroup
 	 * with no rt_runtime allocated.  Just say no.
 	 */
 	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
 		rcu_read_unlock();
 		goto out_unlock_cgroup;
 	}
 	get_task_struct(tsk);
 	rcu_read_unlock();
 	threadgroup_lock(tsk);
 	if (threadgroup) {
 		if (!thread_group_leader(tsk)) {
 			/*
 			 * a race with de_thread from another thread's exec()
 			 * may strip us of our leadership, if this happens,
 			 * there is no choice but to throw this task away and
 			 * try again; this is
 			 * "double-double-toil-and-trouble-check locking".
 			 */
 			threadgroup_unlock(tsk);
 			put_task_struct(tsk);
 			goto retry_find_task;
 		}
 	}
 	ret = cgroup_attach_task(cgrp, tsk, threadgroup);
 	threadgroup_unlock(tsk);
 	put_task_struct(tsk);
 out_unlock_cgroup:
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
 /**
  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
  * @from: attach to all cgroups of a given task
  * @tsk: the task to be attached
  */
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 {
 	struct cgroup_root *root;
 	int retval = 0;
 	mutex_lock(&cgroup_mutex);
 	for_each_root(root) {
 		struct cgroup *from_cgrp;
 		if (root == &cgrp_dfl_root)
 			continue;
 		down_read(&css_set_rwsem);
 		from_cgrp = task_cgroup_from_root(from, root);
 		up_read(&css_set_rwsem);
 		retval = cgroup_attach_task(from_cgrp, tsk, false);
 		if (retval)
 			break;
 	}
 	mutex_unlock(&cgroup_mutex);
 	return retval;
 }
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 static int cgroup_tasks_write(struct cgroup_subsys_state *css,
 			      struct cftype *cft, u64 pid)
 {
 	return attach_task_by_pid(css->cgroup, pid, false);
 }
 static int cgroup_procs_write(struct cgroup_subsys_state *css,
 			      struct cftype *cft, u64 tgid)
 {
 	return attach_task_by_pid(css->cgroup, tgid, true);
 }
 static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
 				      struct cftype *cft, char *buffer)
 {
 	struct cgroup_root *root = css->cgroup->root;
 	BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
 	if (!cgroup_lock_live_group(css->cgroup))
 		return -ENODEV;
 	spin_lock(&release_agent_path_lock);
 	strlcpy(root->release_agent_path, buffer,
 		sizeof(root->release_agent_path));
 	spin_unlock(&release_agent_path_lock);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 static int cgroup_release_agent_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 	seq_puts(seq, cgrp->root->release_agent_path);
 	seq_putc(seq, '\n');
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
 	seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
 	return 0;
 }
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off)
 {
 	struct cgroup *cgrp = of->kn->parent->priv;
 	struct cftype *cft = of->kn->priv;
 	struct cgroup_subsys_state *css;
 	int ret;
 	/*
 	 * kernfs guarantees that a file isn't deleted with operations in
 	 * flight, which means that the matching css is and stays alive and
 	 * doesn't need to be pinned.  The RCU locking is not necessary
 	 * either.  It's just for the convenience of using cgroup_css().
 	 */
 	rcu_read_lock();
 	css = cgroup_css(cgrp, cft->ss);
 	rcu_read_unlock();
 	if (cft->write_string) {
 		ret = cft->write_string(css, cft, strstrip(buf));
 	} else if (cft->write_u64) {
 		unsigned long long v;
 		ret = kstrtoull(buf, 0, &v);
 		if (!ret)
 			ret = cft->write_u64(css, cft, v);
 	} else if (cft->write_s64) {
 		long long v;
 		ret = kstrtoll(buf, 0, &v);
 		if (!ret)
 			ret = cft->write_s64(css, cft, v);
 	} else if (cft->trigger) {
 		ret = cft->trigger(css, (unsigned int)cft->private);
 	} else {
 		ret = -EINVAL;
 	}
 	return ret ?: nbytes;
 }
 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
 {
 	return seq_cft(seq)->seq_start(seq, ppos);
 }
 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
 {
 	return seq_cft(seq)->seq_next(seq, v, ppos);
 }
 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
 {
 	seq_cft(seq)->seq_stop(seq, v);
 }
 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 {
 	struct cftype *cft = seq_cft(m);
 	struct cgroup_subsys_state *css = seq_css(m);
 	if (cft->seq_show)
 		return cft->seq_show(m, arg);
 	if (cft->read_u64)
 		seq_printf(m, "%llu\n", cft->read_u64(css, cft));
 	else if (cft->read_s64)
 		seq_printf(m, "%lld\n", cft->read_s64(css, cft));
 	else
 		return -EINVAL;
 	return 0;
 }
 static struct kernfs_ops cgroup_kf_single_ops = {
 	.atomic_write_len	= PAGE_SIZE,
 	.write			= cgroup_file_write,
 	.seq_show		= cgroup_seqfile_show,
 };
 static struct kernfs_ops cgroup_kf_ops = {
 	.atomic_write_len	= PAGE_SIZE,
 	.write			= cgroup_file_write,
 	.seq_start		= cgroup_seqfile_start,
 	.seq_next		= cgroup_seqfile_next,
 	.seq_stop		= cgroup_seqfile_stop,
 	.seq_show		= cgroup_seqfile_show,
 };
 /*
  * cgroup_rename - Only allow simple rename of directories in place.
  */
 static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
 			 const char *new_name_str)
 {
 	struct cgroup *cgrp = kn->priv;
 	int ret;
 	if (kernfs_type(kn) != KERNFS_DIR)
 		return -ENOTDIR;
 	if (kn->parent != new_parent)
 		return -EIO;
 	/*
 	 * This isn't a proper migration and its usefulness is very
 	 * limited.  Disallow if sane_behavior.
 	 */
 	if (cgroup_sane_behavior(cgrp))
 		return -EPERM;
 	/*
 	 * We're gonna grab cgroup_tree_mutex which nests outside kernfs
 	 * active_ref.  kernfs_rename() doesn't require active_ref
 	 * protection.  Break them before grabbing cgroup_tree_mutex.
 	 */
 	kernfs_break_active_protection(new_parent);
 	kernfs_break_active_protection(kn);
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	ret = kernfs_rename(kn, new_parent, new_name_str);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	kernfs_unbreak_active_protection(kn);
 	kernfs_unbreak_active_protection(new_parent);
 	return ret;
 }
 /* set uid and gid of cgroup dirs and files to that of the creator */
 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
 {
 	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
 			       .ia_uid = current_fsuid(),
 			       .ia_gid = current_fsgid(), };
 	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
 	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
 		return 0;
 	return kernfs_setattr(kn, &iattr);
 }
 static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 {
 	char name[CGROUP_FILE_NAME_MAX];
 	struct kernfs_node *kn;
 	struct lock_class_key *key = NULL;
 	int ret;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	key = &cft->lockdep_key;
 #endif
 	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
 				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
 				  NULL, false, key);
 	if (IS_ERR(kn))
 		return PTR_ERR(kn);
 	ret = cgroup_kn_set_ugid(kn);
 	if (ret)
 		kernfs_remove(kn);
 	return ret;
 }
 /**
  * cgroup_addrm_files - add or remove files to a cgroup directory
  * @cgrp: the target cgroup
  * @cfts: array of cftypes to be added
  * @is_add: whether to add or remove
  *
  * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
  * For removals, this function never fails.  If addition fails, this
  * function doesn't remove files already added.  The caller is responsible
  * for cleaning up.
  */
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add)
 {
 	struct cftype *cft;
 	int ret;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
 		/* does cft->flags tell us to skip this file on @cgrp? */
 		if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
 			continue;
 		if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
 			continue;
 		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
 			continue;
 		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
 			continue;
 		if (is_add) {
 			ret = cgroup_add_file(cgrp, cft);
 			if (ret) {
 				pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
 					cft->name, ret);
 				return ret;
 			}
 		} else {
 			cgroup_rm_file(cgrp, cft);
 		}
 	}
 	return 0;
 }
 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
 {
 	LIST_HEAD(pending);
 	struct cgroup_subsys *ss = cfts[0].ss;
 	struct cgroup *root = &ss->root->cgrp;
 	struct cgroup_subsys_state *css;
 	int ret = 0;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	/* add/rm files for all cgroups created before */
 	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
 		struct cgroup *cgrp = css->cgroup;
 		if (cgroup_is_dead(cgrp))
 			continue;
 		ret = cgroup_addrm_files(cgrp, cfts, is_add);
 		if (ret)
 			break;
 	}
 	if (is_add && !ret)
 		kernfs_activate(root->kn);
 	return ret;
 }
 static void cgroup_exit_cftypes(struct cftype *cfts)
 {
 	struct cftype *cft;
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
 		/* free copy for custom atomic_write_len, see init_cftypes() */
 		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
 			kfree(cft->kf_ops);
 		cft->kf_ops = NULL;
 		cft->ss = NULL;
 	}
 }
 static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
 	struct cftype *cft;
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
 		struct kernfs_ops *kf_ops;
 		WARN_ON(cft->ss || cft->kf_ops);
 		if (cft->seq_start)
 			kf_ops = &cgroup_kf_ops;
 		else
 			kf_ops = &cgroup_kf_single_ops;
 		/*
 		 * Ugh... if @cft wants a custom max_write_len, we need to
 		 * make a copy of kf_ops to set its atomic_write_len.
 		 */
 		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
 			kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
 			if (!kf_ops) {
 				cgroup_exit_cftypes(cfts);
 				return -ENOMEM;
 			}
 			kf_ops->atomic_write_len = cft->max_write_len;
 		}
 		cft->kf_ops = kf_ops;
 		cft->ss = ss;
 	}
 	return 0;
 }
 static int cgroup_rm_cftypes_locked(struct cftype *cfts)
 {
 	lockdep_assert_held(&cgroup_tree_mutex);
 	if (!cfts || !cfts[0].ss)
 		return -ENOENT;
 	list_del(&cfts->node);
 	cgroup_apply_cftypes(cfts, false);
 	cgroup_exit_cftypes(cfts);
 	return 0;
 }
 /**
  * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
  * @cfts: zero-length name terminated array of cftypes
  *
  * Unregister @cfts.  Files described by @cfts are removed from all
  * existing cgroups and all future cgroups won't have them either.  This
  * function can be called anytime whether @cfts' subsys is attached or not.
  *
  * Returns 0 on successful unregistration, -ENOENT if @cfts is not
  * registered.
  */
 int cgroup_rm_cftypes(struct cftype *cfts)
 {
 	int ret;
 	mutex_lock(&cgroup_tree_mutex);
 	ret = cgroup_rm_cftypes_locked(cfts);
 	mutex_unlock(&cgroup_tree_mutex);
 	return ret;
 }
 /**
  * cgroup_add_cftypes - add an array of cftypes to a subsystem
  * @ss: target cgroup subsystem
  * @cfts: zero-length name terminated array of cftypes
  *
  * Register @cfts to @ss.  Files described by @cfts are created for all
  * existing cgroups to which @ss is attached and all future cgroups will
  * have them too.  This function can be called anytime whether @ss is
  * attached or not.
  *
  * Returns 0 on successful registration, -errno on failure.  Note that this
  * function currently returns 0 as long as @cfts registration is successful
  * even if some file creation attempts on existing cgroups fail.
  */
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
 	int ret;
 	if (!cfts || cfts[0].name[0] == '\0')
 		return 0;
 	ret = cgroup_init_cftypes(ss, cfts);
 	if (ret)
 		return ret;
 	mutex_lock(&cgroup_tree_mutex);
 	list_add_tail(&cfts->node, &ss->cfts);
 	ret = cgroup_apply_cftypes(cfts, true);
 	if (ret)
 		cgroup_rm_cftypes_locked(cfts);
 	mutex_unlock(&cgroup_tree_mutex);
 	return ret;
 }
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
  *
  * Return the number of tasks in the cgroup.
  */
 static int cgroup_task_count(const struct cgroup *cgrp)
 {
 	int count = 0;
 	struct cgrp_cset_link *link;
 	down_read(&css_set_rwsem);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
 		count += atomic_read(&link->cset->refcount);
 	up_read(&css_set_rwsem);
 	return count;
 }
 /**
  * css_next_child - find the next child of a given css
  * @pos_css: the current position (%NULL to initiate traversal)
  * @parent_css: css whose children to walk
  *
  * This function returns the next child of @parent_css and should be called
  * under either cgroup_mutex or RCU read lock.  The only requirement is
  * that @parent_css and @pos_css are accessible.  The next sibling is
  * guaranteed to be returned regardless of their states.
  */
 struct cgroup_subsys_state *
 css_next_child(struct cgroup_subsys_state *pos_css,
 	       struct cgroup_subsys_state *parent_css)
 {
 	struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
 	struct cgroup *cgrp = parent_css->cgroup;
 	struct cgroup *next;
 	cgroup_assert_mutexes_or_rcu_locked();
 	/*
 	 * @pos could already have been removed.  Once a cgroup is removed,
 	 * its ->sibling.next is no longer updated when its next sibling
 	 * changes.  As CGRP_DEAD assertion is serialized and happens
 	 * before the cgroup is taken off the ->sibling list, if we see it
 	 * unasserted, it's guaranteed that the next sibling hasn't
 	 * finished its grace period even if it's already removed, and thus
 	 * safe to dereference from this RCU critical section.  If
 	 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
 	 * to be visible as %true here.
 	 *
 	 * If @pos is dead, its next pointer can't be dereferenced;
 	 * however, as each cgroup is given a monotonically increasing
 	 * unique serial number and always appended to the sibling list,
 	 * the next one can be found by walking the parent's children until
 	 * we see a cgroup with higher serial number than @pos's.  While
 	 * this path can be slower, it's taken only when either the current
 	 * cgroup is removed or iteration and removal race.
 	 */
 	if (!pos) {
 		next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
 	} else if (likely(!cgroup_is_dead(pos))) {
 		next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
 	} else {
 		list_for_each_entry_rcu(next, &cgrp->children, sibling)
 			if (next->serial_nr > pos->serial_nr)
 				break;
 	}
 	if (&next->sibling == &cgrp->children)
 		return NULL;
 	return cgroup_css(next, parent_css->ss);
 }
 /**
  * css_next_descendant_pre - find the next descendant for pre-order walk
  * @pos: the current position (%NULL to initiate traversal)
  * @root: css whose descendants to walk
  *
  * To be used by css_for_each_descendant_pre().  Find the next descendant
  * to visit for pre-order traversal of @root's descendants.  @root is
  * included in the iteration and the first node to be visited.
  *
  * While this function requires cgroup_mutex or RCU read locking, it
  * doesn't require the whole traversal to be contained in a single critical
  * section.  This function will return the correct next descendant as long
  * as both @pos and @root are accessible and @pos is a descendant of @root.
  */
 struct cgroup_subsys_state *
 css_next_descendant_pre(struct cgroup_subsys_state *pos,
 			struct cgroup_subsys_state *root)
 {
 	struct cgroup_subsys_state *next;
 	cgroup_assert_mutexes_or_rcu_locked();
 	/* if first iteration, visit @root */
 	if (!pos)
 		return root;
 	/* visit the first child if exists */
 	next = css_next_child(NULL, pos);
 	if (next)
 		return next;
 	/* no child, visit my or the closest ancestor's next sibling */
 	while (pos != root) {
 		next = css_next_child(pos, css_parent(pos));
 		if (next)
 			return next;
 		pos = css_parent(pos);
 	}
 	return NULL;
 }
 /**
  * css_rightmost_descendant - return the rightmost descendant of a css
  * @pos: css of interest
  *
  * Return the rightmost descendant of @pos.  If there's no descendant, @pos
  * is returned.  This can be used during pre-order traversal to skip
  * subtree of @pos.
  *
  * While this function requires cgroup_mutex or RCU read locking, it
  * doesn't require the whole traversal to be contained in a single critical
  * section.  This function will return the correct rightmost descendant as
  * long as @pos is accessible.
  */
 struct cgroup_subsys_state *
 css_rightmost_descendant(struct cgroup_subsys_state *pos)
 {
 	struct cgroup_subsys_state *last, *tmp;
 	cgroup_assert_mutexes_or_rcu_locked();
 	do {
 		last = pos;
 		/* ->prev isn't RCU safe, walk ->next till the end */
 		pos = NULL;
 		css_for_each_child(tmp, last)
 			pos = tmp;
 	} while (pos);
 	return last;
 }
 static struct cgroup_subsys_state *
 css_leftmost_descendant(struct cgroup_subsys_state *pos)
 {
 	struct cgroup_subsys_state *last;
 	do {
 		last = pos;
 		pos = css_next_child(NULL, pos);
 	} while (pos);
 	return last;
 }
 /**
  * css_next_descendant_post - find the next descendant for post-order walk
  * @pos: the current position (%NULL to initiate traversal)
  * @root: css whose descendants to walk
  *
  * To be used by css_for_each_descendant_post().  Find the next descendant
  * to visit for post-order traversal of @root's descendants.  @root is
  * included in the iteration and the last node to be visited.
  *
  * While this function requires cgroup_mutex or RCU read locking, it
  * doesn't require the whole traversal to be contained in a single critical
  * section.  This function will return the correct next descendant as long
  * as both @pos and @cgroup are accessible and @pos is a descendant of
  * @cgroup.
  */
 struct cgroup_subsys_state *
 css_next_descendant_post(struct cgroup_subsys_state *pos,
 			 struct cgroup_subsys_state *root)
 {
 	struct cgroup_subsys_state *next;
 	cgroup_assert_mutexes_or_rcu_locked();
 	/* if first iteration, visit leftmost descendant which may be @root */
 	if (!pos)
 		return css_leftmost_descendant(root);
 	/* if we visited @root, we're done */
 	if (pos == root)
 		return NULL;
 	/* if there's an unvisited sibling, visit its leftmost descendant */
 	next = css_next_child(pos, css_parent(pos));
 	if (next)
 		return css_leftmost_descendant(next);
 	/* no sibling left, visit parent */
 	return css_parent(pos);
 }
 /**
  * css_advance_task_iter - advance a task itererator to the next css_set
  * @it: the iterator to advance
  *
  * Advance @it to the next css_set to walk.
  */
 static void css_advance_task_iter(struct css_task_iter *it)
 {
 	struct list_head *l = it->cset_link;
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
 	/* Advance to the next non-empty css_set */
 	do {
 		l = l->next;
 		if (l == &it->origin_css->cgroup->cset_links) {
 			it->cset_link = NULL;
 			return;
 		}
 		link = list_entry(l, struct cgrp_cset_link, cset_link);
 		cset = link->cset;
 	} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
 	it->cset_link = l;
 	if (!list_empty(&cset->tasks))
 		it->task = cset->tasks.next;
 	else
 		it->task = cset->mg_tasks.next;
 }
 /**
  * css_task_iter_start - initiate task iteration
  * @css: the css to walk tasks of
  * @it: the task iterator to use
  *
  * Initiate iteration through the tasks of @css.  The caller can call
  * css_task_iter_next() to walk through the tasks until the function
  * returns NULL.  On completion of iteration, css_task_iter_end() must be
  * called.
  *
  * Note that this function acquires a lock which is released when the
  * iteration finishes.  The caller can't sleep while iteration is in
  * progress.
  */
 void css_task_iter_start(struct cgroup_subsys_state *css,
 			 struct css_task_iter *it)
 	__acquires(css_set_rwsem)
 {
 	/* no one should try to iterate before mounting cgroups */
 	WARN_ON_ONCE(!use_task_css_set_links);
 	down_read(&css_set_rwsem);
 	it->origin_css = css;
 	it->cset_link = &css->cgroup->cset_links;
 	css_advance_task_iter(it);
 }
 /**
  * css_task_iter_next - return the next task for the iterator
  * @it: the task iterator being iterated
  *
  * The "next" function for task iteration.  @it should have been
  * initialized via css_task_iter_start().  Returns NULL when the iteration
  * reaches the end.
  */
 struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
 	struct task_struct *res;
 	struct list_head *l = it->task;
 	struct cgrp_cset_link *link = list_entry(it->cset_link,
 					struct cgrp_cset_link, cset_link);
 	/* If the iterator cg is NULL, we have no tasks */
 	if (!it->cset_link)
 		return NULL;
 	res = list_entry(l, struct task_struct, cg_list);
 	/*
 	 * Advance iterator to find next entry.  cset->tasks is consumed
 	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
 	 * next cset.
 	 */
 	l = l->next;
 	if (l == &link->cset->tasks)
 		l = link->cset->mg_tasks.next;
 	if (l == &link->cset->mg_tasks)
 		css_advance_task_iter(it);
 	else
 		it->task = l;
 	return res;
 }
 /**
  * css_task_iter_end - finish task iteration
  * @it: the task iterator to finish
  *
  * Finish task iteration started by css_task_iter_start().
  */
 void css_task_iter_end(struct css_task_iter *it)
 	__releases(css_set_rwsem)
 {
 	up_read(&css_set_rwsem);
 }
 /**
  * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
  * @to: cgroup to which the tasks will be moved
  * @from: cgroup in which the tasks currently reside
  *
  * Locking rules between cgroup_post_fork() and the migration path
  * guarantee that, if a task is forking while being migrated, the new child
  * is guaranteed to be either visible in the source cgroup after the
  * parent's migration is complete or put into the target cgroup.  No task
  * can slip out of migration through forking.
  */
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 {
 	LIST_HEAD(preloaded_csets);
 	struct cgrp_cset_link *link;
 	struct css_task_iter it;
 	struct task_struct *task;
 	int ret;
 	mutex_lock(&cgroup_mutex);
 	/* all tasks in @from are being moved, all csets are source */
 	down_read(&css_set_rwsem);
 	list_for_each_entry(link, &from->cset_links, cset_link)
 		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
 	up_read(&css_set_rwsem);
 	ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
 	if (ret)
 		goto out_err;
 	/*
 	 * Migrate tasks one-by-one until @form is empty.  This fails iff
 	 * ->can_attach() fails.
 	 */
 	do {
 		css_task_iter_start(&from->dummy_css, &it);
 		task = css_task_iter_next(&it);
 		if (task)
 			get_task_struct(task);
 		css_task_iter_end(&it);
 		if (task) {
 			ret = cgroup_migrate(to, task, false);
 			put_task_struct(task);
 		}
 	} while (task && !ret);
 out_err:
 	cgroup_migrate_finish(&preloaded_csets);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
 /*
  * Stuff for reading the 'tasks'/'procs' files.
  *
  * Reading this file can return large amounts of data if a cgroup has
  * *lots* of attached tasks. So it may need several calls to read(),
  * but we cannot guarantee that the information we produce is correct
  * unless we produce it entirely atomically.
  *
  */
 /* which pidlist file are we talking about? */
 enum cgroup_filetype {
 	CGROUP_FILE_PROCS,
 	CGROUP_FILE_TASKS,
 };
 /*
  * A pidlist is a list of pids that virtually represents the contents of one
  * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
  * a pair (one each for procs, tasks) for each pid namespace that's relevant
  * to the cgroup.
  */
 struct cgroup_pidlist {
 	/*
 	 * used to find which pidlist is wanted. doesn't change as long as
 	 * this particular list stays in the list.
 	*/
 	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
 	/* array of xids */
 	pid_t *list;
 	/* how many elements the above list has */
 	int length;
 	/* each of these stored in a list by its cgroup */
 	struct list_head links;
 	/* pointer to the cgroup we belong to, for list removal purposes */
 	struct cgroup *owner;
 	/* for delayed destruction */
 	struct delayed_work destroy_dwork;
 };
 /*
  * The following two functions "fix" the issue where there are more pids
  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
  * TODO: replace with a kernel-wide solution to this problem
  */
 #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
 static void *pidlist_allocate(int count)
 {
 	if (PIDLIST_TOO_LARGE(count))
 		return vmalloc(count * sizeof(pid_t));
 	else
 		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
 }
 static void pidlist_free(void *p)
 {
 	if (is_vmalloc_addr(p))
 		vfree(p);
 	else
 		kfree(p);
 }
 /*
  * Used to destroy all pidlists lingering waiting for destroy timer.  None
  * should be left afterwards.
  */
 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
 {
 	struct cgroup_pidlist *l, *tmp_l;
 	mutex_lock(&cgrp->pidlist_mutex);
 	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
 	mutex_unlock(&cgrp->pidlist_mutex);
 	flush_workqueue(cgroup_pidlist_destroy_wq);
 	BUG_ON(!list_empty(&cgrp->pidlists));
 }
 static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
 {
 	struct delayed_work *dwork = to_delayed_work(work);
 	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
 						destroy_dwork);
 	struct cgroup_pidlist *tofree = NULL;
 	mutex_lock(&l->owner->pidlist_mutex);
 	/*
 	 * Destroy iff we didn't get queued again.  The state won't change
 	 * as destroy_dwork can only be queued while locked.
 	 */
 	if (!delayed_work_pending(dwork)) {
 		list_del(&l->links);
 		pidlist_free(l->list);
 		put_pid_ns(l->key.ns);
 		tofree = l;
 	}
 	mutex_unlock(&l->owner->pidlist_mutex);
 	kfree(tofree);
 }
 /*
  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
  * Returns the number of unique elements.
  */
 static int pidlist_uniq(pid_t *list, int length)
 {
 	int src, dest = 1;
 	/*
 	 * we presume the 0th element is unique, so i starts at 1. trivial
 	 * edge cases first; no work needs to be done for either
 	 */
 	if (length == 0 || length == 1)
 		return length;
 	/* src and dest walk down the list; dest counts unique elements */
 	for (src = 1; src < length; src++) {
 		/* find next unique element */
 		while (list[src] == list[src-1]) {
 			src++;
 			if (src == length)
 				goto after;
 		}
 		/* dest always points to where the next unique element goes */
 		list[dest] = list[src];
 		dest++;
 	}
 after:
 	return dest;
 }
 /*
  * The two pid files - task and cgroup.procs - guaranteed that the result
  * is sorted, which forced this whole pidlist fiasco.  As pid order is
  * different per namespace, each namespace needs differently sorted list,
  * making it impossible to use, for example, single rbtree of member tasks
  * sorted by task pointer.  As pidlists can be fairly large, allocating one
  * per open file is dangerous, so cgroup had to implement shared pool of
  * pidlists keyed by cgroup and namespace.
  *
  * All this extra complexity was caused by the original implementation
  * committing to an entirely unnecessary property.  In the long term, we
  * want to do away with it.  Explicitly scramble sort order if
  * sane_behavior so that no such expectation exists in the new interface.
  *
  * Scrambling is done by swapping every two consecutive bits, which is
  * non-identity one-to-one mapping which disturbs sort order sufficiently.
  */
 static pid_t pid_fry(pid_t pid)
 {
 	unsigned a = pid & 0x55555555;
 	unsigned b = pid & 0xAAAAAAAA;
 	return (a << 1) | (b >> 1);
 }
 static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
 {
 	if (cgroup_sane_behavior(cgrp))
 		return pid_fry(pid);
 	else
 		return pid;
 }
 static int cmppid(const void *a, const void *b)
 {
 	return *(pid_t *)a - *(pid_t *)b;
 }
 static int fried_cmppid(const void *a, const void *b)
 {
 	return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
 }
 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 						  enum cgroup_filetype type)
 {
 	struct cgroup_pidlist *l;
 	/* don't need task_nsproxy() if we're looking at ourself */
 	struct pid_namespace *ns = task_active_pid_ns(current);
 	lockdep_assert_held(&cgrp->pidlist_mutex);
 	list_for_each_entry(l, &cgrp->pidlists, links)
 		if (l->key.type == type && l->key.ns == ns)
 			return l;
 	return NULL;
 }
 /*
  * find the appropriate pidlist for our purpose (given procs vs tasks)
  * returns with the lock on that pidlist already held, and takes care
  * of the use count, or returns NULL with no locks held if we're out of
  * memory.
  */
 static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
 						enum cgroup_filetype type)
 {
 	struct cgroup_pidlist *l;
 	lockdep_assert_held(&cgrp->pidlist_mutex);
 	l = cgroup_pidlist_find(cgrp, type);
 	if (l)
 		return l;
 	/* entry not found; create a new one */
 	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
 	if (!l)
 		return l;
 	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
 	l->key.type = type;
 	/* don't need task_nsproxy() if we're looking at ourself */
 	l->key.ns = get_pid_ns(task_active_pid_ns(current));
 	l->owner = cgrp;
 	list_add(&l->links, &cgrp->pidlists);
 	return l;
 }
 /*
  * Load a cgroup's pidarray with either procs' tgids or tasks' pids
  */
 static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 			      struct cgroup_pidlist **lp)
 {
 	pid_t *array;
 	int length;
 	int pid, n = 0; /* used for populating the array */
 	struct css_task_iter it;
 	struct task_struct *tsk;
 	struct cgroup_pidlist *l;
 	lockdep_assert_held(&cgrp->pidlist_mutex);
 	/*
 	 * If cgroup gets more users after we read count, we won't have
 	 * enough space - tough.  This race is indistinguishable to the
 	 * caller from the case that the additional cgroup users didn't
 	 * show up until sometime later on.
 	 */
 	length = cgroup_task_count(cgrp);
 	array = pidlist_allocate(length);
 	if (!array)
 		return -ENOMEM;
 	/* now, populate the array */
 	css_task_iter_start(&cgrp->dummy_css, &it);
 	while ((tsk = css_task_iter_next(&it))) {
 		if (unlikely(n == length))
 			break;
 		/* get tgid or pid for procs or tasks file respectively */
 		if (type == CGROUP_FILE_PROCS)
 			pid = task_tgid_vnr(tsk);
 		else
 			pid = task_pid_vnr(tsk);
 		if (pid > 0) /* make sure to only use valid results */
 			array[n++] = pid;
 	}
 	css_task_iter_end(&it);
 	length = n;
 	/* now sort & (if procs) strip out duplicates */
 	if (cgroup_sane_behavior(cgrp))
 		sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
 	else
 		sort(array, length, sizeof(pid_t), cmppid, NULL);
 	if (type == CGROUP_FILE_PROCS)
 		length = pidlist_uniq(array, length);
 	l = cgroup_pidlist_find_create(cgrp, type);
 	if (!l) {
 		mutex_unlock(&cgrp->pidlist_mutex);
 		pidlist_free(array);
 		return -ENOMEM;
 	}
 	/* store array, freeing old if necessary */
 	pidlist_free(l->list);
 	l->list = array;
 	l->length = length;
 	*lp = l;
 	return 0;
 }
 /**
  * cgroupstats_build - build and fill cgroupstats
  * @stats: cgroupstats to fill information into
  * @dentry: A dentry entry belonging to the cgroup for which stats have
  * been requested.
  *
  * Build and fill cgroupstats so that taskstats can export it to user
  * space.
  */
 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 {
 	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
 	struct cgroup *cgrp;
 	struct css_task_iter it;
 	struct task_struct *tsk;
 	/* it should be kernfs_node belonging to cgroupfs and is a directory */
 	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
 	    kernfs_type(kn) != KERNFS_DIR)
 		return -EINVAL;
 	mutex_lock(&cgroup_mutex);
 	/*
 	 * We aren't being called from kernfs and there's no guarantee on
 	 * @kn->priv's validity.  For this and css_tryget_from_dir(),
 	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
 	 */
 	rcu_read_lock();
 	cgrp = rcu_dereference(kn->priv);
 	if (!cgrp || cgroup_is_dead(cgrp)) {
 		rcu_read_unlock();
 		mutex_unlock(&cgroup_mutex);
 		return -ENOENT;
 	}
 	rcu_read_unlock();
 	css_task_iter_start(&cgrp->dummy_css, &it);
 	while ((tsk = css_task_iter_next(&it))) {
 		switch (tsk->state) {
 		case TASK_RUNNING:
 			stats->nr_running++;
 			break;
 		case TASK_INTERRUPTIBLE:
 			stats->nr_sleeping++;
 			break;
 		case TASK_UNINTERRUPTIBLE:
 			stats->nr_uninterruptible++;
 			break;
 		case TASK_STOPPED:
 			stats->nr_stopped++;
 			break;
 		default:
 			if (delayacct_is_task_waiting_on_io(tsk))
 				stats->nr_io_wait++;
 			break;
 		}
 	}
 	css_task_iter_end(&it);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 /*
  * seq_file methods for the tasks/procs files. The seq_file position is the
  * next pid to display; the seq_file iterator is a pointer to the pid
  * in the cgroup->l->list array.
  */
 static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 {
 	/*
 	 * Initially we receive a position value that corresponds to
 	 * one more than the last pid shown (or 0 on the first call or
 	 * after a seek to the start). Use a binary-search to find the
 	 * next pid to display, if any
 	 */
 	struct kernfs_open_file *of = s->private;
 	struct cgroup *cgrp = seq_css(s)->cgroup;
 	struct cgroup_pidlist *l;
 	enum cgroup_filetype type = seq_cft(s)->private;
 	int index = 0, pid = *pos;
 	int *iter, ret;
 	mutex_lock(&cgrp->pidlist_mutex);
 	/*
 	 * !NULL @of->priv indicates that this isn't the first start()
 	 * after open.  If the matching pidlist is around, we can use that.
 	 * Look for it.  Note that @of->priv can't be used directly.  It
 	 * could already have been destroyed.
 	 */
 	if (of->priv)
 		of->priv = cgroup_pidlist_find(cgrp, type);
 	/*
 	 * Either this is the first start() after open or the matching
 	 * pidlist has been destroyed inbetween.  Create a new one.
 	 */
 	if (!of->priv) {
 		ret = pidlist_array_load(cgrp, type,
 					 (struct cgroup_pidlist **)&of->priv);
 		if (ret)
 			return ERR_PTR(ret);
 	}
 	l = of->priv;
 	if (pid) {
 		int end = l->length;
 		while (index < end) {
 			int mid = (index + end) / 2;
 			if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
 				index = mid;
 				break;
 			} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
 				index = mid + 1;
 			else
 				end = mid;
 		}
 	}
 	/* If we're off the end of the array, we're done */
 	if (index >= l->length)
 		return NULL;
 	/* Update the abstract position to be the actual pid that we found */
 	iter = l->list + index;
 	*pos = cgroup_pid_fry(cgrp, *iter);
 	return iter;
 }
 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
 	struct kernfs_open_file *of = s->private;
 	struct cgroup_pidlist *l = of->priv;
 	if (l)
 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
 				 CGROUP_PIDLIST_DESTROY_DELAY);
 	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
 }
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 {
 	struct kernfs_open_file *of = s->private;
 	struct cgroup_pidlist *l = of->priv;
 	pid_t *p = v;
 	pid_t *end = l->list + l->length;
 	/*
 	 * Advance to the next pid in the array. If this goes off the
 	 * end, we're done
 	 */
 	p++;
 	if (p >= end) {
 		return NULL;
 	} else {
 		*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
 		return p;
 	}
 }
 static int cgroup_pidlist_show(struct seq_file *s, void *v)
 {
 	return seq_printf(s, "%d\n", *(int *)v);
 }
 /*
  * seq_operations functions for iterating on pidlists through seq_file -
  * independent of whether it's tasks or procs
  */
 static const struct seq_operations cgroup_pidlist_seq_operations = {
 	.start = cgroup_pidlist_start,
 	.stop = cgroup_pidlist_stop,
 	.next = cgroup_pidlist_next,
 	.show = cgroup_pidlist_show,
 };
 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
 					 struct cftype *cft)
 {
 	return notify_on_release(css->cgroup);
 }
 static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
 					  struct cftype *cft, u64 val)
 {
 	clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
 	if (val)
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
 	else
 		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
 	return 0;
 }
 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
 				      struct cftype *cft)
 {
 	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
 }
 static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
 				       struct cftype *cft, u64 val)
 {
 	if (val)
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
 	else
 		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
 	return 0;
 }
 static struct cftype cgroup_base_files[] = {
 	{
 		.name = "cgroup.procs",
 		.seq_start = cgroup_pidlist_start,
 		.seq_next = cgroup_pidlist_next,
 		.seq_stop = cgroup_pidlist_stop,
 		.seq_show = cgroup_pidlist_show,
 		.private = CGROUP_FILE_PROCS,
 		.write_u64 = cgroup_procs_write,
 		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
 		.name = "cgroup.clone_children",
 		.flags = CFTYPE_INSANE,
 		.read_u64 = cgroup_clone_children_read,
 		.write_u64 = cgroup_clone_children_write,
 	},
 	{
 		.name = "cgroup.sane_behavior",
 		.flags = CFTYPE_ONLY_ON_ROOT,
 		.seq_show = cgroup_sane_behavior_show,
 	},
 	/*
 	 * Historical crazy stuff.  These don't have "cgroup."  prefix and
 	 * don't exist if sane_behavior.  If you're depending on these, be
 	 * prepared to be burned.
 	 */
 	{
 		.name = "tasks",
 		.flags = CFTYPE_INSANE,		/* use "procs" instead */
 		.seq_start = cgroup_pidlist_start,
 		.seq_next = cgroup_pidlist_next,
 		.seq_stop = cgroup_pidlist_stop,
 		.seq_show = cgroup_pidlist_show,
 		.private = CGROUP_FILE_TASKS,
 		.write_u64 = cgroup_tasks_write,
 		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
 		.name = "notify_on_release",
 		.flags = CFTYPE_INSANE,
 		.read_u64 = cgroup_read_notify_on_release,
 		.write_u64 = cgroup_write_notify_on_release,
 	},
 	{
 		.name = "release_agent",
 		.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
 		.seq_show = cgroup_release_agent_show,
 		.write_string = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX - 1,
 	},
 	{ }	/* terminate */
 };
 /**
  * cgroup_populate_dir - create subsys files in a cgroup directory
  * @cgrp: target cgroup
  * @subsys_mask: mask of the subsystem ids whose files should be added
  *
  * On failure, no file is added.
  */
 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 {
 	struct cgroup_subsys *ss;
 	int i, ret = 0;
 	/* process cftsets of each subsystem */
 	for_each_subsys(ss, i) {
 		struct cftype *cfts;
 		if (!test_bit(i, &subsys_mask))
 			continue;
 		list_for_each_entry(cfts, &ss->cfts, node) {
 			ret = cgroup_addrm_files(cgrp, cfts, true);
 			if (ret < 0)
 				goto err;
 		}
 	}
 	return 0;
 err:
 	cgroup_clear_dir(cgrp, subsys_mask);
 	return ret;
 }
 /*
  * css destruction is four-stage process.
  *
  * 1. Destruction starts.  Killing of the percpu_ref is initiated.
  *    Implemented in kill_css().
  *
  * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
  *    and thus css_tryget() is guaranteed to fail, the css can be offlined
  *    by invoking offline_css().  After offlining, the base ref is put.
  *    Implemented in css_killed_work_fn().
  *
  * 3. When the percpu_ref reaches zero, the only possible remaining
  *    accessors are inside RCU read sections.  css_release() schedules the
  *    RCU callback.
  *
  * 4. After the grace period, the css can be freed.  Implemented in
  *    css_free_work_fn().
  *
  * It is actually hairier because both step 2 and 4 require process context
  * and thus involve punting to css->destroy_work adding two additional
  * steps to the already complex sequence.
  */
 static void css_free_work_fn(struct work_struct *work)
 {
 	struct cgroup_subsys_state *css =
 		container_of(work, struct cgroup_subsys_state, destroy_work);
 	struct cgroup *cgrp = css->cgroup;
 	if (css->parent)
 		css_put(css->parent);
 	css->ss->css_free(css);
 	cgroup_put(cgrp);
 }
 static void css_free_rcu_fn(struct rcu_head *rcu_head)
 {
 	struct cgroup_subsys_state *css =
 		container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
 	INIT_WORK(&css->destroy_work, css_free_work_fn);
 	queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 static void css_release(struct percpu_ref *ref)
 {
 	struct cgroup_subsys_state *css =
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 	RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
 	call_rcu(&css->rcu_head, css_free_rcu_fn);
 }
 static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
 		     struct cgroup *cgrp)
 {
 	css->cgroup = cgrp;
 	css->ss = ss;
 	css->flags = 0;
 	if (cgrp->parent)
 		css->parent = cgroup_css(cgrp->parent, ss);
 	else
 		css->flags |= CSS_ROOT;
 	BUG_ON(cgroup_css(cgrp, ss));
 }
 /* invoke ->css_online() on a new CSS and mark it online if successful */
 static int online_css(struct cgroup_subsys_state *css)
 {
 	struct cgroup_subsys *ss = css->ss;
 	int ret = 0;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	if (ss->css_online)
 		ret = ss->css_online(css);
 	if (!ret) {
 		css->flags |= CSS_ONLINE;
 		css->cgroup->nr_css++;
 		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
 	}
 	return ret;
 }
 /* if the CSS is online, invoke ->css_offline() on it and mark it offline */
 static void offline_css(struct cgroup_subsys_state *css)
 {
 	struct cgroup_subsys *ss = css->ss;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	if (!(css->flags & CSS_ONLINE))
 		return;
 	if (ss->css_offline)
 		ss->css_offline(css);
 	css->flags &= ~CSS_ONLINE;
 	css->cgroup->nr_css--;
 	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);
 }
 /**
  * create_css - create a cgroup_subsys_state
  * @cgrp: the cgroup new css will be associated with
  * @ss: the subsys of new css
  *
  * Create a new css associated with @cgrp - @ss pair.  On success, the new
  * css is online and installed in @cgrp with all interface files created.
  * Returns 0 on success, -errno on failure.
  */
 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
 {
 	struct cgroup *parent = cgrp->parent;
 	struct cgroup_subsys_state *css;
 	int err;
 	lockdep_assert_held(&cgroup_mutex);
 	css = ss->css_alloc(cgroup_css(parent, ss));
 	if (IS_ERR(css))
 		return PTR_ERR(css);
 	err = percpu_ref_init(&css->refcnt, css_release);
 	if (err)
 		goto err_free_css;
 	init_css(css, ss, cgrp);
 	err = cgroup_populate_dir(cgrp, 1 << ss->id);
 	if (err)
 		goto err_free_percpu_ref;
 	err = online_css(css);
 	if (err)
 		goto err_clear_dir;
 	cgroup_get(cgrp);
 	css_get(css->parent);
 	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
 	    parent->parent) {
 		pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
 			   current->comm, current->pid, ss->name);
 		if (!strcmp(ss->name, "memory"))
 			pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
 		ss->warned_broken_hierarchy = true;
 	}
 	return 0;
 err_clear_dir:
 	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
 err_free_percpu_ref:
 	percpu_ref_cancel_init(&css->refcnt);
 err_free_css:
 	ss->css_free(css);
 	return err;
 }
 /**
  * cgroup_create - create a cgroup
  * @parent: cgroup that will be parent of the new cgroup
  * @name: name of the new cgroup
  * @mode: mode to set on new cgroup
  */
 static long cgroup_create(struct cgroup *parent, const char *name,
 			  umode_t mode)
 {
 	struct cgroup *cgrp;
 	struct cgroup_root *root = parent->root;
 	int ssid, err;
 	struct cgroup_subsys *ss;
 	struct kernfs_node *kn;
 	/*
 	 * XXX: The default hierarchy isn't fully implemented yet.  Block
 	 * !root cgroup creation on it for now.
 	 */
 	if (root == &cgrp_dfl_root)
 		return -EINVAL;
 	/* allocate the cgroup and its ID, 0 is reserved for the root */
 	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
 	if (!cgrp)
 		return -ENOMEM;
 	mutex_lock(&cgroup_tree_mutex);
 	/*
 	 * Only live parents can have children.  Note that the liveliness
 	 * check isn't strictly necessary because cgroup_mkdir() and
 	 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
 	 * anyway so that locking is contained inside cgroup proper and we
 	 * don't get nasty surprises if we ever grow another caller.
 	 */
 	if (!cgroup_lock_live_group(parent)) {
 		err = -ENODEV;
 		goto err_unlock_tree;
 	}
 	/*
 	 * Temporarily set the pointer to NULL, so idr_find() won't return
 	 * a half-baked cgroup.
 	 */
 	cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
 	if (cgrp->id < 0) {
 		err = -ENOMEM;
 		goto err_unlock;
 	}
 	init_cgroup_housekeeping(cgrp);
 	cgrp->parent = parent;
 	cgrp->dummy_css.parent = &parent->dummy_css;
 	cgrp->root = parent->root;
 	if (notify_on_release(parent))
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
 	/* create the directory */
 	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
 	if (IS_ERR(kn)) {
 		err = PTR_ERR(kn);
 		goto err_free_id;
 	}
 	cgrp->kn = kn;
 	/*
 	 * This extra ref will be put in cgroup_free_fn() and guarantees
 	 * that @cgrp->kn is always accessible.
 	 */
 	kernfs_get(kn);
 	cgrp->serial_nr = cgroup_serial_nr_next++;
 	/* allocation complete, commit to creation */
 	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
 	atomic_inc(&root->nr_cgrps);
 	cgroup_get(parent);
 	/*
 	 * @cgrp is now fully operational.  If something fails after this
 	 * point, it'll be released via the normal destruction path.
 	 */
 	idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
 	err = cgroup_kn_set_ugid(kn);
 	if (err)
 		goto err_destroy;
 	err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
 	if (err)
 		goto err_destroy;
 	/* let's create and online css's */
 	for_each_subsys(ss, ssid) {
 		if (parent->child_subsys_mask & (1 << ssid)) {
 			err = create_css(cgrp, ss);
 			if (err)
 				goto err_destroy;
 		}
 	}
 	cgrp->child_subsys_mask = parent->child_subsys_mask;
 	kernfs_activate(kn);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	return 0;
 err_free_id:
 	idr_remove(&root->cgroup_idr, cgrp->id);
 err_unlock:
 	mutex_unlock(&cgroup_mutex);
 err_unlock_tree:
 	mutex_unlock(&cgroup_tree_mutex);
 	kfree(cgrp);
 	return err;
 err_destroy:
 	cgroup_destroy_locked(cgrp);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	return err;
 }
 static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 			umode_t mode)
 {
 	struct cgroup *parent = parent_kn->priv;
 	int ret;
 	/*
 	 * cgroup_create() grabs cgroup_tree_mutex which nests outside
 	 * kernfs active_ref and cgroup_create() already synchronizes
 	 * properly against removal through cgroup_lock_live_group().
 	 * Break it before calling cgroup_create().
 	 */
 	cgroup_get(parent);
 	kernfs_break_active_protection(parent_kn);
 	ret = cgroup_create(parent, name, mode);
 	kernfs_unbreak_active_protection(parent_kn);
 	cgroup_put(parent);
 	return ret;
 }
 /*
  * This is called when the refcnt of a css is confirmed to be killed.
  * css_tryget() is now guaranteed to fail.
  */
 static void css_killed_work_fn(struct work_struct *work)
 {
 	struct cgroup_subsys_state *css =
 		container_of(work, struct cgroup_subsys_state, destroy_work);
 	struct cgroup *cgrp = css->cgroup;
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	/*
 	 * css_tryget() is guaranteed to fail now.  Tell subsystems to
 	 * initate destruction.
 	 */
 	offline_css(css);
 	/*
 	 * If @cgrp is marked dead, it's waiting for refs of all css's to
 	 * be disabled before proceeding to the second phase of cgroup
 	 * destruction.  If we are the last one, kick it off.
 	 */
 	if (!cgrp->nr_css && cgroup_is_dead(cgrp))
 		cgroup_destroy_css_killed(cgrp);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	/*
 	 * Put the css refs from kill_css().  Each css holds an extra
 	 * reference to the cgroup's dentry and cgroup removal proceeds
 	 * regardless of css refs.  On the last put of each css, whenever
 	 * that may be, the extra dentry ref is put so that dentry
 	 * destruction happens only after all css's are released.
 	 */
 	css_put(css);
 }
 /* css kill confirmation processing requires process context, bounce */
 static void css_killed_ref_fn(struct percpu_ref *ref)
 {
 	struct cgroup_subsys_state *css =
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 	INIT_WORK(&css->destroy_work, css_killed_work_fn);
 	queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 /**
  * kill_css - destroy a css
  * @css: css to destroy
  *
  * This function initiates destruction of @css by removing cgroup interface
  * files and putting its base reference.  ->css_offline() will be invoked
  * asynchronously once css_tryget() is guaranteed to fail and when the
  * reference count reaches zero, @css will be released.
  */
 static void kill_css(struct cgroup_subsys_state *css)
 {
 	lockdep_assert_held(&cgroup_tree_mutex);
 	/*
 	 * This must happen before css is disassociated with its cgroup.
 	 * See seq_css() for details.
 	 */
 	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
 	/*
 	 * Killing would put the base ref, but we need to keep it alive
 	 * until after ->css_offline().
 	 */
 	css_get(css);
 	/*
 	 * cgroup core guarantees that, by the time ->css_offline() is
 	 * invoked, no new css reference will be given out via
 	 * css_tryget().  We can't simply call percpu_ref_kill() and
 	 * proceed to offlining css's because percpu_ref_kill() doesn't
 	 * guarantee that the ref is seen as killed on all CPUs on return.
 	 *
 	 * Use percpu_ref_kill_and_confirm() to get notifications as each
 	 * css is confirmed to be seen as killed on all CPUs.
 	 */
 	percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
 }
 /**
  * cgroup_destroy_locked - the first stage of cgroup destruction
  * @cgrp: cgroup to be destroyed
  *
  * css's make use of percpu refcnts whose killing latency shouldn't be
  * exposed to userland and are RCU protected.  Also, cgroup core needs to
  * guarantee that css_tryget() won't succeed by the time ->css_offline() is
  * invoked.  To satisfy all the requirements, destruction is implemented in
  * the following two steps.
  *
  * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
  *     userland visible parts and start killing the percpu refcnts of
  *     css's.  Set up so that the next stage will be kicked off once all
  *     the percpu refcnts are confirmed to be killed.
  *
  * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
  *     rest of destruction.  Once all cgroup references are gone, the
  *     cgroup is RCU-freed.
  *
  * This function implements s1.  After this step, @cgrp is gone as far as
  * the userland is concerned and a new cgroup with the same name may be
  * created.  As cgroup doesn't care about the names internally, this
  * doesn't cause any problem.
  */
 static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
 	struct cgroup *child;
 	struct cgroup_subsys_state *css;
 	bool empty;
 	int ssid;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	/*
 	 * css_set_rwsem synchronizes access to ->cset_links and prevents
 	 * @cgrp from being removed while put_css_set() is in progress.
 	 */
 	down_read(&css_set_rwsem);
 	empty = list_empty(&cgrp->cset_links);
 	up_read(&css_set_rwsem);
 	if (!empty)
 		return -EBUSY;
 	/*
 	 * Make sure there's no live children.  We can't test ->children
 	 * emptiness as dead children linger on it while being destroyed;
 	 * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
 	 */
 	empty = true;
 	rcu_read_lock();
 	list_for_each_entry_rcu(child, &cgrp->children, sibling) {
 		empty = cgroup_is_dead(child);
 		if (!empty)
 			break;
 	}
 	rcu_read_unlock();
 	if (!empty)
 		return -EBUSY;
 	/*
 	 * Mark @cgrp dead.  This prevents further task migration and child
 	 * creation by disabling cgroup_lock_live_group().  Note that
 	 * CGRP_DEAD assertion is depended upon by css_next_child() to
 	 * resume iteration after dropping RCU read lock.  See
 	 * css_next_child() for details.
 	 */
 	set_bit(CGRP_DEAD, &cgrp->flags);
 	/*
 	 * Initiate massacre of all css's.  cgroup_destroy_css_killed()
 	 * will be invoked to perform the rest of destruction once the
 	 * percpu refs of all css's are confirmed to be killed.  This
 	 * involves removing the subsystem's files, drop cgroup_mutex.
 	 */
 	mutex_unlock(&cgroup_mutex);
 	for_each_css(css, ssid, cgrp)
 		kill_css(css);
 	mutex_lock(&cgroup_mutex);
 	/* CGRP_DEAD is set, remove from ->release_list for the last time */
 	raw_spin_lock(&release_list_lock);
 	if (!list_empty(&cgrp->release_list))
 		list_del_init(&cgrp->release_list);
 	raw_spin_unlock(&release_list_lock);
 	/*
 	 * If @cgrp has css's attached, the second stage of cgroup
 	 * destruction is kicked off from css_killed_work_fn() after the
 	 * refs of all attached css's are killed.  If @cgrp doesn't have
 	 * any css, we kick it off here.
 	 */
 	if (!cgrp->nr_css)
 		cgroup_destroy_css_killed(cgrp);
 	/* remove @cgrp directory along with the base files */
 	mutex_unlock(&cgroup_mutex);
 	/*
 	 * There are two control paths which try to determine cgroup from
 	 * dentry without going through kernfs - cgroupstats_build() and
 	 * css_tryget_from_dir().  Those are supported by RCU protecting
 	 * clearing of cgrp->kn->priv backpointer, which should happen
 	 * after all files under it have been removed.
 	 */
 	kernfs_remove(cgrp->kn);	/* @cgrp has an extra ref on its kn */
 	RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
 	mutex_lock(&cgroup_mutex);
 	return 0;
 };
 /**
  * cgroup_destroy_css_killed - the second step of cgroup destruction
  * @work: cgroup->destroy_free_work
  *
  * This function is invoked from a work item for a cgroup which is being
  * destroyed after all css's are offlined and performs the rest of
  * destruction.  This is the second step of destruction described in the
  * comment above cgroup_destroy_locked().
  */
 static void cgroup_destroy_css_killed(struct cgroup *cgrp)
 {
 	struct cgroup *parent = cgrp->parent;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	/* delete this cgroup from parent->children */
 	list_del_rcu(&cgrp->sibling);
 	cgroup_put(cgrp);
 	set_bit(CGRP_RELEASABLE, &parent->flags);
 	check_for_release(parent);
 }
 static int cgroup_rmdir(struct kernfs_node *kn)
 {
 	struct cgroup *cgrp = kn->priv;
 	int ret = 0;
 	/*
 	 * This is self-destruction but @kn can't be removed while this
 	 * callback is in progress.  Let's break active protection.  Once
 	 * the protection is broken, @cgrp can be destroyed at any point.
 	 * Pin it so that it stays accessible.
 	 */
 	cgroup_get(cgrp);
 	kernfs_break_active_protection(kn);
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	/*
 	 * @cgrp might already have been destroyed while we're trying to
 	 * grab the mutexes.
 	 */
 	if (!cgroup_is_dead(cgrp))
 		ret = cgroup_destroy_locked(cgrp);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	kernfs_unbreak_active_protection(kn);
 	cgroup_put(cgrp);
 	return ret;
 }
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
 	.remount_fs		= cgroup_remount,
 	.show_options		= cgroup_show_options,
 	.mkdir			= cgroup_mkdir,
 	.rmdir			= cgroup_rmdir,
 	.rename			= cgroup_rename,
 };
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 {
 	struct cgroup_subsys_state *css;
 	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	INIT_LIST_HEAD(&ss->cfts);
 	/* Create the root cgroup state for this subsystem */
 	ss->root = &cgrp_dfl_root;
 	css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
 	/* We don't handle early failures gracefully */
 	BUG_ON(IS_ERR(css));
 	init_css(css, ss, &cgrp_dfl_root.cgrp);
 	/* Update the init_css_set to contain a subsys
 	 * pointer to this state - since the subsystem is
 	 * newly registered, all tasks and hence the
 	 * init_css_set is in the subsystem's root cgroup. */
 	init_css_set.subsys[ss->id] = css;
 	need_forkexit_callback |= ss->fork || ss->exit;
 	/* At system boot, before all subsystems have been
 	 * registered, no tasks have been forked, so we don't
 	 * need to invoke fork callbacks here. */
 	BUG_ON(!list_empty(&init_task.tasks));
 	BUG_ON(online_css(css));
 	cgrp_dfl_root.subsys_mask |= 1 << ss->id;
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 }
 /**
  * cgroup_init_early - cgroup initialization at system boot
  *
  * Initialize cgroups at system boot, and initialize any
  * subsystems that request early init.
  */
 int __init cgroup_init_early(void)
 {
 	static struct cgroup_sb_opts __initdata opts =
 		{ .flags = CGRP_ROOT_SANE_BEHAVIOR };
 	struct cgroup_subsys *ss;
 	int i;
 	init_cgroup_root(&cgrp_dfl_root, &opts);
 	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
 	for_each_subsys(ss, i) {
 		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
 		     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
 		     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
 		     ss->id, ss->name);
 		WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
 		     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
 		ss->id = i;
 		ss->name = cgroup_subsys_name[i];
 		if (ss->early_init)
 			cgroup_init_subsys(ss);
 	}
 	return 0;
 }
 /**
  * cgroup_init - cgroup initialization
  *
  * Register cgroup filesystem and /proc file, and initialize
  * any subsystems that didn't request early init.
  */
 int __init cgroup_init(void)
 {
 	struct cgroup_subsys *ss;
 	unsigned long key;
 	int ssid, err;
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	/* Add init_css_set to the hash table */
 	key = css_set_hash(init_css_set.subsys);
 	hash_add(css_set_table, &init_css_set.hlist, key);
 	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	for_each_subsys(ss, ssid) {
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
 		/*
 		 * cftype registration needs kmalloc and can't be done
 		 * during early_init.  Register base cftypes separately.
 		 */
 		if (ss->base_cftypes)
 			WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
 	}
 	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
 	if (!cgroup_kobj)
 		return -ENOMEM;
 	err = register_filesystem(&cgroup_fs_type);
 	if (err < 0) {
 		kobject_put(cgroup_kobj);
 		return err;
 	}
 	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
 	return 0;
 }
 static int __init cgroup_wq_init(void)
 {
 	/*
 	 * There isn't much point in executing destruction path in
 	 * parallel.  Good chunk is serialized with cgroup_mutex anyway.
 	 * Use 1 for @max_active.
 	 *
 	 * We would prefer to do this in cgroup_init() above, but that
 	 * is called before init_workqueues(): so leave this until after.
 	 */
 	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
 	BUG_ON(!cgroup_destroy_wq);
 	/*
 	 * Used to destroy pidlists and separate to serve as flush domain.
 	 * Cap @max_active to 1 too.
 	 */
 	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
 						    0, 1);
 	BUG_ON(!cgroup_pidlist_destroy_wq);
 	return 0;
 }
 core_initcall(cgroup_wq_init);
 /*
  * proc_cgroup_show()
  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
  *  - Used for /proc/<pid>/cgroup.
  */
 /* TODO: Use a proper seq_file iterator */
 int proc_cgroup_show(struct seq_file *m, void *v)
 {
 	struct pid *pid;
 	struct task_struct *tsk;
 	char *buf, *path;
 	int retval;
 	struct cgroup_root *root;
 	retval = -ENOMEM;
 	buf = kmalloc(PATH_MAX, GFP_KERNEL);
 	if (!buf)
 		goto out;
 	retval = -ESRCH;
 	pid = m->private;
 	tsk = get_pid_task(pid, PIDTYPE_PID);
 	if (!tsk)
 		goto out_free;
 	retval = 0;
 	mutex_lock(&cgroup_mutex);
 	down_read(&css_set_rwsem);
 	for_each_root(root) {
 		struct cgroup_subsys *ss;
 		struct cgroup *cgrp;
 		int ssid, count = 0;
 		if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
 			continue;
 		seq_printf(m, "%d:", root->hierarchy_id);
 		for_each_subsys(ss, ssid)
 			if (root->subsys_mask & (1 << ssid))
 				seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
 		if (strlen(root->name))
 			seq_printf(m, "%sname=%s", count ? "," : "",
 				   root->name);
 		seq_putc(m, ':');
 		cgrp = task_cgroup_from_root(tsk, root);
 		path = cgroup_path(cgrp, buf, PATH_MAX);
 		if (!path) {
 			retval = -ENAMETOOLONG;
 			goto out_unlock;
 		}
 		seq_puts(m, path);
 		seq_putc(m, '\n');
 	}
 out_unlock:
 	up_read(&css_set_rwsem);
 	mutex_unlock(&cgroup_mutex);
 	put_task_struct(tsk);
 out_free:
 	kfree(buf);
 out:
 	return retval;
 }
 /* Display information about each subsystem and each hierarchy */
 static int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
 	struct cgroup_subsys *ss;
 	int i;
 	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
 	/*
 	 * ideally we don't want subsystems moving around while we do this.
 	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
 	 * subsys/hierarchy state.
 	 */
 	mutex_lock(&cgroup_mutex);
 	for_each_subsys(ss, i)
 		seq_printf(m, "%s\t%d\t%d\t%d\n",
 			   ss->name, ss->root->hierarchy_id,
 			   atomic_read(&ss->root->nr_cgrps), !ss->disabled);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 static int cgroupstats_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, proc_cgroupstats_show, NULL);
 }
 static const struct file_operations proc_cgroupstats_operations = {
 	.open = cgroupstats_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.release = single_release,
 };
 /**
  * cgroup_fork - initialize cgroup related fields during copy_process()
  * @child: pointer to task_struct of forking parent process.
  *
  * A task is associated with the init_css_set until cgroup_post_fork()
  * attaches it to the parent's css_set.  Empty cg_list indicates that
  * @child isn't holding reference to its css_set.
  */
 void cgroup_fork(struct task_struct *child)
 {
 	RCU_INIT_POINTER(child->cgroups, &init_css_set);
 	INIT_LIST_HEAD(&child->cg_list);
 }
 /**
  * cgroup_post_fork - called on a new task after adding it to the task list
  * @child: the task in question
  *
  * Adds the task to the list running through its css_set if necessary and
  * call the subsystem fork() callbacks.  Has to be after the task is
  * visible on the task list in case we race with the first call to
  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
  * list.
  */
 void cgroup_post_fork(struct task_struct *child)
 {
 	struct cgroup_subsys *ss;
 	int i;
 	/*
 	 * This may race against cgroup_enable_task_cg_links().  As that
 	 * function sets use_task_css_set_links before grabbing
 	 * tasklist_lock and we just went through tasklist_lock to add
 	 * @child, it's guaranteed that either we see the set
 	 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
 	 * @child during its iteration.
 	 *
 	 * If we won the race, @child is associated with %current's
 	 * css_set.  Grabbing css_set_rwsem guarantees both that the
 	 * association is stable, and, on completion of the parent's
 	 * migration, @child is visible in the source of migration or
 	 * already in the destination cgroup.  This guarantee is necessary
 	 * when implementing operations which need to migrate all tasks of
 	 * a cgroup to another.
 	 *
 	 * Note that if we lose to cgroup_enable_task_cg_links(), @child
 	 * will remain in init_css_set.  This is safe because all tasks are
 	 * in the init_css_set before cg_links is enabled and there's no
 	 * operation which transfers all tasks out of init_css_set.
 	 */
 	if (use_task_css_set_links) {
 		struct css_set *cset;
 		down_write(&css_set_rwsem);
 		cset = task_css_set(current);
 		if (list_empty(&child->cg_list)) {
 			rcu_assign_pointer(child->cgroups, cset);
 			list_add(&child->cg_list, &cset->tasks);
 			get_css_set(cset);
 		}
 		up_write(&css_set_rwsem);
 	}
 	/*
 	 * Call ss->fork().  This must happen after @child is linked on
 	 * css_set; otherwise, @child might change state between ->fork()
 	 * and addition to css_set.
 	 */
 	if (need_forkexit_callback) {
 		for_each_subsys(ss, i)
 			if (ss->fork)
 				ss->fork(child);
 	}
 }
 /**
  * cgroup_exit - detach cgroup from exiting task
  * @tsk: pointer to task_struct of exiting process
  *
  * Description: Detach cgroup from @tsk and release it.
  *
  * Note that cgroups marked notify_on_release force every task in
  * them to take the global cgroup_mutex mutex when exiting.
  * This could impact scaling on very large systems.  Be reluctant to
  * use notify_on_release cgroups where very high task exit scaling
  * is required on large systems.
  *
  * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
  * call cgroup_exit() while the task is still competent to handle
  * notify_on_release(), then leave the task attached to the root cgroup in
  * each hierarchy for the remainder of its exit.  No need to bother with
  * init_css_set refcnting.  init_css_set never goes away and we can't race
  * with migration path - PF_EXITING is visible to migration path.
  */
 void cgroup_exit(struct task_struct *tsk)
 {
 	struct cgroup_subsys *ss;
 	struct css_set *cset;
 	bool put_cset = false;
 	int i;
 	/*
 	 * Unlink from @tsk from its css_set.  As migration path can't race
 	 * with us, we can check cg_list without grabbing css_set_rwsem.
 	 */
 	if (!list_empty(&tsk->cg_list)) {
 		down_write(&css_set_rwsem);
 		list_del_init(&tsk->cg_list);
 		up_write(&css_set_rwsem);
 		put_cset = true;
 	}
 	/* Reassign the task to the init_css_set. */
 	cset = task_css_set(tsk);
 	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
 	if (need_forkexit_callback) {
 		/* see cgroup_post_fork() for details */
 		for_each_subsys(ss, i) {
 			if (ss->exit) {
 				struct cgroup_subsys_state *old_css = cset->subsys[i];
 				struct cgroup_subsys_state *css = task_css(tsk, i);
 				ss->exit(css, old_css, tsk);
 			}
 		}
 	}
 	if (put_cset)
 		put_css_set(cset, true);
 }
 static void check_for_release(struct cgroup *cgrp)
 {
 	if (cgroup_is_releasable(cgrp) &&
 	    list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
 		/*
 		 * Control Group is currently removeable. If it's not
 		 * already queued for a userspace notification, queue
 		 * it now
 		 */
 		int need_schedule_work = 0;
 		raw_spin_lock(&release_list_lock);
 		if (!cgroup_is_dead(cgrp) &&
 		    list_empty(&cgrp->release_list)) {
 			list_add(&cgrp->release_list, &release_list);
 			need_schedule_work = 1;
 		}
 		raw_spin_unlock(&release_list_lock);
 		if (need_schedule_work)
 			schedule_work(&release_agent_work);
 	}
 }
 /*
  * Notify userspace when a cgroup is released, by running the
  * configured release agent with the name of the cgroup (path
  * relative to the root of cgroup file system) as the argument.
  *
  * Most likely, this user command will try to rmdir this cgroup.
  *
  * This races with the possibility that some other task will be
  * attached to this cgroup before it is removed, or that some other
  * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
  * The presumed 'rmdir' will fail quietly if this cgroup is no longer
  * unused, and this cgroup will be reprieved from its death sentence,
  * to continue to serve a useful existence.  Next time it's released,
  * we will get notified again, if it still has 'notify_on_release' set.
  *
  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
  * means only wait until the task is successfully execve()'d.  The
  * separate release agent task is forked by call_usermodehelper(),
  * then control in this thread returns here, without waiting for the
  * release agent task.  We don't bother to wait because the caller of
  * this routine has no use for the exit status of the release agent
  * task, so no sense holding our caller up for that.
  */
 static void cgroup_release_agent(struct work_struct *work)
 {
 	BUG_ON(work != &release_agent_work);
 	mutex_lock(&cgroup_mutex);
 	raw_spin_lock(&release_list_lock);
 	while (!list_empty(&release_list)) {
 		char *argv[3], *envp[3];
 		int i;
 		char *pathbuf = NULL, *agentbuf = NULL, *path;
 		struct cgroup *cgrp = list_entry(release_list.next,
 						    struct cgroup,
 						    release_list);
 		list_del_init(&cgrp->release_list);
 		raw_spin_unlock(&release_list_lock);
 		pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
 		if (!pathbuf)
 			goto continue_free;
 		path = cgroup_path(cgrp, pathbuf, PATH_MAX);
 		if (!path)
 			goto continue_free;
 		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
 		if (!agentbuf)
 			goto continue_free;
 		i = 0;
 		argv[i++] = agentbuf;
 		argv[i++] = path;
 		argv[i] = NULL;
 		i = 0;
 		/* minimal command environment */
 		envp[i++] = "HOME=/";
 		envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 		envp[i] = NULL;
 		/* Drop the lock while we invoke the usermode helper,
 		 * since the exec could involve hitting disk and hence
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
 		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
 		mutex_lock(&cgroup_mutex);
  continue_free:
 		kfree(pathbuf);
 		kfree(agentbuf);
 		raw_spin_lock(&release_list_lock);
 	}
 	raw_spin_unlock(&release_list_lock);
 	mutex_unlock(&cgroup_mutex);
 }
 static int __init cgroup_disable(char *str)
 {
 	struct cgroup_subsys *ss;
 	char *token;
 	int i;
 	while ((token = strsep(&str, ",")) != NULL) {
 		if (!*token)
 			continue;
 		for_each_subsys(ss, i) {
 			if (!strcmp(token, ss->name)) {
 				ss->disabled = 1;
 				printk(KERN_INFO "Disabling %s control group"
 					" subsystem\n", ss->name);
 				break;
 			}
 		}
 	}
 	return 1;
 }
 __setup("cgroup_disable=", cgroup_disable);
 /**
  * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
  * @dentry: directory dentry of interest
  * @ss: subsystem of interest
  *
  * If @dentry is a directory for a cgroup which has @ss enabled on it, try
  * to get the corresponding css and return it.  If such css doesn't exist
  * or can't be pinned, an ERR_PTR value is returned.
  */
 struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
 						struct cgroup_subsys *ss)
 {
 	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
 	struct cgroup_subsys_state *css = NULL;
 	struct cgroup *cgrp;
 	/* is @dentry a cgroup dir? */
 	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
 	    kernfs_type(kn) != KERNFS_DIR)
 		return ERR_PTR(-EBADF);
 	rcu_read_lock();
 	/*
 	 * This path doesn't originate from kernfs and @kn could already
 	 * have been or be removed at any point.  @kn->priv is RCU
 	 * protected for this access.  See destroy_locked() for details.
 	 */
 	cgrp = rcu_dereference(kn->priv);
 	if (cgrp)
 		css = cgroup_css(cgrp, ss);
 	if (!css || !css_tryget(css))
 		css = ERR_PTR(-ENOENT);
 	rcu_read_unlock();
 	return css;
 }
 /**
  * css_from_id - lookup css by id
  * @id: the cgroup id
  * @ss: cgroup subsys to be looked into
  *
  * Returns the css if there's valid one with @id, otherwise returns NULL.
  * Should be called under rcu_read_lock().
  */
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 {
 	struct cgroup *cgrp;
 	cgroup_assert_mutexes_or_rcu_locked();
 	cgrp = idr_find(&ss->root->cgroup_idr, id);
 	if (cgrp)
 		return cgroup_css(cgrp, ss);
 	return NULL;
 }
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *
 debug_css_alloc(struct cgroup_subsys_state *parent_css)
 {
 	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
 	if (!css)
 		return ERR_PTR(-ENOMEM);
 	return css;
 }
 static void debug_css_free(struct cgroup_subsys_state *css)
 {
 	kfree(css);
 }
 static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
 				struct cftype *cft)
 {
 	return cgroup_task_count(css->cgroup);
 }
 static u64 current_css_set_read(struct cgroup_subsys_state *css,
 				struct cftype *cft)
 {
 	return (u64)(unsigned long)current->cgroups;
 }
 static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
 					 struct cftype *cft)
 {
 	u64 count;
 	rcu_read_lock();
 	count = atomic_read(&task_css_set(current)->refcount);
 	rcu_read_unlock();
 	return count;
 }
 static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 {
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
 	char *name_buf;
 	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
 	if (!name_buf)
 		return -ENOMEM;
 	down_read(&css_set_rwsem);
 	rcu_read_lock();
 	cset = rcu_dereference(current->cgroups);
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
 		cgroup_name(c, name_buf, NAME_MAX + 1);
 		seq_printf(seq, "Root %d group %s\n",
 			   c->root->hierarchy_id, name_buf);
 	}
 	rcu_read_unlock();
 	up_read(&css_set_rwsem);
 	kfree(name_buf);
 	return 0;
 }
 #define MAX_TASKS_SHOWN_PER_CSS 25
 static int cgroup_css_links_read(struct seq_file *seq, void *v)
 {
 	struct cgroup_subsys_state *css = seq_css(seq);
 	struct cgrp_cset_link *link;
 	down_read(&css_set_rwsem);
 	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
 		struct css_set *cset = link->cset;
 		struct task_struct *task;
 		int count = 0;
 		seq_printf(seq, "css_set %p\n", cset);
 		list_for_each_entry(task, &cset->tasks, cg_list) {
 			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
 				goto overflow;
 			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
 		}
 		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
 			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
 				goto overflow;
 			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
 		}
 		continue;
 	overflow:
 		seq_puts(seq, "  ...\n");
 	}
 	up_read(&css_set_rwsem);
 	return 0;
 }
 static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
 	return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
 }
 static struct cftype debug_files[] =  {
 	{
 		.name = "taskcount",
 		.read_u64 = debug_taskcount_read,
 	},
 	{
 		.name = "current_css_set",
 		.read_u64 = current_css_set_read,
 	},
 	{
 		.name = "current_css_set_refcount",
 		.read_u64 = current_css_set_refcount_read,
 	},
 	{
 		.name = "current_css_set_cg_links",
 		.seq_show = current_css_set_cg_links_read,
 	},
 	{
 		.name = "cgroup_css_links",
 		.seq_show = cgroup_css_links_read,
 	},
 	{
 		.name = "releasable",
 		.read_u64 = releasable_read,
 	},
 	{ }	/* terminate */
 };
 struct cgroup_subsys debug_cgrp_subsys = {
 	.css_alloc = debug_css_alloc,
 	.css_free = debug_css_free,