Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* Generic process-grouping system.

2

* Generic process-grouping system.

3

*

3

*

4

* Based originally on the cpuset system, extracted by Paul Menage

4

* Based originally on the cpuset system, extracted by Paul Menage

5

6

*

6

*

7

* Notifications support

7

* Notifications support

8

9

* Author: Kirill A. Shutemov

9

* Author: Kirill A. Shutemov

10

*

10

*

11

* Copyright notices from the original cpuset code:

11

* Copyright notices from the original cpuset code:

12

* --------------------------------------------------

12

* --------------------------------------------------

13

14

15

*

15

*

16

* Portions derived from Patrick Mochel's sysfs code.

16

* Portions derived from Patrick Mochel's sysfs code.

17

18

*

18

*

19

* 2003-10-10 Written by Simon Derr.

19

* 2003-10-10 Written by Simon Derr.

20

* 2003-10-22 Updates by Stephen Hemminger.

20

* 2003-10-22 Updates by Stephen Hemminger.

21

* 2004 May-July Rework by Paul Jackson.

21

* 2004 May-July Rework by Paul Jackson.

22

* ---------------------------------------------------

22

* ---------------------------------------------------

23

*

23

*

24

* This file is subject to the terms and conditions of the GNU General Public

24

* This file is subject to the terms and conditions of the GNU General Public

25

* License. See the file COPYING in the main directory of the Linux

25

* License. See the file COPYING in the main directory of the Linux

26

* distribution for more details.

26

* distribution for more details.

27

*/

27

*/

28

29

#include <linux/cgroup.h>

29

#include <linux/cgroup.h>

30

#include <linux/cred.h>

30

#include <linux/cred.h>

31

#include <linux/ctype.h>

31

#include <linux/ctype.h>

32

#include <linux/errno.h>

32

#include <linux/errno.h>

33

#include <linux/init_task.h>

33

#include <linux/init_task.h>

34

#include <linux/kernel.h>

34

#include <linux/kernel.h>

35

#include <linux/list.h>

35

#include <linux/list.h>

36

#include <linux/magic.h>

36

#include <linux/mm.h>

37

#include <linux/mm.h>

37

#include <linux/mutex.h>

38

#include <linux/mutex.h>

38

#include <linux/mount.h>

39

#include <linux/mount.h>

39

#include <linux/pagemap.h>

40

#include <linux/pagemap.h>

40

#include <linux/proc_fs.h>

41

#include <linux/proc_fs.h>

41

#include <linux/rcupdate.h>

42

#include <linux/rcupdate.h>

42

#include <linux/sched.h>

43

#include <linux/sched.h>

43

#include <linux/slab.h>

44

#include <linux/slab.h>

44

#include <linux/spinlock.h>

45

#include <linux/spinlock.h>

45

#include <linux/rwsem.h>

46

#include <linux/rwsem.h>

46

#include <linux/string.h>

47

#include <linux/string.h>

47

#include <linux/sort.h>

48

#include <linux/sort.h>

48

#include <linux/kmod.h>

49

#include <linux/kmod.h>

49

#include <linux/delayacct.h>

50

#include <linux/delayacct.h>

50

#include <linux/cgroupstats.h>

51

#include <linux/cgroupstats.h>

51

#include <linux/hashtable.h>

52

#include <linux/hashtable.h>

52

#include <linux/pid_namespace.h>

53

#include <linux/pid_namespace.h>

53

#include <linux/idr.h>

54

#include <linux/idr.h>

54

#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */

55

#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */

55

#include <linux/kthread.h>

56

#include <linux/kthread.h>

56

#include <linux/delay.h>

57

#include <linux/delay.h>

57

58

#include <linux/atomic.h>

59

#include <linux/atomic.h>

59

60

/*

61

/*

61

* pidlists linger the following amount before being destroyed. The goal

62

* pidlists linger the following amount before being destroyed. The goal

62

* is avoiding frequent destruction in the middle of consecutive read calls

63

* is avoiding frequent destruction in the middle of consecutive read calls

63

* Expiring in the middle is a performance problem not a correctness one.

64

* Expiring in the middle is a performance problem not a correctness one.

64

* 1 sec should be enough.

65

* 1 sec should be enough.

65

*/

66

*/

66

#define CGROUP_PIDLIST_DESTROY_DELAY HZ

67

#define CGROUP_PIDLIST_DESTROY_DELAY HZ

67

68

#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \

69

#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \

69

MAX_CFTYPE_NAME + 2)

70

MAX_CFTYPE_NAME + 2)

70

71

/*

72

/*

72

* cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file

73

* cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file

73

* creation/removal and hierarchy changing operations including cgroup

74

* creation/removal and hierarchy changing operations including cgroup

74

* creation, removal, css association and controller rebinding. This outer

75

* creation, removal, css association and controller rebinding. This outer

75

* lock is needed mainly to resolve the circular dependency between kernfs

76

* lock is needed mainly to resolve the circular dependency between kernfs

76

* active ref and cgroup_mutex. cgroup_tree_mutex nests above both.

77

* active ref and cgroup_mutex. cgroup_tree_mutex nests above both.

77

*/

78

*/

78

static DEFINE_MUTEX(cgroup_tree_mutex);

79

static DEFINE_MUTEX(cgroup_tree_mutex);

79

80

/*

81

/*

81

* cgroup_mutex is the master lock. Any modification to cgroup or its

82

* cgroup_mutex is the master lock. Any modification to cgroup or its

82

* hierarchy must be performed while holding it.

83

* hierarchy must be performed while holding it.

83

*

84

*

84

* css_set_rwsem protects task->cgroups pointer, the list of css_set

85

* css_set_rwsem protects task->cgroups pointer, the list of css_set

85

* objects, and the chain of tasks off each css_set.

86

* objects, and the chain of tasks off each css_set.

86

*

87

*

87

* These locks are exported if CONFIG_PROVE_RCU so that accessors in

88

* These locks are exported if CONFIG_PROVE_RCU so that accessors in

88

* cgroup.h can use them for lockdep annotations.

89

* cgroup.h can use them for lockdep annotations.

89

*/

90

*/

90

#ifdef CONFIG_PROVE_RCU

91

#ifdef CONFIG_PROVE_RCU

91

DEFINE_MUTEX(cgroup_mutex);

92

DEFINE_MUTEX(cgroup_mutex);

92

DECLARE_RWSEM(css_set_rwsem);

93

DECLARE_RWSEM(css_set_rwsem);

93

EXPORT_SYMBOL_GPL(cgroup_mutex);

94

EXPORT_SYMBOL_GPL(cgroup_mutex);

94

EXPORT_SYMBOL_GPL(css_set_rwsem);

95

EXPORT_SYMBOL_GPL(css_set_rwsem);

95

#else

96

#else

96

static DEFINE_MUTEX(cgroup_mutex);

97

static DEFINE_MUTEX(cgroup_mutex);

97

static DECLARE_RWSEM(css_set_rwsem);

98

static DECLARE_RWSEM(css_set_rwsem);

98

#endif

99

#endif

99

100

/*

101

/*

101

* Protects cgroup_subsys->release_agent_path. Modifying it also requires

102

* Protects cgroup_subsys->release_agent_path. Modifying it also requires

102

* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.

103

* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.

103

*/

104

*/

104

static DEFINE_SPINLOCK(release_agent_path_lock);

105

static DEFINE_SPINLOCK(release_agent_path_lock);

105

106

#define cgroup_assert_mutexes_or_rcu_locked() \

107

#define cgroup_assert_mutexes_or_rcu_locked() \

107

rcu_lockdep_assert(rcu_read_lock_held() || \

108

rcu_lockdep_assert(rcu_read_lock_held() || \

108

lockdep_is_held(&cgroup_tree_mutex) || \

109

lockdep_is_held(&cgroup_tree_mutex) || \

109

lockdep_is_held(&cgroup_mutex), \

110

lockdep_is_held(&cgroup_mutex), \

110

"cgroup_[tree_]mutex or RCU read lock required");

111

"cgroup_[tree_]mutex or RCU read lock required");

111

112

/*

113

/*

113

* cgroup destruction makes heavy use of work items and there can be a lot

114

* cgroup destruction makes heavy use of work items and there can be a lot

114

* of concurrent destructions. Use a separate workqueue so that cgroup

115

* of concurrent destructions. Use a separate workqueue so that cgroup

115

* destruction work items don't end up filling up max_active of system_wq

116

* destruction work items don't end up filling up max_active of system_wq

116

* which may lead to deadlock.

117

* which may lead to deadlock.

117

*/

118

*/

118

static struct workqueue_struct *cgroup_destroy_wq;

119

static struct workqueue_struct *cgroup_destroy_wq;

119

120

/*

121

/*

121

* pidlist destructions need to be flushed on cgroup destruction. Use a

122

* pidlist destructions need to be flushed on cgroup destruction. Use a

122

* separate workqueue as flush domain.

123

* separate workqueue as flush domain.

123

*/

124

*/

124

static struct workqueue_struct *cgroup_pidlist_destroy_wq;

125

static struct workqueue_struct *cgroup_pidlist_destroy_wq;

125

126

/* generate an array of cgroup subsystem pointers */

127

/* generate an array of cgroup subsystem pointers */

127

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,

128

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,

128

static struct cgroup_subsys *cgroup_subsys[] = {

129

static struct cgroup_subsys *cgroup_subsys[] = {

129

#include <linux/cgroup_subsys.h>

130

#include <linux/cgroup_subsys.h>

130

};

131

};

131

#undef SUBSYS

132

#undef SUBSYS

132

133

/* array of cgroup subsystem names */

134

/* array of cgroup subsystem names */

134

#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,

135

#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,

135

static const char *cgroup_subsys_name[] = {

136

static const char *cgroup_subsys_name[] = {

136

#include <linux/cgroup_subsys.h>

137

#include <linux/cgroup_subsys.h>

137

};

138

};

138

#undef SUBSYS

139

#undef SUBSYS

139

140

/*

141

/*

141

* The default hierarchy, reserved for the subsystems that are otherwise

142

* The default hierarchy, reserved for the subsystems that are otherwise

142

* unattached - it never has more than a single cgroup, and all tasks are

143

* unattached - it never has more than a single cgroup, and all tasks are

143

* part of that cgroup.

144

* part of that cgroup.

144

*/

145

*/

145

struct cgroup_root cgrp_dfl_root;

146

struct cgroup_root cgrp_dfl_root;

146

147

/*

148

/*

148

* The default hierarchy always exists but is hidden until mounted for the

149

* The default hierarchy always exists but is hidden until mounted for the

149

* first time. This is for backward compatibility.

150

* first time. This is for backward compatibility.

150

*/

151

*/

151

static bool cgrp_dfl_root_visible;

152

static bool cgrp_dfl_root_visible;

152

153

/* The list of hierarchy roots */

154

/* The list of hierarchy roots */

154

155

static LIST_HEAD(cgroup_roots);

156

static LIST_HEAD(cgroup_roots);

156

static int cgroup_root_count;

157

static int cgroup_root_count;

157

158

/* hierarchy ID allocation and mapping, protected by cgroup_mutex */

159

/* hierarchy ID allocation and mapping, protected by cgroup_mutex */

159

static DEFINE_IDR(cgroup_hierarchy_idr);

160

static DEFINE_IDR(cgroup_hierarchy_idr);

160

161

/*

162

/*

162

* Assign a monotonically increasing serial number to cgroups. It

163

* Assign a monotonically increasing serial number to cgroups. It

163

* guarantees cgroups with bigger numbers are newer than those with smaller

164

* guarantees cgroups with bigger numbers are newer than those with smaller

164

* numbers. Also, as cgroups are always appended to the parent's

165

* numbers. Also, as cgroups are always appended to the parent's

165

* ->children list, it guarantees that sibling cgroups are always sorted in

166

* ->children list, it guarantees that sibling cgroups are always sorted in

166

* the ascending serial number order on the list. Protected by

167

* the ascending serial number order on the list. Protected by

167

* cgroup_mutex.

168

* cgroup_mutex.

168

*/

169

*/

169

static u64 cgroup_serial_nr_next = 1;

170

static u64 cgroup_serial_nr_next = 1;

170

171

/* This flag indicates whether tasks in the fork and exit paths should

172

/* This flag indicates whether tasks in the fork and exit paths should

172

* check for fork/exit handlers to call. This avoids us having to do

173

* check for fork/exit handlers to call. This avoids us having to do

173

* extra work in the fork/exit path if none of the subsystems need to

174

* extra work in the fork/exit path if none of the subsystems need to

174

* be called.

175

* be called.

175

*/

176

*/

176

static int need_forkexit_callback __read_mostly;

177

static int need_forkexit_callback __read_mostly;

177

178

static struct cftype cgroup_base_files[];

179

static struct cftype cgroup_base_files[];

179

180

static void cgroup_put(struct cgroup *cgrp);

181

static void cgroup_put(struct cgroup *cgrp);

181

static int rebind_subsystems(struct cgroup_root *dst_root,

182

static int rebind_subsystems(struct cgroup_root *dst_root,

182

unsigned long ss_mask);

183

unsigned long ss_mask);

183

static void cgroup_destroy_css_killed(struct cgroup *cgrp);

184

static void cgroup_destroy_css_killed(struct cgroup *cgrp);

184

static int cgroup_destroy_locked(struct cgroup *cgrp);

185

static int cgroup_destroy_locked(struct cgroup *cgrp);

185

static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],

186

static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],

186

bool is_add);

187

bool is_add);

187

static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);

188

static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);

188

189

/**

190

/**

190

* cgroup_css - obtain a cgroup's css for the specified subsystem

191

* cgroup_css - obtain a cgroup's css for the specified subsystem

191

* @cgrp: the cgroup of interest

192

* @cgrp: the cgroup of interest

192

* @ss: the subsystem of interest (%NULL returns the dummy_css)

193

* @ss: the subsystem of interest (%NULL returns the dummy_css)

193

*

194

*

194

* Return @cgrp's css (cgroup_subsys_state) associated with @ss. This

195

* Return @cgrp's css (cgroup_subsys_state) associated with @ss. This

195

* function must be called either under cgroup_mutex or rcu_read_lock() and

196

* function must be called either under cgroup_mutex or rcu_read_lock() and

196

* the caller is responsible for pinning the returned css if it wants to

197

* the caller is responsible for pinning the returned css if it wants to

197

* keep accessing it outside the said locks. This function may return

198

* keep accessing it outside the said locks. This function may return

198

* %NULL if @cgrp doesn't have @subsys_id enabled.

199

* %NULL if @cgrp doesn't have @subsys_id enabled.

199

*/

200

*/

200

static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,

201

static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,

201

struct cgroup_subsys *ss)

202

struct cgroup_subsys *ss)

202

{

203

{

203

if (ss)

204

if (ss)

204

return rcu_dereference_check(cgrp->subsys[ss->id],

205

return rcu_dereference_check(cgrp->subsys[ss->id],

205

lockdep_is_held(&cgroup_tree_mutex) ||

206

lockdep_is_held(&cgroup_tree_mutex) ||

206

lockdep_is_held(&cgroup_mutex));

207

lockdep_is_held(&cgroup_mutex));

207

else

208

else

208

return &cgrp->dummy_css;

209

return &cgrp->dummy_css;

209

}

210

}

210

211

/* convenient tests for these bits */

212

/* convenient tests for these bits */

212

static inline bool cgroup_is_dead(const struct cgroup *cgrp)

213

static inline bool cgroup_is_dead(const struct cgroup *cgrp)

213

{

214

{

214

return test_bit(CGRP_DEAD, &cgrp->flags);

215

return test_bit(CGRP_DEAD, &cgrp->flags);

215

}

216

}

216

217

struct cgroup_subsys_state *seq_css(struct seq_file *seq)

218

struct cgroup_subsys_state *seq_css(struct seq_file *seq)

218

{

219

{

219

struct kernfs_open_file *of = seq->private;

220

struct kernfs_open_file *of = seq->private;

220

struct cgroup *cgrp = of->kn->parent->priv;

221

struct cgroup *cgrp = of->kn->parent->priv;

221

struct cftype *cft = seq_cft(seq);

222

struct cftype *cft = seq_cft(seq);

222

223

/*

224

/*

224

* This is open and unprotected implementation of cgroup_css().

225

* This is open and unprotected implementation of cgroup_css().

225

* seq_css() is only called from a kernfs file operation which has

226

* seq_css() is only called from a kernfs file operation which has

226

* an active reference on the file. Because all the subsystem

227

* an active reference on the file. Because all the subsystem

227

* files are drained before a css is disassociated with a cgroup,

228

* files are drained before a css is disassociated with a cgroup,

228

* the matching css from the cgroup's subsys table is guaranteed to

229

* the matching css from the cgroup's subsys table is guaranteed to

229

* be and stay valid until the enclosing operation is complete.

230

* be and stay valid until the enclosing operation is complete.

230

*/

231

*/

231

if (cft->ss)

232

if (cft->ss)

232

return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);

233

return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);

233

else

234

else

234

return &cgrp->dummy_css;

235

return &cgrp->dummy_css;

235

}

236

}

236

EXPORT_SYMBOL_GPL(seq_css);

237

EXPORT_SYMBOL_GPL(seq_css);

237

238

/**

239

/**

239

* cgroup_is_descendant - test ancestry

240

* cgroup_is_descendant - test ancestry

240

* @cgrp: the cgroup to be tested

241

* @cgrp: the cgroup to be tested

241

* @ancestor: possible ancestor of @cgrp

242

* @ancestor: possible ancestor of @cgrp

242

*

243

*

243

* Test whether @cgrp is a descendant of @ancestor. It also returns %true

244

* Test whether @cgrp is a descendant of @ancestor. It also returns %true

244

* if @cgrp == @ancestor. This function is safe to call as long as @cgrp

245

* if @cgrp == @ancestor. This function is safe to call as long as @cgrp

245

* and @ancestor are accessible.

246

* and @ancestor are accessible.

246

*/

247

*/

247

bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)

248

bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)

248

{

249

{

249

while (cgrp) {

250

while (cgrp) {

250

if (cgrp == ancestor)

251

if (cgrp == ancestor)

251

return true;

252

return true;

252

cgrp = cgrp->parent;

253

cgrp = cgrp->parent;

253

}

254

}

254

return false;

255

return false;

255

}

256

}

256

257

static int cgroup_is_releasable(const struct cgroup *cgrp)

258

static int cgroup_is_releasable(const struct cgroup *cgrp)

258

{

259

{

259

const int bits =

260

const int bits =

260

(1 << CGRP_RELEASABLE) |

261

(1 << CGRP_RELEASABLE) |

261

(1 << CGRP_NOTIFY_ON_RELEASE);

262

(1 << CGRP_NOTIFY_ON_RELEASE);

262

return (cgrp->flags & bits) == bits;

263

return (cgrp->flags & bits) == bits;

263

}

264

}

264

265

static int notify_on_release(const struct cgroup *cgrp)

266

static int notify_on_release(const struct cgroup *cgrp)

266

{

267

{

267

return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

268

return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

268

}

269

}

269

270

/**

271

/**

271

* for_each_css - iterate all css's of a cgroup

272

* for_each_css - iterate all css's of a cgroup

272

* @css: the iteration cursor

273

* @css: the iteration cursor

273

* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end

274

* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end

274

* @cgrp: the target cgroup to iterate css's of

275

* @cgrp: the target cgroup to iterate css's of

275

*

276

*

276

* Should be called under cgroup_mutex.

277

* Should be called under cgroup_mutex.

277

*/

278

*/

278

#define for_each_css(css, ssid, cgrp) \

279

#define for_each_css(css, ssid, cgrp) \

279

for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \

280

for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \

280

if (!((css) = rcu_dereference_check( \

281

if (!((css) = rcu_dereference_check( \

281

(cgrp)->subsys[(ssid)], \

282

(cgrp)->subsys[(ssid)], \

282

lockdep_is_held(&cgroup_tree_mutex) || \

283

lockdep_is_held(&cgroup_tree_mutex) || \

283

lockdep_is_held(&cgroup_mutex)))) { } \

284

lockdep_is_held(&cgroup_mutex)))) { } \

284

else

285

else

285

286

/**

287

/**

287

* for_each_subsys - iterate all enabled cgroup subsystems

288

* for_each_subsys - iterate all enabled cgroup subsystems

288

* @ss: the iteration cursor

289

* @ss: the iteration cursor

289

* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end

290

* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end

290

*/

291

*/

291

#define for_each_subsys(ss, ssid) \

292

#define for_each_subsys(ss, ssid) \

292

for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \

293

for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \

293

(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)

294

(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)

294

295

/* iterate across the hierarchies */

296

/* iterate across the hierarchies */

296

#define for_each_root(root) \

297

#define for_each_root(root) \

297

list_for_each_entry((root), &cgroup_roots, root_list)

298

list_for_each_entry((root), &cgroup_roots, root_list)

298

299

/**

300

/**

300

* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.

301

* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.

301

* @cgrp: the cgroup to be checked for liveness

302

* @cgrp: the cgroup to be checked for liveness

302

*

303

*

303

* On success, returns true; the mutex should be later unlocked. On

304

* On success, returns true; the mutex should be later unlocked. On

304

* failure returns false with no lock held.

305

* failure returns false with no lock held.

305

*/

306

*/

306

static bool cgroup_lock_live_group(struct cgroup *cgrp)

307

static bool cgroup_lock_live_group(struct cgroup *cgrp)

307

{

308

{

308

mutex_lock(&cgroup_mutex);

309

mutex_lock(&cgroup_mutex);

309

if (cgroup_is_dead(cgrp)) {

310

if (cgroup_is_dead(cgrp)) {

310

mutex_unlock(&cgroup_mutex);

311

mutex_unlock(&cgroup_mutex);

311

return false;

312

return false;

312

}

313

}

313

return true;

314

return true;

314

}

315

}

315

316

/* the list of cgroups eligible for automatic release. Protected by

317

/* the list of cgroups eligible for automatic release. Protected by

317

* release_list_lock */

318

* release_list_lock */

318

static LIST_HEAD(release_list);

319

static LIST_HEAD(release_list);

319

static DEFINE_RAW_SPINLOCK(release_list_lock);

320

static DEFINE_RAW_SPINLOCK(release_list_lock);

320

static void cgroup_release_agent(struct work_struct *work);

321

static void cgroup_release_agent(struct work_struct *work);

321

static DECLARE_WORK(release_agent_work, cgroup_release_agent);

322

static DECLARE_WORK(release_agent_work, cgroup_release_agent);

322

static void check_for_release(struct cgroup *cgrp);

323

static void check_for_release(struct cgroup *cgrp);

323

324

/*

325

/*

325

* A cgroup can be associated with multiple css_sets as different tasks may

326

* A cgroup can be associated with multiple css_sets as different tasks may

326

* belong to different cgroups on different hierarchies. In the other

327

* belong to different cgroups on different hierarchies. In the other

327

* direction, a css_set is naturally associated with multiple cgroups.

328

* direction, a css_set is naturally associated with multiple cgroups.

328

* This M:N relationship is represented by the following link structure

329

* This M:N relationship is represented by the following link structure

329

* which exists for each association and allows traversing the associations

330

* which exists for each association and allows traversing the associations

330

* from both sides.

331

* from both sides.

331

*/

332

*/

332

struct cgrp_cset_link {

333

struct cgrp_cset_link {

333

/* the cgroup and css_set this link associates */

334

/* the cgroup and css_set this link associates */

334

struct cgroup *cgrp;

335

struct cgroup *cgrp;

335

struct css_set *cset;

336

struct css_set *cset;

336

337

/* list of cgrp_cset_links anchored at cgrp->cset_links */

338

/* list of cgrp_cset_links anchored at cgrp->cset_links */

338

struct list_head cset_link;

339

struct list_head cset_link;

339

340

/* list of cgrp_cset_links anchored at css_set->cgrp_links */

341

/* list of cgrp_cset_links anchored at css_set->cgrp_links */

341

struct list_head cgrp_link;

342

struct list_head cgrp_link;

342

};

343

};

343

344

/*

345

/*

345

* The default css_set - used by init and its children prior to any

346

* The default css_set - used by init and its children prior to any

346

* hierarchies being mounted. It contains a pointer to the root state

347

* hierarchies being mounted. It contains a pointer to the root state

347

* for each subsystem. Also used to anchor the list of css_sets. Not

348

* for each subsystem. Also used to anchor the list of css_sets. Not

348

* reference-counted, to improve performance when child cgroups

349

* reference-counted, to improve performance when child cgroups

349

* haven't been created.

350

* haven't been created.

350

*/

351

*/

351

struct css_set init_css_set = {

352

struct css_set init_css_set = {

352

.refcount = ATOMIC_INIT(1),

353

.refcount = ATOMIC_INIT(1),

353

.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),

354

.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),

354

.tasks = LIST_HEAD_INIT(init_css_set.tasks),

355

.tasks = LIST_HEAD_INIT(init_css_set.tasks),

355

.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),

356

.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),

356

.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),

357

.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),

357

.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),

358

.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),

358

};

359

};

359

360

static int css_set_count = 1; /* 1 for init_css_set */

361

static int css_set_count = 1; /* 1 for init_css_set */

361

362

/*

363

/*

363

* hash table for cgroup groups. This improves the performance to find

364

* hash table for cgroup groups. This improves the performance to find

364

* an existing css_set. This hash doesn't (currently) take into

365

* an existing css_set. This hash doesn't (currently) take into

365

* account cgroups in empty hierarchies.

366

* account cgroups in empty hierarchies.

366

*/

367

*/

367

#define CSS_SET_HASH_BITS 7

368

#define CSS_SET_HASH_BITS 7

368

static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);

369

static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);

369

370

static unsigned long css_set_hash(struct cgroup_subsys_state *css[])

371

static unsigned long css_set_hash(struct cgroup_subsys_state *css[])

371

{

372

{

372

unsigned long key = 0UL;

373

unsigned long key = 0UL;

373

struct cgroup_subsys *ss;

374

struct cgroup_subsys *ss;

374

int i;

375

int i;

375

376

for_each_subsys(ss, i)

377

for_each_subsys(ss, i)

377

key += (unsigned long)css[i];

378

key += (unsigned long)css[i];

378

key = (key >> 16) ^ key;

379

key = (key >> 16) ^ key;

379

380

return key;

381

return key;

381

}

382

}

382

383

static void put_css_set_locked(struct css_set *cset, bool taskexit)

384

static void put_css_set_locked(struct css_set *cset, bool taskexit)

384

{

385

{

385

struct cgrp_cset_link *link, *tmp_link;

386

struct cgrp_cset_link *link, *tmp_link;

386

387

lockdep_assert_held(&css_set_rwsem);

388

lockdep_assert_held(&css_set_rwsem);

388

389

if (!atomic_dec_and_test(&cset->refcount))

390

if (!atomic_dec_and_test(&cset->refcount))

390

return;

391

return;

391

392

/* This css_set is dead. unlink it and release cgroup refcounts */

393

/* This css_set is dead. unlink it and release cgroup refcounts */

393

hash_del(&cset->hlist);

394

hash_del(&cset->hlist);

394

css_set_count--;

395

css_set_count--;

395

396

list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {

397

list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {

397

struct cgroup *cgrp = link->cgrp;

398

struct cgroup *cgrp = link->cgrp;

398

399

list_del(&link->cset_link);

400

list_del(&link->cset_link);

400

list_del(&link->cgrp_link);

401

list_del(&link->cgrp_link);

401

402

/* @cgrp can't go away while we're holding css_set_rwsem */

403

/* @cgrp can't go away while we're holding css_set_rwsem */

403

if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {

404

if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {

404

if (taskexit)

405

if (taskexit)

405

set_bit(CGRP_RELEASABLE, &cgrp->flags);

406

set_bit(CGRP_RELEASABLE, &cgrp->flags);

406

check_for_release(cgrp);

407

check_for_release(cgrp);

407

}

408

}

408

409

kfree(link);

410

kfree(link);

410

}

411

}

411

412

kfree_rcu(cset, rcu_head);

413

kfree_rcu(cset, rcu_head);

413

}

414

}

414

415

static void put_css_set(struct css_set *cset, bool taskexit)

416

static void put_css_set(struct css_set *cset, bool taskexit)

416

{

417

{

417

/*

418

/*

418

* Ensure that the refcount doesn't hit zero while any readers

419

* Ensure that the refcount doesn't hit zero while any readers

419

* can see it. Similar to atomic_dec_and_lock(), but for an

420

* can see it. Similar to atomic_dec_and_lock(), but for an

420

* rwlock

421

* rwlock

421

*/

422

*/

422

if (atomic_add_unless(&cset->refcount, -1, 1))

423

if (atomic_add_unless(&cset->refcount, -1, 1))

423

return;

424

return;

424

425

down_write(&css_set_rwsem);

426

down_write(&css_set_rwsem);

426

put_css_set_locked(cset, taskexit);

427

put_css_set_locked(cset, taskexit);

427

up_write(&css_set_rwsem);

428

up_write(&css_set_rwsem);

428

}

429

}

429

430

/*

431

/*

431

* refcounted get/put for css_set objects

432

* refcounted get/put for css_set objects

432

*/

433

*/

433

static inline void get_css_set(struct css_set *cset)

434

static inline void get_css_set(struct css_set *cset)

434

{

435

{

435

atomic_inc(&cset->refcount);

436

atomic_inc(&cset->refcount);

436

}

437

}

437

438

/**

439

/**

439

* compare_css_sets - helper function for find_existing_css_set().

440

* compare_css_sets - helper function for find_existing_css_set().

440

* @cset: candidate css_set being tested

441

* @cset: candidate css_set being tested

441

* @old_cset: existing css_set for a task

442

* @old_cset: existing css_set for a task

442

* @new_cgrp: cgroup that's being entered by the task

443

* @new_cgrp: cgroup that's being entered by the task

443

* @template: desired set of css pointers in css_set (pre-calculated)

444

* @template: desired set of css pointers in css_set (pre-calculated)

444

*

445

*

445

* Returns true if "cset" matches "old_cset" except for the hierarchy

446

* Returns true if "cset" matches "old_cset" except for the hierarchy

446

* which "new_cgrp" belongs to, for which it should match "new_cgrp".

447

* which "new_cgrp" belongs to, for which it should match "new_cgrp".

447

*/

448

*/

448

static bool compare_css_sets(struct css_set *cset,

449

static bool compare_css_sets(struct css_set *cset,

449

struct css_set *old_cset,

450

struct css_set *old_cset,

450

struct cgroup *new_cgrp,

451

struct cgroup *new_cgrp,

451

struct cgroup_subsys_state *template[])

452

struct cgroup_subsys_state *template[])

452

{

453

{

453

struct list_head *l1, *l2;

454

struct list_head *l1, *l2;

454

455

if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {

456

if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {

456

/* Not all subsystems matched */

457

/* Not all subsystems matched */

457

return false;

458

return false;

458

}

459

}

459

460

/*

461

/*

461

* Compare cgroup pointers in order to distinguish between

462

* Compare cgroup pointers in order to distinguish between

462

* different cgroups in heirarchies with no subsystems. We

463

* different cgroups in heirarchies with no subsystems. We

463

* could get by with just this check alone (and skip the

464

* could get by with just this check alone (and skip the

464

* memcmp above) but on most setups the memcmp check will

465

* memcmp above) but on most setups the memcmp check will

465

* avoid the need for this more expensive check on almost all

466

* avoid the need for this more expensive check on almost all

466

* candidates.

467

* candidates.

467

*/

468

*/

468

469

l1 = &cset->cgrp_links;

470

l1 = &cset->cgrp_links;

470

l2 = &old_cset->cgrp_links;

471

l2 = &old_cset->cgrp_links;

471

while (1) {

472

while (1) {

472

struct cgrp_cset_link *link1, *link2;

473

struct cgrp_cset_link *link1, *link2;

473

struct cgroup *cgrp1, *cgrp2;

474

struct cgroup *cgrp1, *cgrp2;

474

475

l1 = l1->next;

476

l1 = l1->next;

476

l2 = l2->next;

477

l2 = l2->next;

477

/* See if we reached the end - both lists are equal length. */

478

/* See if we reached the end - both lists are equal length. */

478

if (l1 == &cset->cgrp_links) {

479

if (l1 == &cset->cgrp_links) {

479

BUG_ON(l2 != &old_cset->cgrp_links);

480

BUG_ON(l2 != &old_cset->cgrp_links);

480

break;

481

break;

481

} else {

482

} else {

482

BUG_ON(l2 == &old_cset->cgrp_links);

483

BUG_ON(l2 == &old_cset->cgrp_links);

483

}

484

}

484

/* Locate the cgroups associated with these links. */

485

/* Locate the cgroups associated with these links. */

485

link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);

486

link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);

486

link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);

487

link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);

487

cgrp1 = link1->cgrp;

488

cgrp1 = link1->cgrp;

488

cgrp2 = link2->cgrp;

489

cgrp2 = link2->cgrp;

489

/* Hierarchies should be linked in the same order. */

490

/* Hierarchies should be linked in the same order. */

490

BUG_ON(cgrp1->root != cgrp2->root);

491

BUG_ON(cgrp1->root != cgrp2->root);

491

492

/*

493

/*

493

* If this hierarchy is the hierarchy of the cgroup

494

* If this hierarchy is the hierarchy of the cgroup

494

* that's changing, then we need to check that this

495

* that's changing, then we need to check that this

495

* css_set points to the new cgroup; if it's any other

496

* css_set points to the new cgroup; if it's any other

496

* hierarchy, then this css_set should point to the

497

* hierarchy, then this css_set should point to the

497

* same cgroup as the old css_set.

498

* same cgroup as the old css_set.

498

*/

499

*/

499

if (cgrp1->root == new_cgrp->root) {

500

if (cgrp1->root == new_cgrp->root) {

500

if (cgrp1 != new_cgrp)

501

if (cgrp1 != new_cgrp)

501

return false;

502

return false;

502

} else {

503

} else {

503

if (cgrp1 != cgrp2)

504

if (cgrp1 != cgrp2)

504

return false;

505

return false;

505

}

506

}

506

}

507

}

507

return true;

508

return true;

508

}

509

}

509

510

/**

511

/**

511

* find_existing_css_set - init css array and find the matching css_set

512

* find_existing_css_set - init css array and find the matching css_set

512

* @old_cset: the css_set that we're using before the cgroup transition

513

* @old_cset: the css_set that we're using before the cgroup transition

513

* @cgrp: the cgroup that we're moving into

514

* @cgrp: the cgroup that we're moving into

514

* @template: out param for the new set of csses, should be clear on entry

515

* @template: out param for the new set of csses, should be clear on entry

515

*/

516

*/

516

static struct css_set *find_existing_css_set(struct css_set *old_cset,

517

static struct css_set *find_existing_css_set(struct css_set *old_cset,

517

struct cgroup *cgrp,

518

struct cgroup *cgrp,

518

struct cgroup_subsys_state *template[])

519

struct cgroup_subsys_state *template[])

519

{

520

{

520

struct cgroup_root *root = cgrp->root;

521

struct cgroup_root *root = cgrp->root;

521

struct cgroup_subsys *ss;

522

struct cgroup_subsys *ss;

522

struct css_set *cset;

523

struct css_set *cset;

523

unsigned long key;

524

unsigned long key;

524

int i;

525

int i;

525

526

/*

527

/*

527

* Build the set of subsystem state objects that we want to see in the

528

* Build the set of subsystem state objects that we want to see in the

528

* new css_set. while subsystems can change globally, the entries here

529

* new css_set. while subsystems can change globally, the entries here

529

* won't change, so no need for locking.

530

* won't change, so no need for locking.

530

*/

531

*/

531

for_each_subsys(ss, i) {

532

for_each_subsys(ss, i) {

532

if (root->cgrp.subsys_mask & (1UL << i)) {

533

if (root->cgrp.subsys_mask & (1UL << i)) {

533

/* Subsystem is in this hierarchy. So we want

534

/* Subsystem is in this hierarchy. So we want

534

* the subsystem state from the new

535

* the subsystem state from the new

535

* cgroup */

536

* cgroup */

536

template[i] = cgroup_css(cgrp, ss);

537

template[i] = cgroup_css(cgrp, ss);

537

} else {

538

} else {

538

/* Subsystem is not in this hierarchy, so we

539

/* Subsystem is not in this hierarchy, so we

539

* don't want to change the subsystem state */

540

* don't want to change the subsystem state */

540

template[i] = old_cset->subsys[i];

541

template[i] = old_cset->subsys[i];

541

}

542

}

542

}

543

}

543

544

key = css_set_hash(template);

545

key = css_set_hash(template);

545

hash_for_each_possible(css_set_table, cset, hlist, key) {

546

hash_for_each_possible(css_set_table, cset, hlist, key) {

546

if (!compare_css_sets(cset, old_cset, cgrp, template))

547

if (!compare_css_sets(cset, old_cset, cgrp, template))

547

continue;

548

continue;

548

549

/* This css_set matches what we need */

550

/* This css_set matches what we need */

550

return cset;

551

return cset;

551

}

552

}

552

553

/* No existing cgroup group matched */

554

/* No existing cgroup group matched */

554

return NULL;

555

return NULL;

555

}

556

}

556

557

static void free_cgrp_cset_links(struct list_head *links_to_free)

558

static void free_cgrp_cset_links(struct list_head *links_to_free)

558

{

559

{

559

struct cgrp_cset_link *link, *tmp_link;

560

struct cgrp_cset_link *link, *tmp_link;

560

561

list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {

562

list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {

562

list_del(&link->cset_link);

563

list_del(&link->cset_link);

563

kfree(link);

564

kfree(link);

564

}

565

}

565

}

566

}

566

567

/**

568

/**

568

* allocate_cgrp_cset_links - allocate cgrp_cset_links

569

* allocate_cgrp_cset_links - allocate cgrp_cset_links

569

* @count: the number of links to allocate

570

* @count: the number of links to allocate

570

* @tmp_links: list_head the allocated links are put on

571

* @tmp_links: list_head the allocated links are put on

571

*

572

*

572

* Allocate @count cgrp_cset_link structures and chain them on @tmp_links

573

* Allocate @count cgrp_cset_link structures and chain them on @tmp_links

573

* through ->cset_link. Returns 0 on success or -errno.

574

* through ->cset_link. Returns 0 on success or -errno.

574

*/

575

*/

575

static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)

576

static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)

576

{

577

{

577

struct cgrp_cset_link *link;

578

struct cgrp_cset_link *link;

578

int i;

579

int i;

579

580

INIT_LIST_HEAD(tmp_links);

581

INIT_LIST_HEAD(tmp_links);

581

582

for (i = 0; i < count; i++) {

583

for (i = 0; i < count; i++) {

583

link = kzalloc(sizeof(*link), GFP_KERNEL);

584

link = kzalloc(sizeof(*link), GFP_KERNEL);

584

if (!link) {

585

if (!link) {

585

free_cgrp_cset_links(tmp_links);

586

free_cgrp_cset_links(tmp_links);

586

return -ENOMEM;

587

return -ENOMEM;

587

}

588

}

588

list_add(&link->cset_link, tmp_links);

589

list_add(&link->cset_link, tmp_links);

589

}

590

}

590

return 0;

591

return 0;

591

}

592

}

592

593

/**

594

/**

594

* link_css_set - a helper function to link a css_set to a cgroup

595

* link_css_set - a helper function to link a css_set to a cgroup

595

* @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()

596

* @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()

596

* @cset: the css_set to be linked

597

* @cset: the css_set to be linked

597

* @cgrp: the destination cgroup

598

* @cgrp: the destination cgroup

598

*/

599

*/

599

static void link_css_set(struct list_head *tmp_links, struct css_set *cset,

600

static void link_css_set(struct list_head *tmp_links, struct css_set *cset,

600

struct cgroup *cgrp)

601

struct cgroup *cgrp)

601

{

602

{

602

struct cgrp_cset_link *link;

603

struct cgrp_cset_link *link;

603

604

BUG_ON(list_empty(tmp_links));

605

BUG_ON(list_empty(tmp_links));

605

link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);

606

link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);

606

link->cset = cset;

607

link->cset = cset;

607

link->cgrp = cgrp;

608

link->cgrp = cgrp;

608

list_move(&link->cset_link, &cgrp->cset_links);

609

list_move(&link->cset_link, &cgrp->cset_links);

609

/*

610

/*

610

* Always add links to the tail of the list so that the list

611

* Always add links to the tail of the list so that the list

611

* is sorted by order of hierarchy creation

612

* is sorted by order of hierarchy creation

612

*/

613

*/

613

list_add_tail(&link->cgrp_link, &cset->cgrp_links);

614

list_add_tail(&link->cgrp_link, &cset->cgrp_links);

614

}

615

}

615

616

/**

617

/**

617

* find_css_set - return a new css_set with one cgroup updated

618

* find_css_set - return a new css_set with one cgroup updated

618

* @old_cset: the baseline css_set

619

* @old_cset: the baseline css_set

619

* @cgrp: the cgroup to be updated

620

* @cgrp: the cgroup to be updated

620

*

621

*

621

* Return a new css_set that's equivalent to @old_cset, but with @cgrp

622

* Return a new css_set that's equivalent to @old_cset, but with @cgrp

622

* substituted into the appropriate hierarchy.

623

* substituted into the appropriate hierarchy.

623

*/

624

*/

624

static struct css_set *find_css_set(struct css_set *old_cset,

625

static struct css_set *find_css_set(struct css_set *old_cset,

625

struct cgroup *cgrp)

626

struct cgroup *cgrp)

626

{

627

{

627

struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };

628

struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };

628

struct css_set *cset;

629

struct css_set *cset;

629

struct list_head tmp_links;

630

struct list_head tmp_links;

630

struct cgrp_cset_link *link;

631

struct cgrp_cset_link *link;

631

unsigned long key;

632

unsigned long key;

632

633

lockdep_assert_held(&cgroup_mutex);

634

lockdep_assert_held(&cgroup_mutex);

634

635

/* First see if we already have a cgroup group that matches

636

/* First see if we already have a cgroup group that matches

636

* the desired set */

637

* the desired set */

637

down_read(&css_set_rwsem);

638

down_read(&css_set_rwsem);

638

cset = find_existing_css_set(old_cset, cgrp, template);

639

cset = find_existing_css_set(old_cset, cgrp, template);

639

if (cset)

640

if (cset)

640

get_css_set(cset);

641

get_css_set(cset);

641

up_read(&css_set_rwsem);

642

up_read(&css_set_rwsem);

642

643

if (cset)

644

if (cset)

644

return cset;

645

return cset;

645

646

cset = kzalloc(sizeof(*cset), GFP_KERNEL);

647

cset = kzalloc(sizeof(*cset), GFP_KERNEL);

647

if (!cset)

648

if (!cset)

648

return NULL;

649

return NULL;

649

650

/* Allocate all the cgrp_cset_link objects that we'll need */

651

/* Allocate all the cgrp_cset_link objects that we'll need */

651

if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {

652

if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {

652

kfree(cset);

653

kfree(cset);

653

return NULL;

654

return NULL;

654

}

655

}

655

656

atomic_set(&cset->refcount, 1);

657

atomic_set(&cset->refcount, 1);

657

INIT_LIST_HEAD(&cset->cgrp_links);

658

INIT_LIST_HEAD(&cset->cgrp_links);

658

INIT_LIST_HEAD(&cset->tasks);

659

INIT_LIST_HEAD(&cset->tasks);

659

INIT_LIST_HEAD(&cset->mg_tasks);

660

INIT_LIST_HEAD(&cset->mg_tasks);

660

INIT_LIST_HEAD(&cset->mg_preload_node);

661

INIT_LIST_HEAD(&cset->mg_preload_node);

661

INIT_LIST_HEAD(&cset->mg_node);

662

INIT_LIST_HEAD(&cset->mg_node);

662

INIT_HLIST_NODE(&cset->hlist);

663

INIT_HLIST_NODE(&cset->hlist);

663

664

/* Copy the set of subsystem state objects generated in

665

/* Copy the set of subsystem state objects generated in

665

* find_existing_css_set() */

666

* find_existing_css_set() */

666

memcpy(cset->subsys, template, sizeof(cset->subsys));

667

memcpy(cset->subsys, template, sizeof(cset->subsys));

667

668

down_write(&css_set_rwsem);

669

down_write(&css_set_rwsem);

669

/* Add reference counts and links from the new css_set. */

670

/* Add reference counts and links from the new css_set. */

670

list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {

671

list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {

671

struct cgroup *c = link->cgrp;

672

struct cgroup *c = link->cgrp;

672

673

if (c->root == cgrp->root)

674

if (c->root == cgrp->root)

674

c = cgrp;

675

c = cgrp;

675

link_css_set(&tmp_links, cset, c);

676

link_css_set(&tmp_links, cset, c);

676

}

677

}

677

678

BUG_ON(!list_empty(&tmp_links));

679

BUG_ON(!list_empty(&tmp_links));

679

680

css_set_count++;

681

css_set_count++;

681

682

/* Add this cgroup group to the hash table */

683

/* Add this cgroup group to the hash table */

683

key = css_set_hash(cset->subsys);

684

key = css_set_hash(cset->subsys);

684

hash_add(css_set_table, &cset->hlist, key);

685

hash_add(css_set_table, &cset->hlist, key);

685

686

up_write(&css_set_rwsem);

687

up_write(&css_set_rwsem);

687

688

return cset;

689

return cset;

689

}

690

}

690

691

static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)

692

static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)

692

{

693

{

693

struct cgroup *root_cgrp = kf_root->kn->priv;

694

struct cgroup *root_cgrp = kf_root->kn->priv;

694

695

return root_cgrp->root;

696

return root_cgrp->root;

696

}

697

}

697

698

static int cgroup_init_root_id(struct cgroup_root *root)

699

static int cgroup_init_root_id(struct cgroup_root *root)

699

{

700

{

700

int id;

701

int id;

701

702

lockdep_assert_held(&cgroup_mutex);

703

lockdep_assert_held(&cgroup_mutex);

703

704

id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);

705

id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);

705

if (id < 0)

706

if (id < 0)

706

return id;

707

return id;

707

708

root->hierarchy_id = id;

709

root->hierarchy_id = id;

709

return 0;

710

return 0;

710

}

711

}

711

712

static void cgroup_exit_root_id(struct cgroup_root *root)

713

static void cgroup_exit_root_id(struct cgroup_root *root)

713

{

714

{

714

lockdep_assert_held(&cgroup_mutex);

715

lockdep_assert_held(&cgroup_mutex);

715

716

if (root->hierarchy_id) {

717

if (root->hierarchy_id) {

717

idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);

718

idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);

718

root->hierarchy_id = 0;

719

root->hierarchy_id = 0;

719

}

720

}

720

}

721

}

721

722

static void cgroup_free_root(struct cgroup_root *root)

723

static void cgroup_free_root(struct cgroup_root *root)

723

{

724

{

724

if (root) {

725

if (root) {

725

/* hierarhcy ID shoulid already have been released */

726

/* hierarhcy ID shoulid already have been released */

726

WARN_ON_ONCE(root->hierarchy_id);

727

WARN_ON_ONCE(root->hierarchy_id);

727

728

idr_destroy(&root->cgroup_idr);

729

idr_destroy(&root->cgroup_idr);

729

kfree(root);

730

kfree(root);

730

}

731

}

731

}

732

}

732

733

static void cgroup_destroy_root(struct cgroup_root *root)

734

static void cgroup_destroy_root(struct cgroup_root *root)

734

{

735

{

735

struct cgroup *cgrp = &root->cgrp;

736

struct cgroup *cgrp = &root->cgrp;

736

struct cgrp_cset_link *link, *tmp_link;

737

struct cgrp_cset_link *link, *tmp_link;

737

738

mutex_lock(&cgroup_tree_mutex);

739

mutex_lock(&cgroup_tree_mutex);

739

mutex_lock(&cgroup_mutex);

740

mutex_lock(&cgroup_mutex);

740

741

BUG_ON(atomic_read(&root->nr_cgrps));

742

BUG_ON(atomic_read(&root->nr_cgrps));

742

BUG_ON(!list_empty(&cgrp->children));

743

BUG_ON(!list_empty(&cgrp->children));

743

744

/* Rebind all subsystems back to the default hierarchy */

745

/* Rebind all subsystems back to the default hierarchy */

745

rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask);

746

rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask);

746

747

/*

748

/*

748

* Release all the links from cset_links to this hierarchy's

749

* Release all the links from cset_links to this hierarchy's

749

* root cgroup

750

* root cgroup

750

*/

751

*/

751

down_write(&css_set_rwsem);

752

down_write(&css_set_rwsem);

752

753

list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {

754

list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {

754

list_del(&link->cset_link);

755

list_del(&link->cset_link);

755

list_del(&link->cgrp_link);

756

list_del(&link->cgrp_link);

756

kfree(link);

757

kfree(link);

757

}

758

}

758

up_write(&css_set_rwsem);

759

up_write(&css_set_rwsem);

759

760

if (!list_empty(&root->root_list)) {

761

if (!list_empty(&root->root_list)) {

761

list_del(&root->root_list);

762

list_del(&root->root_list);

762

cgroup_root_count--;

763

cgroup_root_count--;

763

}

764

}

764

765

cgroup_exit_root_id(root);

766

cgroup_exit_root_id(root);

766

767

mutex_unlock(&cgroup_mutex);

768

mutex_unlock(&cgroup_mutex);

768

mutex_unlock(&cgroup_tree_mutex);

769

mutex_unlock(&cgroup_tree_mutex);

769

770

kernfs_destroy_root(root->kf_root);

771

kernfs_destroy_root(root->kf_root);

771

cgroup_free_root(root);

772

cgroup_free_root(root);

772

}

773

}

773

774

/* look up cgroup associated with given css_set on the specified hierarchy */

775

/* look up cgroup associated with given css_set on the specified hierarchy */

775

static struct cgroup *cset_cgroup_from_root(struct css_set *cset,

776

static struct cgroup *cset_cgroup_from_root(struct css_set *cset,

776

struct cgroup_root *root)

777

struct cgroup_root *root)

777

{

778

{

778

struct cgroup *res = NULL;

779

struct cgroup *res = NULL;

779

780

lockdep_assert_held(&cgroup_mutex);

781

lockdep_assert_held(&cgroup_mutex);

781

lockdep_assert_held(&css_set_rwsem);

782

lockdep_assert_held(&css_set_rwsem);

782

783

if (cset == &init_css_set) {

784

if (cset == &init_css_set) {

784

res = &root->cgrp;

785

res = &root->cgrp;

785

} else {

786

} else {

786

struct cgrp_cset_link *link;

787

struct cgrp_cset_link *link;

787

788

list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {

789

list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {

789

struct cgroup *c = link->cgrp;

790

struct cgroup *c = link->cgrp;

790

791

if (c->root == root) {

792

if (c->root == root) {

792

res = c;

793

res = c;

793

break;

794

break;

794

}

795

}

795

}

796

}

796

}

797

}

797

798

BUG_ON(!res);

799

BUG_ON(!res);

799

return res;

800

return res;

800

}

801

}

801

802

/*

803

/*

803

* Return the cgroup for "task" from the given hierarchy. Must be

804

* Return the cgroup for "task" from the given hierarchy. Must be

804

* called with cgroup_mutex and css_set_rwsem held.

805

* called with cgroup_mutex and css_set_rwsem held.

805

*/

806

*/

806

static struct cgroup *task_cgroup_from_root(struct task_struct *task,

807

static struct cgroup *task_cgroup_from_root(struct task_struct *task,

807

struct cgroup_root *root)

808

struct cgroup_root *root)

808

{

809

{

809

/*

810

/*

810

* No need to lock the task - since we hold cgroup_mutex the

811

* No need to lock the task - since we hold cgroup_mutex the

811

* task can't change groups, so the only thing that can happen

812

* task can't change groups, so the only thing that can happen

812

* is that it exits and its css is set back to init_css_set.

813

* is that it exits and its css is set back to init_css_set.

813

*/

814

*/

814

return cset_cgroup_from_root(task_css_set(task), root);

815

return cset_cgroup_from_root(task_css_set(task), root);

815

}

816

}

816

817

/*

818

/*

818

* A task must hold cgroup_mutex to modify cgroups.

819

* A task must hold cgroup_mutex to modify cgroups.

819

*

820

*

820

* Any task can increment and decrement the count field without lock.

821

* Any task can increment and decrement the count field without lock.

821

* So in general, code holding cgroup_mutex can't rely on the count

822

* So in general, code holding cgroup_mutex can't rely on the count

822

* field not changing. However, if the count goes to zero, then only

823

* field not changing. However, if the count goes to zero, then only

823

* cgroup_attach_task() can increment it again. Because a count of zero

824

* cgroup_attach_task() can increment it again. Because a count of zero

824

* means that no tasks are currently attached, therefore there is no

825

* means that no tasks are currently attached, therefore there is no

825

* way a task attached to that cgroup can fork (the other way to

826

* way a task attached to that cgroup can fork (the other way to

826

* increment the count). So code holding cgroup_mutex can safely

827

* increment the count). So code holding cgroup_mutex can safely

827

* assume that if the count is zero, it will stay zero. Similarly, if

828

* assume that if the count is zero, it will stay zero. Similarly, if

828

* a task holds cgroup_mutex on a cgroup with zero count, it

829

* a task holds cgroup_mutex on a cgroup with zero count, it

829

* knows that the cgroup won't be removed, as cgroup_rmdir()

830

* knows that the cgroup won't be removed, as cgroup_rmdir()

830

* needs that mutex.

831

* needs that mutex.

831

*

832

*

832

* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't

833

* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't

833

* (usually) take cgroup_mutex. These are the two most performance

834

* (usually) take cgroup_mutex. These are the two most performance

834

* critical pieces of code here. The exception occurs on cgroup_exit(),

835

* critical pieces of code here. The exception occurs on cgroup_exit(),

835

* when a task in a notify_on_release cgroup exits. Then cgroup_mutex

836

* when a task in a notify_on_release cgroup exits. Then cgroup_mutex

836

* is taken, and if the cgroup count is zero, a usermode call made

837

* is taken, and if the cgroup count is zero, a usermode call made

837

* to the release agent with the name of the cgroup (path relative to

838

* to the release agent with the name of the cgroup (path relative to

838

* the root of cgroup file system) as the argument.

839

* the root of cgroup file system) as the argument.

839

*

840

*

840

* A cgroup can only be deleted if both its 'count' of using tasks

841

* A cgroup can only be deleted if both its 'count' of using tasks

841

* is zero, and its list of 'children' cgroups is empty. Since all

842

* is zero, and its list of 'children' cgroups is empty. Since all

842

* tasks in the system use _some_ cgroup, and since there is always at

843

* tasks in the system use _some_ cgroup, and since there is always at

843

* least one task in the system (init, pid == 1), therefore, root cgroup

844

* least one task in the system (init, pid == 1), therefore, root cgroup

844

* always has either children cgroups and/or using tasks. So we don't

845

* always has either children cgroups and/or using tasks. So we don't

845

* need a special hack to ensure that root cgroup cannot be deleted.

846

* need a special hack to ensure that root cgroup cannot be deleted.

846

*

847

*

847

* P.S. One more locking exception. RCU is used to guard the

848

* P.S. One more locking exception. RCU is used to guard the

848

* update of a tasks cgroup pointer by cgroup_attach_task()

849

* update of a tasks cgroup pointer by cgroup_attach_task()

849

*/

850

*/

850

851

static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);

852

static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);

852

static struct kernfs_syscall_ops cgroup_kf_syscall_ops;

853

static struct kernfs_syscall_ops cgroup_kf_syscall_ops;

853

static const struct file_operations proc_cgroupstats_operations;

854

static const struct file_operations proc_cgroupstats_operations;

854

855

static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,

856

static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,

856

char *buf)

857

char *buf)

857

{

858

{

858

if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&

859

if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&

859

!(cgrp->root->flags & CGRP_ROOT_NOPREFIX))

860

!(cgrp->root->flags & CGRP_ROOT_NOPREFIX))

860

snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",

861

snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",

861

cft->ss->name, cft->name);

862

cft->ss->name, cft->name);

862

else

863

else

863

strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);

864

strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);

864

return buf;

865

return buf;

865

}

866

}

866

867

/**

868

/**

868

* cgroup_file_mode - deduce file mode of a control file

869

* cgroup_file_mode - deduce file mode of a control file

869

* @cft: the control file in question

870

* @cft: the control file in question

870

*

871

*

871

* returns cft->mode if ->mode is not 0

872

* returns cft->mode if ->mode is not 0

872

* returns S_IRUGO|S_IWUSR if it has both a read and a write handler

873

* returns S_IRUGO|S_IWUSR if it has both a read and a write handler

873

* returns S_IRUGO if it has only a read handler

874

* returns S_IRUGO if it has only a read handler

874

* returns S_IWUSR if it has only a write hander

875

* returns S_IWUSR if it has only a write hander

875

*/

876

*/

876

static umode_t cgroup_file_mode(const struct cftype *cft)

877

static umode_t cgroup_file_mode(const struct cftype *cft)

877

{

878

{

878

umode_t mode = 0;

879

umode_t mode = 0;

879

880

if (cft->mode)

881

if (cft->mode)

881

return cft->mode;

882

return cft->mode;

882

883

if (cft->read_u64 || cft->read_s64 || cft->seq_show)

884

if (cft->read_u64 || cft->read_s64 || cft->seq_show)

884

mode |= S_IRUGO;

885

mode |= S_IRUGO;

885

886

if (cft->write_u64 || cft->write_s64 || cft->write_string ||

887

if (cft->write_u64 || cft->write_s64 || cft->write_string ||

887

cft->trigger)

888

cft->trigger)

888

mode |= S_IWUSR;

889

mode |= S_IWUSR;

889

890

return mode;

891

return mode;

891

}

892

}

892

893

static void cgroup_free_fn(struct work_struct *work)

894

static void cgroup_free_fn(struct work_struct *work)

894

{

895

{

895

struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);

896

struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);

896

897

atomic_dec(&cgrp->root->nr_cgrps);

898

atomic_dec(&cgrp->root->nr_cgrps);

898

cgroup_pidlist_destroy_all(cgrp);

899

cgroup_pidlist_destroy_all(cgrp);

899

900

if (cgrp->parent) {

901

if (cgrp->parent) {

901

/*

902

/*

902

* We get a ref to the parent, and put the ref when this

903

* We get a ref to the parent, and put the ref when this

903

* cgroup is being freed, so it's guaranteed that the

904

* cgroup is being freed, so it's guaranteed that the

904

* parent won't be destroyed before its children.

905

* parent won't be destroyed before its children.

905

*/

906

*/

906

cgroup_put(cgrp->parent);

907

cgroup_put(cgrp->parent);

907

kernfs_put(cgrp->kn);

908

kernfs_put(cgrp->kn);

908

kfree(cgrp);

909

kfree(cgrp);

909

} else {

910

} else {

910

/*

911

/*

911

* This is root cgroup's refcnt reaching zero, which

912

* This is root cgroup's refcnt reaching zero, which

912

* indicates that the root should be released.

913

* indicates that the root should be released.

913

*/

914

*/

914

cgroup_destroy_root(cgrp->root);

915

cgroup_destroy_root(cgrp->root);

915

}

916

}

916

}

917

}

917

918

static void cgroup_free_rcu(struct rcu_head *head)

919

static void cgroup_free_rcu(struct rcu_head *head)

919

{

920

{

920

struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);

921

struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);

921

922

INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);

923

INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);

923

queue_work(cgroup_destroy_wq, &cgrp->destroy_work);

924

queue_work(cgroup_destroy_wq, &cgrp->destroy_work);

924

}

925

}

925

926

static void cgroup_get(struct cgroup *cgrp)

927

static void cgroup_get(struct cgroup *cgrp)

927

{

928

{

928

WARN_ON_ONCE(cgroup_is_dead(cgrp));

929

WARN_ON_ONCE(cgroup_is_dead(cgrp));

929

WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);

930

WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);

930

atomic_inc(&cgrp->refcnt);

931

atomic_inc(&cgrp->refcnt);

931

}

932

}

932

933

static void cgroup_put(struct cgroup *cgrp)

934

static void cgroup_put(struct cgroup *cgrp)

934

{

935

{

935

if (!atomic_dec_and_test(&cgrp->refcnt))

936

if (!atomic_dec_and_test(&cgrp->refcnt))

936

return;

937

return;

937

if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))

938

if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))

938

return;

939

return;

939

940

/*

941

/*

941

* XXX: cgrp->id is only used to look up css's. As cgroup and

942

* XXX: cgrp->id is only used to look up css's. As cgroup and

942

* css's lifetimes will be decoupled, it should be made

943

* css's lifetimes will be decoupled, it should be made

943

* per-subsystem and moved to css->id so that lookups are

944

* per-subsystem and moved to css->id so that lookups are

944

* successful until the target css is released.

945

* successful until the target css is released.

945

*/

946

*/

946

mutex_lock(&cgroup_mutex);

947

mutex_lock(&cgroup_mutex);

947

idr_remove(&cgrp->root->cgroup_idr, cgrp->id);

948

idr_remove(&cgrp->root->cgroup_idr, cgrp->id);

948

mutex_unlock(&cgroup_mutex);

949

mutex_unlock(&cgroup_mutex);

949

cgrp->id = -1;

950

cgrp->id = -1;

950

951

call_rcu(&cgrp->rcu_head, cgroup_free_rcu);

952

call_rcu(&cgrp->rcu_head, cgroup_free_rcu);

952

}

953

}

953

954

static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)

955

static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)

955

{

956

{

956

char name[CGROUP_FILE_NAME_MAX];

957

char name[CGROUP_FILE_NAME_MAX];

957

958

lockdep_assert_held(&cgroup_tree_mutex);

959

lockdep_assert_held(&cgroup_tree_mutex);

959

kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));

960

kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));

960

}

961

}

961

962

/**

963

/**

963

* cgroup_clear_dir - remove subsys files in a cgroup directory

964

* cgroup_clear_dir - remove subsys files in a cgroup directory

964

* @cgrp: target cgroup

965

* @cgrp: target cgroup

965

* @subsys_mask: mask of the subsystem ids whose files should be removed

966

* @subsys_mask: mask of the subsystem ids whose files should be removed

966

*/

967

*/

967

static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)

968

static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)

968

{

969

{

969

struct cgroup_subsys *ss;

970

struct cgroup_subsys *ss;

970

int i;

971

int i;

971

972

for_each_subsys(ss, i) {

973

for_each_subsys(ss, i) {

973

struct cftype *cfts;

974

struct cftype *cfts;

974

975

if (!test_bit(i, &subsys_mask))

976

if (!test_bit(i, &subsys_mask))

976

continue;

977

continue;

977

list_for_each_entry(cfts, &ss->cfts, node)

978

list_for_each_entry(cfts, &ss->cfts, node)

978

cgroup_addrm_files(cgrp, cfts, false);

979

cgroup_addrm_files(cgrp, cfts, false);

979

}

980

}

980

}

981

}

981

982

static int rebind_subsystems(struct cgroup_root *dst_root,

983

static int rebind_subsystems(struct cgroup_root *dst_root,

983

unsigned long ss_mask)

984

unsigned long ss_mask)

984

{

985

{

985

struct cgroup_subsys *ss;

986

struct cgroup_subsys *ss;

986

int ssid, ret;

987

int ssid, ret;

987

988

lockdep_assert_held(&cgroup_tree_mutex);

989

lockdep_assert_held(&cgroup_tree_mutex);

989

lockdep_assert_held(&cgroup_mutex);

990

lockdep_assert_held(&cgroup_mutex);

990

991

for_each_subsys(ss, ssid) {

992

for_each_subsys(ss, ssid) {

992

if (!(ss_mask & (1 << ssid)))

993

if (!(ss_mask & (1 << ssid)))

993

continue;

994

continue;

994

995

/* if @ss is on the dummy_root, we can always move it */

996

/* if @ss is on the dummy_root, we can always move it */

996

if (ss->root == &cgrp_dfl_root)

997

if (ss->root == &cgrp_dfl_root)

997

continue;

998

continue;

998

999

/* if @ss has non-root cgroups attached to it, can't move */

1000

/* if @ss has non-root cgroups attached to it, can't move */

1000

if (!list_empty(&ss->root->cgrp.children))

1001

if (!list_empty(&ss->root->cgrp.children))

1001

return -EBUSY;

1002

return -EBUSY;

1002

1003

/* can't move between two non-dummy roots either */

1004

/* can't move between two non-dummy roots either */

1004

if (dst_root != &cgrp_dfl_root)

1005

if (dst_root != &cgrp_dfl_root)

1005

return -EBUSY;

1006

return -EBUSY;

1006

}

1007

}

1007

1008

ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);

1009

ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);

1009

if (ret) {

1010

if (ret) {

1010

if (dst_root != &cgrp_dfl_root)

1011

if (dst_root != &cgrp_dfl_root)

1011

return ret;

1012

return ret;

1012

1013

/*

1014

/*

1014

* Rebinding back to the default root is not allowed to

1015

* Rebinding back to the default root is not allowed to

1015

* fail. Using both default and non-default roots should

1016

* fail. Using both default and non-default roots should

1016

* be rare. Moving subsystems back and forth even more so.

1017

* be rare. Moving subsystems back and forth even more so.

1017

* Just warn about it and continue.

1018

* Just warn about it and continue.

1018

*/

1019

*/

1019

if (cgrp_dfl_root_visible) {

1020

if (cgrp_dfl_root_visible) {

1020

pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",

1021

pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",

1021

ret, ss_mask);

1022

ret, ss_mask);

1022

pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");

1023

pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");

1023

}

1024

}

1024

}

1025

}

1025

1026

/*

1027

/*

1027

* Nothing can fail from this point on. Remove files for the

1028

* Nothing can fail from this point on. Remove files for the

1028

* removed subsystems and rebind each subsystem.

1029

* removed subsystems and rebind each subsystem.

1029

*/

1030

*/

1030

mutex_unlock(&cgroup_mutex);

1031

mutex_unlock(&cgroup_mutex);

1031

for_each_subsys(ss, ssid)

1032

for_each_subsys(ss, ssid)

1032

if (ss_mask & (1 << ssid))

1033

if (ss_mask & (1 << ssid))

1033

cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);

1034

cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);

1034

mutex_lock(&cgroup_mutex);

1035

mutex_lock(&cgroup_mutex);

1035

1036

for_each_subsys(ss, ssid) {

1037

for_each_subsys(ss, ssid) {

1037

struct cgroup_root *src_root;

1038

struct cgroup_root *src_root;

1038

struct cgroup_subsys_state *css;

1039

struct cgroup_subsys_state *css;

1039

1040

if (!(ss_mask & (1 << ssid)))

1041

if (!(ss_mask & (1 << ssid)))

1041

continue;

1042

continue;

1042

1043

src_root = ss->root;

1044

src_root = ss->root;

1044

css = cgroup_css(&src_root->cgrp, ss);

1045

css = cgroup_css(&src_root->cgrp, ss);

1045

1046

WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));

1047

WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));

1047

1048

RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);

1049

RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);

1049

rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);

1050

rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);

1050

ss->root = dst_root;

1051

ss->root = dst_root;

1051

css->cgroup = &dst_root->cgrp;

1052

css->cgroup = &dst_root->cgrp;

1052

1053

src_root->cgrp.subsys_mask &= ~(1 << ssid);

1054

src_root->cgrp.subsys_mask &= ~(1 << ssid);

1054

dst_root->cgrp.subsys_mask |= 1 << ssid;

1055

dst_root->cgrp.subsys_mask |= 1 << ssid;

1055

1056

if (ss->bind)

1057

if (ss->bind)

1057

ss->bind(css);

1058

ss->bind(css);

1058

}

1059

}

1059

1060

kernfs_activate(dst_root->cgrp.kn);

1061

kernfs_activate(dst_root->cgrp.kn);

1061

return 0;

1062

return 0;

1062

}

1063

}

1063

1064

static int cgroup_show_options(struct seq_file *seq,

1065

static int cgroup_show_options(struct seq_file *seq,

1065

struct kernfs_root *kf_root)

1066

struct kernfs_root *kf_root)

1066

{

1067

{

1067

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1068

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1068

struct cgroup_subsys *ss;

1069

struct cgroup_subsys *ss;

1069

int ssid;

1070

int ssid;

1070

1071

for_each_subsys(ss, ssid)

1072

for_each_subsys(ss, ssid)

1072

if (root->cgrp.subsys_mask & (1 << ssid))

1073

if (root->cgrp.subsys_mask & (1 << ssid))

1073

seq_printf(seq, ",%s", ss->name);

1074

seq_printf(seq, ",%s", ss->name);

1074

if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)

1075

if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)

1075

seq_puts(seq, ",sane_behavior");

1076

seq_puts(seq, ",sane_behavior");

1076

if (root->flags & CGRP_ROOT_NOPREFIX)

1077

if (root->flags & CGRP_ROOT_NOPREFIX)

1077

seq_puts(seq, ",noprefix");

1078

seq_puts(seq, ",noprefix");

1078

if (root->flags & CGRP_ROOT_XATTR)

1079

if (root->flags & CGRP_ROOT_XATTR)

1079

seq_puts(seq, ",xattr");

1080

seq_puts(seq, ",xattr");

1080

1081

spin_lock(&release_agent_path_lock);

1082

spin_lock(&release_agent_path_lock);

1082

if (strlen(root->release_agent_path))

1083

if (strlen(root->release_agent_path))

1083

seq_printf(seq, ",release_agent=%s", root->release_agent_path);

1084

seq_printf(seq, ",release_agent=%s", root->release_agent_path);

1084

spin_unlock(&release_agent_path_lock);

1085

spin_unlock(&release_agent_path_lock);

1085

1086

if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))

1087

if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))

1087

seq_puts(seq, ",clone_children");

1088

seq_puts(seq, ",clone_children");

1088

if (strlen(root->name))

1089

if (strlen(root->name))

1089

seq_printf(seq, ",name=%s", root->name);

1090

seq_printf(seq, ",name=%s", root->name);

1090

return 0;

1091

return 0;

1091

}

1092

}

1092

1093

struct cgroup_sb_opts {

1094

struct cgroup_sb_opts {

1094

unsigned long subsys_mask;

1095

unsigned long subsys_mask;

1095

unsigned long flags;

1096

unsigned long flags;

1096

char *release_agent;

1097

char *release_agent;

1097

bool cpuset_clone_children;

1098

bool cpuset_clone_children;

1098

char *name;

1099

char *name;

1099

/* User explicitly requested empty subsystem */

1100

/* User explicitly requested empty subsystem */

1100

bool none;

1101

bool none;

1101

};

1102

};

1102

1103

/*

1104

/*

1104

* Convert a hierarchy specifier into a bitmask of subsystems and

1105

* Convert a hierarchy specifier into a bitmask of subsystems and

1105

* flags. Call with cgroup_mutex held to protect the cgroup_subsys[]

1106

* flags. Call with cgroup_mutex held to protect the cgroup_subsys[]

1106

* array. This function takes refcounts on subsystems to be used, unless it

1107

* array. This function takes refcounts on subsystems to be used, unless it

1107

* returns error, in which case no refcounts are taken.

1108

* returns error, in which case no refcounts are taken.

1108

*/

1109

*/

1109

static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)

1110

static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)

1110

{

1111

{

1111

char *token, *o = data;

1112

char *token, *o = data;

1112

bool all_ss = false, one_ss = false;

1113

bool all_ss = false, one_ss = false;

1113

unsigned long mask = (unsigned long)-1;

1114

unsigned long mask = (unsigned long)-1;

1114

struct cgroup_subsys *ss;

1115

struct cgroup_subsys *ss;

1115

int i;

1116

int i;

1116

1117

BUG_ON(!mutex_is_locked(&cgroup_mutex));

1118

BUG_ON(!mutex_is_locked(&cgroup_mutex));

1118

1119

#ifdef CONFIG_CPUSETS

1120

#ifdef CONFIG_CPUSETS

1120

mask = ~(1UL << cpuset_cgrp_id);

1121

mask = ~(1UL << cpuset_cgrp_id);

1121

#endif

1122

#endif

1122

1123

memset(opts, 0, sizeof(*opts));

1124

memset(opts, 0, sizeof(*opts));

1124

1125

while ((token = strsep(&o, ",")) != NULL) {

1126

while ((token = strsep(&o, ",")) != NULL) {

1126

if (!*token)

1127

if (!*token)

1127

return -EINVAL;

1128

return -EINVAL;

1128

if (!strcmp(token, "none")) {

1129

if (!strcmp(token, "none")) {

1129

/* Explicitly have no subsystems */

1130

/* Explicitly have no subsystems */

1130

opts->none = true;

1131

opts->none = true;

1131

continue;

1132

continue;

1132

}

1133

}

1133

if (!strcmp(token, "all")) {

1134

if (!strcmp(token, "all")) {

1134

/* Mutually exclusive option 'all' + subsystem name */

1135

/* Mutually exclusive option 'all' + subsystem name */

1135

if (one_ss)

1136

if (one_ss)

1136

return -EINVAL;

1137

return -EINVAL;

1137

all_ss = true;

1138

all_ss = true;

1138

continue;

1139

continue;

1139

}

1140

}

1140

if (!strcmp(token, "__DEVEL__sane_behavior")) {

1141

if (!strcmp(token, "__DEVEL__sane_behavior")) {

1141

opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;

1142

opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;

1142

continue;

1143

continue;

1143

}

1144

}

1144

if (!strcmp(token, "noprefix")) {

1145

if (!strcmp(token, "noprefix")) {

1145

opts->flags |= CGRP_ROOT_NOPREFIX;

1146

opts->flags |= CGRP_ROOT_NOPREFIX;

1146

continue;

1147

continue;

1147

}

1148

}

1148

if (!strcmp(token, "clone_children")) {

1149

if (!strcmp(token, "clone_children")) {

1149

opts->cpuset_clone_children = true;

1150

opts->cpuset_clone_children = true;

1150

continue;

1151

continue;

1151

}

1152

}

1152

if (!strcmp(token, "xattr")) {

1153

if (!strcmp(token, "xattr")) {

1153

opts->flags |= CGRP_ROOT_XATTR;

1154

opts->flags |= CGRP_ROOT_XATTR;

1154

continue;

1155

continue;

1155

}

1156

}

1156

if (!strncmp(token, "release_agent=", 14)) {

1157

if (!strncmp(token, "release_agent=", 14)) {

1157

/* Specifying two release agents is forbidden */

1158

/* Specifying two release agents is forbidden */

1158

if (opts->release_agent)

1159

if (opts->release_agent)

1159

return -EINVAL;

1160

return -EINVAL;

1160

opts->release_agent =

1161

opts->release_agent =

1161

kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);

1162

kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);

1162

if (!opts->release_agent)

1163

if (!opts->release_agent)

1163

return -ENOMEM;

1164

return -ENOMEM;

1164

continue;

1165

continue;

1165

}

1166

}

1166

if (!strncmp(token, "name=", 5)) {

1167

if (!strncmp(token, "name=", 5)) {

1167

const char *name = token + 5;

1168

const char *name = token + 5;

1168

/* Can't specify an empty name */

1169

/* Can't specify an empty name */

1169

if (!strlen(name))

1170

if (!strlen(name))

1170

return -EINVAL;

1171

return -EINVAL;

1171

/* Must match [\w.-]+ */

1172

/* Must match [\w.-]+ */

1172

for (i = 0; i < strlen(name); i++) {

1173

for (i = 0; i < strlen(name); i++) {

1173

char c = name[i];

1174

char c = name[i];

1174

if (isalnum(c))

1175

if (isalnum(c))

1175

continue;

1176

continue;

1176

if ((c == '.') || (c == '-') || (c == '_'))

1177

if ((c == '.') || (c == '-') || (c == '_'))

1177

continue;

1178

continue;

1178

return -EINVAL;

1179

return -EINVAL;

1179

}

1180

}

1180

/* Specifying two names is forbidden */

1181

/* Specifying two names is forbidden */

1181

if (opts->name)

1182

if (opts->name)

1182

return -EINVAL;

1183

return -EINVAL;

1183

opts->name = kstrndup(name,

1184

opts->name = kstrndup(name,

1184

MAX_CGROUP_ROOT_NAMELEN - 1,

1185

MAX_CGROUP_ROOT_NAMELEN - 1,

1185

GFP_KERNEL);

1186

GFP_KERNEL);

1186

if (!opts->name)

1187

if (!opts->name)

1187

return -ENOMEM;

1188

return -ENOMEM;

1188

1189

continue;

1190

continue;

1190

}

1191

}

1191

1192

for_each_subsys(ss, i) {

1193

for_each_subsys(ss, i) {

1193

if (strcmp(token, ss->name))

1194

if (strcmp(token, ss->name))

1194

continue;

1195

continue;

1195

if (ss->disabled)

1196

if (ss->disabled)

1196

continue;

1197

continue;

1197

1198

/* Mutually exclusive option 'all' + subsystem name */

1199

/* Mutually exclusive option 'all' + subsystem name */

1199

if (all_ss)

1200

if (all_ss)

1200

return -EINVAL;

1201

return -EINVAL;

1201

set_bit(i, &opts->subsys_mask);

1202

set_bit(i, &opts->subsys_mask);

1202

one_ss = true;

1203

one_ss = true;

1203

1204

break;

1205

break;

1205

}

1206

}

1206

if (i == CGROUP_SUBSYS_COUNT)

1207

if (i == CGROUP_SUBSYS_COUNT)

1207

return -ENOENT;

1208

return -ENOENT;

1208

}

1209

}

1209

1210

/* Consistency checks */

1211

/* Consistency checks */

1211

1212

if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {

1213

if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {

1213

pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");

1214

pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");

1214

1215

if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||

1216

if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||

1216

opts->cpuset_clone_children || opts->release_agent ||

1217

opts->cpuset_clone_children || opts->release_agent ||

1217

opts->name) {

1218

opts->name) {

1218

pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");

1219

pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");

1219

return -EINVAL;

1220

return -EINVAL;

1220

}

1221

}

1221

} else {

1222

} else {

1222

/*

1223

/*

1223

* If the 'all' option was specified select all the

1224

* If the 'all' option was specified select all the

1224

* subsystems, otherwise if 'none', 'name=' and a subsystem

1225

* subsystems, otherwise if 'none', 'name=' and a subsystem

1225

* name options were not specified, let's default to 'all'

1226

* name options were not specified, let's default to 'all'

1226

*/

1227

*/

1227

if (all_ss || (!one_ss && !opts->none && !opts->name))

1228

if (all_ss || (!one_ss && !opts->none && !opts->name))

1228

for_each_subsys(ss, i)

1229

for_each_subsys(ss, i)

1229

if (!ss->disabled)

1230

if (!ss->disabled)

1230

set_bit(i, &opts->subsys_mask);

1231

set_bit(i, &opts->subsys_mask);

1231

1232

/*

1233

/*

1233

* We either have to specify by name or by subsystems. (So

1234

* We either have to specify by name or by subsystems. (So

1234

* all empty hierarchies must have a name).

1235

* all empty hierarchies must have a name).

1235

*/

1236

*/

1236

if (!opts->subsys_mask && !opts->name)

1237

if (!opts->subsys_mask && !opts->name)

1237

return -EINVAL;

1238

return -EINVAL;

1238

}

1239

}

1239

1240

/*

1241

/*

1241

* Option noprefix was introduced just for backward compatibility

1242

* Option noprefix was introduced just for backward compatibility

1242

* with the old cpuset, so we allow noprefix only if mounting just

1243

* with the old cpuset, so we allow noprefix only if mounting just

1243

* the cpuset subsystem.

1244

* the cpuset subsystem.

1244

*/

1245

*/

1245

if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))

1246

if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))

1246

return -EINVAL;

1247

return -EINVAL;

1247

1248

1249

/* Can't specify "none" and some subsystems */

1250

/* Can't specify "none" and some subsystems */

1250

if (opts->subsys_mask && opts->none)

1251

if (opts->subsys_mask && opts->none)

1251

return -EINVAL;

1252

return -EINVAL;

1252

1253

return 0;

1254

return 0;

1254

}

1255

}

1255

1256

static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)

1257

static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)

1257

{

1258

{

1258

int ret = 0;

1259

int ret = 0;

1259

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1260

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1260

struct cgroup_sb_opts opts;

1261

struct cgroup_sb_opts opts;

1261

unsigned long added_mask, removed_mask;

1262

unsigned long added_mask, removed_mask;

1262

1263

if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {

1264

if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {

1264

pr_err("cgroup: sane_behavior: remount is not allowed\n");

1265

pr_err("cgroup: sane_behavior: remount is not allowed\n");

1265

return -EINVAL;

1266

return -EINVAL;

1266

}

1267

}

1267

1268

mutex_lock(&cgroup_tree_mutex);

1269

mutex_lock(&cgroup_tree_mutex);

1269

mutex_lock(&cgroup_mutex);

1270

mutex_lock(&cgroup_mutex);

1270

1271

/* See what subsystems are wanted */

1272

/* See what subsystems are wanted */

1272

ret = parse_cgroupfs_options(data, &opts);

1273

ret = parse_cgroupfs_options(data, &opts);

1273

if (ret)

1274

if (ret)

1274

goto out_unlock;

1275

goto out_unlock;

1275

1276

if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent)

1277

if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent)

1277

pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",

1278

pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",

1278

task_tgid_nr(current), current->comm);

1279

task_tgid_nr(current), current->comm);

1279

1280

added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask;

1281

added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask;

1281

removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask;

1282

removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask;

1282

1283

/* Don't allow flags or name to change at remount */

1284

/* Don't allow flags or name to change at remount */

1284

if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||

1285

if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||

1285

(opts.name && strcmp(opts.name, root->name))) {

1286

(opts.name && strcmp(opts.name, root->name))) {

1286

pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",

1287

pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",

1287

opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",

1288

opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",

1288

root->flags & CGRP_ROOT_OPTION_MASK, root->name);

1289

root->flags & CGRP_ROOT_OPTION_MASK, root->name);

1289

ret = -EINVAL;

1290

ret = -EINVAL;

1290

goto out_unlock;

1291

goto out_unlock;

1291

}

1292

}

1292

1293

/* remounting is not allowed for populated hierarchies */

1294

/* remounting is not allowed for populated hierarchies */

1294

if (!list_empty(&root->cgrp.children)) {

1295

if (!list_empty(&root->cgrp.children)) {

1295

ret = -EBUSY;

1296

ret = -EBUSY;

1296

goto out_unlock;

1297

goto out_unlock;

1297

}

1298

}

1298

1299

ret = rebind_subsystems(root, added_mask);

1300

ret = rebind_subsystems(root, added_mask);

1300

if (ret)

1301

if (ret)

1301

goto out_unlock;

1302

goto out_unlock;

1302

1303

rebind_subsystems(&cgrp_dfl_root, removed_mask);

1304

rebind_subsystems(&cgrp_dfl_root, removed_mask);

1304

1305

if (opts.release_agent) {

1306

if (opts.release_agent) {

1306

spin_lock(&release_agent_path_lock);

1307

spin_lock(&release_agent_path_lock);

1307

strcpy(root->release_agent_path, opts.release_agent);

1308

strcpy(root->release_agent_path, opts.release_agent);

1308

spin_unlock(&release_agent_path_lock);

1309

spin_unlock(&release_agent_path_lock);

1309

}

1310

}

1310

out_unlock:

1311

out_unlock:

1311

kfree(opts.release_agent);

1312

kfree(opts.release_agent);

1312

kfree(opts.name);

1313

kfree(opts.name);

1313

mutex_unlock(&cgroup_mutex);

1314

mutex_unlock(&cgroup_mutex);

1314

mutex_unlock(&cgroup_tree_mutex);

1315

mutex_unlock(&cgroup_tree_mutex);

1315

return ret;

1316

return ret;

1316

}

1317

}

1317

1318

/*

1319

/*

1319

* To reduce the fork() overhead for systems that are not actually using

1320

* To reduce the fork() overhead for systems that are not actually using

1320

* their cgroups capability, we don't maintain the lists running through

1321

* their cgroups capability, we don't maintain the lists running through

1321

* each css_set to its tasks until we see the list actually used - in other

1322

* each css_set to its tasks until we see the list actually used - in other

1322

* words after the first mount.

1323

* words after the first mount.

1323

*/

1324

*/

1324

static bool use_task_css_set_links __read_mostly;

1325

static bool use_task_css_set_links __read_mostly;

1325

1326

static void cgroup_enable_task_cg_lists(void)

1327

static void cgroup_enable_task_cg_lists(void)

1327

{

1328

{

1328

struct task_struct *p, *g;

1329

struct task_struct *p, *g;

1329

1330

down_write(&css_set_rwsem);

1331

down_write(&css_set_rwsem);

1331

1332

if (use_task_css_set_links)

1333

if (use_task_css_set_links)

1333

goto out_unlock;

1334

goto out_unlock;

1334

1335

use_task_css_set_links = true;

1336

use_task_css_set_links = true;

1336

1337

/*

1338

/*

1338

* We need tasklist_lock because RCU is not safe against

1339

* We need tasklist_lock because RCU is not safe against

1339

* while_each_thread(). Besides, a forking task that has passed

1340

* while_each_thread(). Besides, a forking task that has passed

1340

* cgroup_post_fork() without seeing use_task_css_set_links = 1

1341

* cgroup_post_fork() without seeing use_task_css_set_links = 1

1341

* is not guaranteed to have its child immediately visible in the

1342

* is not guaranteed to have its child immediately visible in the

1342

* tasklist if we walk through it with RCU.

1343

* tasklist if we walk through it with RCU.

1343

*/

1344

*/

1344

read_lock(&tasklist_lock);

1345

read_lock(&tasklist_lock);

1345

do_each_thread(g, p) {

1346

do_each_thread(g, p) {

1346

WARN_ON_ONCE(!list_empty(&p->cg_list) ||

1347

WARN_ON_ONCE(!list_empty(&p->cg_list) ||

1347

task_css_set(p) != &init_css_set);

1348

task_css_set(p) != &init_css_set);

1348

1349

/*

1350

/*

1350

* We should check if the process is exiting, otherwise

1351

* We should check if the process is exiting, otherwise

1351

* it will race with cgroup_exit() in that the list

1352

* it will race with cgroup_exit() in that the list

1352

* entry won't be deleted though the process has exited.

1353

* entry won't be deleted though the process has exited.

1353

* Do it while holding siglock so that we don't end up

1354

* Do it while holding siglock so that we don't end up

1354

* racing against cgroup_exit().

1355

* racing against cgroup_exit().

1355

*/

1356

*/

1356

spin_lock_irq(&p->sighand->siglock);

1357

spin_lock_irq(&p->sighand->siglock);

1357

if (!(p->flags & PF_EXITING)) {

1358

if (!(p->flags & PF_EXITING)) {

1358

struct css_set *cset = task_css_set(p);

1359

struct css_set *cset = task_css_set(p);

1359

1360

list_add(&p->cg_list, &cset->tasks);

1361

list_add(&p->cg_list, &cset->tasks);

1361

get_css_set(cset);

1362

get_css_set(cset);

1362

}

1363

}

1363

spin_unlock_irq(&p->sighand->siglock);

1364

spin_unlock_irq(&p->sighand->siglock);

1364

} while_each_thread(g, p);

1365

} while_each_thread(g, p);

1365

read_unlock(&tasklist_lock);

1366

read_unlock(&tasklist_lock);

1366

out_unlock:

1367

out_unlock:

1367

up_write(&css_set_rwsem);

1368

up_write(&css_set_rwsem);

1368

}

1369

}

1369

1370

static void init_cgroup_housekeeping(struct cgroup *cgrp)

1371

static void init_cgroup_housekeeping(struct cgroup *cgrp)

1371

{

1372

{

1372

atomic_set(&cgrp->refcnt, 1);

1373

atomic_set(&cgrp->refcnt, 1);

1373

INIT_LIST_HEAD(&cgrp->sibling);

1374

INIT_LIST_HEAD(&cgrp->sibling);

1374

INIT_LIST_HEAD(&cgrp->children);

1375

INIT_LIST_HEAD(&cgrp->children);

1375

INIT_LIST_HEAD(&cgrp->cset_links);

1376

INIT_LIST_HEAD(&cgrp->cset_links);

1376

INIT_LIST_HEAD(&cgrp->release_list);

1377

INIT_LIST_HEAD(&cgrp->release_list);

1377

INIT_LIST_HEAD(&cgrp->pidlists);

1378

INIT_LIST_HEAD(&cgrp->pidlists);

1378

mutex_init(&cgrp->pidlist_mutex);

1379

mutex_init(&cgrp->pidlist_mutex);

1379

cgrp->dummy_css.cgroup = cgrp;

1380

cgrp->dummy_css.cgroup = cgrp;

1380

}

1381

}

1381

1382

static void init_cgroup_root(struct cgroup_root *root,

1383

static void init_cgroup_root(struct cgroup_root *root,

1383

struct cgroup_sb_opts *opts)

1384

struct cgroup_sb_opts *opts)

1384

{

1385

{

1385

struct cgroup *cgrp = &root->cgrp;

1386

struct cgroup *cgrp = &root->cgrp;

1386

1387

INIT_LIST_HEAD(&root->root_list);

1388

INIT_LIST_HEAD(&root->root_list);

1388

atomic_set(&root->nr_cgrps, 1);

1389

atomic_set(&root->nr_cgrps, 1);

1389

cgrp->root = root;

1390

cgrp->root = root;

1390

init_cgroup_housekeeping(cgrp);

1391

init_cgroup_housekeeping(cgrp);

1391

idr_init(&root->cgroup_idr);

1392

idr_init(&root->cgroup_idr);

1392

1393

root->flags = opts->flags;

1394

root->flags = opts->flags;

1394

if (opts->release_agent)

1395

if (opts->release_agent)

1395

strcpy(root->release_agent_path, opts->release_agent);

1396

strcpy(root->release_agent_path, opts->release_agent);

1396

if (opts->name)

1397

if (opts->name)

1397

strcpy(root->name, opts->name);

1398

strcpy(root->name, opts->name);

1398

if (opts->cpuset_clone_children)

1399

if (opts->cpuset_clone_children)

1399

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);

1400

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);

1400

}

1401

}

1401

1402

static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)

1403

static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)

1403

{

1404

{

1404

LIST_HEAD(tmp_links);

1405

LIST_HEAD(tmp_links);

1405

struct cgroup *root_cgrp = &root->cgrp;

1406

struct cgroup *root_cgrp = &root->cgrp;

1406

struct css_set *cset;

1407

struct css_set *cset;

1407

int i, ret;

1408

int i, ret;

1408

1409

lockdep_assert_held(&cgroup_tree_mutex);

1410

lockdep_assert_held(&cgroup_tree_mutex);

1410

lockdep_assert_held(&cgroup_mutex);

1411

lockdep_assert_held(&cgroup_mutex);

1411

1412

ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);

1413

ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);

1413

if (ret < 0)

1414

if (ret < 0)

1414

goto out;

1415

goto out;

1415

root_cgrp->id = ret;

1416

root_cgrp->id = ret;

1416

1417

/*

1418

/*

1418

* We're accessing css_set_count without locking css_set_rwsem here,

1419

* We're accessing css_set_count without locking css_set_rwsem here,

1419

* but that's OK - it can only be increased by someone holding

1420

* but that's OK - it can only be increased by someone holding

1420

* cgroup_lock, and that's us. The worst that can happen is that we

1421

* cgroup_lock, and that's us. The worst that can happen is that we

1421

* have some link structures left over

1422

* have some link structures left over

1422

*/

1423

*/

1423

ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);

1424

ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);

1424

if (ret)

1425

if (ret)

1425

goto out;

1426

goto out;

1426

1427

ret = cgroup_init_root_id(root);

1428

ret = cgroup_init_root_id(root);

1428

if (ret)

1429

if (ret)

1429

goto out;

1430

goto out;

1430

1431

root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,

1432

root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,

1432

KERNFS_ROOT_CREATE_DEACTIVATED,

1433

KERNFS_ROOT_CREATE_DEACTIVATED,

1433

root_cgrp);

1434

root_cgrp);

1434

if (IS_ERR(root->kf_root)) {

1435

if (IS_ERR(root->kf_root)) {

1435

ret = PTR_ERR(root->kf_root);

1436

ret = PTR_ERR(root->kf_root);

1436

goto exit_root_id;

1437

goto exit_root_id;

1437

}

1438

}

1438

root_cgrp->kn = root->kf_root->kn;

1439

root_cgrp->kn = root->kf_root->kn;

1439

1440

ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);

1441

ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);

1441

if (ret)

1442

if (ret)

1442

goto destroy_root;

1443

goto destroy_root;

1443

1444

ret = rebind_subsystems(root, ss_mask);

1445

ret = rebind_subsystems(root, ss_mask);

1445

if (ret)

1446

if (ret)

1446

goto destroy_root;

1447

goto destroy_root;

1447

1448

/*

1449

/*

1449

* There must be no failure case after here, since rebinding takes

1450

* There must be no failure case after here, since rebinding takes

1450

* care of subsystems' refcounts, which are explicitly dropped in

1451

* care of subsystems' refcounts, which are explicitly dropped in

1451

* the failure exit path.

1452

* the failure exit path.

1452

*/

1453

*/

1453

list_add(&root->root_list, &cgroup_roots);

1454

list_add(&root->root_list, &cgroup_roots);

1454

cgroup_root_count++;

1455

cgroup_root_count++;

1455

1456

/*

1457

/*

1457

* Link the root cgroup in this hierarchy into all the css_set

1458

* Link the root cgroup in this hierarchy into all the css_set

1458

* objects.

1459

* objects.

1459

*/

1460

*/

1460

down_write(&css_set_rwsem);

1461

down_write(&css_set_rwsem);

1461

hash_for_each(css_set_table, i, cset, hlist)

1462

hash_for_each(css_set_table, i, cset, hlist)

1462

link_css_set(&tmp_links, cset, root_cgrp);

1463

link_css_set(&tmp_links, cset, root_cgrp);

1463

up_write(&css_set_rwsem);

1464

up_write(&css_set_rwsem);

1464

1465

BUG_ON(!list_empty(&root_cgrp->children));

1466

BUG_ON(!list_empty(&root_cgrp->children));

1466

BUG_ON(atomic_read(&root->nr_cgrps) != 1);

1467

BUG_ON(atomic_read(&root->nr_cgrps) != 1);

1467

1468

kernfs_activate(root_cgrp->kn);

1469

kernfs_activate(root_cgrp->kn);

1469

ret = 0;

1470

ret = 0;

1470

goto out;

1471

goto out;

1471

1472

destroy_root:

1473

destroy_root:

1473

kernfs_destroy_root(root->kf_root);

1474

kernfs_destroy_root(root->kf_root);

1474

root->kf_root = NULL;

1475

root->kf_root = NULL;

1475

exit_root_id:

1476

exit_root_id:

1476

cgroup_exit_root_id(root);

1477

cgroup_exit_root_id(root);

1477

out:

1478

out:

1478

free_cgrp_cset_links(&tmp_links);

1479

free_cgrp_cset_links(&tmp_links);

1479

return ret;

1480

return ret;

1480

}

1481

}

1481

1482

static struct dentry *cgroup_mount(struct file_system_type *fs_type,

1483

static struct dentry *cgroup_mount(struct file_system_type *fs_type,

1483

int flags, const char *unused_dev_name,

1484

int flags, const char *unused_dev_name,

1484

void *data)

1485

void *data)

1485

{

1486

{

1486

struct cgroup_root *root;

1487

struct cgroup_root *root;

1487

struct cgroup_sb_opts opts;

1488

struct cgroup_sb_opts opts;

1488

struct dentry *dentry;

1489

struct dentry *dentry;

1489

int ret;

1490

int ret;

1490

bool new_sb;

1491

bool new_sb;

1491

1492

/*

1493

/*

1493

* The first time anyone tries to mount a cgroup, enable the list

1494

* The first time anyone tries to mount a cgroup, enable the list

1494

* linking each css_set to its tasks and fix up all existing tasks.

1495

* linking each css_set to its tasks and fix up all existing tasks.

1495

*/

1496

*/

1496

if (!use_task_css_set_links)

1497

if (!use_task_css_set_links)

1497

cgroup_enable_task_cg_lists();

1498

cgroup_enable_task_cg_lists();

1498

1499

mutex_lock(&cgroup_tree_mutex);

1500

mutex_lock(&cgroup_tree_mutex);

1500

mutex_lock(&cgroup_mutex);

1501

mutex_lock(&cgroup_mutex);

1501

1502

/* First find the desired set of subsystems */

1503

/* First find the desired set of subsystems */

1503

ret = parse_cgroupfs_options(data, &opts);

1504

ret = parse_cgroupfs_options(data, &opts);

1504

if (ret)

1505

if (ret)

1505

goto out_unlock;

1506

goto out_unlock;

1506

retry:

1507

retry:

1507

/* look for a matching existing root */

1508

/* look for a matching existing root */

1508

if (!opts.subsys_mask && !opts.none && !opts.name) {

1509

if (!opts.subsys_mask && !opts.none && !opts.name) {

1509

cgrp_dfl_root_visible = true;

1510

cgrp_dfl_root_visible = true;

1510

root = &cgrp_dfl_root;

1511

root = &cgrp_dfl_root;

1511

cgroup_get(&root->cgrp);

1512

cgroup_get(&root->cgrp);

1512

ret = 0;

1513

ret = 0;

1513

goto out_unlock;

1514

goto out_unlock;

1514

}

1515

}

1515

1516

for_each_root(root) {

1517

for_each_root(root) {

1517

bool name_match = false;

1518

bool name_match = false;

1518

1519

if (root == &cgrp_dfl_root)

1520

if (root == &cgrp_dfl_root)

1520

continue;

1521

continue;

1521

1522

/*

1523

/*

1523

* If we asked for a name then it must match. Also, if

1524

* If we asked for a name then it must match. Also, if

1524

* name matches but sybsys_mask doesn't, we should fail.

1525

* name matches but sybsys_mask doesn't, we should fail.

1525

* Remember whether name matched.

1526

* Remember whether name matched.

1526

*/

1527

*/

1527

if (opts.name) {

1528

if (opts.name) {

1528

if (strcmp(opts.name, root->name))

1529

if (strcmp(opts.name, root->name))

1529

continue;

1530

continue;

1530

name_match = true;

1531

name_match = true;

1531

}

1532

}

1532

1533

/*

1534

/*

1534

* If we asked for subsystems (or explicitly for no

1535

* If we asked for subsystems (or explicitly for no

1535

* subsystems) then they must match.

1536

* subsystems) then they must match.

1536

*/

1537

*/

1537

if ((opts.subsys_mask || opts.none) &&

1538

if ((opts.subsys_mask || opts.none) &&

1538

(opts.subsys_mask != root->cgrp.subsys_mask)) {

1539

(opts.subsys_mask != root->cgrp.subsys_mask)) {

1539

if (!name_match)

1540

if (!name_match)

1540

continue;

1541

continue;

1541

ret = -EBUSY;

1542

ret = -EBUSY;

1542

goto out_unlock;

1543

goto out_unlock;

1543

}

1544

}

1544

1545

if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {

1546

if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {

1546

if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {

1547

if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {

1547

pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");

1548

pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");

1548

ret = -EINVAL;

1549

ret = -EINVAL;

1549

goto out_unlock;

1550

goto out_unlock;

1550

} else {

1551

} else {

1551

pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");

1552

pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");

1552

}

1553

}

1553

}

1554

}

1554

1555

/*

1556

/*

1556

* A root's lifetime is governed by its root cgroup. Zero

1557

* A root's lifetime is governed by its root cgroup. Zero

1557

* ref indicate that the root is being destroyed. Wait for

1558

* ref indicate that the root is being destroyed. Wait for

1558

* destruction to complete so that the subsystems are free.

1559

* destruction to complete so that the subsystems are free.

1559

* We can use wait_queue for the wait but this path is

1560

* We can use wait_queue for the wait but this path is

1560

* super cold. Let's just sleep for a bit and retry.

1561

* super cold. Let's just sleep for a bit and retry.

1561

*/

1562

*/

1562

if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {

1563

if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {

1563

mutex_unlock(&cgroup_mutex);

1564

mutex_unlock(&cgroup_mutex);

1564

mutex_unlock(&cgroup_tree_mutex);

1565

mutex_unlock(&cgroup_tree_mutex);

1565

msleep(10);

1566

msleep(10);

1566

mutex_lock(&cgroup_tree_mutex);

1567

mutex_lock(&cgroup_tree_mutex);

1567

mutex_lock(&cgroup_mutex);

1568

mutex_lock(&cgroup_mutex);

1568

goto retry;

1569

goto retry;

1569

}

1570

}

1570

1571

ret = 0;

1572

ret = 0;

1572

goto out_unlock;

1573

goto out_unlock;

1573

}

1574

}

1574

1575

/*

1576

/*

1576

* No such thing, create a new one. name= matching without subsys

1577

* No such thing, create a new one. name= matching without subsys

1577

* specification is allowed for already existing hierarchies but we

1578

* specification is allowed for already existing hierarchies but we

1578

* can't create new one without subsys specification.

1579

* can't create new one without subsys specification.

1579

*/

1580

*/

1580

if (!opts.subsys_mask && !opts.none) {

1581

if (!opts.subsys_mask && !opts.none) {

1581

ret = -EINVAL;

1582

ret = -EINVAL;

1582

goto out_unlock;

1583

goto out_unlock;

1583

}

1584

}

1584

1585

root = kzalloc(sizeof(*root), GFP_KERNEL);

1586

root = kzalloc(sizeof(*root), GFP_KERNEL);

1586

if (!root) {

1587

if (!root) {

1587

ret = -ENOMEM;

1588

ret = -ENOMEM;

1588

goto out_unlock;

1589

goto out_unlock;

1589

}

1590

}

1590

1591

init_cgroup_root(root, &opts);

1592

init_cgroup_root(root, &opts);

1592

1593

ret = cgroup_setup_root(root, opts.subsys_mask);

1594

ret = cgroup_setup_root(root, opts.subsys_mask);

1594

if (ret)

1595

if (ret)

1595

cgroup_free_root(root);

1596

cgroup_free_root(root);

1596

1597

out_unlock:

1598

out_unlock:

1598

mutex_unlock(&cgroup_mutex);

1599

mutex_unlock(&cgroup_mutex);

1599

mutex_unlock(&cgroup_tree_mutex);

1600

mutex_unlock(&cgroup_tree_mutex);

1600

1601

kfree(opts.release_agent);

1602

kfree(opts.release_agent);

1602

kfree(opts.name);

1603

kfree(opts.name);

1603

1604

if (ret)

1605

if (ret)

1605

return ERR_PTR(ret);

1606

return ERR_PTR(ret);

1606

1607

dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);

1608

dentry = kernfs_mount(fs_type, flags, root->kf_root,

1609

CGROUP_SUPER_MAGIC, &new_sb);

1608

if (IS_ERR(dentry) || !new_sb)

1610

if (IS_ERR(dentry) || !new_sb)

1609

cgroup_put(&root->cgrp);

1611

cgroup_put(&root->cgrp);

1610

return dentry;

1612

return dentry;

1611

}

1613

}

1612

1614

1613

static void cgroup_kill_sb(struct super_block *sb)

1615

static void cgroup_kill_sb(struct super_block *sb)

1614

{

1616

{

1615

struct kernfs_root *kf_root = kernfs_root_from_sb(sb);

1617

struct kernfs_root *kf_root = kernfs_root_from_sb(sb);

1616

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1618

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1617

1619

1618

cgroup_put(&root->cgrp);

1620

cgroup_put(&root->cgrp);

1619

kernfs_kill_sb(sb);

1621

kernfs_kill_sb(sb);

1620

}

1622

}

1621

1623

1622

static struct file_system_type cgroup_fs_type = {

1624

static struct file_system_type cgroup_fs_type = {

1623

.name = "cgroup",

1625

.name = "cgroup",

1624

.mount = cgroup_mount,

1626

.mount = cgroup_mount,

1625

.kill_sb = cgroup_kill_sb,

1627

.kill_sb = cgroup_kill_sb,

1626

};

1628

};

1627

1629

1628

static struct kobject *cgroup_kobj;

1630

static struct kobject *cgroup_kobj;

1629

1631

1630

/**

1632

/**

1631

* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy

1633

* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy

1632

* @task: target task

1634

* @task: target task

1633

* @buf: the buffer to write the path into

1635

* @buf: the buffer to write the path into

1634

* @buflen: the length of the buffer

1636

* @buflen: the length of the buffer

1635

*

1637

*

1636

* Determine @task's cgroup on the first (the one with the lowest non-zero

1638

* Determine @task's cgroup on the first (the one with the lowest non-zero

1637

* hierarchy_id) cgroup hierarchy and copy its path into @buf. This

1639

* hierarchy_id) cgroup hierarchy and copy its path into @buf. This

1638

* function grabs cgroup_mutex and shouldn't be used inside locks used by

1640

* function grabs cgroup_mutex and shouldn't be used inside locks used by

1639

* cgroup controller callbacks.

1641

* cgroup controller callbacks.

1640

*

1642

*

1641

* Return value is the same as kernfs_path().

1643

* Return value is the same as kernfs_path().

1642

*/

1644

*/

1643

char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)

1645

char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)

1644

{

1646

{

1645

struct cgroup_root *root;

1647

struct cgroup_root *root;

1646

struct cgroup *cgrp;

1648

struct cgroup *cgrp;

1647

int hierarchy_id = 1;

1649

int hierarchy_id = 1;

1648

char *path = NULL;

1650

char *path = NULL;

1649

1651

1650

mutex_lock(&cgroup_mutex);

1652

mutex_lock(&cgroup_mutex);

1651

down_read(&css_set_rwsem);

1653

down_read(&css_set_rwsem);

1652

1654

1653

root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);

1655

root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);

1654

1656

1655

if (root) {

1657

if (root) {

1656

cgrp = task_cgroup_from_root(task, root);

1658

cgrp = task_cgroup_from_root(task, root);

1657

path = cgroup_path(cgrp, buf, buflen);

1659

path = cgroup_path(cgrp, buf, buflen);

1658

} else {

1660

} else {

1659

/* if no hierarchy exists, everyone is in "/" */

1661

/* if no hierarchy exists, everyone is in "/" */

1660

if (strlcpy(buf, "/", buflen) < buflen)

1662

if (strlcpy(buf, "/", buflen) < buflen)

1661

path = buf;

1663

path = buf;

1662

}

1664

}

1663

1665

1664

up_read(&css_set_rwsem);

1666

up_read(&css_set_rwsem);

1665

mutex_unlock(&cgroup_mutex);

1667

mutex_unlock(&cgroup_mutex);

1666

return path;

1668

return path;

1667

}

1669

}

1668

EXPORT_SYMBOL_GPL(task_cgroup_path);

1670

EXPORT_SYMBOL_GPL(task_cgroup_path);

1669

1671

1670

/* used to track tasks and other necessary states during migration */

1672

/* used to track tasks and other necessary states during migration */

1671

struct cgroup_taskset {

1673

struct cgroup_taskset {

1672

/* the src and dst cset list running through cset->mg_node */

1674

/* the src and dst cset list running through cset->mg_node */

1673

struct list_head src_csets;

1675

struct list_head src_csets;

1674

struct list_head dst_csets;

1676

struct list_head dst_csets;

1675

1677

1676

/*

1678

/*

1677

* Fields for cgroup_taskset_*() iteration.

1679

* Fields for cgroup_taskset_*() iteration.

1678

*

1680

*

1679

* Before migration is committed, the target migration tasks are on

1681

* Before migration is committed, the target migration tasks are on

1680

* ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of

1682

* ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of

1681

* the csets on ->dst_csets. ->csets point to either ->src_csets

1683

* the csets on ->dst_csets. ->csets point to either ->src_csets

1682

* or ->dst_csets depending on whether migration is committed.

1684

* or ->dst_csets depending on whether migration is committed.

1683

*

1685

*

1684

* ->cur_csets and ->cur_task point to the current task position

1686

* ->cur_csets and ->cur_task point to the current task position

1685

* during iteration.

1687

* during iteration.

1686

*/

1688

*/

1687

struct list_head *csets;

1689

struct list_head *csets;

1688

struct css_set *cur_cset;

1690

struct css_set *cur_cset;

1689

struct task_struct *cur_task;

1691

struct task_struct *cur_task;

1690

};

1692

};

1691

1693

1692

/**

1694

/**

1693

* cgroup_taskset_first - reset taskset and return the first task

1695

* cgroup_taskset_first - reset taskset and return the first task

1694

* @tset: taskset of interest

1696

* @tset: taskset of interest

1695

*

1697

*

1696

* @tset iteration is initialized and the first task is returned.

1698

* @tset iteration is initialized and the first task is returned.

1697

*/

1699

*/

1698

struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)

1700

struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)

1699

{

1701

{

1700

tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);

1702

tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);

1701

tset->cur_task = NULL;

1703

tset->cur_task = NULL;

1702

1704

1703

return cgroup_taskset_next(tset);

1705

return cgroup_taskset_next(tset);

1704

}

1706

}

1705

1707

1706

/**

1708

/**

1707

* cgroup_taskset_next - iterate to the next task in taskset

1709

* cgroup_taskset_next - iterate to the next task in taskset

1708

* @tset: taskset of interest

1710

* @tset: taskset of interest

1709

*

1711

*

1710

* Return the next task in @tset. Iteration must have been initialized

1712

* Return the next task in @tset. Iteration must have been initialized

1711

* with cgroup_taskset_first().

1713

* with cgroup_taskset_first().

1712

*/

1714

*/

1713

struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)

1715

struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)

1714

{

1716

{

1715

struct css_set *cset = tset->cur_cset;

1717

struct css_set *cset = tset->cur_cset;

1716

struct task_struct *task = tset->cur_task;

1718

struct task_struct *task = tset->cur_task;

1717

1719

1718

while (&cset->mg_node != tset->csets) {

1720

while (&cset->mg_node != tset->csets) {

1719

if (!task)

1721

if (!task)

1720

task = list_first_entry(&cset->mg_tasks,

1722

task = list_first_entry(&cset->mg_tasks,

1721

struct task_struct, cg_list);

1723

struct task_struct, cg_list);

1722

else

1724

else

1723

task = list_next_entry(task, cg_list);

1725

task = list_next_entry(task, cg_list);

1724

1726

1725

if (&task->cg_list != &cset->mg_tasks) {

1727

if (&task->cg_list != &cset->mg_tasks) {

1726

tset->cur_cset = cset;

1728

tset->cur_cset = cset;

1727

tset->cur_task = task;

1729

tset->cur_task = task;

1728

return task;

1730

return task;

1729

}

1731

}

1730

1732

1731

cset = list_next_entry(cset, mg_node);

1733

cset = list_next_entry(cset, mg_node);

1732

task = NULL;

1734

task = NULL;

1733

}

1735

}

1734

1736

1735

return NULL;

1737

return NULL;

1736

}

1738

}

1737

1739

1738

/**

1740

/**

1739

* cgroup_task_migrate - move a task from one cgroup to another.

1741

* cgroup_task_migrate - move a task from one cgroup to another.

1740

* @old_cgrp; the cgroup @tsk is being migrated from

1742

* @old_cgrp; the cgroup @tsk is being migrated from

1741

* @tsk: the task being migrated

1743

* @tsk: the task being migrated

1742

* @new_cset: the new css_set @tsk is being attached to

1744

* @new_cset: the new css_set @tsk is being attached to

1743

*

1745

*

1744

* Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.

1746

* Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.

1745

*/

1747

*/

1746

static void cgroup_task_migrate(struct cgroup *old_cgrp,

1748

static void cgroup_task_migrate(struct cgroup *old_cgrp,

1747

struct task_struct *tsk,

1749

struct task_struct *tsk,

1748

struct css_set *new_cset)

1750

struct css_set *new_cset)

1749

{

1751

{

1750

struct css_set *old_cset;

1752

struct css_set *old_cset;

1751

1753

1752

lockdep_assert_held(&cgroup_mutex);

1754

lockdep_assert_held(&cgroup_mutex);

1753

lockdep_assert_held(&css_set_rwsem);

1755

lockdep_assert_held(&css_set_rwsem);

1754

1756

1755

/*

1757

/*

1756

* We are synchronized through threadgroup_lock() against PF_EXITING

1758

* We are synchronized through threadgroup_lock() against PF_EXITING

1757

* setting such that we can't race against cgroup_exit() changing the

1759

* setting such that we can't race against cgroup_exit() changing the

1758

* css_set to init_css_set and dropping the old one.

1760

* css_set to init_css_set and dropping the old one.

1759

*/

1761

*/

1760

WARN_ON_ONCE(tsk->flags & PF_EXITING);

1762

WARN_ON_ONCE(tsk->flags & PF_EXITING);

1761

old_cset = task_css_set(tsk);

1763

old_cset = task_css_set(tsk);

1762

1764

1763

get_css_set(new_cset);

1765

get_css_set(new_cset);

1764

rcu_assign_pointer(tsk->cgroups, new_cset);

1766

rcu_assign_pointer(tsk->cgroups, new_cset);

1765

1767

1766

/*

1768

/*

1767

* Use move_tail so that cgroup_taskset_first() still returns the

1769

* Use move_tail so that cgroup_taskset_first() still returns the

1768

* leader after migration. This works because cgroup_migrate()

1770

* leader after migration. This works because cgroup_migrate()

1769

* ensures that the dst_cset of the leader is the first on the

1771

* ensures that the dst_cset of the leader is the first on the

1770

* tset's dst_csets list.

1772

* tset's dst_csets list.

1771

*/

1773

*/

1772

list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);

1774

list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);

1773

1775

1774

/*

1776

/*

1775

* We just gained a reference on old_cset by taking it from the

1777

* We just gained a reference on old_cset by taking it from the

1776

* task. As trading it for new_cset is protected by cgroup_mutex,

1778

* task. As trading it for new_cset is protected by cgroup_mutex,

1777

* we're safe to drop it here; it will be freed under RCU.

1779

* we're safe to drop it here; it will be freed under RCU.

1778

*/

1780

*/

1779

set_bit(CGRP_RELEASABLE, &old_cgrp->flags);

1781

set_bit(CGRP_RELEASABLE, &old_cgrp->flags);

1780

put_css_set_locked(old_cset, false);

1782

put_css_set_locked(old_cset, false);

1781

}

1783

}

1782

1784

1783

/**

1785

/**

1784

* cgroup_migrate_finish - cleanup after attach

1786

* cgroup_migrate_finish - cleanup after attach

1785

* @preloaded_csets: list of preloaded css_sets

1787

* @preloaded_csets: list of preloaded css_sets

1786

*

1788

*

1787

* Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See

1789

* Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See

1788

* those functions for details.

1790

* those functions for details.

1789

*/

1791

*/

1790

static void cgroup_migrate_finish(struct list_head *preloaded_csets)

1792

static void cgroup_migrate_finish(struct list_head *preloaded_csets)

1791

{

1793

{

1792

struct css_set *cset, *tmp_cset;

1794

struct css_set *cset, *tmp_cset;

1793

1795

1794

lockdep_assert_held(&cgroup_mutex);

1796

lockdep_assert_held(&cgroup_mutex);

1795

1797

1796

down_write(&css_set_rwsem);

1798

down_write(&css_set_rwsem);

1797

list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {

1799

list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {

1798

cset->mg_src_cgrp = NULL;

1800

cset->mg_src_cgrp = NULL;

1799

cset->mg_dst_cset = NULL;

1801

cset->mg_dst_cset = NULL;

1800

list_del_init(&cset->mg_preload_node);

1802

list_del_init(&cset->mg_preload_node);

1801

put_css_set_locked(cset, false);

1803

put_css_set_locked(cset, false);

1802

}

1804

}

1803

up_write(&css_set_rwsem);

1805

up_write(&css_set_rwsem);

1804

}

1806

}

1805

1807

1806

/**

1808

/**

1807

* cgroup_migrate_add_src - add a migration source css_set

1809

* cgroup_migrate_add_src - add a migration source css_set

1808

* @src_cset: the source css_set to add

1810

* @src_cset: the source css_set to add

1809

* @dst_cgrp: the destination cgroup

1811

* @dst_cgrp: the destination cgroup

1810

* @preloaded_csets: list of preloaded css_sets

1812

* @preloaded_csets: list of preloaded css_sets

1811

*

1813

*

1812

* Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin

1814

* Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin

1813

* @src_cset and add it to @preloaded_csets, which should later be cleaned

1815

* @src_cset and add it to @preloaded_csets, which should later be cleaned

1814

* up by cgroup_migrate_finish().

1816

* up by cgroup_migrate_finish().

1815

*

1817

*

1816

* This function may be called without holding threadgroup_lock even if the

1818

* This function may be called without holding threadgroup_lock even if the

1817

* target is a process. Threads may be created and destroyed but as long

1819

* target is a process. Threads may be created and destroyed but as long

1818

* as cgroup_mutex is not dropped, no new css_set can be put into play and

1820

* as cgroup_mutex is not dropped, no new css_set can be put into play and

1819

* the preloaded css_sets are guaranteed to cover all migrations.

1821

* the preloaded css_sets are guaranteed to cover all migrations.

1820

*/

1822

*/

1821

static void cgroup_migrate_add_src(struct css_set *src_cset,

1823

static void cgroup_migrate_add_src(struct css_set *src_cset,

1822

struct cgroup *dst_cgrp,

1824

struct cgroup *dst_cgrp,

1823

struct list_head *preloaded_csets)

1825

struct list_head *preloaded_csets)

1824

{

1826

{

1825

struct cgroup *src_cgrp;

1827

struct cgroup *src_cgrp;

1826

1828

1827

lockdep_assert_held(&cgroup_mutex);

1829

lockdep_assert_held(&cgroup_mutex);

1828

lockdep_assert_held(&css_set_rwsem);

1830

lockdep_assert_held(&css_set_rwsem);

1829

1831

1830

src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);

1832

src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);

1831

1833

1832

/* nothing to do if this cset already belongs to the cgroup */

1834

/* nothing to do if this cset already belongs to the cgroup */

1833

if (src_cgrp == dst_cgrp)

1835

if (src_cgrp == dst_cgrp)

1834

return;

1836

return;

1835

1837

1836

if (!list_empty(&src_cset->mg_preload_node))

1838

if (!list_empty(&src_cset->mg_preload_node))

1837

return;

1839

return;

1838

1840

1839

WARN_ON(src_cset->mg_src_cgrp);

1841

WARN_ON(src_cset->mg_src_cgrp);

1840

WARN_ON(!list_empty(&src_cset->mg_tasks));

1842

WARN_ON(!list_empty(&src_cset->mg_tasks));

1841

WARN_ON(!list_empty(&src_cset->mg_node));

1843

WARN_ON(!list_empty(&src_cset->mg_node));

1842

1844

1843

src_cset->mg_src_cgrp = src_cgrp;

1845

src_cset->mg_src_cgrp = src_cgrp;

1844

get_css_set(src_cset);

1846

get_css_set(src_cset);

1845

list_add(&src_cset->mg_preload_node, preloaded_csets);

1847

list_add(&src_cset->mg_preload_node, preloaded_csets);

1846

}

1848

}

1847

1849

1848

/**

1850

/**

1849

* cgroup_migrate_prepare_dst - prepare destination css_sets for migration

1851

* cgroup_migrate_prepare_dst - prepare destination css_sets for migration

1850

* @dst_cgrp: the destination cgroup

1852

* @dst_cgrp: the destination cgroup

1851

* @preloaded_csets: list of preloaded source css_sets

1853

* @preloaded_csets: list of preloaded source css_sets

1852

*

1854

*

1853

* Tasks are about to be moved to @dst_cgrp and all the source css_sets

1855

* Tasks are about to be moved to @dst_cgrp and all the source css_sets

1854

* have been preloaded to @preloaded_csets. This function looks up and

1856

* have been preloaded to @preloaded_csets. This function looks up and

1855

* pins all destination css_sets, links each to its source, and put them on

1857

* pins all destination css_sets, links each to its source, and put them on

1856

* @preloaded_csets.

1858

* @preloaded_csets.

1857

*

1859

*

1858

* This function must be called after cgroup_migrate_add_src() has been

1860

* This function must be called after cgroup_migrate_add_src() has been

1859

* called on each migration source css_set. After migration is performed

1861

* called on each migration source css_set. After migration is performed

1860

* using cgroup_migrate(), cgroup_migrate_finish() must be called on

1862

* using cgroup_migrate(), cgroup_migrate_finish() must be called on

1861

* @preloaded_csets.

1863

* @preloaded_csets.

1862

*/

1864

*/

1863

static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,

1865

static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,

1864

struct list_head *preloaded_csets)

1866

struct list_head *preloaded_csets)

1865

{

1867

{

1866

LIST_HEAD(csets);

1868

LIST_HEAD(csets);

1867

struct css_set *src_cset;

1869

struct css_set *src_cset;

1868

1870

1869

lockdep_assert_held(&cgroup_mutex);

1871

lockdep_assert_held(&cgroup_mutex);

1870

1872

1871

/* look up the dst cset for each src cset and link it to src */

1873

/* look up the dst cset for each src cset and link it to src */

1872

list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {

1874

list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {

1873

struct css_set *dst_cset;

1875

struct css_set *dst_cset;

1874

1876

1875

dst_cset = find_css_set(src_cset, dst_cgrp);

1877

dst_cset = find_css_set(src_cset, dst_cgrp);

1876

if (!dst_cset)

1878

if (!dst_cset)

1877

goto err;

1879

goto err;

1878

1880

1879

WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);

1881

WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);

1880

src_cset->mg_dst_cset = dst_cset;

1882

src_cset->mg_dst_cset = dst_cset;

1881

1883

1882

if (list_empty(&dst_cset->mg_preload_node))

1884

if (list_empty(&dst_cset->mg_preload_node))

1883

list_add(&dst_cset->mg_preload_node, &csets);

1885

list_add(&dst_cset->mg_preload_node, &csets);

1884

else

1886

else

1885

put_css_set(dst_cset, false);

1887

put_css_set(dst_cset, false);

1886

}

1888

}

1887

1889

1888

list_splice(&csets, preloaded_csets);

1890

list_splice(&csets, preloaded_csets);

1889

return 0;

1891

return 0;

1890

err:

1892

err:

1891

cgroup_migrate_finish(&csets);

1893

cgroup_migrate_finish(&csets);

1892

return -ENOMEM;

1894

return -ENOMEM;

1893

}

1895

}

1894

1896

1895

/**

1897

/**

1896

* cgroup_migrate - migrate a process or task to a cgroup

1898

* cgroup_migrate - migrate a process or task to a cgroup

1897

* @cgrp: the destination cgroup

1899

* @cgrp: the destination cgroup

1898

* @leader: the leader of the process or the task to migrate

1900

* @leader: the leader of the process or the task to migrate

1899

* @threadgroup: whether @leader points to the whole process or a single task

1901

* @threadgroup: whether @leader points to the whole process or a single task

1900

*

1902

*

1901

* Migrate a process or task denoted by @leader to @cgrp. If migrating a

1903

* Migrate a process or task denoted by @leader to @cgrp. If migrating a

1902

* process, the caller must be holding threadgroup_lock of @leader. The

1904

* process, the caller must be holding threadgroup_lock of @leader. The

1903

* caller is also responsible for invoking cgroup_migrate_add_src() and

1905

* caller is also responsible for invoking cgroup_migrate_add_src() and

1904

* cgroup_migrate_prepare_dst() on the targets before invoking this

1906

* cgroup_migrate_prepare_dst() on the targets before invoking this

1905

* function and following up with cgroup_migrate_finish().

1907

* function and following up with cgroup_migrate_finish().

1906

*

1908

*

1907

* As long as a controller's ->can_attach() doesn't fail, this function is

1909

* As long as a controller's ->can_attach() doesn't fail, this function is

1908

* guaranteed to succeed. This means that, excluding ->can_attach()

1910

* guaranteed to succeed. This means that, excluding ->can_attach()

1909

* failure, when migrating multiple targets, the success or failure can be

1911

* failure, when migrating multiple targets, the success or failure can be

1910

* decided for all targets by invoking group_migrate_prepare_dst() before

1912

* decided for all targets by invoking group_migrate_prepare_dst() before

1911

* actually starting migrating.

1913

* actually starting migrating.

1912

*/

1914

*/

1913

static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,

1915

static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,

1914

bool threadgroup)

1916

bool threadgroup)

1915

{

1917

{

1916

struct cgroup_taskset tset = {

1918

struct cgroup_taskset tset = {

1917

.src_csets = LIST_HEAD_INIT(tset.src_csets),

1919

.src_csets = LIST_HEAD_INIT(tset.src_csets),

1918

.dst_csets = LIST_HEAD_INIT(tset.dst_csets),

1920

.dst_csets = LIST_HEAD_INIT(tset.dst_csets),

1919

.csets = &tset.src_csets,

1921

.csets = &tset.src_csets,

1920

};

1922

};

1921

struct cgroup_subsys_state *css, *failed_css = NULL;

1923

struct cgroup_subsys_state *css, *failed_css = NULL;

1922

struct css_set *cset, *tmp_cset;

1924

struct css_set *cset, *tmp_cset;

1923

struct task_struct *task, *tmp_task;

1925

struct task_struct *task, *tmp_task;

1924

int i, ret;

1926

int i, ret;

1925

1927

1926

/*

1928

/*

1927

* Prevent freeing of tasks while we take a snapshot. Tasks that are

1929

* Prevent freeing of tasks while we take a snapshot. Tasks that are

1928

* already PF_EXITING could be freed from underneath us unless we

1930

* already PF_EXITING could be freed from underneath us unless we

1929

* take an rcu_read_lock.

1931

* take an rcu_read_lock.

1930

*/

1932

*/

1931

down_write(&css_set_rwsem);

1933

down_write(&css_set_rwsem);

1932

rcu_read_lock();

1934

rcu_read_lock();

1933

task = leader;

1935

task = leader;

1934

do {

1936

do {

1935

/* @task either already exited or can't exit until the end */

1937

/* @task either already exited or can't exit until the end */

1936

if (task->flags & PF_EXITING)

1938

if (task->flags & PF_EXITING)

1937

goto next;

1939

goto next;

1938

1940

1939

/* leave @task alone if post_fork() hasn't linked it yet */

1941

/* leave @task alone if post_fork() hasn't linked it yet */

1940

if (list_empty(&task->cg_list))

1942

if (list_empty(&task->cg_list))

1941

goto next;

1943

goto next;

1942

1944

1943

cset = task_css_set(task);

1945

cset = task_css_set(task);

1944

if (!cset->mg_src_cgrp)

1946

if (!cset->mg_src_cgrp)

1945

goto next;

1947

goto next;

1946

1948

1947

/*

1949

/*

1948

* cgroup_taskset_first() must always return the leader.

1950

* cgroup_taskset_first() must always return the leader.

1949

* Take care to avoid disturbing the ordering.

1951

* Take care to avoid disturbing the ordering.

1950

*/

1952

*/

1951

list_move_tail(&task->cg_list, &cset->mg_tasks);

1953

list_move_tail(&task->cg_list, &cset->mg_tasks);

1952

if (list_empty(&cset->mg_node))

1954

if (list_empty(&cset->mg_node))

1953

list_add_tail(&cset->mg_node, &tset.src_csets);

1955

list_add_tail(&cset->mg_node, &tset.src_csets);

1954

if (list_empty(&cset->mg_dst_cset->mg_node))

1956

if (list_empty(&cset->mg_dst_cset->mg_node))

1955

list_move_tail(&cset->mg_dst_cset->mg_node,

1957

list_move_tail(&cset->mg_dst_cset->mg_node,

1956

&tset.dst_csets);

1958

&tset.dst_csets);

1957

if (!threadgroup)

1960

if (!threadgroup)

1959

break;

1961

break;

1960

} while_each_thread(leader, task);

1962

} while_each_thread(leader, task);

1961

rcu_read_unlock();

1963

rcu_read_unlock();

1962

up_write(&css_set_rwsem);

1964

up_write(&css_set_rwsem);

1963

1965

1964

/* methods shouldn't be called if no task is actually migrating */

1966

/* methods shouldn't be called if no task is actually migrating */

1965

if (list_empty(&tset.src_csets))

1967

if (list_empty(&tset.src_csets))

1966

return 0;

1968

return 0;

1967

1969

1968

/* check that we can legitimately attach to the cgroup */

1970

/* check that we can legitimately attach to the cgroup */

1969

for_each_css(css, i, cgrp) {

1971

for_each_css(css, i, cgrp) {

1970

if (css->ss->can_attach) {

1972

if (css->ss->can_attach) {

1971

ret = css->ss->can_attach(css, &tset);

1973

ret = css->ss->can_attach(css, &tset);

1972

if (ret) {

1974

if (ret) {

1973

failed_css = css;

1975

failed_css = css;

1974

goto out_cancel_attach;

1976

goto out_cancel_attach;

1975

}

1977

}

1976

}

1978

}

1977

}

1979

}

1978

1980

1979

/*

1981

/*

1980

* Now that we're guaranteed success, proceed to move all tasks to

1982

* Now that we're guaranteed success, proceed to move all tasks to

1981

* the new cgroup. There are no failure cases after here, so this

1983

* the new cgroup. There are no failure cases after here, so this

1982

* is the commit point.

1984

* is the commit point.

1983

*/

1985

*/

1984

down_write(&css_set_rwsem);

1986

down_write(&css_set_rwsem);

1985

list_for_each_entry(cset, &tset.src_csets, mg_node) {

1987

list_for_each_entry(cset, &tset.src_csets, mg_node) {

1986

list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)

1988

list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)

1987

cgroup_task_migrate(cset->mg_src_cgrp, task,

1989

cgroup_task_migrate(cset->mg_src_cgrp, task,

1988

cset->mg_dst_cset);

1990

cset->mg_dst_cset);

1989

}

1991

}

1990

up_write(&css_set_rwsem);

1992

up_write(&css_set_rwsem);

1991

1993

1992

/*

1994

/*

1993

* Migration is committed, all target tasks are now on dst_csets.

1995

* Migration is committed, all target tasks are now on dst_csets.

1994

* Nothing is sensitive to fork() after this point. Notify

1996

* Nothing is sensitive to fork() after this point. Notify

1995

* controllers that migration is complete.

1997

* controllers that migration is complete.

1996

*/

1998

*/

1997

tset.csets = &tset.dst_csets;

1999

tset.csets = &tset.dst_csets;

1998

2000

1999

for_each_css(css, i, cgrp)

2001

for_each_css(css, i, cgrp)

2000

if (css->ss->attach)

2002

if (css->ss->attach)

2001

css->ss->attach(css, &tset);

2003

css->ss->attach(css, &tset);

2002

2004

2003

ret = 0;

2005

ret = 0;

2004

goto out_release_tset;

2006

goto out_release_tset;

2005

2007

2006

out_cancel_attach:

2008

out_cancel_attach:

2007

for_each_css(css, i, cgrp) {

2009

for_each_css(css, i, cgrp) {

2008

if (css == failed_css)

2010

if (css == failed_css)

2009

break;

2011

break;

2010

if (css->ss->cancel_attach)

2012

if (css->ss->cancel_attach)

2011

css->ss->cancel_attach(css, &tset);

2013

css->ss->cancel_attach(css, &tset);

2012

}

2014

}

2013

out_release_tset:

2015

out_release_tset:

2014

down_write(&css_set_rwsem);

2016

down_write(&css_set_rwsem);

2015

list_splice_init(&tset.dst_csets, &tset.src_csets);

2017

list_splice_init(&tset.dst_csets, &tset.src_csets);

2016

list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {

2018

list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {

2017

list_splice_tail_init(&cset->mg_tasks, &cset->tasks);

2019

list_splice_tail_init(&cset->mg_tasks, &cset->tasks);

2018

list_del_init(&cset->mg_node);

2020

list_del_init(&cset->mg_node);

2019

}

2021

}

2020

up_write(&css_set_rwsem);

2022

up_write(&css_set_rwsem);

2021

return ret;

2023

return ret;

2022

}

2024

}

2023

2025

2024

/**

2026

/**

2025

* cgroup_attach_task - attach a task or a whole threadgroup to a cgroup

2027

* cgroup_attach_task - attach a task or a whole threadgroup to a cgroup

2026

* @dst_cgrp: the cgroup to attach to

2028

* @dst_cgrp: the cgroup to attach to

2027

* @leader: the task or the leader of the threadgroup to be attached

2029

* @leader: the task or the leader of the threadgroup to be attached

2028

* @threadgroup: attach the whole threadgroup?

2030

* @threadgroup: attach the whole threadgroup?

2029

*

2031

*

2030

* Call holding cgroup_mutex and threadgroup_lock of @leader.

2032

* Call holding cgroup_mutex and threadgroup_lock of @leader.

2031

*/

2033

*/

2032

static int cgroup_attach_task(struct cgroup *dst_cgrp,

2034

static int cgroup_attach_task(struct cgroup *dst_cgrp,

2033

struct task_struct *leader, bool threadgroup)

2035

struct task_struct *leader, bool threadgroup)

2034

{

2036

{

2035

LIST_HEAD(preloaded_csets);

2037

LIST_HEAD(preloaded_csets);

2036

struct task_struct *task;

2038

struct task_struct *task;

2037

int ret;

2039

int ret;

2038

2040

2039

/* look up all src csets */

2041

/* look up all src csets */

2040

down_read(&css_set_rwsem);

2042

down_read(&css_set_rwsem);

2041

rcu_read_lock();

2043

rcu_read_lock();

2042

task = leader;

2044

task = leader;

2043

do {

2045

do {

2044

cgroup_migrate_add_src(task_css_set(task), dst_cgrp,

2046

cgroup_migrate_add_src(task_css_set(task), dst_cgrp,

2045

&preloaded_csets);

2047

&preloaded_csets);

2046

if (!threadgroup)

2048

if (!threadgroup)

2047

break;

2049

break;

2048

} while_each_thread(leader, task);

2050

} while_each_thread(leader, task);

2049

rcu_read_unlock();

2051

rcu_read_unlock();

2050

up_read(&css_set_rwsem);

2052

up_read(&css_set_rwsem);

2051

2053

2052

/* prepare dst csets and commit */

2054

/* prepare dst csets and commit */

2053

ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);

2055

ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);

2054

if (!ret)

2056

if (!ret)

2055

ret = cgroup_migrate(dst_cgrp, leader, threadgroup);

2057

ret = cgroup_migrate(dst_cgrp, leader, threadgroup);

2056

2058

2057

cgroup_migrate_finish(&preloaded_csets);

2059

cgroup_migrate_finish(&preloaded_csets);

2058

return ret;

2060

return ret;

2059

}

2061

}

2060

2062

2061

/*

2063

/*

2062

* Find the task_struct of the task to attach by vpid and pass it along to the

2064

* Find the task_struct of the task to attach by vpid and pass it along to the

2063

* function to attach either it or all tasks in its threadgroup. Will lock

2065

* function to attach either it or all tasks in its threadgroup. Will lock

2064

* cgroup_mutex and threadgroup.

2066

* cgroup_mutex and threadgroup.

2065

*/

2067

*/

2066

static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)

2068

static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)

2067

{

2069

{

2068

struct task_struct *tsk;

2070

struct task_struct *tsk;

2069

const struct cred *cred = current_cred(), *tcred;

2071

const struct cred *cred = current_cred(), *tcred;

2070

int ret;

2072

int ret;

2071

2073

2072

if (!cgroup_lock_live_group(cgrp))

2074

if (!cgroup_lock_live_group(cgrp))

2073

return -ENODEV;

2075

return -ENODEV;

2074

2076

2075

retry_find_task:

2077

retry_find_task:

2076

rcu_read_lock();

2078

rcu_read_lock();

2077

if (pid) {

2079

if (pid) {

2078

tsk = find_task_by_vpid(pid);

2080

tsk = find_task_by_vpid(pid);

2079

if (!tsk) {

2081

if (!tsk) {

2080

rcu_read_unlock();

2082

rcu_read_unlock();

2081

ret = -ESRCH;

2083

ret = -ESRCH;

2082

goto out_unlock_cgroup;

2084

goto out_unlock_cgroup;

2083

}

2085

}

2084

/*

2086

/*

2085

* even if we're attaching all tasks in the thread group, we

2087

* even if we're attaching all tasks in the thread group, we

2086

* only need to check permissions on one of them.

2088

* only need to check permissions on one of them.

2087

*/

2089

*/

2088

tcred = __task_cred(tsk);

2090

tcred = __task_cred(tsk);

2089

if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&

2091

if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&

2090

!uid_eq(cred->euid, tcred->uid) &&

2092

!uid_eq(cred->euid, tcred->uid) &&

2091

!uid_eq(cred->euid, tcred->suid)) {

2093

!uid_eq(cred->euid, tcred->suid)) {

2092

rcu_read_unlock();

2094

rcu_read_unlock();

2093

ret = -EACCES;

2095

ret = -EACCES;

2094

goto out_unlock_cgroup;

2096

goto out_unlock_cgroup;

2095

}

2097

}

2096

} else

2098

} else

2097

tsk = current;

2099

tsk = current;

2098

2100

2099

if (threadgroup)

2101

if (threadgroup)

2100

tsk = tsk->group_leader;

2102

tsk = tsk->group_leader;

2101

2103

2102

/*

2104

/*

2103

* Workqueue threads may acquire PF_NO_SETAFFINITY and become

2105

* Workqueue threads may acquire PF_NO_SETAFFINITY and become

2104

* trapped in a cpuset, or RT worker may be born in a cgroup

2106

* trapped in a cpuset, or RT worker may be born in a cgroup

2105

* with no rt_runtime allocated. Just say no.

2107

* with no rt_runtime allocated. Just say no.

2106

*/

2108

*/

2107

if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {

2109

if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {

2108

ret = -EINVAL;

2110

ret = -EINVAL;

2109

rcu_read_unlock();

2111

rcu_read_unlock();

2110

goto out_unlock_cgroup;

2112

goto out_unlock_cgroup;

2111

}

2113

}

2112

2114

2113

get_task_struct(tsk);

2115

get_task_struct(tsk);

2114

rcu_read_unlock();

2116

rcu_read_unlock();

2115

2117

2116

threadgroup_lock(tsk);

2118

threadgroup_lock(tsk);

2117

if (threadgroup) {

2119

if (threadgroup) {

2118

if (!thread_group_leader(tsk)) {

2120

if (!thread_group_leader(tsk)) {

2119

/*

2121

/*

2120

* a race with de_thread from another thread's exec()

2122

* a race with de_thread from another thread's exec()

2121

* may strip us of our leadership, if this happens,

2123

* may strip us of our leadership, if this happens,

2122

* there is no choice but to throw this task away and

2124

* there is no choice but to throw this task away and

2123

* try again; this is

2125

* try again; this is

2124

* "double-double-toil-and-trouble-check locking".

2126

* "double-double-toil-and-trouble-check locking".

2125

*/

2127

*/

2126

threadgroup_unlock(tsk);

2128

threadgroup_unlock(tsk);

2127

put_task_struct(tsk);

2129

put_task_struct(tsk);

2128

goto retry_find_task;

2130

goto retry_find_task;

2129

}

2131

}

2130

}

2132

}

2131

2133

2132

ret = cgroup_attach_task(cgrp, tsk, threadgroup);

2134

ret = cgroup_attach_task(cgrp, tsk, threadgroup);

2133

2135

2134

threadgroup_unlock(tsk);

2136

threadgroup_unlock(tsk);

2135

2137

2136

put_task_struct(tsk);

2138

put_task_struct(tsk);

2137

out_unlock_cgroup:

2139

out_unlock_cgroup:

2138

mutex_unlock(&cgroup_mutex);

2140

mutex_unlock(&cgroup_mutex);

2139

return ret;

2141

return ret;

2140

}

2142

}

2141

2143

2142

/**

2144

/**

2143

* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'

2145

* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'

2144

* @from: attach to all cgroups of a given task

2146

* @from: attach to all cgroups of a given task

2145

* @tsk: the task to be attached

2147

* @tsk: the task to be attached

2146

*/

2148

*/

2147

int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)

2149

int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)

2148

{

2150

{

2149

struct cgroup_root *root;

2151

struct cgroup_root *root;

2150

int retval = 0;

2152

int retval = 0;

2151

2153

2152

mutex_lock(&cgroup_mutex);

2154

mutex_lock(&cgroup_mutex);

2153

for_each_root(root) {

2155

for_each_root(root) {

2154

struct cgroup *from_cgrp;

2156

struct cgroup *from_cgrp;

2155

2157

2156

if (root == &cgrp_dfl_root)

2158

if (root == &cgrp_dfl_root)

2157

continue;

2159

continue;

2158

2160

2159

down_read(&css_set_rwsem);

2161

down_read(&css_set_rwsem);

2160

from_cgrp = task_cgroup_from_root(from, root);

2162

from_cgrp = task_cgroup_from_root(from, root);

2161

up_read(&css_set_rwsem);

2163

up_read(&css_set_rwsem);

2162

2164

2163

retval = cgroup_attach_task(from_cgrp, tsk, false);

2165

retval = cgroup_attach_task(from_cgrp, tsk, false);

2164

if (retval)

2166

if (retval)

2165

break;

2167

break;

2166

}

2168

}

2167

mutex_unlock(&cgroup_mutex);

2169

mutex_unlock(&cgroup_mutex);

2168

2170

2169

return retval;

2171

return retval;

2170

}

2172

}

2171

EXPORT_SYMBOL_GPL(cgroup_attach_task_all);

2173

EXPORT_SYMBOL_GPL(cgroup_attach_task_all);

2172

2174

2173

static int cgroup_tasks_write(struct cgroup_subsys_state *css,

2175

static int cgroup_tasks_write(struct cgroup_subsys_state *css,

2174

struct cftype *cft, u64 pid)

2176

struct cftype *cft, u64 pid)

2175

{

2177

{

2176

return attach_task_by_pid(css->cgroup, pid, false);

2178

return attach_task_by_pid(css->cgroup, pid, false);

2177

}

2179

}

2178

2180

2179

static int cgroup_procs_write(struct cgroup_subsys_state *css,

2181

static int cgroup_procs_write(struct cgroup_subsys_state *css,

2180

struct cftype *cft, u64 tgid)

2182

struct cftype *cft, u64 tgid)

2181

{

2183

{

2182

return attach_task_by_pid(css->cgroup, tgid, true);

2184

return attach_task_by_pid(css->cgroup, tgid, true);

2183

}

2185

}

2184

2186

2185

static int cgroup_release_agent_write(struct cgroup_subsys_state *css,

2187

static int cgroup_release_agent_write(struct cgroup_subsys_state *css,

2186

struct cftype *cft, char *buffer)

2188

struct cftype *cft, char *buffer)

2187

{

2189

{

2188

struct cgroup_root *root = css->cgroup->root;

2190

struct cgroup_root *root = css->cgroup->root;

2189

2191

2190

BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);

2192

BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);

2191

if (!cgroup_lock_live_group(css->cgroup))

2193

if (!cgroup_lock_live_group(css->cgroup))

2192

return -ENODEV;

2194

return -ENODEV;

2193

spin_lock(&release_agent_path_lock);

2195

spin_lock(&release_agent_path_lock);

2194

strlcpy(root->release_agent_path, buffer,

2196

strlcpy(root->release_agent_path, buffer,

2195

sizeof(root->release_agent_path));

2197

sizeof(root->release_agent_path));

2196

spin_unlock(&release_agent_path_lock);

2198

spin_unlock(&release_agent_path_lock);

2197

mutex_unlock(&cgroup_mutex);

2199

mutex_unlock(&cgroup_mutex);

2198

return 0;

2200

return 0;

2199

}

2201

}

2200

2202

2201

static int cgroup_release_agent_show(struct seq_file *seq, void *v)

2203

static int cgroup_release_agent_show(struct seq_file *seq, void *v)

2202

{

2204

{

2203

struct cgroup *cgrp = seq_css(seq)->cgroup;

2205

struct cgroup *cgrp = seq_css(seq)->cgroup;

2204

2206

2205

if (!cgroup_lock_live_group(cgrp))

2207

if (!cgroup_lock_live_group(cgrp))

2206

return -ENODEV;

2208

return -ENODEV;

2207

seq_puts(seq, cgrp->root->release_agent_path);

2209

seq_puts(seq, cgrp->root->release_agent_path);

2208

seq_putc(seq, '\n');

2210

seq_putc(seq, '\n');

2209

mutex_unlock(&cgroup_mutex);

2211

mutex_unlock(&cgroup_mutex);

2210

return 0;

2212

return 0;

2211

}

2213

}

2212

2214

2213

static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)

2215

static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)

2214

{

2216

{

2215

struct cgroup *cgrp = seq_css(seq)->cgroup;

2217

struct cgroup *cgrp = seq_css(seq)->cgroup;

2216

2218

2217

seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));

2219

seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));

2218

return 0;

2220

return 0;

2219

}

2221

}

2220

2222

2221

static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,

2223

static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,

2222

size_t nbytes, loff_t off)

2224

size_t nbytes, loff_t off)

2223

{

2225

{

2224

struct cgroup *cgrp = of->kn->parent->priv;

2226

struct cgroup *cgrp = of->kn->parent->priv;

2225

struct cftype *cft = of->kn->priv;

2227

struct cftype *cft = of->kn->priv;

2226

struct cgroup_subsys_state *css;

2228

struct cgroup_subsys_state *css;

2227

int ret;

2229

int ret;

2228

2230

2229

/*

2231

/*

2230

* kernfs guarantees that a file isn't deleted with operations in

2232

* kernfs guarantees that a file isn't deleted with operations in

2231

* flight, which means that the matching css is and stays alive and

2233

* flight, which means that the matching css is and stays alive and

2232

* doesn't need to be pinned. The RCU locking is not necessary

2234

* doesn't need to be pinned. The RCU locking is not necessary

2233

* either. It's just for the convenience of using cgroup_css().

2235

* either. It's just for the convenience of using cgroup_css().

2234

*/

2236

*/

2235

rcu_read_lock();

2237

rcu_read_lock();

2236

css = cgroup_css(cgrp, cft->ss);

2238

css = cgroup_css(cgrp, cft->ss);

2237

rcu_read_unlock();

2239

rcu_read_unlock();

2238

2240

2239

if (cft->write_string) {

2241

if (cft->write_string) {

2240

ret = cft->write_string(css, cft, strstrip(buf));

2242

ret = cft->write_string(css, cft, strstrip(buf));

2241

} else if (cft->write_u64) {

2243

} else if (cft->write_u64) {

2242

unsigned long long v;

2244

unsigned long long v;

2243

ret = kstrtoull(buf, 0, &v);

2245

ret = kstrtoull(buf, 0, &v);

2244

if (!ret)

2246

if (!ret)

2245

ret = cft->write_u64(css, cft, v);

2247

ret = cft->write_u64(css, cft, v);

2246

} else if (cft->write_s64) {

2248

} else if (cft->write_s64) {

2247

long long v;

2249

long long v;

2248

ret = kstrtoll(buf, 0, &v);

2250

ret = kstrtoll(buf, 0, &v);

2249

if (!ret)

2251

if (!ret)

2250

ret = cft->write_s64(css, cft, v);

2252

ret = cft->write_s64(css, cft, v);

2251

} else if (cft->trigger) {

2253

} else if (cft->trigger) {

2252

ret = cft->trigger(css, (unsigned int)cft->private);

2254

ret = cft->trigger(css, (unsigned int)cft->private);

2253

} else {

2255

} else {

2254

ret = -EINVAL;

2256

ret = -EINVAL;

2255

}

2257

}

2256

2258

2257

return ret ?: nbytes;

2259

return ret ?: nbytes;

2258

}

2260

}

2259

2261

2260

static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)

2262

static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)

2261

{

2263

{

2262

return seq_cft(seq)->seq_start(seq, ppos);

2264

return seq_cft(seq)->seq_start(seq, ppos);

2263

}

2265

}

2264

2266

2265

static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)

2267

static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)

2266

{

2268

{

2267

return seq_cft(seq)->seq_next(seq, v, ppos);

2269

return seq_cft(seq)->seq_next(seq, v, ppos);

2268

}

2270

}

2269

2271

2270

static void cgroup_seqfile_stop(struct seq_file *seq, void *v)

2272

static void cgroup_seqfile_stop(struct seq_file *seq, void *v)

2271

{

2273

{

2272

seq_cft(seq)->seq_stop(seq, v);

2274

seq_cft(seq)->seq_stop(seq, v);

2273

}

2275

}

2274

2276

2275

static int cgroup_seqfile_show(struct seq_file *m, void *arg)

2277

static int cgroup_seqfile_show(struct seq_file *m, void *arg)

2276

{

2278

{

2277

struct cftype *cft = seq_cft(m);

2279

struct cftype *cft = seq_cft(m);

2278

struct cgroup_subsys_state *css = seq_css(m);

2280

struct cgroup_subsys_state *css = seq_css(m);

2279

2281

2280

if (cft->seq_show)

2282

if (cft->seq_show)

2281

return cft->seq_show(m, arg);

2283

return cft->seq_show(m, arg);

2282

2284

2283

if (cft->read_u64)

2285

if (cft->read_u64)

2284

seq_printf(m, "%llu\n", cft->read_u64(css, cft));

2286

seq_printf(m, "%llu\n", cft->read_u64(css, cft));

2285

else if (cft->read_s64)

2287

else if (cft->read_s64)

2286

seq_printf(m, "%lld\n", cft->read_s64(css, cft));

2288

seq_printf(m, "%lld\n", cft->read_s64(css, cft));

2287

else

2289

else

2288

return -EINVAL;

2290

return -EINVAL;

2289

return 0;

2291

return 0;

2290

}

2292

}

2291

2293

2292

static struct kernfs_ops cgroup_kf_single_ops = {

2294

static struct kernfs_ops cgroup_kf_single_ops = {

2293

.atomic_write_len = PAGE_SIZE,

2295

.atomic_write_len = PAGE_SIZE,

2294

.write = cgroup_file_write,

2296

.write = cgroup_file_write,

2295

.seq_show = cgroup_seqfile_show,

2297

.seq_show = cgroup_seqfile_show,

2296

};

2298

};

2297

2299

2298

static struct kernfs_ops cgroup_kf_ops = {

2300

static struct kernfs_ops cgroup_kf_ops = {

2299

.atomic_write_len = PAGE_SIZE,

2301

.atomic_write_len = PAGE_SIZE,

2300

.write = cgroup_file_write,

2302

.write = cgroup_file_write,

2301

.seq_start = cgroup_seqfile_start,

2303

.seq_start = cgroup_seqfile_start,

2302

.seq_next = cgroup_seqfile_next,

2304

.seq_next = cgroup_seqfile_next,

2303

.seq_stop = cgroup_seqfile_stop,

2305

.seq_stop = cgroup_seqfile_stop,

2304

.seq_show = cgroup_seqfile_show,

2306

.seq_show = cgroup_seqfile_show,

2305

};

2307

};

2306

2308

2307

/*

2309

/*

2308

* cgroup_rename - Only allow simple rename of directories in place.

2310

* cgroup_rename - Only allow simple rename of directories in place.

2309

*/

2311

*/

2310

static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,

2312

static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,

2311

const char *new_name_str)

2313

const char *new_name_str)

2312

{

2314

{

2313

struct cgroup *cgrp = kn->priv;

2315

struct cgroup *cgrp = kn->priv;

2314

int ret;

2316

int ret;

2315

2317

2316

if (kernfs_type(kn) != KERNFS_DIR)

2318

if (kernfs_type(kn) != KERNFS_DIR)

2317

return -ENOTDIR;

2319

return -ENOTDIR;

2318

if (kn->parent != new_parent)

2320

if (kn->parent != new_parent)

2319

return -EIO;

2321

return -EIO;

2320

2322

2321

/*

2323

/*

2322

* This isn't a proper migration and its usefulness is very

2324

* This isn't a proper migration and its usefulness is very

2323

* limited. Disallow if sane_behavior.

2325

* limited. Disallow if sane_behavior.

2324

*/

2326

*/

2325

if (cgroup_sane_behavior(cgrp))

2327

if (cgroup_sane_behavior(cgrp))

2326

return -EPERM;

2328

return -EPERM;

2327

2329

2328

/*

2330

/*

2329

* We're gonna grab cgroup_tree_mutex which nests outside kernfs

2331

* We're gonna grab cgroup_tree_mutex which nests outside kernfs

2330

* active_ref. kernfs_rename() doesn't require active_ref

2332

* active_ref. kernfs_rename() doesn't require active_ref

2331

* protection. Break them before grabbing cgroup_tree_mutex.

2333

* protection. Break them before grabbing cgroup_tree_mutex.

2332

*/

2334

*/

2333

kernfs_break_active_protection(new_parent);

2335

kernfs_break_active_protection(new_parent);

2334

kernfs_break_active_protection(kn);

2336

kernfs_break_active_protection(kn);

2335

2337

2336

mutex_lock(&cgroup_tree_mutex);

2338

mutex_lock(&cgroup_tree_mutex);

2337

mutex_lock(&cgroup_mutex);

2339

mutex_lock(&cgroup_mutex);

2338

2340

2339

ret = kernfs_rename(kn, new_parent, new_name_str);

2341

ret = kernfs_rename(kn, new_parent, new_name_str);

2340

2342

2341

mutex_unlock(&cgroup_mutex);

2343

mutex_unlock(&cgroup_mutex);

2342

mutex_unlock(&cgroup_tree_mutex);

2344

mutex_unlock(&cgroup_tree_mutex);

2343

2345

2344

kernfs_unbreak_active_protection(kn);

2346

kernfs_unbreak_active_protection(kn);

2345

kernfs_unbreak_active_protection(new_parent);

2347

kernfs_unbreak_active_protection(new_parent);

2346

return ret;

2348

return ret;

2347

}

2349

}

2348

2350

2349

/* set uid and gid of cgroup dirs and files to that of the creator */

2351

/* set uid and gid of cgroup dirs and files to that of the creator */

2350

static int cgroup_kn_set_ugid(struct kernfs_node *kn)

2352

static int cgroup_kn_set_ugid(struct kernfs_node *kn)

2351

{

2353

{

2352

struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,

2354

struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,

2353

.ia_uid = current_fsuid(),

2355

.ia_uid = current_fsuid(),

2354

.ia_gid = current_fsgid(), };

2356

.ia_gid = current_fsgid(), };

2355

2357

2356

if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&

2358

if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&

2357

gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))

2359

gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))

2358

return 0;

2360

return 0;

2359

2361

2360

return kernfs_setattr(kn, &iattr);

2362

return kernfs_setattr(kn, &iattr);

2361

}

2363

}

2362

2364

2363

static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)

2365

static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)

2364

{

2366

{

2365

char name[CGROUP_FILE_NAME_MAX];

2367

char name[CGROUP_FILE_NAME_MAX];

2366

struct kernfs_node *kn;

2368

struct kernfs_node *kn;

2367

struct lock_class_key *key = NULL;

2369

struct lock_class_key *key = NULL;

2368

int ret;

2370

int ret;

2369

2371

2370

#ifdef CONFIG_DEBUG_LOCK_ALLOC

2372

#ifdef CONFIG_DEBUG_LOCK_ALLOC

2371

key = &cft->lockdep_key;

2373

key = &cft->lockdep_key;

2372

#endif

2374

#endif

2373

kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),

2375

kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),

2374

cgroup_file_mode(cft), 0, cft->kf_ops, cft,

2376

cgroup_file_mode(cft), 0, cft->kf_ops, cft,

2375

NULL, false, key);

2377

NULL, false, key);

2376

if (IS_ERR(kn))

2378

if (IS_ERR(kn))

2377

return PTR_ERR(kn);

2379

return PTR_ERR(kn);

2378

2380

2379

ret = cgroup_kn_set_ugid(kn);

2381

ret = cgroup_kn_set_ugid(kn);

2380

if (ret)

2382

if (ret)

2381

kernfs_remove(kn);

2383

kernfs_remove(kn);

2382

return ret;

2384

return ret;

2383

}

2385

}

2384

2386

2385

/**

2387

/**

2386

* cgroup_addrm_files - add or remove files to a cgroup directory

2388

* cgroup_addrm_files - add or remove files to a cgroup directory

2387

* @cgrp: the target cgroup

2389

* @cgrp: the target cgroup

2388

* @cfts: array of cftypes to be added

2390

* @cfts: array of cftypes to be added

2389

* @is_add: whether to add or remove

2391

* @is_add: whether to add or remove

2390

*

2392

*

2391

* Depending on @is_add, add or remove files defined by @cfts on @cgrp.

2393

* Depending on @is_add, add or remove files defined by @cfts on @cgrp.

2392

* For removals, this function never fails. If addition fails, this

2394

* For removals, this function never fails. If addition fails, this

2393

* function doesn't remove files already added. The caller is responsible

2395

* function doesn't remove files already added. The caller is responsible

2394

* for cleaning up.

2396

* for cleaning up.

2395

*/

2397

*/

2396

static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],

2398

static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],

2397

bool is_add)

2399

bool is_add)

2398

{

2400

{

2399

struct cftype *cft;

2401

struct cftype *cft;

2400

int ret;

2402

int ret;

2401

2403

2402

lockdep_assert_held(&cgroup_tree_mutex);

2404

lockdep_assert_held(&cgroup_tree_mutex);

2403

2405

2404

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2406

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2405

/* does cft->flags tell us to skip this file on @cgrp? */

2407

/* does cft->flags tell us to skip this file on @cgrp? */

2406

if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))

2408

if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))

2407

continue;

2409

continue;

2408

if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))

2410

if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))

2409

continue;

2411

continue;

2410

if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)

2412

if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)

2411

continue;

2413

continue;

2412

if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)

2414

if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)

2413

continue;

2415

continue;

2414

2416

2415

if (is_add) {

2417

if (is_add) {

2416

ret = cgroup_add_file(cgrp, cft);

2418

ret = cgroup_add_file(cgrp, cft);

2417

if (ret) {

2419

if (ret) {

2418

pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",

2420

pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",

2419

cft->name, ret);

2421

cft->name, ret);

2420

return ret;

2422

return ret;

2421

}

2423

}

2422

} else {

2424

} else {

2423

cgroup_rm_file(cgrp, cft);

2425

cgroup_rm_file(cgrp, cft);

2424

}

2426

}

2425

}

2427

}

2426

return 0;

2428

return 0;

2427

}

2429

}

2428

2430

2429

static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)

2431

static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)

2430

{

2432

{

2431

LIST_HEAD(pending);

2433

LIST_HEAD(pending);

2432

struct cgroup_subsys *ss = cfts[0].ss;

2434

struct cgroup_subsys *ss = cfts[0].ss;

2433

struct cgroup *root = &ss->root->cgrp;

2435

struct cgroup *root = &ss->root->cgrp;

2434

struct cgroup_subsys_state *css;

2436

struct cgroup_subsys_state *css;

2435

int ret = 0;

2437

int ret = 0;

2436

2438

2437

lockdep_assert_held(&cgroup_tree_mutex);

2439

lockdep_assert_held(&cgroup_tree_mutex);

2438

2440

2439

/* don't bother if @ss isn't attached */

2441

/* don't bother if @ss isn't attached */

2440

if (ss->root == &cgrp_dfl_root)

2442

if (ss->root == &cgrp_dfl_root)

2441

return 0;

2443

return 0;

2442

2444

2443

/* add/rm files for all cgroups created before */

2445

/* add/rm files for all cgroups created before */

2444

css_for_each_descendant_pre(css, cgroup_css(root, ss)) {

2446

css_for_each_descendant_pre(css, cgroup_css(root, ss)) {

2445

struct cgroup *cgrp = css->cgroup;

2447

struct cgroup *cgrp = css->cgroup;

2446

2448

2447

if (cgroup_is_dead(cgrp))

2449

if (cgroup_is_dead(cgrp))

2448

continue;

2450

continue;

2449

2451

2450

ret = cgroup_addrm_files(cgrp, cfts, is_add);

2452

ret = cgroup_addrm_files(cgrp, cfts, is_add);

2451

if (ret)

2453

if (ret)

2452

break;

2454

break;

2453

}

2455

}

2454

2456

2455

if (is_add && !ret)

2457

if (is_add && !ret)

2456

kernfs_activate(root->kn);

2458

kernfs_activate(root->kn);

2457

return ret;

2459

return ret;

2458

}

2460

}

2459

2461

2460

static void cgroup_exit_cftypes(struct cftype *cfts)

2462

static void cgroup_exit_cftypes(struct cftype *cfts)

2461

{

2463

{

2462

struct cftype *cft;

2464

struct cftype *cft;

2463

2465

2464

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2466

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2465

/* free copy for custom atomic_write_len, see init_cftypes() */

2467

/* free copy for custom atomic_write_len, see init_cftypes() */

2466

if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)

2468

if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)

2467

kfree(cft->kf_ops);

2469

kfree(cft->kf_ops);

2468

cft->kf_ops = NULL;

2470

cft->kf_ops = NULL;

2469

cft->ss = NULL;

2471

cft->ss = NULL;

2470

}

2472

}

2471

}

2473

}

2472

2474

2473

static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

2475

static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

2474

{

2476

{

2475

struct cftype *cft;

2477

struct cftype *cft;

2476

2478

2477

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2479

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2478

struct kernfs_ops *kf_ops;

2480

struct kernfs_ops *kf_ops;

2479

2481

2480

WARN_ON(cft->ss || cft->kf_ops);

2482

WARN_ON(cft->ss || cft->kf_ops);

2481

2483

2482

if (cft->seq_start)

2484

if (cft->seq_start)

2483

kf_ops = &cgroup_kf_ops;

2485

kf_ops = &cgroup_kf_ops;

2484

else

2486

else

2485

kf_ops = &cgroup_kf_single_ops;

2487

kf_ops = &cgroup_kf_single_ops;

2486

2488

2487

/*

2489

/*

2488

* Ugh... if @cft wants a custom max_write_len, we need to

2490

* Ugh... if @cft wants a custom max_write_len, we need to

2489

* make a copy of kf_ops to set its atomic_write_len.

2491

* make a copy of kf_ops to set its atomic_write_len.

2490

*/

2492

*/

2491

if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {

2493

if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {

2492

kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);

2494

kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);

2493

if (!kf_ops) {

2495

if (!kf_ops) {

2494

cgroup_exit_cftypes(cfts);

2496

cgroup_exit_cftypes(cfts);

2495

return -ENOMEM;

2497

return -ENOMEM;

2496

}

2498

}

2497

kf_ops->atomic_write_len = cft->max_write_len;

2499

kf_ops->atomic_write_len = cft->max_write_len;

2498

}

2500

}

2499

2501

2500

cft->kf_ops = kf_ops;

2502

cft->kf_ops = kf_ops;

2501

cft->ss = ss;

2503

cft->ss = ss;

2502

}

2504

}

2503

2505

2504

return 0;

2506

return 0;

2505

}

2507

}

2506

2508

2507

static int cgroup_rm_cftypes_locked(struct cftype *cfts)

2509

static int cgroup_rm_cftypes_locked(struct cftype *cfts)

2508

{

2510

{

2509

lockdep_assert_held(&cgroup_tree_mutex);

2511

lockdep_assert_held(&cgroup_tree_mutex);

2510

2512

2511

if (!cfts || !cfts[0].ss)

2513

if (!cfts || !cfts[0].ss)

2512

return -ENOENT;

2514

return -ENOENT;

2513

2515

2514

list_del(&cfts->node);

2516

list_del(&cfts->node);

2515

cgroup_apply_cftypes(cfts, false);

2517

cgroup_apply_cftypes(cfts, false);

2516

cgroup_exit_cftypes(cfts);

2518

cgroup_exit_cftypes(cfts);

2517

return 0;

2519

return 0;

2518

}

2520

}

2519

2521

2520

/**

2522

/**

2521

* cgroup_rm_cftypes - remove an array of cftypes from a subsystem

2523

* cgroup_rm_cftypes - remove an array of cftypes from a subsystem

2522

* @cfts: zero-length name terminated array of cftypes

2524

* @cfts: zero-length name terminated array of cftypes

2523

*

2525

*

2524

* Unregister @cfts. Files described by @cfts are removed from all

2526

* Unregister @cfts. Files described by @cfts are removed from all

2525

* existing cgroups and all future cgroups won't have them either. This

2527

* existing cgroups and all future cgroups won't have them either. This

2526

* function can be called anytime whether @cfts' subsys is attached or not.

2528

* function can be called anytime whether @cfts' subsys is attached or not.

2527

*

2529

*

2528

* Returns 0 on successful unregistration, -ENOENT if @cfts is not

2530

* Returns 0 on successful unregistration, -ENOENT if @cfts is not

2529

* registered.

2531

* registered.

2530

*/

2532

*/

2531

int cgroup_rm_cftypes(struct cftype *cfts)

2533

int cgroup_rm_cftypes(struct cftype *cfts)

2532

{

2534

{

2533

int ret;

2535

int ret;

2534

2536

2535

mutex_lock(&cgroup_tree_mutex);

2537

mutex_lock(&cgroup_tree_mutex);

2536

ret = cgroup_rm_cftypes_locked(cfts);

2538

ret = cgroup_rm_cftypes_locked(cfts);

2537

mutex_unlock(&cgroup_tree_mutex);

2539

mutex_unlock(&cgroup_tree_mutex);

2538

return ret;

2540

return ret;

2539

}

2541

}

2540

2542

2541

/**

2543

/**

2542

* cgroup_add_cftypes - add an array of cftypes to a subsystem

2544

* cgroup_add_cftypes - add an array of cftypes to a subsystem

2543

* @ss: target cgroup subsystem

2545

* @ss: target cgroup subsystem

2544

* @cfts: zero-length name terminated array of cftypes

2546

* @cfts: zero-length name terminated array of cftypes

2545

*

2547

*

2546

* Register @cfts to @ss. Files described by @cfts are created for all

2548

* Register @cfts to @ss. Files described by @cfts are created for all

2547

* existing cgroups to which @ss is attached and all future cgroups will

2549

* existing cgroups to which @ss is attached and all future cgroups will

2548

* have them too. This function can be called anytime whether @ss is

2550

* have them too. This function can be called anytime whether @ss is

2549

* attached or not.

2551

* attached or not.

2550

*

2552

*

2551

* Returns 0 on successful registration, -errno on failure. Note that this

2553

* Returns 0 on successful registration, -errno on failure. Note that this

2552

* function currently returns 0 as long as @cfts registration is successful

2554

* function currently returns 0 as long as @cfts registration is successful

2553

* even if some file creation attempts on existing cgroups fail.

2555

* even if some file creation attempts on existing cgroups fail.

2554

*/

2556

*/

2555

int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

2557

int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

2556

{

2558

{

2557

int ret;

2559

int ret;

2558

2560

2559

if (!cfts || cfts[0].name[0] == '\0')

2561

if (!cfts || cfts[0].name[0] == '\0')

2560

return 0;

2562

return 0;

2561

2563

2562

ret = cgroup_init_cftypes(ss, cfts);

2564

ret = cgroup_init_cftypes(ss, cfts);

2563

if (ret)

2565

if (ret)

2564

return ret;

2566

return ret;

2565

2567

2566

mutex_lock(&cgroup_tree_mutex);

2568

mutex_lock(&cgroup_tree_mutex);

2567

2569

2568

list_add_tail(&cfts->node, &ss->cfts);

2570

list_add_tail(&cfts->node, &ss->cfts);

2569

ret = cgroup_apply_cftypes(cfts, true);

2571

ret = cgroup_apply_cftypes(cfts, true);

2570

if (ret)

2572

if (ret)

2571

cgroup_rm_cftypes_locked(cfts);

2573

cgroup_rm_cftypes_locked(cfts);

2572

2574

2573

mutex_unlock(&cgroup_tree_mutex);

2575

mutex_unlock(&cgroup_tree_mutex);

2574

return ret;

2576

return ret;

2575

}

2577

}

2576

2578

2577

/**

2579

/**

2578

* cgroup_task_count - count the number of tasks in a cgroup.

2580

* cgroup_task_count - count the number of tasks in a cgroup.

2579

* @cgrp: the cgroup in question

2581

* @cgrp: the cgroup in question

2580

*

2582

*

2581

* Return the number of tasks in the cgroup.

2583

* Return the number of tasks in the cgroup.

2582

*/

2584

*/

2583

static int cgroup_task_count(const struct cgroup *cgrp)

2585

static int cgroup_task_count(const struct cgroup *cgrp)

2584

{

2586

{

2585

int count = 0;

2587

int count = 0;

2586

struct cgrp_cset_link *link;

2588

struct cgrp_cset_link *link;

2587

2589

2588

down_read(&css_set_rwsem);

2590

down_read(&css_set_rwsem);

2589

list_for_each_entry(link, &cgrp->cset_links, cset_link)

2591

list_for_each_entry(link, &cgrp->cset_links, cset_link)

2590

count += atomic_read(&link->cset->refcount);

2592

count += atomic_read(&link->cset->refcount);

2591

up_read(&css_set_rwsem);

2593

up_read(&css_set_rwsem);

2592

return count;

2594

return count;

2593

}

2595

}

2594

2596

2595

/**

2597

/**

2596

* css_next_child - find the next child of a given css

2598

* css_next_child - find the next child of a given css

2597

* @pos_css: the current position (%NULL to initiate traversal)

2599

* @pos_css: the current position (%NULL to initiate traversal)

2598

* @parent_css: css whose children to walk

2600

* @parent_css: css whose children to walk

2599

*

2601

*

2600

* This function returns the next child of @parent_css and should be called

2602

* This function returns the next child of @parent_css and should be called

2601

* under either cgroup_mutex or RCU read lock. The only requirement is

2603

* under either cgroup_mutex or RCU read lock. The only requirement is

2602

* that @parent_css and @pos_css are accessible. The next sibling is

2604

* that @parent_css and @pos_css are accessible. The next sibling is

2603

* guaranteed to be returned regardless of their states.

2605

* guaranteed to be returned regardless of their states.

2604

*/

2606

*/

2605

struct cgroup_subsys_state *

2607

struct cgroup_subsys_state *

2606

css_next_child(struct cgroup_subsys_state *pos_css,

2608

css_next_child(struct cgroup_subsys_state *pos_css,

2607

struct cgroup_subsys_state *parent_css)

2609

struct cgroup_subsys_state *parent_css)

2608

{

2610

{

2609

struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;

2611

struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;

2610

struct cgroup *cgrp = parent_css->cgroup;

2612

struct cgroup *cgrp = parent_css->cgroup;

2611

struct cgroup *next;

2613

struct cgroup *next;

2612

2614

2613

cgroup_assert_mutexes_or_rcu_locked();

2615

cgroup_assert_mutexes_or_rcu_locked();

2614

2616

2615

/*

2617

/*

2616

* @pos could already have been removed. Once a cgroup is removed,

2618

* @pos could already have been removed. Once a cgroup is removed,

2617

* its ->sibling.next is no longer updated when its next sibling

2619

* its ->sibling.next is no longer updated when its next sibling

2618

* changes. As CGRP_DEAD assertion is serialized and happens

2620

* changes. As CGRP_DEAD assertion is serialized and happens

2619

* before the cgroup is taken off the ->sibling list, if we see it

2621

* before the cgroup is taken off the ->sibling list, if we see it

2620

* unasserted, it's guaranteed that the next sibling hasn't

2622

* unasserted, it's guaranteed that the next sibling hasn't

2621

* finished its grace period even if it's already removed, and thus

2623

* finished its grace period even if it's already removed, and thus

2622

* safe to dereference from this RCU critical section. If

2624

* safe to dereference from this RCU critical section. If

2623

* ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed

2625

* ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed

2624

* to be visible as %true here.

2626

* to be visible as %true here.

2625

*

2627

*

2626

* If @pos is dead, its next pointer can't be dereferenced;

2628

* If @pos is dead, its next pointer can't be dereferenced;

2627

* however, as each cgroup is given a monotonically increasing

2629

* however, as each cgroup is given a monotonically increasing

2628

* unique serial number and always appended to the sibling list,

2630

* unique serial number and always appended to the sibling list,

2629

* the next one can be found by walking the parent's children until

2631

* the next one can be found by walking the parent's children until

2630

* we see a cgroup with higher serial number than @pos's. While

2632

* we see a cgroup with higher serial number than @pos's. While

2631

* this path can be slower, it's taken only when either the current

2633

* this path can be slower, it's taken only when either the current

2632

* cgroup is removed or iteration and removal race.

2634

* cgroup is removed or iteration and removal race.

2633

*/

2635

*/

2634

if (!pos) {

2636

if (!pos) {

2635

next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);

2637

next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);

2636

} else if (likely(!cgroup_is_dead(pos))) {

2638

} else if (likely(!cgroup_is_dead(pos))) {

2637

next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);

2639

next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);

2638

} else {

2640

} else {

2639

list_for_each_entry_rcu(next, &cgrp->children, sibling)

2641

list_for_each_entry_rcu(next, &cgrp->children, sibling)

2640

if (next->serial_nr > pos->serial_nr)

2642

if (next->serial_nr > pos->serial_nr)

2641

break;

2643

break;

2642

}

2644

}

2643

2645

2644

if (&next->sibling == &cgrp->children)

2646

if (&next->sibling == &cgrp->children)

2645

return NULL;

2647

return NULL;

2646

2648

2647

return cgroup_css(next, parent_css->ss);

2649

return cgroup_css(next, parent_css->ss);

2648

}

2650

}

2649

2651

2650

/**

2652

/**

2651

* css_next_descendant_pre - find the next descendant for pre-order walk

2653

* css_next_descendant_pre - find the next descendant for pre-order walk

2652

* @pos: the current position (%NULL to initiate traversal)

2654

* @pos: the current position (%NULL to initiate traversal)

2653

* @root: css whose descendants to walk

2655

* @root: css whose descendants to walk

2654

*

2656

*

2655

* To be used by css_for_each_descendant_pre(). Find the next descendant

2657

* To be used by css_for_each_descendant_pre(). Find the next descendant

2656

* to visit for pre-order traversal of @root's descendants. @root is

2658

* to visit for pre-order traversal of @root's descendants. @root is

2657

* included in the iteration and the first node to be visited.

2659

* included in the iteration and the first node to be visited.

2658

*

2660

*

2659

* While this function requires cgroup_mutex or RCU read locking, it

2661

* While this function requires cgroup_mutex or RCU read locking, it

2660

* doesn't require the whole traversal to be contained in a single critical

2662

* doesn't require the whole traversal to be contained in a single critical

2661

* section. This function will return the correct next descendant as long

2663

* section. This function will return the correct next descendant as long

2662

* as both @pos and @root are accessible and @pos is a descendant of @root.

2664

* as both @pos and @root are accessible and @pos is a descendant of @root.

2663

*/

2665

*/

2664

struct cgroup_subsys_state *

2666

struct cgroup_subsys_state *

2665

css_next_descendant_pre(struct cgroup_subsys_state *pos,

2667

css_next_descendant_pre(struct cgroup_subsys_state *pos,

2666

struct cgroup_subsys_state *root)

2668

struct cgroup_subsys_state *root)

2667

{

2669

{

2668

struct cgroup_subsys_state *next;

2670

struct cgroup_subsys_state *next;

2669

2671

2670

cgroup_assert_mutexes_or_rcu_locked();

2672

cgroup_assert_mutexes_or_rcu_locked();

2671

2673

2672

/* if first iteration, visit @root */

2674

/* if first iteration, visit @root */

2673

if (!pos)

2675

if (!pos)

2674

return root;

2676

return root;

2675

2677

2676

/* visit the first child if exists */

2678

/* visit the first child if exists */

2677

next = css_next_child(NULL, pos);

2679

next = css_next_child(NULL, pos);

2678

if (next)

2680

if (next)

2679

return next;

2681

return next;

2680

2682

2681

/* no child, visit my or the closest ancestor's next sibling */

2683

/* no child, visit my or the closest ancestor's next sibling */

2682

while (pos != root) {

2684

while (pos != root) {

2683

next = css_next_child(pos, css_parent(pos));

2685

next = css_next_child(pos, css_parent(pos));

2684

if (next)

2686

if (next)

2685

return next;

2687

return next;

2686

pos = css_parent(pos);

2688

pos = css_parent(pos);

2687

}

2689

}

2688

2690

2689

return NULL;

2691

return NULL;

2690

}

2692

}

2691

2693

2692

/**

2694

/**

2693

* css_rightmost_descendant - return the rightmost descendant of a css

2695

* css_rightmost_descendant - return the rightmost descendant of a css

2694

* @pos: css of interest

2696

* @pos: css of interest

2695

*

2697

*

2696

* Return the rightmost descendant of @pos. If there's no descendant, @pos

2698

* Return the rightmost descendant of @pos. If there's no descendant, @pos

2697

* is returned. This can be used during pre-order traversal to skip

2699

* is returned. This can be used during pre-order traversal to skip

2698

* subtree of @pos.

2700

* subtree of @pos.

2699

*

2701

*

2700

* While this function requires cgroup_mutex or RCU read locking, it

2702

* While this function requires cgroup_mutex or RCU read locking, it

2701

* doesn't require the whole traversal to be contained in a single critical

2703

* doesn't require the whole traversal to be contained in a single critical

2702

* section. This function will return the correct rightmost descendant as

2704

* section. This function will return the correct rightmost descendant as

2703

* long as @pos is accessible.

2705

* long as @pos is accessible.

2704

*/

2706

*/

2705

struct cgroup_subsys_state *

2707

struct cgroup_subsys_state *

2706

css_rightmost_descendant(struct cgroup_subsys_state *pos)

2708

css_rightmost_descendant(struct cgroup_subsys_state *pos)

2707

{

2709

{

2708

struct cgroup_subsys_state *last, *tmp;

2710

struct cgroup_subsys_state *last, *tmp;

2709

2711

2710

cgroup_assert_mutexes_or_rcu_locked();

2712

cgroup_assert_mutexes_or_rcu_locked();

2711

2713

2712

do {

2714

do {

2713

last = pos;

2715

last = pos;

2714

/* ->prev isn't RCU safe, walk ->next till the end */

2716

/* ->prev isn't RCU safe, walk ->next till the end */

2715

pos = NULL;

2717

pos = NULL;

2716

css_for_each_child(tmp, last)

2718

css_for_each_child(tmp, last)

2717

pos = tmp;

2719

pos = tmp;

2718

} while (pos);

2720

} while (pos);

2719

2721

2720

return last;

2722

return last;

2721

}

2723

}

2722

2724

2723

static struct cgroup_subsys_state *

2725

static struct cgroup_subsys_state *

2724

css_leftmost_descendant(struct cgroup_subsys_state *pos)

2726

css_leftmost_descendant(struct cgroup_subsys_state *pos)

2725

{

2727

{

2726

struct cgroup_subsys_state *last;

2728

struct cgroup_subsys_state *last;

2727

2729

2728

do {

2730

do {

2729

last = pos;

2731

last = pos;

2730

pos = css_next_child(NULL, pos);

2732

pos = css_next_child(NULL, pos);

2731

} while (pos);

2733

} while (pos);

2732

2734

2733

return last;

2735

return last;

2734

}

2736

}

2735

2737

2736

/**

2738

/**

2737

* css_next_descendant_post - find the next descendant for post-order walk

2739

* css_next_descendant_post - find the next descendant for post-order walk

2738

* @pos: the current position (%NULL to initiate traversal)

2740

* @pos: the current position (%NULL to initiate traversal)

2739

* @root: css whose descendants to walk

2741

* @root: css whose descendants to walk

2740

*

2742

*

2741

* To be used by css_for_each_descendant_post(). Find the next descendant

2743

* To be used by css_for_each_descendant_post(). Find the next descendant

2742

* to visit for post-order traversal of @root's descendants. @root is

2744

* to visit for post-order traversal of @root's descendants. @root is

2743

* included in the iteration and the last node to be visited.

2745

* included in the iteration and the last node to be visited.

2744

*

2746

*

2745

* While this function requires cgroup_mutex or RCU read locking, it

2747

* While this function requires cgroup_mutex or RCU read locking, it

2746

* doesn't require the whole traversal to be contained in a single critical

2748

* doesn't require the whole traversal to be contained in a single critical

2747

* section. This function will return the correct next descendant as long

2749

* section. This function will return the correct next descendant as long

2748

* as both @pos and @cgroup are accessible and @pos is a descendant of

2750

* as both @pos and @cgroup are accessible and @pos is a descendant of

2749

* @cgroup.

2751

* @cgroup.

2750

*/

2752

*/

2751

struct cgroup_subsys_state *

2753

struct cgroup_subsys_state *

2752

css_next_descendant_post(struct cgroup_subsys_state *pos,

2754

css_next_descendant_post(struct cgroup_subsys_state *pos,

2753

struct cgroup_subsys_state *root)

2755

struct cgroup_subsys_state *root)

2754

{

2756

{

2755

struct cgroup_subsys_state *next;

2757

struct cgroup_subsys_state *next;

2756

2758

2757

cgroup_assert_mutexes_or_rcu_locked();

2759

cgroup_assert_mutexes_or_rcu_locked();

2758

2760

2759

/* if first iteration, visit leftmost descendant which may be @root */

2761

/* if first iteration, visit leftmost descendant which may be @root */

2760

if (!pos)

2762

if (!pos)

2761

return css_leftmost_descendant(root);

2763

return css_leftmost_descendant(root);

2762

2764

2763

/* if we visited @root, we're done */

2765

/* if we visited @root, we're done */

2764

if (pos == root)

2766

if (pos == root)

2765

return NULL;

2767

return NULL;

2766

2768

2767

/* if there's an unvisited sibling, visit its leftmost descendant */

2769

/* if there's an unvisited sibling, visit its leftmost descendant */

2768

next = css_next_child(pos, css_parent(pos));

2770

next = css_next_child(pos, css_parent(pos));

2769

if (next)

2771

if (next)

2770

return css_leftmost_descendant(next);

2772

return css_leftmost_descendant(next);

2771

2773

2772

/* no sibling left, visit parent */

2774

/* no sibling left, visit parent */

2773

return css_parent(pos);

2775

return css_parent(pos);

2774

}

2776

}

2775

2777

2776

/**

2778

/**

2777

* css_advance_task_iter - advance a task itererator to the next css_set

2779

* css_advance_task_iter - advance a task itererator to the next css_set

2778

* @it: the iterator to advance

2780

* @it: the iterator to advance

2779

*

2781

*

2780

* Advance @it to the next css_set to walk.

2782

* Advance @it to the next css_set to walk.

2781

*/

2783

*/

2782

static void css_advance_task_iter(struct css_task_iter *it)

2784

static void css_advance_task_iter(struct css_task_iter *it)

2783

{

2785

{

2784

struct list_head *l = it->cset_link;

2786

struct list_head *l = it->cset_link;

2785

struct cgrp_cset_link *link;

2787

struct cgrp_cset_link *link;

2786

struct css_set *cset;

2788

struct css_set *cset;

2787

2789

2788

/* Advance to the next non-empty css_set */

2790

/* Advance to the next non-empty css_set */

2789

do {

2791

do {

2790

l = l->next;

2792

l = l->next;

2791

if (l == &it->origin_css->cgroup->cset_links) {

2793

if (l == &it->origin_css->cgroup->cset_links) {

2792

it->cset_link = NULL;

2794

it->cset_link = NULL;

2793

return;

2795

return;

2794

}

2796

}

2795

link = list_entry(l, struct cgrp_cset_link, cset_link);

2797

link = list_entry(l, struct cgrp_cset_link, cset_link);

2796

cset = link->cset;

2798

cset = link->cset;

2797

} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));

2799

} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));

2798

2800

2799

it->cset_link = l;

2801

it->cset_link = l;

2800

2802

2801

if (!list_empty(&cset->tasks))

2803

if (!list_empty(&cset->tasks))

2802

it->task = cset->tasks.next;

2804

it->task = cset->tasks.next;

2803

else

2805

else

2804

it->task = cset->mg_tasks.next;

2806

it->task = cset->mg_tasks.next;

2805

}

2807

}

2806

2808

2807

/**

2809

/**

2808

* css_task_iter_start - initiate task iteration

2810

* css_task_iter_start - initiate task iteration

2809

* @css: the css to walk tasks of

2811

* @css: the css to walk tasks of

2810

* @it: the task iterator to use

2812

* @it: the task iterator to use

2811

*

2813

*

2812

* Initiate iteration through the tasks of @css. The caller can call

2814

* Initiate iteration through the tasks of @css. The caller can call

2813

* css_task_iter_next() to walk through the tasks until the function

2815

* css_task_iter_next() to walk through the tasks until the function

2814

* returns NULL. On completion of iteration, css_task_iter_end() must be

2816

* returns NULL. On completion of iteration, css_task_iter_end() must be

2815

* called.

2817

* called.

2816

*

2818

*

2817

* Note that this function acquires a lock which is released when the

2819

* Note that this function acquires a lock which is released when the

2818

* iteration finishes. The caller can't sleep while iteration is in

2820

* iteration finishes. The caller can't sleep while iteration is in

2819

* progress.

2821

* progress.

2820

*/

2822

*/

2821

void css_task_iter_start(struct cgroup_subsys_state *css,

2823

void css_task_iter_start(struct cgroup_subsys_state *css,

2822

struct css_task_iter *it)

2824

struct css_task_iter *it)

2823

__acquires(css_set_rwsem)

2825

__acquires(css_set_rwsem)

2824

{

2826

{

2825

/* no one should try to iterate before mounting cgroups */

2827

/* no one should try to iterate before mounting cgroups */

2826

WARN_ON_ONCE(!use_task_css_set_links);

2828

WARN_ON_ONCE(!use_task_css_set_links);

2827

2829

2828

down_read(&css_set_rwsem);

2830

down_read(&css_set_rwsem);

2829

2831

2830

it->origin_css = css;

2832

it->origin_css = css;

2831

it->cset_link = &css->cgroup->cset_links;

2833

it->cset_link = &css->cgroup->cset_links;

2832

2834

2833

css_advance_task_iter(it);

2835

css_advance_task_iter(it);

2834

}

2836

}

2835

2837

2836

/**

2838

/**

2837

* css_task_iter_next - return the next task for the iterator

2839

* css_task_iter_next - return the next task for the iterator

2838

* @it: the task iterator being iterated

2840

* @it: the task iterator being iterated

2839

*

2841

*

2840

* The "next" function for task iteration. @it should have been

2842

* The "next" function for task iteration. @it should have been

2841

* initialized via css_task_iter_start(). Returns NULL when the iteration

2843

* initialized via css_task_iter_start(). Returns NULL when the iteration

2842

* reaches the end.

2844

* reaches the end.

2843

*/

2845

*/

2844

struct task_struct *css_task_iter_next(struct css_task_iter *it)

2846

struct task_struct *css_task_iter_next(struct css_task_iter *it)

2845

{

2847

{

2846

struct task_struct *res;

2848

struct task_struct *res;

2847

struct list_head *l = it->task;

2849

struct list_head *l = it->task;

2848

struct cgrp_cset_link *link = list_entry(it->cset_link,

2850

struct cgrp_cset_link *link = list_entry(it->cset_link,

2849

struct cgrp_cset_link, cset_link);

2851

struct cgrp_cset_link, cset_link);

2850

2852

2851

/* If the iterator cg is NULL, we have no tasks */

2853

/* If the iterator cg is NULL, we have no tasks */

2852

if (!it->cset_link)

2854

if (!it->cset_link)

2853

return NULL;

2855

return NULL;

2854

res = list_entry(l, struct task_struct, cg_list);

2856

res = list_entry(l, struct task_struct, cg_list);

2855

2857

2856

/*

2858

/*

2857

* Advance iterator to find next entry. cset->tasks is consumed

2859

* Advance iterator to find next entry. cset->tasks is consumed

2858

* first and then ->mg_tasks. After ->mg_tasks, we move onto the

2860

* first and then ->mg_tasks. After ->mg_tasks, we move onto the

2859

* next cset.

2861

* next cset.

2860

*/

2862

*/

2861

l = l->next;

2863

l = l->next;

2862

2864

2863

if (l == &link->cset->tasks)

2865

if (l == &link->cset->tasks)

2864

l = link->cset->mg_tasks.next;

2866

l = link->cset->mg_tasks.next;

2865

2867

2866

if (l == &link->cset->mg_tasks)

2868

if (l == &link->cset->mg_tasks)

2867

css_advance_task_iter(it);

2869

css_advance_task_iter(it);

2868

else

2870

else

2869

it->task = l;

2871

it->task = l;

2870

2872

2871

return res;

2873

return res;

2872

}

2874

}

2873

2875

2874

/**

2876

/**

2875

* css_task_iter_end - finish task iteration

2877

* css_task_iter_end - finish task iteration

2876

* @it: the task iterator to finish

2878

* @it: the task iterator to finish

2877

*

2879

*

2878

* Finish task iteration started by css_task_iter_start().

2880

* Finish task iteration started by css_task_iter_start().

2879

*/

2881

*/

2880

void css_task_iter_end(struct css_task_iter *it)

2882

void css_task_iter_end(struct css_task_iter *it)

2881

__releases(css_set_rwsem)

2883

__releases(css_set_rwsem)

2882

{

2884

{

2883

up_read(&css_set_rwsem);

2885

up_read(&css_set_rwsem);

2884

}

2886

}

2885

2887

2886

/**

2888

/**

2887

* cgroup_trasnsfer_tasks - move tasks from one cgroup to another

2889

* cgroup_trasnsfer_tasks - move tasks from one cgroup to another

2888

* @to: cgroup to which the tasks will be moved

2890

* @to: cgroup to which the tasks will be moved

2889

* @from: cgroup in which the tasks currently reside

2891

* @from: cgroup in which the tasks currently reside

2890

*

2892

*

2891

* Locking rules between cgroup_post_fork() and the migration path

2893

* Locking rules between cgroup_post_fork() and the migration path

2892

* guarantee that, if a task is forking while being migrated, the new child

2894

* guarantee that, if a task is forking while being migrated, the new child

2893

* is guaranteed to be either visible in the source cgroup after the

2895

* is guaranteed to be either visible in the source cgroup after the

2894

* parent's migration is complete or put into the target cgroup. No task

2896

* parent's migration is complete or put into the target cgroup. No task

2895

* can slip out of migration through forking.

2897

* can slip out of migration through forking.

2896

*/

2898

*/

2897

int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)

2899

int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)

2898

{

2900

{

2899

LIST_HEAD(preloaded_csets);

2901

LIST_HEAD(preloaded_csets);

2900

struct cgrp_cset_link *link;

2902

struct cgrp_cset_link *link;

2901

struct css_task_iter it;

2903

struct css_task_iter it;

2902

struct task_struct *task;

2904

struct task_struct *task;

2903

int ret;

2905

int ret;

2904

2906

2905

mutex_lock(&cgroup_mutex);

2907

mutex_lock(&cgroup_mutex);

2906

2908

2907

/* all tasks in @from are being moved, all csets are source */

2909

/* all tasks in @from are being moved, all csets are source */

2908

down_read(&css_set_rwsem);

2910

down_read(&css_set_rwsem);

2909

list_for_each_entry(link, &from->cset_links, cset_link)

2911

list_for_each_entry(link, &from->cset_links, cset_link)

2910

cgroup_migrate_add_src(link->cset, to, &preloaded_csets);

2912

cgroup_migrate_add_src(link->cset, to, &preloaded_csets);

2911

up_read(&css_set_rwsem);

2913

up_read(&css_set_rwsem);

2912

2914

2913

ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);

2915

ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);

2914

if (ret)

2916

if (ret)

2915

goto out_err;

2917

goto out_err;

2916

2918

2917

/*

2919

/*

2918

* Migrate tasks one-by-one until @form is empty. This fails iff

2920

* Migrate tasks one-by-one until @form is empty. This fails iff

2919

* ->can_attach() fails.

2921

* ->can_attach() fails.

2920

*/

2922

*/

2921

do {

2923

do {

2922

css_task_iter_start(&from->dummy_css, &it);

2924

css_task_iter_start(&from->dummy_css, &it);

2923

task = css_task_iter_next(&it);

2925

task = css_task_iter_next(&it);

2924

if (task)

2926

if (task)

2925

get_task_struct(task);

2927

get_task_struct(task);

2926

css_task_iter_end(&it);

2928

css_task_iter_end(&it);

2927

2929

2928

if (task) {

2930

if (task) {

2929

ret = cgroup_migrate(to, task, false);

2931

ret = cgroup_migrate(to, task, false);

2930

put_task_struct(task);

2932

put_task_struct(task);

2931

}

2933

}

2932

} while (task && !ret);

2934

} while (task && !ret);

2933

out_err:

2935

out_err:

2934

cgroup_migrate_finish(&preloaded_csets);

2936

cgroup_migrate_finish(&preloaded_csets);

2935

mutex_unlock(&cgroup_mutex);

2937

mutex_unlock(&cgroup_mutex);

2936

return ret;

2938

return ret;

2937

}

2939

}

2938

2940

2939

/*

2941

/*

2940

* Stuff for reading the 'tasks'/'procs' files.

2942

* Stuff for reading the 'tasks'/'procs' files.

2941

*

2943

*

2942

* Reading this file can return large amounts of data if a cgroup has

2944

* Reading this file can return large amounts of data if a cgroup has

2943

* *lots* of attached tasks. So it may need several calls to read(),

2945

* *lots* of attached tasks. So it may need several calls to read(),

2944

* but we cannot guarantee that the information we produce is correct

2946

* but we cannot guarantee that the information we produce is correct

2945

* unless we produce it entirely atomically.

2947

* unless we produce it entirely atomically.

2946

*

2948

*

2947

*/

2949

*/

2948

2950

2949

/* which pidlist file are we talking about? */

2951

/* which pidlist file are we talking about? */

2950

enum cgroup_filetype {

2952

enum cgroup_filetype {

2951

CGROUP_FILE_PROCS,

2953

CGROUP_FILE_PROCS,

2952

CGROUP_FILE_TASKS,

2954

CGROUP_FILE_TASKS,

2953

};

2955

};

2954

2956

2955

/*

2957

/*

2956

* A pidlist is a list of pids that virtually represents the contents of one

2958

* A pidlist is a list of pids that virtually represents the contents of one

2957

* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,

2959

* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,

2958

* a pair (one each for procs, tasks) for each pid namespace that's relevant

2960

* a pair (one each for procs, tasks) for each pid namespace that's relevant

2959

* to the cgroup.

2961

* to the cgroup.

2960

*/

2962

*/

2961

struct cgroup_pidlist {

2963

struct cgroup_pidlist {

2962

/*

2964

/*

2963

* used to find which pidlist is wanted. doesn't change as long as

2965

* used to find which pidlist is wanted. doesn't change as long as

2964

* this particular list stays in the list.

2966

* this particular list stays in the list.

2965

*/

2967

*/

2966

struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;

2968

struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;

2967

/* array of xids */

2969

/* array of xids */

2968

pid_t *list;

2970

pid_t *list;

2969

/* how many elements the above list has */

2971

/* how many elements the above list has */

2970

int length;

2972

int length;

2971

/* each of these stored in a list by its cgroup */

2973

/* each of these stored in a list by its cgroup */

2972

struct list_head links;

2974

struct list_head links;

2973

/* pointer to the cgroup we belong to, for list removal purposes */

2975

/* pointer to the cgroup we belong to, for list removal purposes */

2974

struct cgroup *owner;

2976

struct cgroup *owner;

2975

/* for delayed destruction */

2977

/* for delayed destruction */

2976

struct delayed_work destroy_dwork;

2978

struct delayed_work destroy_dwork;

2977

};

2979

};

2978

2980

2979

/*

2981

/*

2980

* The following two functions "fix" the issue where there are more pids

2982

* The following two functions "fix" the issue where there are more pids

2981

* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.

2983

* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.

2982

* TODO: replace with a kernel-wide solution to this problem

2984

* TODO: replace with a kernel-wide solution to this problem

2983

*/

2985

*/

2984

#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))

2986

#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))

2985

static void *pidlist_allocate(int count)

2987

static void *pidlist_allocate(int count)

2986

{

2988

{

2987

if (PIDLIST_TOO_LARGE(count))

2989

if (PIDLIST_TOO_LARGE(count))

2988

return vmalloc(count * sizeof(pid_t));

2990

return vmalloc(count * sizeof(pid_t));

2989

else

2991

else

2990

return kmalloc(count * sizeof(pid_t), GFP_KERNEL);

2992

return kmalloc(count * sizeof(pid_t), GFP_KERNEL);

2991

}

2993

}

2992

2994

2993

static void pidlist_free(void *p)

2995

static void pidlist_free(void *p)

2994

{

2996

{

2995

if (is_vmalloc_addr(p))

2997

if (is_vmalloc_addr(p))

2996

vfree(p);

2998

vfree(p);

2997

else

2999

else

2998

kfree(p);

3000

kfree(p);

2999

}

3001

}

3000

3002

3001

/*

3003

/*

3002

* Used to destroy all pidlists lingering waiting for destroy timer. None

3004

* Used to destroy all pidlists lingering waiting for destroy timer. None

3003

* should be left afterwards.

3005

* should be left afterwards.

3004

*/

3006

*/

3005

static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)

3007

static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)

3006

{

3008

{

3007

struct cgroup_pidlist *l, *tmp_l;

3009

struct cgroup_pidlist *l, *tmp_l;

3008

3010

3009

mutex_lock(&cgrp->pidlist_mutex);

3011

mutex_lock(&cgrp->pidlist_mutex);

3010

list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)

3012

list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)

3011

mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);

3013

mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);

3012

mutex_unlock(&cgrp->pidlist_mutex);

3014

mutex_unlock(&cgrp->pidlist_mutex);

3013

3015

3014

flush_workqueue(cgroup_pidlist_destroy_wq);

3016

flush_workqueue(cgroup_pidlist_destroy_wq);

3015

BUG_ON(!list_empty(&cgrp->pidlists));

3017

BUG_ON(!list_empty(&cgrp->pidlists));

3016

}

3018

}

3017

3019

3018

static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)

3020

static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)

3019

{

3021

{

3020

struct delayed_work *dwork = to_delayed_work(work);

3022

struct delayed_work *dwork = to_delayed_work(work);

3021

struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,

3023

struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,

3022

destroy_dwork);

3024

destroy_dwork);

3023

struct cgroup_pidlist *tofree = NULL;

3025

struct cgroup_pidlist *tofree = NULL;

3024

3026

3025

mutex_lock(&l->owner->pidlist_mutex);

3027

mutex_lock(&l->owner->pidlist_mutex);

3026

3028

3027

/*

3029

/*

3028

* Destroy iff we didn't get queued again. The state won't change

3030

* Destroy iff we didn't get queued again. The state won't change

3029

* as destroy_dwork can only be queued while locked.

3031

* as destroy_dwork can only be queued while locked.

3030

*/

3032

*/

3031

if (!delayed_work_pending(dwork)) {

3033

if (!delayed_work_pending(dwork)) {

3032

list_del(&l->links);

3034

list_del(&l->links);

3033

pidlist_free(l->list);

3035

pidlist_free(l->list);

3034

put_pid_ns(l->key.ns);

3036

put_pid_ns(l->key.ns);

3035

tofree = l;

3037

tofree = l;

3036

}

3038

}

3037

3039

3038

mutex_unlock(&l->owner->pidlist_mutex);

3040

mutex_unlock(&l->owner->pidlist_mutex);

3039

kfree(tofree);

3041

kfree(tofree);

3040

}

3042

}

3041

3043

3042

/*

3044

/*

3043

* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries

3045

* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries

3044

* Returns the number of unique elements.

3046

* Returns the number of unique elements.

3045

*/

3047

*/

3046

static int pidlist_uniq(pid_t *list, int length)

3048

static int pidlist_uniq(pid_t *list, int length)

3047

{

3049

{

3048

int src, dest = 1;

3050

int src, dest = 1;

3049

3051

3050

/*

3052

/*

3051

* we presume the 0th element is unique, so i starts at 1. trivial

3053

* we presume the 0th element is unique, so i starts at 1. trivial

3052

* edge cases first; no work needs to be done for either

3054

* edge cases first; no work needs to be done for either

3053

*/

3055

*/

3054

if (length == 0 || length == 1)

3056

if (length == 0 || length == 1)

3055

return length;

3057

return length;

3056

/* src and dest walk down the list; dest counts unique elements */

3058

/* src and dest walk down the list; dest counts unique elements */

3057

for (src = 1; src < length; src++) {

3059

for (src = 1; src < length; src++) {

3058

/* find next unique element */

3060

/* find next unique element */

3059

while (list[src] == list[src-1]) {

3061

while (list[src] == list[src-1]) {

3060

src++;

3062

src++;

3061

if (src == length)

3063

if (src == length)

3062

goto after;

3064

goto after;

3063

}

3065

}

3064

/* dest always points to where the next unique element goes */

3066

/* dest always points to where the next unique element goes */

3065

list[dest] = list[src];

3067

list[dest] = list[src];

3066

dest++;

3068

dest++;

3067

}

3069

}

3068

after:

3070

after:

3069

return dest;

3071

return dest;

3070

}

3072

}

3071

3073

3072

/*

3074

/*

3073

* The two pid files - task and cgroup.procs - guaranteed that the result

3075

* The two pid files - task and cgroup.procs - guaranteed that the result

3074

* is sorted, which forced this whole pidlist fiasco. As pid order is

3076

* is sorted, which forced this whole pidlist fiasco. As pid order is

3075

* different per namespace, each namespace needs differently sorted list,

3077

* different per namespace, each namespace needs differently sorted list,

3076

* making it impossible to use, for example, single rbtree of member tasks

3078

* making it impossible to use, for example, single rbtree of member tasks

3077

* sorted by task pointer. As pidlists can be fairly large, allocating one

3079

* sorted by task pointer. As pidlists can be fairly large, allocating one

3078

* per open file is dangerous, so cgroup had to implement shared pool of

3080

* per open file is dangerous, so cgroup had to implement shared pool of

3079

* pidlists keyed by cgroup and namespace.

3081

* pidlists keyed by cgroup and namespace.

3080

*

3082

*

3081

* All this extra complexity was caused by the original implementation

3083

* All this extra complexity was caused by the original implementation

3082

* committing to an entirely unnecessary property. In the long term, we

3084

* committing to an entirely unnecessary property. In the long term, we

3083

* want to do away with it. Explicitly scramble sort order if

3085

* want to do away with it. Explicitly scramble sort order if

3084

* sane_behavior so that no such expectation exists in the new interface.

3086

* sane_behavior so that no such expectation exists in the new interface.

3085

*

3087

*

3086

* Scrambling is done by swapping every two consecutive bits, which is

3088

* Scrambling is done by swapping every two consecutive bits, which is

3087

* non-identity one-to-one mapping which disturbs sort order sufficiently.

3089

* non-identity one-to-one mapping which disturbs sort order sufficiently.

3088

*/

3090

*/

3089

static pid_t pid_fry(pid_t pid)

3091

static pid_t pid_fry(pid_t pid)

3090

{

3092

{

3091

unsigned a = pid & 0x55555555;

3093

unsigned a = pid & 0x55555555;

3092

unsigned b = pid & 0xAAAAAAAA;

3094

unsigned b = pid & 0xAAAAAAAA;

3093

3095

3094

return (a << 1) | (b >> 1);

3096

return (a << 1) | (b >> 1);

3095

}

3097

}

3096

3098

3097

static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)

3099

static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)

3098

{

3100

{

3099

if (cgroup_sane_behavior(cgrp))

3101

if (cgroup_sane_behavior(cgrp))

3100

return pid_fry(pid);

3102

return pid_fry(pid);

3101

else

3103

else

3102

return pid;

3104

return pid;

3103

}

3105

}

3104

3106

3105

static int cmppid(const void *a, const void *b)

3107

static int cmppid(const void *a, const void *b)

3106

{

3108

{

3107

return *(pid_t *)a - *(pid_t *)b;

3109

return *(pid_t *)a - *(pid_t *)b;

3108

}

3110

}

3109

3111

3110

static int fried_cmppid(const void *a, const void *b)

3112

static int fried_cmppid(const void *a, const void *b)

3111

{

3113

{

3112

return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);

3114

return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);

3113

}

3115

}

3114

3116

3115

static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,

3117

static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,

3116

enum cgroup_filetype type)

3118

enum cgroup_filetype type)

3117

{

3119

{

3118

struct cgroup_pidlist *l;

3120

struct cgroup_pidlist *l;

3119

/* don't need task_nsproxy() if we're looking at ourself */

3121

/* don't need task_nsproxy() if we're looking at ourself */

3120

struct pid_namespace *ns = task_active_pid_ns(current);

3122

struct pid_namespace *ns = task_active_pid_ns(current);

3121

3123

3122

lockdep_assert_held(&cgrp->pidlist_mutex);

3124

lockdep_assert_held(&cgrp->pidlist_mutex);

3123

3125

3124

list_for_each_entry(l, &cgrp->pidlists, links)

3126

list_for_each_entry(l, &cgrp->pidlists, links)

3125

if (l->key.type == type && l->key.ns == ns)

3127

if (l->key.type == type && l->key.ns == ns)

3126

return l;

3128

return l;

3127

return NULL;

3129

return NULL;

3128

}

3130

}

3129

3131

3130

/*

3132

/*

3131

* find the appropriate pidlist for our purpose (given procs vs tasks)

3133

* find the appropriate pidlist for our purpose (given procs vs tasks)

3132

* returns with the lock on that pidlist already held, and takes care

3134

* returns with the lock on that pidlist already held, and takes care

3133

* of the use count, or returns NULL with no locks held if we're out of

3135

* of the use count, or returns NULL with no locks held if we're out of

3134

* memory.

3136

* memory.

3135

*/

3137

*/

3136

static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,

3138

static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,

3137

enum cgroup_filetype type)

3139

enum cgroup_filetype type)

3138

{

3140

{

3139

struct cgroup_pidlist *l;

3141

struct cgroup_pidlist *l;

3140

3142

3141

lockdep_assert_held(&cgrp->pidlist_mutex);

3143

lockdep_assert_held(&cgrp->pidlist_mutex);

3142

3144

3143

l = cgroup_pidlist_find(cgrp, type);

3145

l = cgroup_pidlist_find(cgrp, type);

3144

if (l)

3146

if (l)

3145

return l;

3147

return l;

3146

3148

3147

/* entry not found; create a new one */

3149

/* entry not found; create a new one */

3148

l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);

3150

l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);

3149

if (!l)

3151

if (!l)

3150

return l;

3152

return l;

3151

3153

3152

INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);

3154

INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);

3153

l->key.type = type;

3155

l->key.type = type;

3154

/* don't need task_nsproxy() if we're looking at ourself */

3156

/* don't need task_nsproxy() if we're looking at ourself */

3155

l->key.ns = get_pid_ns(task_active_pid_ns(current));

3157

l->key.ns = get_pid_ns(task_active_pid_ns(current));

3156

l->owner = cgrp;

3158

l->owner = cgrp;

3157

list_add(&l->links, &cgrp->pidlists);

3159

list_add(&l->links, &cgrp->pidlists);

3158

return l;

3160

return l;

3159

}

3161

}

3160

3162

3161

/*

3163

/*

3162

* Load a cgroup's pidarray with either procs' tgids or tasks' pids

3164

* Load a cgroup's pidarray with either procs' tgids or tasks' pids

3163

*/

3165

*/

3164

static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,

3166

static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,

3165

struct cgroup_pidlist **lp)

3167

struct cgroup_pidlist **lp)

3166

{

3168

{

3167

pid_t *array;

3169

pid_t *array;

3168

int length;

3170

int length;

3169

int pid, n = 0; /* used for populating the array */

3171

int pid, n = 0; /* used for populating the array */

3170

struct css_task_iter it;

3172

struct css_task_iter it;

3171

struct task_struct *tsk;

3173

struct task_struct *tsk;

3172

struct cgroup_pidlist *l;

3174

struct cgroup_pidlist *l;

3173

3175

3174

lockdep_assert_held(&cgrp->pidlist_mutex);

3176

lockdep_assert_held(&cgrp->pidlist_mutex);

3175

3177

3176

/*

3178

/*

3177

* If cgroup gets more users after we read count, we won't have

3179

* If cgroup gets more users after we read count, we won't have

3178

* enough space - tough. This race is indistinguishable to the

3180

* enough space - tough. This race is indistinguishable to the

3179

* caller from the case that the additional cgroup users didn't

3181

* caller from the case that the additional cgroup users didn't

3180

* show up until sometime later on.

3182

* show up until sometime later on.

3181

*/

3183

*/

3182

length = cgroup_task_count(cgrp);

3184

length = cgroup_task_count(cgrp);

3183

array = pidlist_allocate(length);

3185

array = pidlist_allocate(length);

3184

if (!array)

3186

if (!array)

3185

return -ENOMEM;

3187

return -ENOMEM;

3186

/* now, populate the array */

3188

/* now, populate the array */

3187

css_task_iter_start(&cgrp->dummy_css, &it);

3189

css_task_iter_start(&cgrp->dummy_css, &it);

3188

while ((tsk = css_task_iter_next(&it))) {

3190

while ((tsk = css_task_iter_next(&it))) {

3189

if (unlikely(n == length))

3191

if (unlikely(n == length))

3190

break;

3192

break;

3191

/* get tgid or pid for procs or tasks file respectively */

3193

/* get tgid or pid for procs or tasks file respectively */

3192

if (type == CGROUP_FILE_PROCS)

3194

if (type == CGROUP_FILE_PROCS)

3193

pid = task_tgid_vnr(tsk);

3195

pid = task_tgid_vnr(tsk);

3194

else

3196

else

3195

pid = task_pid_vnr(tsk);

3197

pid = task_pid_vnr(tsk);

3196

if (pid > 0) /* make sure to only use valid results */

3198

if (pid > 0) /* make sure to only use valid results */

3197

array[n++] = pid;

3199

array[n++] = pid;

3198

}

3200

}

3199

css_task_iter_end(&it);

3201

css_task_iter_end(&it);

3200

length = n;

3202

length = n;

3201

/* now sort & (if procs) strip out duplicates */

3203

/* now sort & (if procs) strip out duplicates */

3202

if (cgroup_sane_behavior(cgrp))

3204

if (cgroup_sane_behavior(cgrp))

3203

sort(array, length, sizeof(pid_t), fried_cmppid, NULL);

3205

sort(array, length, sizeof(pid_t), fried_cmppid, NULL);

3204

else

3206

else

3205

sort(array, length, sizeof(pid_t), cmppid, NULL);

3207

sort(array, length, sizeof(pid_t), cmppid, NULL);

3206

if (type == CGROUP_FILE_PROCS)

3208

if (type == CGROUP_FILE_PROCS)

3207

length = pidlist_uniq(array, length);

3209

length = pidlist_uniq(array, length);

3208

3210

3209

l = cgroup_pidlist_find_create(cgrp, type);

3211

l = cgroup_pidlist_find_create(cgrp, type);

3210

if (!l) {

3212

if (!l) {

3211

mutex_unlock(&cgrp->pidlist_mutex);

3213

mutex_unlock(&cgrp->pidlist_mutex);

3212

pidlist_free(array);

3214

pidlist_free(array);

3213

return -ENOMEM;

3215

return -ENOMEM;

3214

}

3216

}

3215

3217

3216

/* store array, freeing old if necessary */

3218

/* store array, freeing old if necessary */

3217

pidlist_free(l->list);

3219

pidlist_free(l->list);

3218

l->list = array;

3220

l->list = array;

3219

l->length = length;

3221

l->length = length;

3220

*lp = l;

3222

*lp = l;

3221

return 0;

3223

return 0;

3222

}

3224

}

3223

3225

3224

/**

3226

/**

3225

* cgroupstats_build - build and fill cgroupstats

3227

* cgroupstats_build - build and fill cgroupstats

3226

* @stats: cgroupstats to fill information into

3228

* @stats: cgroupstats to fill information into

3227

* @dentry: A dentry entry belonging to the cgroup for which stats have

3229

* @dentry: A dentry entry belonging to the cgroup for which stats have

3228

* been requested.

3230

* been requested.

3229

*

3231

*

3230

* Build and fill cgroupstats so that taskstats can export it to user

3232

* Build and fill cgroupstats so that taskstats can export it to user

3231

* space.

3233

* space.

3232

*/

3234

*/

3233

int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)

3235

int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)

3234

{

3236

{

3235

struct kernfs_node *kn = kernfs_node_from_dentry(dentry);

3237

struct kernfs_node *kn = kernfs_node_from_dentry(dentry);

3236

struct cgroup *cgrp;

3238

struct cgroup *cgrp;

3237

struct css_task_iter it;

3239

struct css_task_iter it;

3238

struct task_struct *tsk;

3240

struct task_struct *tsk;

3239

3241

3240

/* it should be kernfs_node belonging to cgroupfs and is a directory */

3242

/* it should be kernfs_node belonging to cgroupfs and is a directory */

3241

if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||

3243

if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||

3242

kernfs_type(kn) != KERNFS_DIR)

3244

kernfs_type(kn) != KERNFS_DIR)

3243

return -EINVAL;

3245

return -EINVAL;

3244

3246

3245

mutex_lock(&cgroup_mutex);

3247

mutex_lock(&cgroup_mutex);

3246

3248

3247

/*

3249

/*

3248

* We aren't being called from kernfs and there's no guarantee on

3250

* We aren't being called from kernfs and there's no guarantee on

3249

* @kn->priv's validity. For this and css_tryget_from_dir(),

3251

* @kn->priv's validity. For this and css_tryget_from_dir(),

3250

* @kn->priv is RCU safe. Let's do the RCU dancing.

3252

* @kn->priv is RCU safe. Let's do the RCU dancing.

3251

*/

3253

*/

3252

rcu_read_lock();

3254

rcu_read_lock();

3253

cgrp = rcu_dereference(kn->priv);

3255

cgrp = rcu_dereference(kn->priv);

3254

if (!cgrp || cgroup_is_dead(cgrp)) {

3256

if (!cgrp || cgroup_is_dead(cgrp)) {

3255

rcu_read_unlock();

3257

rcu_read_unlock();

3256

mutex_unlock(&cgroup_mutex);

3258

mutex_unlock(&cgroup_mutex);

3257

return -ENOENT;

3259

return -ENOENT;

3258

}

3260

}

3259

rcu_read_unlock();

3261

rcu_read_unlock();

3260

3262

3261

css_task_iter_start(&cgrp->dummy_css, &it);

3263

css_task_iter_start(&cgrp->dummy_css, &it);

3262

while ((tsk = css_task_iter_next(&it))) {

3264

while ((tsk = css_task_iter_next(&it))) {

3263

switch (tsk->state) {

3265

switch (tsk->state) {

3264

case TASK_RUNNING:

3266

case TASK_RUNNING:

3265

stats->nr_running++;

3267

stats->nr_running++;

3266

break;

3268

break;

3267

case TASK_INTERRUPTIBLE:

3269

case TASK_INTERRUPTIBLE:

3268

stats->nr_sleeping++;

3270

stats->nr_sleeping++;

3269

break;

3271

break;

3270

case TASK_UNINTERRUPTIBLE:

3272

case TASK_UNINTERRUPTIBLE:

3271

stats->nr_uninterruptible++;

3273

stats->nr_uninterruptible++;

3272

break;

3274

break;

3273

case TASK_STOPPED:

3275

case TASK_STOPPED:

3274

stats->nr_stopped++;

3276

stats->nr_stopped++;

3275

break;

3277

break;

3276

default:

3278

default:

3277

if (delayacct_is_task_waiting_on_io(tsk))

3279

if (delayacct_is_task_waiting_on_io(tsk))

3278

stats->nr_io_wait++;

3280

stats->nr_io_wait++;

3279

break;

3281

break;

3280

}

3282

}

3281

}

3283

}

3282

css_task_iter_end(&it);

3284

css_task_iter_end(&it);

3283

3285

3284

mutex_unlock(&cgroup_mutex);

3286

mutex_unlock(&cgroup_mutex);

3285

return 0;

3287

return 0;

3286

}

3288

}

3287

3289

3288

3290

3289

/*

3291

/*

3290

* seq_file methods for the tasks/procs files. The seq_file position is the

3292

* seq_file methods for the tasks/procs files. The seq_file position is the

3291

* next pid to display; the seq_file iterator is a pointer to the pid

3293

* next pid to display; the seq_file iterator is a pointer to the pid

3292

* in the cgroup->l->list array.

3294

* in the cgroup->l->list array.

3293

*/

3295

*/

3294

3296

3295

static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)

3297

static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)

3296

{

3298

{

3297

/*

3299

/*

3298

* Initially we receive a position value that corresponds to

3300

* Initially we receive a position value that corresponds to

3299

* one more than the last pid shown (or 0 on the first call or

3301

* one more than the last pid shown (or 0 on the first call or

3300

* after a seek to the start). Use a binary-search to find the

3302

* after a seek to the start). Use a binary-search to find the

3301

* next pid to display, if any

3303

* next pid to display, if any

3302

*/

3304

*/

3303

struct kernfs_open_file *of = s->private;

3305

struct kernfs_open_file *of = s->private;

3304

struct cgroup *cgrp = seq_css(s)->cgroup;

3306

struct cgroup *cgrp = seq_css(s)->cgroup;

3305

struct cgroup_pidlist *l;

3307

struct cgroup_pidlist *l;

3306

enum cgroup_filetype type = seq_cft(s)->private;

3308

enum cgroup_filetype type = seq_cft(s)->private;

3307

int index = 0, pid = *pos;

3309

int index = 0, pid = *pos;

3308

int *iter, ret;

3310

int *iter, ret;

3309

3311

3310

mutex_lock(&cgrp->pidlist_mutex);

3312

mutex_lock(&cgrp->pidlist_mutex);

3311

3313

3312

/*

3314

/*

3313

* !NULL @of->priv indicates that this isn't the first start()

3315

* !NULL @of->priv indicates that this isn't the first start()

3314

* after open. If the matching pidlist is around, we can use that.

3316

* after open. If the matching pidlist is around, we can use that.

3315

* Look for it. Note that @of->priv can't be used directly. It

3317

* Look for it. Note that @of->priv can't be used directly. It

3316

* could already have been destroyed.

3318

* could already have been destroyed.

3317

*/

3319

*/

3318

if (of->priv)

3320

if (of->priv)

3319

of->priv = cgroup_pidlist_find(cgrp, type);

3321

of->priv = cgroup_pidlist_find(cgrp, type);

3320

3322

3321

/*

3323

/*

3322

* Either this is the first start() after open or the matching

3324

* Either this is the first start() after open or the matching

3323

* pidlist has been destroyed inbetween. Create a new one.

3325

* pidlist has been destroyed inbetween. Create a new one.

3324

*/

3326

*/

3325

if (!of->priv) {

3327

if (!of->priv) {

3326

ret = pidlist_array_load(cgrp, type,

3328

ret = pidlist_array_load(cgrp, type,

3327

(struct cgroup_pidlist **)&of->priv);

3329

(struct cgroup_pidlist **)&of->priv);

3328

if (ret)

3330

if (ret)

3329

return ERR_PTR(ret);

3331

return ERR_PTR(ret);

3330

}

3332

}

3331

l = of->priv;

3333

l = of->priv;

3332

3334

3333

if (pid) {

3335

if (pid) {

3334

int end = l->length;

3336

int end = l->length;

3335

3337

3336

while (index < end) {

3338

while (index < end) {

3337

int mid = (index + end) / 2;

3339

int mid = (index + end) / 2;

3338

if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {

3340

if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {

3339

index = mid;

3341

index = mid;

3340

break;

3342

break;

3341

} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)

3343

} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)

3342

index = mid + 1;

3344

index = mid + 1;

3343

else

3345

else

3344

end = mid;

3346

end = mid;

3345

}

3347

}

3346

}

3348

}

3347

/* If we're off the end of the array, we're done */

3349

/* If we're off the end of the array, we're done */

3348

if (index >= l->length)

3350

if (index >= l->length)

3349

return NULL;

3351

return NULL;

3350

/* Update the abstract position to be the actual pid that we found */

3352

/* Update the abstract position to be the actual pid that we found */

3351

iter = l->list + index;

3353

iter = l->list + index;

3352

*pos = cgroup_pid_fry(cgrp, *iter);

3354

*pos = cgroup_pid_fry(cgrp, *iter);

3353

return iter;

3355

return iter;

3354

}

3356

}

3355

3357

3356

static void cgroup_pidlist_stop(struct seq_file *s, void *v)

3358

static void cgroup_pidlist_stop(struct seq_file *s, void *v)

3357

{

3359

{

3358

struct kernfs_open_file *of = s->private;

3360

struct kernfs_open_file *of = s->private;

3359

struct cgroup_pidlist *l = of->priv;

3361

struct cgroup_pidlist *l = of->priv;

3360

3362

3361

if (l)

3363

if (l)

3362

mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,

3364

mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,

3363

CGROUP_PIDLIST_DESTROY_DELAY);

3365

CGROUP_PIDLIST_DESTROY_DELAY);

3364

mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);

3366

mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);

3365

}

3367

}

3366

3368

3367

static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)

3369

static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)

3368

{

3370

{

3369

struct kernfs_open_file *of = s->private;

3371

struct kernfs_open_file *of = s->private;

3370

struct cgroup_pidlist *l = of->priv;

3372

struct cgroup_pidlist *l = of->priv;

3371

pid_t *p = v;

3373

pid_t *p = v;

3372

pid_t *end = l->list + l->length;

3374

pid_t *end = l->list + l->length;

3373

/*

3375

/*

3374

* Advance to the next pid in the array. If this goes off the

3376

* Advance to the next pid in the array. If this goes off the

3375

* end, we're done

3377

* end, we're done

3376

*/

3378

*/

3377

p++;

3379

p++;

3378

if (p >= end) {

3380

if (p >= end) {

3379

return NULL;

3381

return NULL;

3380

} else {

3382

} else {

3381

*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);

3383

*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);

3382

return p;

3384

return p;

3383

}

3385

}

3384

}

3386

}

3385

3387

3386

static int cgroup_pidlist_show(struct seq_file *s, void *v)

3388

static int cgroup_pidlist_show(struct seq_file *s, void *v)

3387

{

3389

{

3388

return seq_printf(s, "%d\n", *(int *)v);

3390

return seq_printf(s, "%d\n", *(int *)v);

3389

}

3391

}

3390

3392

3391

/*

3393

/*

3392

* seq_operations functions for iterating on pidlists through seq_file -

3394

* seq_operations functions for iterating on pidlists through seq_file -

3393

* independent of whether it's tasks or procs

3395

* independent of whether it's tasks or procs

3394

*/

3396

*/

3395

static const struct seq_operations cgroup_pidlist_seq_operations = {

3397

static const struct seq_operations cgroup_pidlist_seq_operations = {

3396

.start = cgroup_pidlist_start,

3398

.start = cgroup_pidlist_start,

3397

.stop = cgroup_pidlist_stop,

3399

.stop = cgroup_pidlist_stop,

3398

.next = cgroup_pidlist_next,

3400

.next = cgroup_pidlist_next,

3399

.show = cgroup_pidlist_show,

3401

.show = cgroup_pidlist_show,

3400

};

3402

};

3401

3403

3402

static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,

3404

static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,

3403

struct cftype *cft)

3405

struct cftype *cft)

3404

{

3406

{

3405

return notify_on_release(css->cgroup);

3407

return notify_on_release(css->cgroup);

3406

}

3408

}

3407

3409

3408

static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,

3410

static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,

3409

struct cftype *cft, u64 val)

3411

struct cftype *cft, u64 val)

3410

{

3412

{

3411

clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);

3413

clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);

3412

if (val)

3414

if (val)

3413

set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);

3415

set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);

3414

else

3416

else

3415

clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);

3417

clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);

3416

return 0;

3418

return 0;

3417

}

3419

}

3418

3420

3419

static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,

3421

static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,

3420

struct cftype *cft)

3422

struct cftype *cft)

3421

{

3423

{

3422

return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3424

return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3423

}

3425

}

3424

3426

3425

static int cgroup_clone_children_write(struct cgroup_subsys_state *css,

3427

static int cgroup_clone_children_write(struct cgroup_subsys_state *css,

3426

struct cftype *cft, u64 val)

3428

struct cftype *cft, u64 val)

3427

{

3429

{

3428

if (val)

3430

if (val)

3429

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3431

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3430

else

3432

else

3431

clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3433

clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3432

return 0;

3434

return 0;

3433

}

3435

}

3434

3436

3435

static struct cftype cgroup_base_files[] = {

3437

static struct cftype cgroup_base_files[] = {

3436

{

3438

{

3437

.name = "cgroup.procs",

3439

.name = "cgroup.procs",

3438

.seq_start = cgroup_pidlist_start,

3440

.seq_start = cgroup_pidlist_start,

3439

.seq_next = cgroup_pidlist_next,

3441

.seq_next = cgroup_pidlist_next,

3440

.seq_stop = cgroup_pidlist_stop,

3442

.seq_stop = cgroup_pidlist_stop,

3441

.seq_show = cgroup_pidlist_show,

3443

.seq_show = cgroup_pidlist_show,

3442

.private = CGROUP_FILE_PROCS,

3444

.private = CGROUP_FILE_PROCS,

3443

.write_u64 = cgroup_procs_write,

3445

.write_u64 = cgroup_procs_write,

3444

.mode = S_IRUGO | S_IWUSR,

3446

.mode = S_IRUGO | S_IWUSR,

3445

},

3447

},

3446

{

3448

{

3447

.name = "cgroup.clone_children",

3449

.name = "cgroup.clone_children",

3448

.flags = CFTYPE_INSANE,

3450

.flags = CFTYPE_INSANE,

3449

.read_u64 = cgroup_clone_children_read,

3451

.read_u64 = cgroup_clone_children_read,

3450

.write_u64 = cgroup_clone_children_write,

3452

.write_u64 = cgroup_clone_children_write,

3451

},

3453

},

3452

{

3454

{

3453

.name = "cgroup.sane_behavior",

3455

.name = "cgroup.sane_behavior",

3454

.flags = CFTYPE_ONLY_ON_ROOT,

3456

.flags = CFTYPE_ONLY_ON_ROOT,

3455

.seq_show = cgroup_sane_behavior_show,

3457

.seq_show = cgroup_sane_behavior_show,

3456

},

3458

},

3457

3459

3458

/*

3460

/*

3459

* Historical crazy stuff. These don't have "cgroup." prefix and

3461

* Historical crazy stuff. These don't have "cgroup." prefix and

3460

* don't exist if sane_behavior. If you're depending on these, be

3462

* don't exist if sane_behavior. If you're depending on these, be

3461

* prepared to be burned.

3463

* prepared to be burned.

3462

*/

3464

*/

3463

{

3465

{

3464

.name = "tasks",

3466

.name = "tasks",

3465

.flags = CFTYPE_INSANE, /* use "procs" instead */

3467

.flags = CFTYPE_INSANE, /* use "procs" instead */

3466

.seq_start = cgroup_pidlist_start,

3468

.seq_start = cgroup_pidlist_start,

3467

.seq_next = cgroup_pidlist_next,

3469

.seq_next = cgroup_pidlist_next,

3468

.seq_stop = cgroup_pidlist_stop,

3470

.seq_stop = cgroup_pidlist_stop,

3469

.seq_show = cgroup_pidlist_show,

3471

.seq_show = cgroup_pidlist_show,

3470

.private = CGROUP_FILE_TASKS,

3472

.private = CGROUP_FILE_TASKS,

3471

.write_u64 = cgroup_tasks_write,

3473

.write_u64 = cgroup_tasks_write,

3472

.mode = S_IRUGO | S_IWUSR,

3474

.mode = S_IRUGO | S_IWUSR,

3473

},

3475

},

3474

{

3476

{

3475

.name = "notify_on_release",

3477

.name = "notify_on_release",

3476

.flags = CFTYPE_INSANE,

3478

.flags = CFTYPE_INSANE,

3477

.read_u64 = cgroup_read_notify_on_release,

3479

.read_u64 = cgroup_read_notify_on_release,

3478

.write_u64 = cgroup_write_notify_on_release,

3480

.write_u64 = cgroup_write_notify_on_release,

3479

},

3481

},

3480

{

3482

{

3481

.name = "release_agent",

3483

.name = "release_agent",

3482

.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,

3484

.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,

3483

.seq_show = cgroup_release_agent_show,

3485

.seq_show = cgroup_release_agent_show,

3484

.write_string = cgroup_release_agent_write,

3486

.write_string = cgroup_release_agent_write,

3485

.max_write_len = PATH_MAX - 1,

3487

.max_write_len = PATH_MAX - 1,

3486

},

3488

},

3487

{ } /* terminate */

3489

{ } /* terminate */

3488

};

3490

};

3489

3491

3490

/**

3492

/**

3491

* cgroup_populate_dir - create subsys files in a cgroup directory

3493

* cgroup_populate_dir - create subsys files in a cgroup directory

3492

* @cgrp: target cgroup

3494

* @cgrp: target cgroup

3493

* @subsys_mask: mask of the subsystem ids whose files should be added

3495

* @subsys_mask: mask of the subsystem ids whose files should be added

3494

*

3496

*

3495

* On failure, no file is added.

3497

* On failure, no file is added.

3496

*/

3498

*/

3497

static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)

3499

static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)

3498

{

3500

{

3499

struct cgroup_subsys *ss;

3501

struct cgroup_subsys *ss;

3500

int i, ret = 0;

3502

int i, ret = 0;

3501

3503

3502

/* process cftsets of each subsystem */

3504

/* process cftsets of each subsystem */

3503

for_each_subsys(ss, i) {

3505

for_each_subsys(ss, i) {

3504

struct cftype *cfts;

3506

struct cftype *cfts;

3505

3507

3506

if (!test_bit(i, &subsys_mask))

3508

if (!test_bit(i, &subsys_mask))

3507

continue;

3509

continue;

3508

3510

3509

list_for_each_entry(cfts, &ss->cfts, node) {

3511

list_for_each_entry(cfts, &ss->cfts, node) {

3510

ret = cgroup_addrm_files(cgrp, cfts, true);

3512

ret = cgroup_addrm_files(cgrp, cfts, true);

3511

if (ret < 0)

3513

if (ret < 0)

3512

goto err;

3514

goto err;

3513

}

3515

}

3514

}

3516

}

3515

return 0;

3517

return 0;

3516

err:

3518

err:

3517

cgroup_clear_dir(cgrp, subsys_mask);

3519

cgroup_clear_dir(cgrp, subsys_mask);

3518

return ret;

3520

return ret;

3519

}

3521

}

3520

3522

3521

/*

3523

/*

3522

* css destruction is four-stage process.

3524

* css destruction is four-stage process.

3523

*

3525

*

3524

* 1. Destruction starts. Killing of the percpu_ref is initiated.

3526

* 1. Destruction starts. Killing of the percpu_ref is initiated.

3525

* Implemented in kill_css().

3527

* Implemented in kill_css().

3526

*

3528

*

3527

* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs

3529

* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs

3528

* and thus css_tryget() is guaranteed to fail, the css can be offlined

3530

* and thus css_tryget() is guaranteed to fail, the css can be offlined

3529

* by invoking offline_css(). After offlining, the base ref is put.

3531

* by invoking offline_css(). After offlining, the base ref is put.

3530

* Implemented in css_killed_work_fn().

3532

* Implemented in css_killed_work_fn().

3531

*

3533

*

3532

* 3. When the percpu_ref reaches zero, the only possible remaining

3534

* 3. When the percpu_ref reaches zero, the only possible remaining

3533

* accessors are inside RCU read sections. css_release() schedules the

3535

* accessors are inside RCU read sections. css_release() schedules the

3534

* RCU callback.

3536

* RCU callback.

3535

*

3537

*

3536

* 4. After the grace period, the css can be freed. Implemented in

3538

* 4. After the grace period, the css can be freed. Implemented in

3537

* css_free_work_fn().

3539

* css_free_work_fn().

3538

*

3540

*

3539

* It is actually hairier because both step 2 and 4 require process context

3541

* It is actually hairier because both step 2 and 4 require process context

3540

* and thus involve punting to css->destroy_work adding two additional

3542

* and thus involve punting to css->destroy_work adding two additional

3541

* steps to the already complex sequence.

3543

* steps to the already complex sequence.

3542

*/

3544

*/

3543

static void css_free_work_fn(struct work_struct *work)

3545

static void css_free_work_fn(struct work_struct *work)

3544

{

3546

{

3545

struct cgroup_subsys_state *css =

3547

struct cgroup_subsys_state *css =

3546

container_of(work, struct cgroup_subsys_state, destroy_work);

3548

container_of(work, struct cgroup_subsys_state, destroy_work);

3547

struct cgroup *cgrp = css->cgroup;

3549

struct cgroup *cgrp = css->cgroup;

3548

3550

3549

if (css->parent)

3551

if (css->parent)

3550

css_put(css->parent);

3552

css_put(css->parent);

3551

3553

3552

css->ss->css_free(css);

3554

css->ss->css_free(css);

3553

cgroup_put(cgrp);

3555

cgroup_put(cgrp);

3554

}

3556

}

3555

3557

3556

static void css_free_rcu_fn(struct rcu_head *rcu_head)

3558

static void css_free_rcu_fn(struct rcu_head *rcu_head)

3557

{

3559

{

3558

struct cgroup_subsys_state *css =

3560

struct cgroup_subsys_state *css =

3559

container_of(rcu_head, struct cgroup_subsys_state, rcu_head);

3561

container_of(rcu_head, struct cgroup_subsys_state, rcu_head);

3560

3562

3561

INIT_WORK(&css->destroy_work, css_free_work_fn);

3563

INIT_WORK(&css->destroy_work, css_free_work_fn);

3562

queue_work(cgroup_destroy_wq, &css->destroy_work);

3564

queue_work(cgroup_destroy_wq, &css->destroy_work);

3563

}

3565

}

3564

3566

3565

static void css_release(struct percpu_ref *ref)

3567

static void css_release(struct percpu_ref *ref)

3566

{

3568

{

3567

struct cgroup_subsys_state *css =

3569

struct cgroup_subsys_state *css =

3568

container_of(ref, struct cgroup_subsys_state, refcnt);

3570

container_of(ref, struct cgroup_subsys_state, refcnt);

3569

3571

3570

RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);

3572

RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);

3571

call_rcu(&css->rcu_head, css_free_rcu_fn);

3573

call_rcu(&css->rcu_head, css_free_rcu_fn);

3572

}

3574

}

3573

3575

3574

static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,

3576

static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,

3575

struct cgroup *cgrp)

3577

struct cgroup *cgrp)

3576

{

3578

{

3577

css->cgroup = cgrp;

3579

css->cgroup = cgrp;

3578

css->ss = ss;

3580

css->ss = ss;

3579

css->flags = 0;

3581

css->flags = 0;

3580

3582

3581

if (cgrp->parent)

3583

if (cgrp->parent)

3582

css->parent = cgroup_css(cgrp->parent, ss);

3584

css->parent = cgroup_css(cgrp->parent, ss);

3583

else

3585

else

3584

css->flags |= CSS_ROOT;

3586

css->flags |= CSS_ROOT;

3585

3587

3586

BUG_ON(cgroup_css(cgrp, ss));

3588

BUG_ON(cgroup_css(cgrp, ss));

3587

}

3589

}

3588

3590

3589

/* invoke ->css_online() on a new CSS and mark it online if successful */

3591

/* invoke ->css_online() on a new CSS and mark it online if successful */

3590

static int online_css(struct cgroup_subsys_state *css)

3592

static int online_css(struct cgroup_subsys_state *css)

3591

{

3593

{

3592

struct cgroup_subsys *ss = css->ss;

3594

struct cgroup_subsys *ss = css->ss;

3593

int ret = 0;

3595

int ret = 0;

3594

3596

3595

lockdep_assert_held(&cgroup_tree_mutex);

3597

lockdep_assert_held(&cgroup_tree_mutex);

3596

lockdep_assert_held(&cgroup_mutex);

3598

lockdep_assert_held(&cgroup_mutex);

3597

3599

3598

if (ss->css_online)

3600

if (ss->css_online)

3599

ret = ss->css_online(css);

3601

ret = ss->css_online(css);

3600

if (!ret) {

3602

if (!ret) {

3601

css->flags |= CSS_ONLINE;

3603

css->flags |= CSS_ONLINE;

3602

css->cgroup->nr_css++;

3604

css->cgroup->nr_css++;

3603

rcu_assign_pointer(css->cgroup->subsys[ss->id], css);

3605

rcu_assign_pointer(css->cgroup->subsys[ss->id], css);

3604

}

3606

}

3605

return ret;

3607

return ret;

3606

}

3608

}

3607

3609

3608

/* if the CSS is online, invoke ->css_offline() on it and mark it offline */

3610

/* if the CSS is online, invoke ->css_offline() on it and mark it offline */

3609

static void offline_css(struct cgroup_subsys_state *css)

3611

static void offline_css(struct cgroup_subsys_state *css)

3610

{

3612

{

3611

struct cgroup_subsys *ss = css->ss;

3613

struct cgroup_subsys *ss = css->ss;

3612

3614

3613

lockdep_assert_held(&cgroup_tree_mutex);

3615

lockdep_assert_held(&cgroup_tree_mutex);

3614

lockdep_assert_held(&cgroup_mutex);

3616

lockdep_assert_held(&cgroup_mutex);

3615

3617

3616

if (!(css->flags & CSS_ONLINE))

3618

if (!(css->flags & CSS_ONLINE))

3617

return;

3619

return;

3618

3620

3619

if (ss->css_offline)

3621

if (ss->css_offline)

3620

ss->css_offline(css);

3622

ss->css_offline(css);

3621

3623

3622

css->flags &= ~CSS_ONLINE;

3624

css->flags &= ~CSS_ONLINE;

3623

css->cgroup->nr_css--;

3625

css->cgroup->nr_css--;

3624

RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);

3626

RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);

3625

}

3627

}

3626

3628

3627

/**

3629

/**

3628

* create_css - create a cgroup_subsys_state

3630

* create_css - create a cgroup_subsys_state

3629

* @cgrp: the cgroup new css will be associated with

3631

* @cgrp: the cgroup new css will be associated with

3630

* @ss: the subsys of new css

3632

* @ss: the subsys of new css

3631

*

3633

*

3632

* Create a new css associated with @cgrp - @ss pair. On success, the new

3634

* Create a new css associated with @cgrp - @ss pair. On success, the new

3633

* css is online and installed in @cgrp with all interface files created.

3635

* css is online and installed in @cgrp with all interface files created.

3634

* Returns 0 on success, -errno on failure.

3636

* Returns 0 on success, -errno on failure.

3635

*/

3637

*/

3636

static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)

3638

static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)

3637

{

3639

{

3638

struct cgroup *parent = cgrp->parent;

3640

struct cgroup *parent = cgrp->parent;

3639

struct cgroup_subsys_state *css;

3641

struct cgroup_subsys_state *css;

3640

int err;

3642

int err;

3641

3643

3642

lockdep_assert_held(&cgroup_mutex);

3644

lockdep_assert_held(&cgroup_mutex);

3643

3645

3644

css = ss->css_alloc(cgroup_css(parent, ss));

3646

css = ss->css_alloc(cgroup_css(parent, ss));

3645

if (IS_ERR(css))

3647

if (IS_ERR(css))

3646

return PTR_ERR(css);

3648

return PTR_ERR(css);

3647

3649

3648

err = percpu_ref_init(&css->refcnt, css_release);

3650

err = percpu_ref_init(&css->refcnt, css_release);

3649

if (err)

3651

if (err)

3650

goto err_free_css;

3652

goto err_free_css;

3651

3653

3652

init_css(css, ss, cgrp);

3654

init_css(css, ss, cgrp);

3653

3655

3654

err = cgroup_populate_dir(cgrp, 1 << ss->id);

3656

err = cgroup_populate_dir(cgrp, 1 << ss->id);

3655

if (err)

3657

if (err)

3656

goto err_free_percpu_ref;

3658

goto err_free_percpu_ref;

3657

3659

3658

err = online_css(css);

3660

err = online_css(css);

3659

if (err)

3661

if (err)

3660

goto err_clear_dir;

3662

goto err_clear_dir;

3661

3663

3662

cgroup_get(cgrp);

3664

cgroup_get(cgrp);

3663

css_get(css->parent);

3665

css_get(css->parent);

3664

3666

3665

cgrp->subsys_mask |= 1 << ss->id;

3667

cgrp->subsys_mask |= 1 << ss->id;

3666

3668

3667

if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&

3669

if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&

3668

parent->parent) {

3670

parent->parent) {

3669

pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",

3671

pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",

3670

current->comm, current->pid, ss->name);

3672

current->comm, current->pid, ss->name);

3671

if (!strcmp(ss->name, "memory"))

3673

if (!strcmp(ss->name, "memory"))

3672

pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");

3674

pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");

3673

ss->warned_broken_hierarchy = true;

3675

ss->warned_broken_hierarchy = true;

3674

}

3676

}

3675

3677

3676

return 0;

3678

return 0;

3677

3679

3678

err_clear_dir:

3680

err_clear_dir:

3679

cgroup_clear_dir(css->cgroup, 1 << css->ss->id);

3681

cgroup_clear_dir(css->cgroup, 1 << css->ss->id);

3680

err_free_percpu_ref:

3682

err_free_percpu_ref:

3681

percpu_ref_cancel_init(&css->refcnt);

3683

percpu_ref_cancel_init(&css->refcnt);

3682

err_free_css:

3684

err_free_css:

3683

ss->css_free(css);

3685

ss->css_free(css);

3684

return err;

3686

return err;

3685

}

3687

}

3686

3688

3687

/**

3689

/**

3688

* cgroup_create - create a cgroup

3690

* cgroup_create - create a cgroup

3689

* @parent: cgroup that will be parent of the new cgroup

3691

* @parent: cgroup that will be parent of the new cgroup

3690

* @name: name of the new cgroup

3692

* @name: name of the new cgroup

3691

* @mode: mode to set on new cgroup

3693

* @mode: mode to set on new cgroup

3692

*/

3694

*/

3693

static long cgroup_create(struct cgroup *parent, const char *name,

3695

static long cgroup_create(struct cgroup *parent, const char *name,

3694

umode_t mode)

3696

umode_t mode)

3695

{

3697

{

3696

struct cgroup *cgrp;

3698

struct cgroup *cgrp;

3697

struct cgroup_root *root = parent->root;

3699

struct cgroup_root *root = parent->root;

3698

int ssid, err;

3700

int ssid, err;

3699

struct cgroup_subsys *ss;

3701

struct cgroup_subsys *ss;

3700

struct kernfs_node *kn;

3702

struct kernfs_node *kn;

3701

3703

3702

/*

3704

/*

3703

* XXX: The default hierarchy isn't fully implemented yet. Block

3705

* XXX: The default hierarchy isn't fully implemented yet. Block

3704

* !root cgroup creation on it for now.

3706

* !root cgroup creation on it for now.

3705

*/

3707

*/

3706

if (root == &cgrp_dfl_root)

3708

if (root == &cgrp_dfl_root)

3707

return -EINVAL;

3709

return -EINVAL;

3708

3710

3709

/* allocate the cgroup and its ID, 0 is reserved for the root */

3711

/* allocate the cgroup and its ID, 0 is reserved for the root */

3710

cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);

3712

cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);

3711

if (!cgrp)

3713

if (!cgrp)

3712

return -ENOMEM;

3714

return -ENOMEM;

3713

3715

3714

mutex_lock(&cgroup_tree_mutex);

3716

mutex_lock(&cgroup_tree_mutex);

3715

3717

3716

/*

3718

/*

3717

* Only live parents can have children. Note that the liveliness

3719

* Only live parents can have children. Note that the liveliness

3718

* check isn't strictly necessary because cgroup_mkdir() and

3720

* check isn't strictly necessary because cgroup_mkdir() and

3719

* cgroup_rmdir() are fully synchronized by i_mutex; however, do it

3721

* cgroup_rmdir() are fully synchronized by i_mutex; however, do it

3720

* anyway so that locking is contained inside cgroup proper and we

3722

* anyway so that locking is contained inside cgroup proper and we

3721

* don't get nasty surprises if we ever grow another caller.

3723

* don't get nasty surprises if we ever grow another caller.

3722

*/

3724

*/

3723

if (!cgroup_lock_live_group(parent)) {

3725

if (!cgroup_lock_live_group(parent)) {

3724

err = -ENODEV;

3726

err = -ENODEV;

3725

goto err_unlock_tree;

3727

goto err_unlock_tree;

3726

}

3728

}

3727

3729

3728

/*

3730

/*

3729

* Temporarily set the pointer to NULL, so idr_find() won't return

3731

* Temporarily set the pointer to NULL, so idr_find() won't return

3730

* a half-baked cgroup.

3732

* a half-baked cgroup.

3731

*/

3733

*/

3732

cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);

3734

cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);

3733

if (cgrp->id < 0) {

3735

if (cgrp->id < 0) {

3734

err = -ENOMEM;

3736

err = -ENOMEM;

3735

goto err_unlock;

3737

goto err_unlock;

3736

}

3738

}

3737

3739

3738

init_cgroup_housekeeping(cgrp);

3740

init_cgroup_housekeeping(cgrp);

3739

3741

3740

cgrp->parent = parent;

3742

cgrp->parent = parent;

3741

cgrp->dummy_css.parent = &parent->dummy_css;

3743

cgrp->dummy_css.parent = &parent->dummy_css;

3742

cgrp->root = parent->root;

3744

cgrp->root = parent->root;

3743

3745

3744

if (notify_on_release(parent))

3746

if (notify_on_release(parent))

3745

set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

3747

set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

3746

3748

3747

if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))

3749

if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))

3748

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);

3750

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);

3749

3751

3750

/* create the directory */

3752

/* create the directory */

3751

kn = kernfs_create_dir(parent->kn, name, mode, cgrp);

3753

kn = kernfs_create_dir(parent->kn, name, mode, cgrp);

3752

if (IS_ERR(kn)) {

3754

if (IS_ERR(kn)) {

3753

err = PTR_ERR(kn);

3755

err = PTR_ERR(kn);

3754

goto err_free_id;

3756

goto err_free_id;

3755

}

3757

}

3756

cgrp->kn = kn;

3758

cgrp->kn = kn;

3757

3759

3758

/*

3760

/*

3759

* This extra ref will be put in cgroup_free_fn() and guarantees

3761

* This extra ref will be put in cgroup_free_fn() and guarantees

3760

* that @cgrp->kn is always accessible.

3762

* that @cgrp->kn is always accessible.

3761

*/

3763

*/

3762

kernfs_get(kn);

3764

kernfs_get(kn);

3763

3765

3764

cgrp->serial_nr = cgroup_serial_nr_next++;

3766

cgrp->serial_nr = cgroup_serial_nr_next++;

3765

3767

3766

/* allocation complete, commit to creation */

3768

/* allocation complete, commit to creation */

3767

list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);

3769

list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);

3768

atomic_inc(&root->nr_cgrps);

3770

atomic_inc(&root->nr_cgrps);

3769

cgroup_get(parent);

3771

cgroup_get(parent);

3770

3772

3771

/*

3773

/*

3772

* @cgrp is now fully operational. If something fails after this

3774

* @cgrp is now fully operational. If something fails after this

3773

* point, it'll be released via the normal destruction path.

3775

* point, it'll be released via the normal destruction path.

3774

*/

3776

*/

3775

idr_replace(&root->cgroup_idr, cgrp, cgrp->id);

3777

idr_replace(&root->cgroup_idr, cgrp, cgrp->id);

3776

3778

3777

err = cgroup_kn_set_ugid(kn);

3779

err = cgroup_kn_set_ugid(kn);

3778

if (err)

3780

if (err)

3779

goto err_destroy;

3781

goto err_destroy;

3780

3782

3781

err = cgroup_addrm_files(cgrp, cgroup_base_files, true);

3783

err = cgroup_addrm_files(cgrp, cgroup_base_files, true);

3782

if (err)

3784

if (err)

3783

goto err_destroy;

3785

goto err_destroy;

3784

3786

3785

/* let's create and online css's */

3787

/* let's create and online css's */

3786

for_each_subsys(ss, ssid) {

3788

for_each_subsys(ss, ssid) {

3787

if (root->cgrp.subsys_mask & (1 << ssid)) {

3789

if (root->cgrp.subsys_mask & (1 << ssid)) {

3788

err = create_css(cgrp, ss);

3790

err = create_css(cgrp, ss);

3789

if (err)

3791

if (err)

3790

goto err_destroy;

3792

goto err_destroy;

3791

}

3793

}

3792

}

3794

}

3793

3795

3794

kernfs_activate(kn);

3796

kernfs_activate(kn);

3795

3797

3796

mutex_unlock(&cgroup_mutex);

3798

mutex_unlock(&cgroup_mutex);

3797

mutex_unlock(&cgroup_tree_mutex);

3799

mutex_unlock(&cgroup_tree_mutex);

3798

3800

3799

return 0;

3801

return 0;

3800

3802

3801

err_free_id:

3803

err_free_id:

3802

idr_remove(&root->cgroup_idr, cgrp->id);

3804

idr_remove(&root->cgroup_idr, cgrp->id);

3803

err_unlock:

3805

err_unlock:

3804

mutex_unlock(&cgroup_mutex);

3806

mutex_unlock(&cgroup_mutex);

3805

err_unlock_tree:

3807

err_unlock_tree:

3806

mutex_unlock(&cgroup_tree_mutex);

3808

mutex_unlock(&cgroup_tree_mutex);

3807

kfree(cgrp);

3809

kfree(cgrp);

3808

return err;

3810

return err;

3809

3811

3810

err_destroy:

3812

err_destroy:

3811

cgroup_destroy_locked(cgrp);

3813

cgroup_destroy_locked(cgrp);

3812

mutex_unlock(&cgroup_mutex);

3814

mutex_unlock(&cgroup_mutex);

3813

mutex_unlock(&cgroup_tree_mutex);

3815

mutex_unlock(&cgroup_tree_mutex);

3814

return err;

3816

return err;

3815

}

3817

}

3816

3818

3817

static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,

3819

static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,

3818

umode_t mode)

3820

umode_t mode)

3819

{

3821

{

3820

struct cgroup *parent = parent_kn->priv;

3822

struct cgroup *parent = parent_kn->priv;

3821

int ret;

3823

int ret;

3822

3824

3823

/*

3825

/*

3824

* cgroup_create() grabs cgroup_tree_mutex which nests outside

3826

* cgroup_create() grabs cgroup_tree_mutex which nests outside

3825

* kernfs active_ref and cgroup_create() already synchronizes

3827

* kernfs active_ref and cgroup_create() already synchronizes

3826

* properly against removal through cgroup_lock_live_group().

3828

* properly against removal through cgroup_lock_live_group().

3827

* Break it before calling cgroup_create().

3829

* Break it before calling cgroup_create().

3828

*/

3830

*/

3829

cgroup_get(parent);

3831

cgroup_get(parent);

3830

kernfs_break_active_protection(parent_kn);

3832

kernfs_break_active_protection(parent_kn);

3831

3833

3832

ret = cgroup_create(parent, name, mode);

3834

ret = cgroup_create(parent, name, mode);

3833

3835

3834

kernfs_unbreak_active_protection(parent_kn);

3836

kernfs_unbreak_active_protection(parent_kn);

3835

cgroup_put(parent);

3837

cgroup_put(parent);

3836

return ret;

3838

return ret;

3837

}

3839

}

3838

3840

3839

/*

3841

/*

3840

* This is called when the refcnt of a css is confirmed to be killed.

3842

* This is called when the refcnt of a css is confirmed to be killed.

3841

* css_tryget() is now guaranteed to fail.

3843

* css_tryget() is now guaranteed to fail.

3842

*/

3844

*/

3843

static void css_killed_work_fn(struct work_struct *work)

3845

static void css_killed_work_fn(struct work_struct *work)

3844

{

3846

{

3845

struct cgroup_subsys_state *css =

3847

struct cgroup_subsys_state *css =

3846

container_of(work, struct cgroup_subsys_state, destroy_work);

3848

container_of(work, struct cgroup_subsys_state, destroy_work);

3847

struct cgroup *cgrp = css->cgroup;

3849

struct cgroup *cgrp = css->cgroup;

3848

3850

3849

mutex_lock(&cgroup_tree_mutex);

3851

mutex_lock(&cgroup_tree_mutex);

3850

mutex_lock(&cgroup_mutex);

3852

mutex_lock(&cgroup_mutex);

3851

3853

3852

/*

3854

/*

3853

* css_tryget() is guaranteed to fail now. Tell subsystems to

3855

* css_tryget() is guaranteed to fail now. Tell subsystems to

3854

* initate destruction.

3856

* initate destruction.

3855

*/

3857

*/

3856

offline_css(css);

3858

offline_css(css);

3857

3859

3858

/*

3860

/*

3859

* If @cgrp is marked dead, it's waiting for refs of all css's to

3861

* If @cgrp is marked dead, it's waiting for refs of all css's to

3860

* be disabled before proceeding to the second phase of cgroup

3862

* be disabled before proceeding to the second phase of cgroup

3861

* destruction. If we are the last one, kick it off.

3863

* destruction. If we are the last one, kick it off.

3862

*/

3864

*/

3863

if (!cgrp->nr_css && cgroup_is_dead(cgrp))

3865

if (!cgrp->nr_css && cgroup_is_dead(cgrp))

3864

cgroup_destroy_css_killed(cgrp);

3866

cgroup_destroy_css_killed(cgrp);

3865

3867

3866

mutex_unlock(&cgroup_mutex);

3868

mutex_unlock(&cgroup_mutex);

3867

mutex_unlock(&cgroup_tree_mutex);

3869

mutex_unlock(&cgroup_tree_mutex);

3868

3870

3869

/*

3871

/*

3870

* Put the css refs from kill_css(). Each css holds an extra

3872

* Put the css refs from kill_css(). Each css holds an extra

3871

* reference to the cgroup's dentry and cgroup removal proceeds

3873

* reference to the cgroup's dentry and cgroup removal proceeds

3872

* regardless of css refs. On the last put of each css, whenever

3874

* regardless of css refs. On the last put of each css, whenever

3873

* that may be, the extra dentry ref is put so that dentry

3875

* that may be, the extra dentry ref is put so that dentry

3874

* destruction happens only after all css's are released.

3876

* destruction happens only after all css's are released.

3875

*/

3877

*/

3876

css_put(css);

3878

css_put(css);

3877

}

3879

}

3878

3880

3879

/* css kill confirmation processing requires process context, bounce */

3881

/* css kill confirmation processing requires process context, bounce */

3880

static void css_killed_ref_fn(struct percpu_ref *ref)

3882

static void css_killed_ref_fn(struct percpu_ref *ref)

3881

{

3883

{

3882

struct cgroup_subsys_state *css =

3884

struct cgroup_subsys_state *css =

3883

container_of(ref, struct cgroup_subsys_state, refcnt);

3885

container_of(ref, struct cgroup_subsys_state, refcnt);

3884

3886

3885

INIT_WORK(&css->destroy_work, css_killed_work_fn);

3887

INIT_WORK(&css->destroy_work, css_killed_work_fn);

3886

queue_work(cgroup_destroy_wq, &css->destroy_work);

3888

queue_work(cgroup_destroy_wq, &css->destroy_work);

3887

}

3889

}

3888

3890

3889

static void __kill_css(struct cgroup_subsys_state *css)

3891

static void __kill_css(struct cgroup_subsys_state *css)

3890

{

3892

{

3891

lockdep_assert_held(&cgroup_tree_mutex);

3893

lockdep_assert_held(&cgroup_tree_mutex);

3892

3894

3893

/*

3895

/*

3894

* This must happen before css is disassociated with its cgroup.

3896

* This must happen before css is disassociated with its cgroup.

3895

* See seq_css() for details.

3897

* See seq_css() for details.

3896

*/

3898

*/

3897

cgroup_clear_dir(css->cgroup, 1 << css->ss->id);

3899

cgroup_clear_dir(css->cgroup, 1 << css->ss->id);

3898

3900

3899

/*

3901

/*

3900

* Killing would put the base ref, but we need to keep it alive

3902

* Killing would put the base ref, but we need to keep it alive

3901

* until after ->css_offline().

3903

* until after ->css_offline().

3902

*/

3904

*/

3903

css_get(css);

3905

css_get(css);

3904

3906

3905

/*

3907

/*

3906

* cgroup core guarantees that, by the time ->css_offline() is

3908

* cgroup core guarantees that, by the time ->css_offline() is

3907

* invoked, no new css reference will be given out via

3909

* invoked, no new css reference will be given out via

3908

* css_tryget(). We can't simply call percpu_ref_kill() and

3910

* css_tryget(). We can't simply call percpu_ref_kill() and

3909

* proceed to offlining css's because percpu_ref_kill() doesn't

3911

* proceed to offlining css's because percpu_ref_kill() doesn't

3910

* guarantee that the ref is seen as killed on all CPUs on return.

3912

* guarantee that the ref is seen as killed on all CPUs on return.

3911

*

3913

*

3912

* Use percpu_ref_kill_and_confirm() to get notifications as each

3914

* Use percpu_ref_kill_and_confirm() to get notifications as each

3913

* css is confirmed to be seen as killed on all CPUs.

3915

* css is confirmed to be seen as killed on all CPUs.

3914

*/

3916

*/

3915

percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);

3917

percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);

3916

}

3918

}

3917

3919

3918

/**

3920

/**

3919

* kill_css - destroy a css

3921

* kill_css - destroy a css

3920

* @css: css to destroy

3922

* @css: css to destroy

3921

*

3923

*

3922

* This function initiates destruction of @css by removing cgroup interface

3924

* This function initiates destruction of @css by removing cgroup interface

3923

* files and putting its base reference. ->css_offline() will be invoked

3925

* files and putting its base reference. ->css_offline() will be invoked

3924

* asynchronously once css_tryget() is guaranteed to fail and when the

3926

* asynchronously once css_tryget() is guaranteed to fail and when the

3925

* reference count reaches zero, @css will be released.

3927

* reference count reaches zero, @css will be released.

3926

*/

3928

*/

3927

static void kill_css(struct cgroup_subsys_state *css)

3929

static void kill_css(struct cgroup_subsys_state *css)

3928

{

3930

{

3929

struct cgroup *cgrp = css->cgroup;

3931

struct cgroup *cgrp = css->cgroup;

3930

3932

3931

lockdep_assert_held(&cgroup_tree_mutex);

3933

lockdep_assert_held(&cgroup_tree_mutex);

3932

3934

3933

/* if already killed, noop */

3935

/* if already killed, noop */

3934

if (cgrp->subsys_mask & (1 << css->ss->id)) {

3936

if (cgrp->subsys_mask & (1 << css->ss->id)) {

3935

cgrp->subsys_mask &= ~(1 << css->ss->id);

3937

cgrp->subsys_mask &= ~(1 << css->ss->id);

3936

__kill_css(css);

3938

__kill_css(css);

3937

}

3939

}

3938

}

3940

}

3939

3941

3940

/**

3942

/**

3941

* cgroup_destroy_locked - the first stage of cgroup destruction

3943

* cgroup_destroy_locked - the first stage of cgroup destruction

3942

* @cgrp: cgroup to be destroyed

3944

* @cgrp: cgroup to be destroyed

3943

*

3945

*

3944

* css's make use of percpu refcnts whose killing latency shouldn't be

3946

* css's make use of percpu refcnts whose killing latency shouldn't be

3945

* exposed to userland and are RCU protected. Also, cgroup core needs to

3947

* exposed to userland and are RCU protected. Also, cgroup core needs to

3946

* guarantee that css_tryget() won't succeed by the time ->css_offline() is

3948

* guarantee that css_tryget() won't succeed by the time ->css_offline() is

3947

* invoked. To satisfy all the requirements, destruction is implemented in

3949

* invoked. To satisfy all the requirements, destruction is implemented in

3948

* the following two steps.

3950

* the following two steps.

3949

*

3951

*

3950

* s1. Verify @cgrp can be destroyed and mark it dying. Remove all

3952

* s1. Verify @cgrp can be destroyed and mark it dying. Remove all

3951

* userland visible parts and start killing the percpu refcnts of

3953

* userland visible parts and start killing the percpu refcnts of

3952

* css's. Set up so that the next stage will be kicked off once all

3954

* css's. Set up so that the next stage will be kicked off once all

3953

* the percpu refcnts are confirmed to be killed.

3955

* the percpu refcnts are confirmed to be killed.

3954

*

3956

*

3955

* s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the

3957

* s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the

3956

* rest of destruction. Once all cgroup references are gone, the

3958

* rest of destruction. Once all cgroup references are gone, the

3957

* cgroup is RCU-freed.

3959

* cgroup is RCU-freed.

3958

*

3960

*

3959

* This function implements s1. After this step, @cgrp is gone as far as

3961

* This function implements s1. After this step, @cgrp is gone as far as

3960

* the userland is concerned and a new cgroup with the same name may be

3962

* the userland is concerned and a new cgroup with the same name may be

3961

* created. As cgroup doesn't care about the names internally, this

3963

* created. As cgroup doesn't care about the names internally, this

3962

* doesn't cause any problem.

3964

* doesn't cause any problem.

3963

*/

3965

*/

3964

static int cgroup_destroy_locked(struct cgroup *cgrp)

3966

static int cgroup_destroy_locked(struct cgroup *cgrp)

3965

__releases(&cgroup_mutex) __acquires(&cgroup_mutex)

3967

__releases(&cgroup_mutex) __acquires(&cgroup_mutex)

3966

{

3968

{

3967

struct cgroup *child;

3969

struct cgroup *child;

3968

struct cgroup_subsys_state *css;

3970

struct cgroup_subsys_state *css;

3969

bool empty;

3971

bool empty;

3970

int ssid;

3972

int ssid;

3971

3973

3972

lockdep_assert_held(&cgroup_tree_mutex);

3974

lockdep_assert_held(&cgroup_tree_mutex);

3973

lockdep_assert_held(&cgroup_mutex);

3975

lockdep_assert_held(&cgroup_mutex);

3974

3976

3975

/*

3977

/*

3976

* css_set_rwsem synchronizes access to ->cset_links and prevents

3978

* css_set_rwsem synchronizes access to ->cset_links and prevents

3977

* @cgrp from being removed while put_css_set() is in progress.

3979

* @cgrp from being removed while put_css_set() is in progress.

3978

*/

3980

*/

3979

down_read(&css_set_rwsem);

3981

down_read(&css_set_rwsem);

3980

empty = list_empty(&cgrp->cset_links);

3982

empty = list_empty(&cgrp->cset_links);

3981

up_read(&css_set_rwsem);

3983

up_read(&css_set_rwsem);

3982

if (!empty)

3984

if (!empty)

3983

return -EBUSY;

3985

return -EBUSY;

3984

3986

3985

/*

3987

/*

3986

* Make sure there's no live children. We can't test ->children

3988

* Make sure there's no live children. We can't test ->children

3987

* emptiness as dead children linger on it while being destroyed;

3989

* emptiness as dead children linger on it while being destroyed;

3988

* otherwise, "rmdir parent/child parent" may fail with -EBUSY.

3990

* otherwise, "rmdir parent/child parent" may fail with -EBUSY.

3989

*/

3991

*/

3990

empty = true;

3992

empty = true;

3991

rcu_read_lock();

3993

rcu_read_lock();

3992

list_for_each_entry_rcu(child, &cgrp->children, sibling) {

3994

list_for_each_entry_rcu(child, &cgrp->children, sibling) {

3993

empty = cgroup_is_dead(child);

3995

empty = cgroup_is_dead(child);

3994

if (!empty)

3996

if (!empty)

3995

break;

3997

break;

3996

}

3998

}

3997

rcu_read_unlock();

3999

rcu_read_unlock();

3998

if (!empty)

4000

if (!empty)

3999

return -EBUSY;

4001

return -EBUSY;

4000

4002

4001

/*

4003

/*

4002

* Mark @cgrp dead. This prevents further task migration and child

4004

* Mark @cgrp dead. This prevents further task migration and child

4003

* creation by disabling cgroup_lock_live_group(). Note that

4005

* creation by disabling cgroup_lock_live_group(). Note that

4004

* CGRP_DEAD assertion is depended upon by css_next_child() to

4006

* CGRP_DEAD assertion is depended upon by css_next_child() to

4005

* resume iteration after dropping RCU read lock. See

4007

* resume iteration after dropping RCU read lock. See

4006

* css_next_child() for details.

4008

* css_next_child() for details.

4007

*/

4009

*/

4008

set_bit(CGRP_DEAD, &cgrp->flags);

4010

set_bit(CGRP_DEAD, &cgrp->flags);

4009

4011

4010

/*

4012

/*

4011

* Initiate massacre of all css's. cgroup_destroy_css_killed()

4013

* Initiate massacre of all css's. cgroup_destroy_css_killed()

4012

* will be invoked to perform the rest of destruction once the

4014

* will be invoked to perform the rest of destruction once the

4013

* percpu refs of all css's are confirmed to be killed. This

4015

* percpu refs of all css's are confirmed to be killed. This

4014

* involves removing the subsystem's files, drop cgroup_mutex.

4016

* involves removing the subsystem's files, drop cgroup_mutex.

4015

*/

4017

*/

4016

mutex_unlock(&cgroup_mutex);

4018

mutex_unlock(&cgroup_mutex);

4017

for_each_css(css, ssid, cgrp)

4019

for_each_css(css, ssid, cgrp)

4018

kill_css(css);

4020

kill_css(css);

4019

mutex_lock(&cgroup_mutex);

4021

mutex_lock(&cgroup_mutex);

4020

4022

4021

/* CGRP_DEAD is set, remove from ->release_list for the last time */

4023

/* CGRP_DEAD is set, remove from ->release_list for the last time */

4022

raw_spin_lock(&release_list_lock);

4024

raw_spin_lock(&release_list_lock);

4023

if (!list_empty(&cgrp->release_list))

4025

if (!list_empty(&cgrp->release_list))

4024

list_del_init(&cgrp->release_list);

4026

list_del_init(&cgrp->release_list);

4025

raw_spin_unlock(&release_list_lock);

4027

raw_spin_unlock(&release_list_lock);

4026

4028

4027

/*

4029

/*

4028

* If @cgrp has css's attached, the second stage of cgroup

4030

* If @cgrp has css's attached, the second stage of cgroup

4029

* destruction is kicked off from css_killed_work_fn() after the

4031

* destruction is kicked off from css_killed_work_fn() after the

4030

* refs of all attached css's are killed. If @cgrp doesn't have

4032

* refs of all attached css's are killed. If @cgrp doesn't have

4031

* any css, we kick it off here.

4033

* any css, we kick it off here.

4032

*/

4034

*/

4033

if (!cgrp->nr_css)

4035

if (!cgrp->nr_css)

4034

cgroup_destroy_css_killed(cgrp);

4036

cgroup_destroy_css_killed(cgrp);

4035

4037

4036

/* remove @cgrp directory along with the base files */

4038

/* remove @cgrp directory along with the base files */

4037

mutex_unlock(&cgroup_mutex);

4039

mutex_unlock(&cgroup_mutex);

4038

4040

4039

/*

4041

/*

4040

* There are two control paths which try to determine cgroup from

4042

* There are two control paths which try to determine cgroup from

4041

* dentry without going through kernfs - cgroupstats_build() and

4043

* dentry without going through kernfs - cgroupstats_build() and

4042

* css_tryget_from_dir(). Those are supported by RCU protecting

4044

* css_tryget_from_dir(). Those are supported by RCU protecting

4043

* clearing of cgrp->kn->priv backpointer, which should happen

4045

* clearing of cgrp->kn->priv backpointer, which should happen

4044

* after all files under it have been removed.

4046

* after all files under it have been removed.

4045

*/

4047

*/

4046

kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */

4048

kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */

4047

RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);

4049

RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);

4048

4050

4049

mutex_lock(&cgroup_mutex);

4051

mutex_lock(&cgroup_mutex);

4050

4052

4051

return 0;

4053

return 0;

4052

};

4054

};

4053

4055

4054

/**

4056

/**

4055

* cgroup_destroy_css_killed - the second step of cgroup destruction

4057

* cgroup_destroy_css_killed - the second step of cgroup destruction

4056

* @work: cgroup->destroy_free_work

4058

* @work: cgroup->destroy_free_work

4057

*

4059

*

4058

* This function is invoked from a work item for a cgroup which is being

4060

* This function is invoked from a work item for a cgroup which is being

4059

* destroyed after all css's are offlined and performs the rest of

4061

* destroyed after all css's are offlined and performs the rest of

4060

* destruction. This is the second step of destruction described in the

4062

* destruction. This is the second step of destruction described in the

4061

* comment above cgroup_destroy_locked().

4063

* comment above cgroup_destroy_locked().

4062

*/

4064

*/

4063

static void cgroup_destroy_css_killed(struct cgroup *cgrp)

4065

static void cgroup_destroy_css_killed(struct cgroup *cgrp)

4064

{

4066

{

4065

struct cgroup *parent = cgrp->parent;

4067

struct cgroup *parent = cgrp->parent;

4066

4068

4067

lockdep_assert_held(&cgroup_tree_mutex);

4069

lockdep_assert_held(&cgroup_tree_mutex);

4068

lockdep_assert_held(&cgroup_mutex);

4070

lockdep_assert_held(&cgroup_mutex);

4069

4071

4070

/* delete this cgroup from parent->children */

4072

/* delete this cgroup from parent->children */

4071

list_del_rcu(&cgrp->sibling);

4073

list_del_rcu(&cgrp->sibling);

4072

4074

4073

cgroup_put(cgrp);

4075

cgroup_put(cgrp);

4074

4076

4075

set_bit(CGRP_RELEASABLE, &parent->flags);

4077

set_bit(CGRP_RELEASABLE, &parent->flags);

4076

check_for_release(parent);

4078

check_for_release(parent);

4077

}

4079

}

4078

4080

4079

static int cgroup_rmdir(struct kernfs_node *kn)

4081

static int cgroup_rmdir(struct kernfs_node *kn)

4080

{

4082

{

4081

struct cgroup *cgrp = kn->priv;

4083

struct cgroup *cgrp = kn->priv;

4082

int ret = 0;

4084

int ret = 0;

4083

4085

4084

/*

4086

/*

4085

* This is self-destruction but @kn can't be removed while this

4087

* This is self-destruction but @kn can't be removed while this

4086

* callback is in progress. Let's break active protection. Once

4088

* callback is in progress. Let's break active protection. Once

4087

* the protection is broken, @cgrp can be destroyed at any point.

4089

* the protection is broken, @cgrp can be destroyed at any point.

4088

* Pin it so that it stays accessible.

4090

* Pin it so that it stays accessible.

4089

*/

4091

*/

4090

cgroup_get(cgrp);

4092

cgroup_get(cgrp);

4091

kernfs_break_active_protection(kn);

4093

kernfs_break_active_protection(kn);

4092

4094

4093

mutex_lock(&cgroup_tree_mutex);

4095

mutex_lock(&cgroup_tree_mutex);

4094

mutex_lock(&cgroup_mutex);

4096

mutex_lock(&cgroup_mutex);

4095

4097

4096

/*

4098

/*

4097

* @cgrp might already have been destroyed while we're trying to

4099

* @cgrp might already have been destroyed while we're trying to

4098

* grab the mutexes.

4100

* grab the mutexes.

4099

*/

4101

*/

4100

if (!cgroup_is_dead(cgrp))

4102

if (!cgroup_is_dead(cgrp))

4101

ret = cgroup_destroy_locked(cgrp);

4103

ret = cgroup_destroy_locked(cgrp);

4102

4104

4103

mutex_unlock(&cgroup_mutex);

4105

mutex_unlock(&cgroup_mutex);

4104

mutex_unlock(&cgroup_tree_mutex);

4106

mutex_unlock(&cgroup_tree_mutex);

4105

4107

4106

kernfs_unbreak_active_protection(kn);

4108

kernfs_unbreak_active_protection(kn);

4107

cgroup_put(cgrp);

4109

cgroup_put(cgrp);

4108

return ret;

4110

return ret;

4109

}

4111

}

4110

4112

4111

static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {

4113

static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {

4112

.remount_fs = cgroup_remount,

4114

.remount_fs = cgroup_remount,

4113

.show_options = cgroup_show_options,

4115

.show_options = cgroup_show_options,

4114

.mkdir = cgroup_mkdir,

4116

.mkdir = cgroup_mkdir,

4115

.rmdir = cgroup_rmdir,

4117

.rmdir = cgroup_rmdir,

4116

.rename = cgroup_rename,

4118

.rename = cgroup_rename,

4117

};

4119

};

4118

4120

4119

static void __init cgroup_init_subsys(struct cgroup_subsys *ss)

4121

static void __init cgroup_init_subsys(struct cgroup_subsys *ss)

4120

{

4122

{

4121

struct cgroup_subsys_state *css;

4123

struct cgroup_subsys_state *css;

4122

4124

4123

printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);

4125

printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);

4124

4126

4125

mutex_lock(&cgroup_tree_mutex);

4127

mutex_lock(&cgroup_tree_mutex);

4126

mutex_lock(&cgroup_mutex);

4128

mutex_lock(&cgroup_mutex);

4127

4129

4128

INIT_LIST_HEAD(&ss->cfts);

4130

INIT_LIST_HEAD(&ss->cfts);

4129

4131

4130

/* Create the root cgroup state for this subsystem */

4132

/* Create the root cgroup state for this subsystem */

4131

ss->root = &cgrp_dfl_root;

4133

ss->root = &cgrp_dfl_root;

4132

css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));

4134

css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));

4133

/* We don't handle early failures gracefully */

4135

/* We don't handle early failures gracefully */

4134

BUG_ON(IS_ERR(css));

4136

BUG_ON(IS_ERR(css));

4135

init_css(css, ss, &cgrp_dfl_root.cgrp);

4137

init_css(css, ss, &cgrp_dfl_root.cgrp);

4136

4138

4137

/* Update the init_css_set to contain a subsys

4139

/* Update the init_css_set to contain a subsys

4138

* pointer to this state - since the subsystem is

4140

* pointer to this state - since the subsystem is

4139

* newly registered, all tasks and hence the

4141

* newly registered, all tasks and hence the

4140

* init_css_set is in the subsystem's root cgroup. */

4142

* init_css_set is in the subsystem's root cgroup. */

4141

init_css_set.subsys[ss->id] = css;

4143

init_css_set.subsys[ss->id] = css;

4142

4144

4143

need_forkexit_callback |= ss->fork || ss->exit;

4145

need_forkexit_callback |= ss->fork || ss->exit;

4144

4146

4145

/* At system boot, before all subsystems have been

4147

/* At system boot, before all subsystems have been

4146

* registered, no tasks have been forked, so we don't

4148

* registered, no tasks have been forked, so we don't

4147

* need to invoke fork callbacks here. */

4149

* need to invoke fork callbacks here. */

4148

BUG_ON(!list_empty(&init_task.tasks));

4150

BUG_ON(!list_empty(&init_task.tasks));

4149

4151

4150

BUG_ON(online_css(css));

4152

BUG_ON(online_css(css));

4151

4153

4152

cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;

4154

cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;

4153

4155

4154

mutex_unlock(&cgroup_mutex);

4156

mutex_unlock(&cgroup_mutex);

4155

mutex_unlock(&cgroup_tree_mutex);

4157

mutex_unlock(&cgroup_tree_mutex);

4156

}

4158

}

4157

4159

4158

/**

4160

/**

4159

* cgroup_init_early - cgroup initialization at system boot

4161

* cgroup_init_early - cgroup initialization at system boot

4160

*

4162

*

4161

* Initialize cgroups at system boot, and initialize any

4163

* Initialize cgroups at system boot, and initialize any

4162

* subsystems that request early init.

4164

* subsystems that request early init.

4163

*/

4165

*/

4164

int __init cgroup_init_early(void)

4166

int __init cgroup_init_early(void)

4165

{

4167

{

4166

static struct cgroup_sb_opts __initdata opts =

4168

static struct cgroup_sb_opts __initdata opts =

4167

{ .flags = CGRP_ROOT_SANE_BEHAVIOR };

4169

{ .flags = CGRP_ROOT_SANE_BEHAVIOR };

4168

struct cgroup_subsys *ss;

4170

struct cgroup_subsys *ss;

4169

int i;

4171

int i;

4170

4172

4171

init_cgroup_root(&cgrp_dfl_root, &opts);

4173

init_cgroup_root(&cgrp_dfl_root, &opts);

4172

RCU_INIT_POINTER(init_task.cgroups, &init_css_set);

4174

RCU_INIT_POINTER(init_task.cgroups, &init_css_set);

4173

4175

4174

for_each_subsys(ss, i) {

4176

for_each_subsys(ss, i) {

4175

WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,

4177

WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,

4176

"invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",

4178

"invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",

4177

i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,

4179

i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,

4178

ss->id, ss->name);

4180

ss->id, ss->name);

4179

WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,

4181

WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,

4180

"cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);

4182

"cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);

4181

4183

4182

ss->id = i;

4184

ss->id = i;

4183

ss->name = cgroup_subsys_name[i];

4185

ss->name = cgroup_subsys_name[i];

4184

4186

4185

if (ss->early_init)

4187

if (ss->early_init)

4186

cgroup_init_subsys(ss);

4188

cgroup_init_subsys(ss);

4187

}

4189

}

4188

return 0;

4190

return 0;

4189

}

4191

}

4190

4192

4191

/**

4193

/**

4192

* cgroup_init - cgroup initialization

4194

* cgroup_init - cgroup initialization

4193

*

4195

*

4194

* Register cgroup filesystem and /proc file, and initialize

4196

* Register cgroup filesystem and /proc file, and initialize

4195

* any subsystems that didn't request early init.

4197

* any subsystems that didn't request early init.

4196

*/

4198

*/

4197

int __init cgroup_init(void)

4199

int __init cgroup_init(void)

4198

{

4200

{

4199

struct cgroup_subsys *ss;

4201

struct cgroup_subsys *ss;

4200

unsigned long key;

4202

unsigned long key;

4201

int ssid, err;

4203

int ssid, err;

4202

4204

4203

BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));

4205

BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));

4204

4206

4205

mutex_lock(&cgroup_tree_mutex);

4207

mutex_lock(&cgroup_tree_mutex);

4206

mutex_lock(&cgroup_mutex);

4208

mutex_lock(&cgroup_mutex);

4207

4209

4208

/* Add init_css_set to the hash table */

4210

/* Add init_css_set to the hash table */

4209

key = css_set_hash(init_css_set.subsys);

4211

key = css_set_hash(init_css_set.subsys);

4210

hash_add(css_set_table, &init_css_set.hlist, key);

4212

hash_add(css_set_table, &init_css_set.hlist, key);

4211

4213

4212

BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));

4214

BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));

4213

4215

4214

mutex_unlock(&cgroup_mutex);

4216

mutex_unlock(&cgroup_mutex);

4215

mutex_unlock(&cgroup_tree_mutex);

4217

mutex_unlock(&cgroup_tree_mutex);

4216

4218

4217

for_each_subsys(ss, ssid) {

4219

for_each_subsys(ss, ssid) {

4218

if (!ss->early_init)

4220

if (!ss->early_init)

4219

cgroup_init_subsys(ss);

4221

cgroup_init_subsys(ss);

4220

4222

4221

/*

4223

/*

4222

* cftype registration needs kmalloc and can't be done

4224

* cftype registration needs kmalloc and can't be done

4223

* during early_init. Register base cftypes separately.

4225

* during early_init. Register base cftypes separately.

4224

*/

4226

*/

4225

if (ss->base_cftypes)

4227

if (ss->base_cftypes)

4226

WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));

4228

WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));

4227

}

4229

}

4228

4230

4229

cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);

4231

cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);

4230

if (!cgroup_kobj)

4232

if (!cgroup_kobj)

4231

return -ENOMEM;

4233

return -ENOMEM;

4232

4234

4233

err = register_filesystem(&cgroup_fs_type);

4235

err = register_filesystem(&cgroup_fs_type);

4234

if (err < 0) {

4236

if (err < 0) {

4235

kobject_put(cgroup_kobj);

4237

kobject_put(cgroup_kobj);

4236

return err;

4238

return err;

4237

}

4239

}

4238

4240

4239

proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);

4241

proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);

4240

return 0;

4242

return 0;

4241

}

4243

}

4242

4244

4243

static int __init cgroup_wq_init(void)

4245

static int __init cgroup_wq_init(void)

4244

{

4246

{

4245

/*

4247

/*

4246

* There isn't much point in executing destruction path in

4248

* There isn't much point in executing destruction path in

4247

* parallel. Good chunk is serialized with cgroup_mutex anyway.

4249

* parallel. Good chunk is serialized with cgroup_mutex anyway.

4248

* Use 1 for @max_active.

4250

* Use 1 for @max_active.

4249

*

4251

*

4250

* We would prefer to do this in cgroup_init() above, but that

4252

* We would prefer to do this in cgroup_init() above, but that

4251

* is called before init_workqueues(): so leave this until after.

4253

* is called before init_workqueues(): so leave this until after.

4252

*/

4254

*/

4253

cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);

4255

cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);

4254

BUG_ON(!cgroup_destroy_wq);

4256

BUG_ON(!cgroup_destroy_wq);

4255

4257

4256

/*

4258

/*

4257

* Used to destroy pidlists and separate to serve as flush domain.

4259

* Used to destroy pidlists and separate to serve as flush domain.

4258

* Cap @max_active to 1 too.

4260

* Cap @max_active to 1 too.

4259

*/

4261

*/

4260

cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",

4262

cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",

4261

0, 1);

4263

0, 1);

4262

BUG_ON(!cgroup_pidlist_destroy_wq);

4264

BUG_ON(!cgroup_pidlist_destroy_wq);

4263

4265

4264

return 0;

4266

return 0;

4265

}

4267

}

4266

core_initcall(cgroup_wq_init);

4268

core_initcall(cgroup_wq_init);

4267

4269

4268

/*

4270

/*

4269

* proc_cgroup_show()

4271

* proc_cgroup_show()

4270

* - Print task's cgroup paths into seq_file, one line for each hierarchy

4272

* - Print task's cgroup paths into seq_file, one line for each hierarchy

4271

* - Used for /proc/<pid>/cgroup.

4273

* - Used for /proc/<pid>/cgroup.

4272

*/

4274

*/

4273

4275

4274

/* TODO: Use a proper seq_file iterator */

4276

/* TODO: Use a proper seq_file iterator */

4275

int proc_cgroup_show(struct seq_file *m, void *v)

4277

int proc_cgroup_show(struct seq_file *m, void *v)

4276

{

4278

{

4277

struct pid *pid;

4279

struct pid *pid;

4278

struct task_struct *tsk;

4280

struct task_struct *tsk;

4279

char *buf, *path;

4281

char *buf, *path;

4280

int retval;

4282

int retval;

4281

struct cgroup_root *root;

4283

struct cgroup_root *root;

4282

4284

4283

retval = -ENOMEM;

4285

retval = -ENOMEM;

4284

buf = kmalloc(PATH_MAX, GFP_KERNEL);

4286

buf = kmalloc(PATH_MAX, GFP_KERNEL);

4285

if (!buf)

4287

if (!buf)

4286

goto out;

4288

goto out;

4287

4289

4288

retval = -ESRCH;

4290

retval = -ESRCH;

4289

pid = m->private;

4291

pid = m->private;

4290

tsk = get_pid_task(pid, PIDTYPE_PID);

4292

tsk = get_pid_task(pid, PIDTYPE_PID);

4291

if (!tsk)

4293

if (!tsk)

4292

goto out_free;

4294

goto out_free;

4293

4295

4294

retval = 0;

4296

retval = 0;

4295

4297

4296

mutex_lock(&cgroup_mutex);

4298

mutex_lock(&cgroup_mutex);

4297

down_read(&css_set_rwsem);

4299

down_read(&css_set_rwsem);

4298

4300

4299

for_each_root(root) {

4301

for_each_root(root) {

4300

struct cgroup_subsys *ss;

4302

struct cgroup_subsys *ss;

4301

struct cgroup *cgrp;

4303

struct cgroup *cgrp;

4302

int ssid, count = 0;

4304

int ssid, count = 0;

4303

4305

4304

if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)

4306

if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)

4305

continue;

4307

continue;

4306

4308

4307

seq_printf(m, "%d:", root->hierarchy_id);

4309

seq_printf(m, "%d:", root->hierarchy_id);

4308

for_each_subsys(ss, ssid)

4310

for_each_subsys(ss, ssid)

4309

if (root->cgrp.subsys_mask & (1 << ssid))

4311

if (root->cgrp.subsys_mask & (1 << ssid))

4310

seq_printf(m, "%s%s", count++ ? "," : "", ss->name);

4312

seq_printf(m, "%s%s", count++ ? "," : "", ss->name);

4311

if (strlen(root->name))

4313

if (strlen(root->name))

4312

seq_printf(m, "%sname=%s", count ? "," : "",

4314

seq_printf(m, "%sname=%s", count ? "," : "",

4313

root->name);

4315

root->name);

4314

seq_putc(m, ':');

4316

seq_putc(m, ':');

4315

cgrp = task_cgroup_from_root(tsk, root);

4317

cgrp = task_cgroup_from_root(tsk, root);

4316

path = cgroup_path(cgrp, buf, PATH_MAX);

4318

path = cgroup_path(cgrp, buf, PATH_MAX);

4317

if (!path) {

4319

if (!path) {

4318

retval = -ENAMETOOLONG;

4320

retval = -ENAMETOOLONG;

4319

goto out_unlock;

4321

goto out_unlock;

4320

}

4322

}

4321

seq_puts(m, path);

4323

seq_puts(m, path);

4322

seq_putc(m, '\n');

4324

seq_putc(m, '\n');

4323

}

4325

}

4324

4326

4325

out_unlock:

4327

out_unlock:

4326

up_read(&css_set_rwsem);

4328

up_read(&css_set_rwsem);

4327

mutex_unlock(&cgroup_mutex);

4329

mutex_unlock(&cgroup_mutex);

4328

put_task_struct(tsk);

4330

put_task_struct(tsk);

4329

out_free:

4331

out_free:

4330

kfree(buf);

4332

kfree(buf);

4331

out:

4333

out:

4332

return retval;

4334

return retval;

4333

}

4335

}

4334

4336

4335

/* Display information about each subsystem and each hierarchy */

4337

/* Display information about each subsystem and each hierarchy */

4336

static int proc_cgroupstats_show(struct seq_file *m, void *v)

4338

static int proc_cgroupstats_show(struct seq_file *m, void *v)

4337

{

4339

{

4338

struct cgroup_subsys *ss;

4340

struct cgroup_subsys *ss;

4339

int i;

4341

int i;

4340

4342

4341

seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");

4343

seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");

4342

/*

4344

/*

4343

* ideally we don't want subsystems moving around while we do this.

4345

* ideally we don't want subsystems moving around while we do this.

4344

* cgroup_mutex is also necessary to guarantee an atomic snapshot of

4346

* cgroup_mutex is also necessary to guarantee an atomic snapshot of

4345

* subsys/hierarchy state.

4347

* subsys/hierarchy state.

4346

*/

4348

*/

4347

mutex_lock(&cgroup_mutex);

4349

mutex_lock(&cgroup_mutex);

4348

4350

4349

for_each_subsys(ss, i)

4351

for_each_subsys(ss, i)

4350

seq_printf(m, "%s\t%d\t%d\t%d\n",

4352

seq_printf(m, "%s\t%d\t%d\t%d\n",

4351

ss->name, ss->root->hierarchy_id,

4353

ss->name, ss->root->hierarchy_id,

4352

atomic_read(&ss->root->nr_cgrps), !ss->disabled);

4354

atomic_read(&ss->root->nr_cgrps), !ss->disabled);

4353

4355

4354

mutex_unlock(&cgroup_mutex);

4356

mutex_unlock(&cgroup_mutex);

4355

return 0;

4357

return 0;

4356

}

4358

}

4357

4359

4358

static int cgroupstats_open(struct inode *inode, struct file *file)

4360

static int cgroupstats_open(struct inode *inode, struct file *file)

4359

{

4361

{

4360

return single_open(file, proc_cgroupstats_show, NULL);

4362

return single_open(file, proc_cgroupstats_show, NULL);

4361

}

4363

}

4362

4364

4363

static const struct file_operations proc_cgroupstats_operations = {

4365

static const struct file_operations proc_cgroupstats_operations = {

4364

.open = cgroupstats_open,

4366

.open = cgroupstats_open,

4365

.read = seq_read,

4367

.read = seq_read,

4366

.llseek = seq_lseek,

4368

.llseek = seq_lseek,

4367

.release = single_release,

4369

.release = single_release,

4368

};

4370

};

4369

4371

4370

/**

4372

/**

4371

* cgroup_fork - initialize cgroup related fields during copy_process()

4373

* cgroup_fork - initialize cgroup related fields during copy_process()

4372

* @child: pointer to task_struct of forking parent process.

4374

* @child: pointer to task_struct of forking parent process.

4373

*

4375

*

4374

* A task is associated with the init_css_set until cgroup_post_fork()

4376

* A task is associated with the init_css_set until cgroup_post_fork()

4375

* attaches it to the parent's css_set. Empty cg_list indicates that

4377

* attaches it to the parent's css_set. Empty cg_list indicates that

4376

* @child isn't holding reference to its css_set.

4378

* @child isn't holding reference to its css_set.

4377

*/

4379

*/

4378

void cgroup_fork(struct task_struct *child)

4380

void cgroup_fork(struct task_struct *child)

4379

{

4381

{

4380

RCU_INIT_POINTER(child->cgroups, &init_css_set);

4382

RCU_INIT_POINTER(child->cgroups, &init_css_set);

4381

INIT_LIST_HEAD(&child->cg_list);

4383

INIT_LIST_HEAD(&child->cg_list);

4382

}

4384

}

4383

4385

4384

/**

4386

/**

4385

* cgroup_post_fork - called on a new task after adding it to the task list

4387

* cgroup_post_fork - called on a new task after adding it to the task list

4386

* @child: the task in question

4388

* @child: the task in question

4387

*

4389

*

4388

* Adds the task to the list running through its css_set if necessary and

4390

* Adds the task to the list running through its css_set if necessary and

4389

* call the subsystem fork() callbacks. Has to be after the task is

4391

* call the subsystem fork() callbacks. Has to be after the task is

4390

* visible on the task list in case we race with the first call to

4392

* visible on the task list in case we race with the first call to

4391

* cgroup_task_iter_start() - to guarantee that the new task ends up on its

4393

* cgroup_task_iter_start() - to guarantee that the new task ends up on its

4392

* list.

4394

* list.

4393

*/

4395

*/

4394

void cgroup_post_fork(struct task_struct *child)

4396

void cgroup_post_fork(struct task_struct *child)

4395

{

4397

{

4396

struct cgroup_subsys *ss;

4398

struct cgroup_subsys *ss;

4397

int i;

4399

int i;

4398

4400

4399

/*

4401

/*

4400

* This may race against cgroup_enable_task_cg_links(). As that

4402

* This may race against cgroup_enable_task_cg_links(). As that

4401

* function sets use_task_css_set_links before grabbing

4403

* function sets use_task_css_set_links before grabbing

4402

* tasklist_lock and we just went through tasklist_lock to add

4404

* tasklist_lock and we just went through tasklist_lock to add

4403

* @child, it's guaranteed that either we see the set

4405

* @child, it's guaranteed that either we see the set

4404

* use_task_css_set_links or cgroup_enable_task_cg_lists() sees

4406

* use_task_css_set_links or cgroup_enable_task_cg_lists() sees

4405

* @child during its iteration.

4407

* @child during its iteration.

4406

*

4408

*

4407

* If we won the race, @child is associated with %current's

4409

* If we won the race, @child is associated with %current's

4408

* css_set. Grabbing css_set_rwsem guarantees both that the

4410

* css_set. Grabbing css_set_rwsem guarantees both that the

4409

* association is stable, and, on completion of the parent's

4411

* association is stable, and, on completion of the parent's

4410

* migration, @child is visible in the source of migration or

4412

* migration, @child is visible in the source of migration or

4411

* already in the destination cgroup. This guarantee is necessary

4413

* already in the destination cgroup. This guarantee is necessary

4412

* when implementing operations which need to migrate all tasks of

4414

* when implementing operations which need to migrate all tasks of

4413

* a cgroup to another.

4415

* a cgroup to another.

4414

*

4416

*

4415

* Note that if we lose to cgroup_enable_task_cg_links(), @child

4417

* Note that if we lose to cgroup_enable_task_cg_links(), @child

4416

* will remain in init_css_set. This is safe because all tasks are

4418

* will remain in init_css_set. This is safe because all tasks are

4417

* in the init_css_set before cg_links is enabled and there's no

4419

* in the init_css_set before cg_links is enabled and there's no

4418

* operation which transfers all tasks out of init_css_set.

4420

* operation which transfers all tasks out of init_css_set.

4419

*/

4421

*/

4420

if (use_task_css_set_links) {

4422

if (use_task_css_set_links) {

4421

struct css_set *cset;

4423

struct css_set *cset;

4422

4424

4423

down_write(&css_set_rwsem);

4425

down_write(&css_set_rwsem);

4424

cset = task_css_set(current);

4426

cset = task_css_set(current);

4425

if (list_empty(&child->cg_list)) {

4427

if (list_empty(&child->cg_list)) {

4426

rcu_assign_pointer(child->cgroups, cset);

4428

rcu_assign_pointer(child->cgroups, cset);

4427

list_add(&child->cg_list, &cset->tasks);

4429

list_add(&child->cg_list, &cset->tasks);

4428

get_css_set(cset);

4430

get_css_set(cset);

4429

}

4431

}

4430

up_write(&css_set_rwsem);

4432

up_write(&css_set_rwsem);

4431

}

4433

}

4432

4434

4433

/*

4435

/*

4434

* Call ss->fork(). This must happen after @child is linked on

4436

* Call ss->fork(). This must happen after @child is linked on

4435

* css_set; otherwise, @child might change state between ->fork()

4437

* css_set; otherwise, @child might change state between ->fork()

4436

* and addition to css_set.

4438

* and addition to css_set.

4437

*/

4439

*/

4438

if (need_forkexit_callback) {

4440

if (need_forkexit_callback) {

4439

for_each_subsys(ss, i)

4441

for_each_subsys(ss, i)

4440

if (ss->fork)

4442

if (ss->fork)

4441

ss->fork(child);

4443

ss->fork(child);

4442

}

4444

}

4443

}

4445

}

4444

4446

4445

/**

4447

/**

4446

* cgroup_exit - detach cgroup from exiting task

4448

* cgroup_exit - detach cgroup from exiting task

4447

* @tsk: pointer to task_struct of exiting process

4449

* @tsk: pointer to task_struct of exiting process

4448

*

4450

*

4449

* Description: Detach cgroup from @tsk and release it.

4451

* Description: Detach cgroup from @tsk and release it.

4450

*

4452

*

4451

* Note that cgroups marked notify_on_release force every task in

4453

* Note that cgroups marked notify_on_release force every task in

4452

* them to take the global cgroup_mutex mutex when exiting.

4454

* them to take the global cgroup_mutex mutex when exiting.

4453

* This could impact scaling on very large systems. Be reluctant to

4455

* This could impact scaling on very large systems. Be reluctant to

4454

* use notify_on_release cgroups where very high task exit scaling

4456

* use notify_on_release cgroups where very high task exit scaling

4455

* is required on large systems.

4457

* is required on large systems.

4456

*

4458

*

4457

* We set the exiting tasks cgroup to the root cgroup (top_cgroup). We

4459

* We set the exiting tasks cgroup to the root cgroup (top_cgroup). We

4458

* call cgroup_exit() while the task is still competent to handle

4460

* call cgroup_exit() while the task is still competent to handle

4459

* notify_on_release(), then leave the task attached to the root cgroup in

4461

* notify_on_release(), then leave the task attached to the root cgroup in

4460

* each hierarchy for the remainder of its exit. No need to bother with

4462

* each hierarchy for the remainder of its exit. No need to bother with

4461

* init_css_set refcnting. init_css_set never goes away and we can't race

4463

* init_css_set refcnting. init_css_set never goes away and we can't race

4462

* with migration path - PF_EXITING is visible to migration path.

4464

* with migration path - PF_EXITING is visible to migration path.

4463

*/

4465

*/

4464

void cgroup_exit(struct task_struct *tsk)

4466

void cgroup_exit(struct task_struct *tsk)

4465

{

4467

{

4466

struct cgroup_subsys *ss;

4468

struct cgroup_subsys *ss;

4467

struct css_set *cset;

4469

struct css_set *cset;

4468

bool put_cset = false;

4470

bool put_cset = false;

4469

int i;

4471

int i;

4470

4472

4471

/*

4473

/*

4472

* Unlink from @tsk from its css_set. As migration path can't race

4474

* Unlink from @tsk from its css_set. As migration path can't race

4473

* with us, we can check cg_list without grabbing css_set_rwsem.

4475

* with us, we can check cg_list without grabbing css_set_rwsem.

4474

*/

4476

*/

4475

if (!list_empty(&tsk->cg_list)) {

4477

if (!list_empty(&tsk->cg_list)) {

4476

down_write(&css_set_rwsem);

4478

down_write(&css_set_rwsem);

4477

list_del_init(&tsk->cg_list);

4479

list_del_init(&tsk->cg_list);

4478

up_write(&css_set_rwsem);

4480

up_write(&css_set_rwsem);

4479

put_cset = true;

4481

put_cset = true;

4480

}

4482

}

4481

4483

4482

/* Reassign the task to the init_css_set. */

4484

/* Reassign the task to the init_css_set. */

4483

cset = task_css_set(tsk);

4485

cset = task_css_set(tsk);

4484

RCU_INIT_POINTER(tsk->cgroups, &init_css_set);

4486

RCU_INIT_POINTER(tsk->cgroups, &init_css_set);

4485

4487

4486

if (need_forkexit_callback) {

4488

if (need_forkexit_callback) {

4487

/* see cgroup_post_fork() for details */

4489

/* see cgroup_post_fork() for details */

4488

for_each_subsys(ss, i) {

4490

for_each_subsys(ss, i) {

4489

if (ss->exit) {

4491

if (ss->exit) {

4490

struct cgroup_subsys_state *old_css = cset->subsys[i];

4492

struct cgroup_subsys_state *old_css = cset->subsys[i];

4491

struct cgroup_subsys_state *css = task_css(tsk, i);

4493

struct cgroup_subsys_state *css = task_css(tsk, i);

4492

4494

4493

ss->exit(css, old_css, tsk);

4495

ss->exit(css, old_css, tsk);

4494

}

4496

}

4495

}

4497

}

4496

}

4498

}

4497

4499

4498

if (put_cset)

4500

if (put_cset)

4499

put_css_set(cset, true);

4501

put_css_set(cset, true);

4500

}

4502

}

4501

4503

4502

static void check_for_release(struct cgroup *cgrp)

4504

static void check_for_release(struct cgroup *cgrp)

4503

{

4505

{

4504

if (cgroup_is_releasable(cgrp) &&

4506

if (cgroup_is_releasable(cgrp) &&

4505

list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {

4507

list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {

4506

/*

4508

/*

4507

* Control Group is currently removeable. If it's not

4509

* Control Group is currently removeable. If it's not

4508

* already queued for a userspace notification, queue

4510

* already queued for a userspace notification, queue

4509

* it now

4511

* it now

4510

*/

4512

*/

4511

int need_schedule_work = 0;

4513

int need_schedule_work = 0;

4512

4514

4513

raw_spin_lock(&release_list_lock);

4515

raw_spin_lock(&release_list_lock);

4514

if (!cgroup_is_dead(cgrp) &&

4516

if (!cgroup_is_dead(cgrp) &&

4515

list_empty(&cgrp->release_list)) {

4517

list_empty(&cgrp->release_list)) {

4516

list_add(&cgrp->release_list, &release_list);

4518

list_add(&cgrp->release_list, &release_list);

4517

need_schedule_work = 1;

4519

need_schedule_work = 1;

4518

}

4520

}

4519

raw_spin_unlock(&release_list_lock);

4521

raw_spin_unlock(&release_list_lock);

4520

if (need_schedule_work)

4522

if (need_schedule_work)

4521

schedule_work(&release_agent_work);

4523

schedule_work(&release_agent_work);

4522

}

4524

}

4523

}

4525

}

4524

4526

4525

/*

4527

/*

4526

* Notify userspace when a cgroup is released, by running the

4528

* Notify userspace when a cgroup is released, by running the

4527

* configured release agent with the name of the cgroup (path

4529

* configured release agent with the name of the cgroup (path

4528

* relative to the root of cgroup file system) as the argument.

4530

* relative to the root of cgroup file system) as the argument.

4529

*

4531

*

4530

* Most likely, this user command will try to rmdir this cgroup.

4532

* Most likely, this user command will try to rmdir this cgroup.

4531

*

4533

*

4532

* This races with the possibility that some other task will be

4534

* This races with the possibility that some other task will be

4533

* attached to this cgroup before it is removed, or that some other

4535

* attached to this cgroup before it is removed, or that some other

4534

* user task will 'mkdir' a child cgroup of this cgroup. That's ok.

4536

* user task will 'mkdir' a child cgroup of this cgroup. That's ok.

4535

* The presumed 'rmdir' will fail quietly if this cgroup is no longer

4537

* The presumed 'rmdir' will fail quietly if this cgroup is no longer

4536

* unused, and this cgroup will be reprieved from its death sentence,

4538

* unused, and this cgroup will be reprieved from its death sentence,

4537

* to continue to serve a useful existence. Next time it's released,

4539

* to continue to serve a useful existence. Next time it's released,

4538

* we will get notified again, if it still has 'notify_on_release' set.

4540

* we will get notified again, if it still has 'notify_on_release' set.

4539

*

4541

*

4540

* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which

4542

* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which

4541

* means only wait until the task is successfully execve()'d. The

4543

* means only wait until the task is successfully execve()'d. The

4542

* separate release agent task is forked by call_usermodehelper(),

4544

* separate release agent task is forked by call_usermodehelper(),

4543

* then control in this thread returns here, without waiting for the

4545

* then control in this thread returns here, without waiting for the

4544

* release agent task. We don't bother to wait because the caller of

4546

* release agent task. We don't bother to wait because the caller of

4545

* this routine has no use for the exit status of the release agent

4547

* this routine has no use for the exit status of the release agent

4546

* task, so no sense holding our caller up for that.

4548

* task, so no sense holding our caller up for that.

4547

*/

4549

*/

4548

static void cgroup_release_agent(struct work_struct *work)

4550

static void cgroup_release_agent(struct work_struct *work)

4549

{

4551

{

4550

BUG_ON(work != &release_agent_work);

4552

BUG_ON(work != &release_agent_work);

4551

mutex_lock(&cgroup_mutex);

4553

mutex_lock(&cgroup_mutex);

4552

raw_spin_lock(&release_list_lock);

4554

raw_spin_lock(&release_list_lock);

4553

while (!list_empty(&release_list)) {

4555

while (!list_empty(&release_list)) {

4554

char *argv[3], *envp[3];

4556

char *argv[3], *envp[3];

4555

int i;

4557

int i;

4556

char *pathbuf = NULL, *agentbuf = NULL, *path;

4558

char *pathbuf = NULL, *agentbuf = NULL, *path;

4557

struct cgroup *cgrp = list_entry(release_list.next,

4559

struct cgroup *cgrp = list_entry(release_list.next,

4558

struct cgroup,

4560

struct cgroup,

4559

release_list);

4561

release_list);

4560

list_del_init(&cgrp->release_list);

4562

list_del_init(&cgrp->release_list);

4561

raw_spin_unlock(&release_list_lock);

4563

raw_spin_unlock(&release_list_lock);

4562

pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);

4564

pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);

4563

if (!pathbuf)

4565

if (!pathbuf)

4564

goto continue_free;

4566

goto continue_free;

4565

path = cgroup_path(cgrp, pathbuf, PATH_MAX);

4567

path = cgroup_path(cgrp, pathbuf, PATH_MAX);

4566

if (!path)

4568

if (!path)

4567

goto continue_free;

4569

goto continue_free;

4568

agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);

4570

agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);

4569

if (!agentbuf)

4571

if (!agentbuf)

4570

goto continue_free;

4572

goto continue_free;

4571

4573

4572

i = 0;

4574

i = 0;

4573

argv[i++] = agentbuf;

4575

argv[i++] = agentbuf;

4574

argv[i++] = path;

4576

argv[i++] = path;

4575

argv[i] = NULL;

4577

argv[i] = NULL;

4576

4578

4577

i = 0;

4579

i = 0;

4578

/* minimal command environment */

4580

/* minimal command environment */

4579

envp[i++] = "HOME=/";

4581

envp[i++] = "HOME=/";

4580

envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";

4582

envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";

4581

envp[i] = NULL;

4583

envp[i] = NULL;

4582

4584

4583

/* Drop the lock while we invoke the usermode helper,

4585

/* Drop the lock while we invoke the usermode helper,

4584

* since the exec could involve hitting disk and hence

4586

* since the exec could involve hitting disk and hence

4585

* be a slow process */

4587

* be a slow process */

4586

mutex_unlock(&cgroup_mutex);

4588

mutex_unlock(&cgroup_mutex);

4587

call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);

4589

call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);

4588

mutex_lock(&cgroup_mutex);

4590

mutex_lock(&cgroup_mutex);

4589

continue_free:

4591

continue_free:

4590

kfree(pathbuf);

4592

kfree(pathbuf);

4591

kfree(agentbuf);

4593

kfree(agentbuf);

4592

raw_spin_lock(&release_list_lock);

4594

raw_spin_lock(&release_list_lock);

4593

}

4595

}

4594

raw_spin_unlock(&release_list_lock);

4596

raw_spin_unlock(&release_list_lock);

4595

mutex_unlock(&cgroup_mutex);

4597

mutex_unlock(&cgroup_mutex);

4596

}

4598

}

4597

4599

4598

static int __init cgroup_disable(char *str)

4600

static int __init cgroup_disable(char *str)

4599

{

4601

{

4600

struct cgroup_subsys *ss;

4602

struct cgroup_subsys *ss;

4601

char *token;

4603

char *token;

4602

int i;

4604

int i;

4603

4605

4604

while ((token = strsep(&str, ",")) != NULL) {

4606

while ((token = strsep(&str, ",")) != NULL) {

4605

if (!*token)

4607

if (!*token)

4606

continue;

4608

continue;

4607

4609

4608

for_each_subsys(ss, i) {

4610

for_each_subsys(ss, i) {

4609

if (!strcmp(token, ss->name)) {

4611

if (!strcmp(token, ss->name)) {

4610

ss->disabled = 1;

4612

ss->disabled = 1;

4611

printk(KERN_INFO "Disabling %s control group"

4613

printk(KERN_INFO "Disabling %s control group"

4612

" subsystem\n", ss->name);

4614

" subsystem\n", ss->name);

4613

break;

4615

break;

4614

}

4616

}

4615

}

4617

}

4616

}

4618

}

4617

return 1;

4619

return 1;

4618

}

4620

}

4619

__setup("cgroup_disable=", cgroup_disable);

4621

__setup("cgroup_disable=", cgroup_disable);

4620

4622

4621

/**

4623

/**

4622

* css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir

4624

* css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir

4623

* @dentry: directory dentry of interest

4625

* @dentry: directory dentry of interest

4624

* @ss: subsystem of interest

4626

* @ss: subsystem of interest

4625

*

4627

*

4626

* If @dentry is a directory for a cgroup which has @ss enabled on it, try

4628

* If @dentry is a directory for a cgroup which has @ss enabled on it, try

4627

* to get the corresponding css and return it. If such css doesn't exist

4629

* to get the corresponding css and return it. If such css doesn't exist

4628

* or can't be pinned, an ERR_PTR value is returned.

4630

* or can't be pinned, an ERR_PTR value is returned.

4629

*/

4631

*/

4630

struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,

4632

struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,

4631

struct cgroup_subsys *ss)

4633

struct cgroup_subsys *ss)

4632

{

4634

{

4633

struct kernfs_node *kn = kernfs_node_from_dentry(dentry);

4635

struct kernfs_node *kn = kernfs_node_from_dentry(dentry);

4634

struct cgroup_subsys_state *css = NULL;

4636

struct cgroup_subsys_state *css = NULL;

4635

struct cgroup *cgrp;

4637

struct cgroup *cgrp;

4636

4638

4637

/* is @dentry a cgroup dir? */

4639

/* is @dentry a cgroup dir? */

4638

if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||

4640

if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||

4639

kernfs_type(kn) != KERNFS_DIR)

4641

kernfs_type(kn) != KERNFS_DIR)

4640

return ERR_PTR(-EBADF);

4642

return ERR_PTR(-EBADF);

4641

4643

4642

rcu_read_lock();

4644

rcu_read_lock();

4643

4645

4644

/*

4646

/*

4645

* This path doesn't originate from kernfs and @kn could already

4647

* This path doesn't originate from kernfs and @kn could already

4646

* have been or be removed at any point. @kn->priv is RCU

4648

* have been or be removed at any point. @kn->priv is RCU

4647

* protected for this access. See destroy_locked() for details.

4649

* protected for this access. See destroy_locked() for details.

4648

*/

4650

*/

4649

cgrp = rcu_dereference(kn->priv);

4651

cgrp = rcu_dereference(kn->priv);

4650

if (cgrp)

4652

if (cgrp)

4651

css = cgroup_css(cgrp, ss);

4653

css = cgroup_css(cgrp, ss);

4652

4654

4653

if (!css || !css_tryget(css))

4655

if (!css || !css_tryget(css))

4654

css = ERR_PTR(-ENOENT);

4656

css = ERR_PTR(-ENOENT);

4655

4657

4656

rcu_read_unlock();

4658

rcu_read_unlock();

4657

return css;

4659

return css;

4658

}

4660

}

4659

4661

4660

/**

4662

/**

4661

* css_from_id - lookup css by id

4663

* css_from_id - lookup css by id

4662

* @id: the cgroup id

4664

* @id: the cgroup id

4663

* @ss: cgroup subsys to be looked into

4665

* @ss: cgroup subsys to be looked into

4664

*

4666

*

4665

* Returns the css if there's valid one with @id, otherwise returns NULL.

4667

* Returns the css if there's valid one with @id, otherwise returns NULL.

4666

* Should be called under rcu_read_lock().

4668

* Should be called under rcu_read_lock().

4667

*/

4669

*/

4668

struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)

4670

struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)

4669

{

4671

{

4670

struct cgroup *cgrp;

4672

struct cgroup *cgrp;

4671

4673

4672

cgroup_assert_mutexes_or_rcu_locked();

4674

cgroup_assert_mutexes_or_rcu_locked();

4673

4675

4674

cgrp = idr_find(&ss->root->cgroup_idr, id);

4676

cgrp = idr_find(&ss->root->cgroup_idr, id);

4675

if (cgrp)

4677

if (cgrp)

4676

return cgroup_css(cgrp, ss);

4678

return cgroup_css(cgrp, ss);

4677

return NULL;

4679

return NULL;

4678

}

4680

}

4679

4681

4680

#ifdef CONFIG_CGROUP_DEBUG

4682

#ifdef CONFIG_CGROUP_DEBUG

4681

static struct cgroup_subsys_state *

4683

static struct cgroup_subsys_state *

4682

debug_css_alloc(struct cgroup_subsys_state *parent_css)

4684

debug_css_alloc(struct cgroup_subsys_state *parent_css)

4683

{

4685

{

4684

struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);

4686

struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);

4685

4687

4686

if (!css)

4688

if (!css)

4687

return ERR_PTR(-ENOMEM);

4689

return ERR_PTR(-ENOMEM);

4688

4690

4689

return css;

4691

return css;

4690

}

4692

}

4691

4693

4692

static void debug_css_free(struct cgroup_subsys_state *css)

4694

static void debug_css_free(struct cgroup_subsys_state *css)

4693

{

4695

{

4694

kfree(css);

4696

kfree(css);

4695

}

4697

}

4696

4698

4697

static u64 debug_taskcount_read(struct cgroup_subsys_state *css,

4699

static u64 debug_taskcount_read(struct cgroup_subsys_state *css,

4698

struct cftype *cft)

4700

struct cftype *cft)

4699

{

4701

{

4700

return cgroup_task_count(css->cgroup);

4702

return cgroup_task_count(css->cgroup);

4701

}

4703

}

4702

4704

4703

static u64 current_css_set_read(struct cgroup_subsys_state *css,

4705

static u64 current_css_set_read(struct cgroup_subsys_state *css,

4704

struct cftype *cft)

4706

struct cftype *cft)

4705

{

4707

{

4706

return (u64)(unsigned long)current->cgroups;

4708

return (u64)(unsigned long)current->cgroups;

4707

}

4709

}

4708

4710

4709

static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,

4711

static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,

4710

struct cftype *cft)

4712

struct cftype *cft)

4711

{

4713

{

4712

u64 count;

4714

u64 count;

4713

4715

4714

rcu_read_lock();

4716

rcu_read_lock();

4715

count = atomic_read(&task_css_set(current)->refcount);

4717

count = atomic_read(&task_css_set(current)->refcount);

4716

rcu_read_unlock();

4718

rcu_read_unlock();

4717

return count;

4719

return count;

4718

}

4720

}

4719

4721

4720

static int current_css_set_cg_links_read(struct seq_file *seq, void *v)

4722

static int current_css_set_cg_links_read(struct seq_file *seq, void *v)

4721

{

4723

{

4722

struct cgrp_cset_link *link;

4724

struct cgrp_cset_link *link;

4723

struct css_set *cset;

4725

struct css_set *cset;

4724

char *name_buf;

4726

char *name_buf;

4725

4727

4726

name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);

4728

name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);

4727

if (!name_buf)

4729

if (!name_buf)

4728

return -ENOMEM;

4730

return -ENOMEM;

4729

4731

4730

down_read(&css_set_rwsem);

4732

down_read(&css_set_rwsem);

4731

rcu_read_lock();

4733

rcu_read_lock();

4732

cset = rcu_dereference(current->cgroups);

4734

cset = rcu_dereference(current->cgroups);

4733

list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {

4735

list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {

4734

struct cgroup *c = link->cgrp;

4736

struct cgroup *c = link->cgrp;

4735

4737

4736

cgroup_name(c, name_buf, NAME_MAX + 1);

4738

cgroup_name(c, name_buf, NAME_MAX + 1);

4737

seq_printf(seq, "Root %d group %s\n",

4739

seq_printf(seq, "Root %d group %s\n",

4738

c->root->hierarchy_id, name_buf);

4740

c->root->hierarchy_id, name_buf);

4739

}

4741

}

4740

rcu_read_unlock();

4742

rcu_read_unlock();

4741

up_read(&css_set_rwsem);

4743

up_read(&css_set_rwsem);

4742

kfree(name_buf);

4744

kfree(name_buf);

4743

return 0;

4745

return 0;

4744

}

4746

}

4745

4747

4746

#define MAX_TASKS_SHOWN_PER_CSS 25

4748

#define MAX_TASKS_SHOWN_PER_CSS 25

4747

static int cgroup_css_links_read(struct seq_file *seq, void *v)

4749

static int cgroup_css_links_read(struct seq_file *seq, void *v)

4748

{

4750

{

4749

struct cgroup_subsys_state *css = seq_css(seq);

4751

struct cgroup_subsys_state *css = seq_css(seq);

4750

struct cgrp_cset_link *link;

4752

struct cgrp_cset_link *link;

4751

4753

4752

down_read(&css_set_rwsem);

4754

down_read(&css_set_rwsem);

4753

list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {

4755

list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {

4754

struct css_set *cset = link->cset;

4756

struct css_set *cset = link->cset;

4755

struct task_struct *task;

4757

struct task_struct *task;

4756

int count = 0;

4758

int count = 0;

4757

4759

4758

seq_printf(seq, "css_set %p\n", cset);

4760

seq_printf(seq, "css_set %p\n", cset);

4759

4761

4760

list_for_each_entry(task, &cset->tasks, cg_list) {

4762

list_for_each_entry(task, &cset->tasks, cg_list) {

4761

if (count++ > MAX_TASKS_SHOWN_PER_CSS)

4763

if (count++ > MAX_TASKS_SHOWN_PER_CSS)

4762

goto overflow;

4764

goto overflow;

4763

seq_printf(seq, " task %d\n", task_pid_vnr(task));

4765

seq_printf(seq, " task %d\n", task_pid_vnr(task));

4764

}

4766

}

4765

4767

4766

list_for_each_entry(task, &cset->mg_tasks, cg_list) {

4768

list_for_each_entry(task, &cset->mg_tasks, cg_list) {

4767

if (count++ > MAX_TASKS_SHOWN_PER_CSS)

4769

if (count++ > MAX_TASKS_SHOWN_PER_CSS)

4768

goto overflow;

4770

goto overflow;

4769

seq_printf(seq, " task %d\n", task_pid_vnr(task));

4771

seq_printf(seq, " task %d\n", task_pid_vnr(task));

4770

}

4772

}

4771

continue;

4773

continue;

4772

overflow:

4774

overflow:

4773

seq_puts(seq, " ...\n");

4775

seq_puts(seq, " ...\n");

4774

}

4776

}

4775

up_read(&css_set_rwsem);

4777

up_read(&css_set_rwsem);

4776

return 0;

4778

return 0;

4777

}

4779

}

4778

4780

4779

static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)

4781

static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)

4780

{

4782

{

4781

return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);

4783

return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);

4782

}

4784

}

4783

4785

4784

static struct cftype debug_files[] = {

4786

static struct cftype debug_files[] = {

4785

{

4787

{

4786

.name = "taskcount",

4788

.name = "taskcount",

4787

.read_u64 = debug_taskcount_read,

4789

.read_u64 = debug_taskcount_read,

4788

},

4790

},

4789

4791

4790

{

4792

{

4791

.name = "current_css_set",

4793

.name = "current_css_set",

4792

.read_u64 = current_css_set_read,

4794

.read_u64 = current_css_set_read,

4793

},

4795

},

4794

4796

4795

{

4797

{

4796

.name = "current_css_set_refcount",

4798

.name = "current_css_set_refcount",

4797

.read_u64 = current_css_set_refcount_read,

4799

.read_u64 = current_css_set_refcount_read,

4798

},

4800

},

4799

4801

4800

{

4802

{

4801

.name = "current_css_set_cg_links",

4803

.name = "current_css_set_cg_links",

4802

.seq_show = current_css_set_cg_links_read,

4804

.seq_show = current_css_set_cg_links_read,

4803

},

4805

},

4804

4806

4805

{

4807

{

4806

.name = "cgroup_css_links",

4808

.name = "cgroup_css_links",

4807

.seq_show = cgroup_css_links_read,

4809

.seq_show = cgroup_css_links_read,

4808

},

4810

},

4809

4811

4810

{

4812

{

4811

.name = "releasable",

4813

.name = "releasable",

4812

.read_u64 = releasable_read,

4814

.read_u64 = releasable_read,

4813

},

4815

},

4814

4816

4815

{ } /* terminate */

4817

{ } /* terminate */

4816

};

4818

};

4817

4819

4818

struct cgroup_subsys debug_cgrp_subsys = {

4820

struct cgroup_subsys debug_cgrp_subsys = {

4819

.css_alloc = debug_css_alloc,

4821

.css_alloc = debug_css_alloc,

4820

.css_free = debug_css_free,

4822

.css_free = debug_css_free,

4821

.base_cftypes = debug_files,

4823

.base_cftypes = debug_files,

4822

};

4824

};

4823

#endif /* CONFIG_CGROUP_DEBUG */

4825

#endif /* CONFIG_CGROUP_DEBUG */

4824

4826

GITLAB

Merge branch 'master' of http://git.kernel.org/pub/scm/linux/kernel/git/torvalds…

 /*
  * fs/kernfs/mount.c - kernfs mount implementation
  *
  * Copyright (c) 2001-3 Patrick Mochel
  * Copyright (c) 2007 SUSE Linux Products GmbH
  * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
  *
  * This file is released under the GPLv2.
  */
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include "kernfs-internal.h"
 struct kmem_cache *kernfs_node_cache;
 static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data)
 {
 	struct kernfs_root *root = kernfs_info(sb)->root;
 	struct kernfs_syscall_ops *scops = root->syscall_ops;
 	if (scops && scops->remount_fs)
 		return scops->remount_fs(root, flags, data);
 	return 0;
 }
 static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
 {
 	struct kernfs_root *root = kernfs_root(dentry->d_fsdata);
 	struct kernfs_syscall_ops *scops = root->syscall_ops;
 	if (scops && scops->show_options)
 		return scops->show_options(sf, root);
 	return 0;
 }
 const struct super_operations kernfs_sops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
 	.evict_inode	= kernfs_evict_inode,
 	.remount_fs	= kernfs_sop_remount_fs,
 	.show_options	= kernfs_sop_show_options,
 };
 /**
  * kernfs_root_from_sb - determine kernfs_root associated with a super_block
  * @sb: the super_block in question
  *
  * Return the kernfs_root associated with @sb.  If @sb is not a kernfs one,
  * %NULL is returned.
  */
 struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
 {
 	if (sb->s_op == &kernfs_sops)
 		return kernfs_info(sb)->root;
 	return NULL;
 }
-static int kernfs_fill_super(struct super_block *sb)
+static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
 {
 	struct kernfs_super_info *info = kernfs_info(sb);
 	struct inode *inode;
 	struct dentry *root;
 	sb->s_blocksize = PAGE_CACHE_SIZE;
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
-	sb->s_magic = SYSFS_MAGIC;
+	sb->s_magic = magic;
 	sb->s_op = &kernfs_sops;
 	sb->s_time_gran = 1;
 	/* get root inode, initialize and unlock it */
 	mutex_lock(&kernfs_mutex);
 	inode = kernfs_get_inode(sb, info->root->kn);
 	mutex_unlock(&kernfs_mutex);
 	if (!inode) {
 		pr_debug("kernfs: could not get root inode\n");
 		return -ENOMEM;
 	}
 	/* instantiate and link root dentry */
 	root = d_make_root(inode);
 	if (!root) {
 		pr_debug("%s: could not get root dentry!\n", __func__);
 		return -ENOMEM;
 	}
 	kernfs_get(info->root->kn);
 	root->d_fsdata = info->root->kn;
 	sb->s_root = root;
 	sb->s_d_op = &kernfs_dops;
 	return 0;
 }
 static int kernfs_test_super(struct super_block *sb, void *data)
 {
 	struct kernfs_super_info *sb_info = kernfs_info(sb);
 	struct kernfs_super_info *info = data;
 	return sb_info->root == info->root && sb_info->ns == info->ns;
 }
 static int kernfs_set_super(struct super_block *sb, void *data)
 {
 	int error;
 	error = set_anon_super(sb, data);
 	if (!error)
 		sb->s_fs_info = data;
 	return error;
 }
 /**
  * kernfs_super_ns - determine the namespace tag of a kernfs super_block
  * @sb: super_block of interest
  *
  * Return the namespace tag associated with kernfs super_block @sb.
  */
 const void *kernfs_super_ns(struct super_block *sb)
 {
 	struct kernfs_super_info *info = kernfs_info(sb);
 	return info->ns;
 }
 /**
  * kernfs_mount_ns - kernfs mount helper
  * @fs_type: file_system_type of the fs being mounted
  * @flags: mount flags specified for the mount
  * @root: kernfs_root of the hierarchy being mounted
+ * @magic: file system specific magic number
  * @new_sb_created: tell the caller if we allocated a new superblock
  * @ns: optional namespace tag of the mount
  *
  * This is to be called from each kernfs user's file_system_type->mount()
  * implementation, which should pass through the specified @fs_type and
  * @flags, and specify the hierarchy and namespace tag to mount via @root
  * and @ns, respectively.
  *
  * The return value can be passed to the vfs layer verbatim.
  */
 struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
-			       struct kernfs_root *root, bool *new_sb_created,
+				struct kernfs_root *root, unsigned long magic,
-			       const void *ns)
+				bool *new_sb_created, const void *ns)
 {
 	struct super_block *sb;
 	struct kernfs_super_info *info;
 	int error;
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info)
 		return ERR_PTR(-ENOMEM);
 	info->root = root;
 	info->ns = ns;
 	sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info);
 	if (IS_ERR(sb) || sb->s_fs_info != info)
 		kfree(info);
 	if (IS_ERR(sb))
 		return ERR_CAST(sb);
 	if (new_sb_created)
 		*new_sb_created = !sb->s_root;
 	if (!sb->s_root) {
-		error = kernfs_fill_super(sb);
+		error = kernfs_fill_super(sb, magic);
 		if (error) {
 			deactivate_locked_super(sb);
 			return ERR_PTR(error);
 		}
 		sb->s_flags |= MS_ACTIVE;
 	}
 	return dget(sb->s_root);
 }
 /**
  * kernfs_kill_sb - kill_sb for kernfs
  * @sb: super_block being killed
  *
  * This can be used directly for file_system_type->kill_sb().  If a kernfs
  * user needs extra cleanup, it can implement its own kill_sb() and call
  * this function at the end.
  */
 void kernfs_kill_sb(struct super_block *sb)
 {
 	struct kernfs_super_info *info = kernfs_info(sb);
 	struct kernfs_node *root_kn = sb->s_root->d_fsdata;
 	/*
 	 * Remove the superblock from fs_supers/s_instances
 	 * so we can't find it, before freeing kernfs_super_info.
 	 */
 	kill_anon_super(sb);
 	kfree(info);
 	kernfs_put(root_kn);
 }
 void __init kernfs_init(void)
 {
 	kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
 					      sizeof(struct kernfs_node),
 					      0, SLAB_PANIC, NULL);
 	kernfs_inode_init();
 }

 /*
  * fs/sysfs/symlink.c - operations for initializing and mounting sysfs
  *
  * Copyright (c) 2001-3 Patrick Mochel
  * Copyright (c) 2007 SUSE Linux Products GmbH
  * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
  *
  * This file is released under the GPLv2.
  *
  * Please see Documentation/filesystems/sysfs.txt for more information.
  */
 #define DEBUG
 #include <linux/fs.h>
+#include <linux/magic.h>
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/user_namespace.h>
 #include "sysfs.h"
 static struct kernfs_root *sysfs_root;
 struct kernfs_node *sysfs_root_kn;
 static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
 	struct dentry *root;
 	void *ns;
 	bool new_sb;
 	if (!(flags & MS_KERNMOUNT)) {
 		if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
 			return ERR_PTR(-EPERM);
 		if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
 			return ERR_PTR(-EPERM);
 	}
 	ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
-	root = kernfs_mount_ns(fs_type, flags, sysfs_root, &new_sb, ns);
+	root = kernfs_mount_ns(fs_type, flags, sysfs_root,
+				SYSFS_MAGIC, &new_sb, ns);
 	if (IS_ERR(root) || !new_sb)
 		kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
 	return root;
 }
 static void sysfs_kill_sb(struct super_block *sb)
 {
 	void *ns = (void *)kernfs_super_ns(sb);
 	kernfs_kill_sb(sb);
 	kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
 }
 static struct file_system_type sysfs_fs_type = {
 	.name		= "sysfs",
 	.mount		= sysfs_mount,
 	.kill_sb	= sysfs_kill_sb,
 	.fs_flags	= FS_USERNS_MOUNT,
 };
 int __init sysfs_init(void)
 {
 	int err;
 	sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
 					NULL);
 	if (IS_ERR(sysfs_root))
 		return PTR_ERR(sysfs_root);
 	sysfs_root_kn = sysfs_root->kn;
 	err = register_filesystem(&sysfs_fs_type);
 	if (err) {
 		kernfs_destroy_root(sysfs_root);
 		return err;
 	}
 	return 0;
 }

 /*
  * kernfs.h - pseudo filesystem decoupled from vfs locking
  *
  * This file is released under the GPLv2.
  */
 #ifndef __LINUX_KERNFS_H
 #define __LINUX_KERNFS_H
 #include <linux/kernel.h>
 #include <linux/err.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/idr.h>
 #include <linux/lockdep.h>
 #include <linux/rbtree.h>
 #include <linux/atomic.h>
 #include <linux/wait.h>
 struct file;
 struct dentry;
 struct iattr;
 struct seq_file;
 struct vm_area_struct;
 struct super_block;
 struct file_system_type;
 struct kernfs_open_node;
 struct kernfs_iattrs;
 enum kernfs_node_type {
 	KERNFS_DIR		= 0x0001,
 	KERNFS_FILE		= 0x0002,
 	KERNFS_LINK		= 0x0004,
 };
 #define KERNFS_TYPE_MASK	0x000f
 #define KERNFS_FLAG_MASK	~KERNFS_TYPE_MASK
 enum kernfs_node_flag {
 	KERNFS_ACTIVATED	= 0x0010,
 	KERNFS_NS		= 0x0020,
 	KERNFS_HAS_SEQ_SHOW	= 0x0040,
 	KERNFS_HAS_MMAP		= 0x0080,
 	KERNFS_LOCKDEP		= 0x0100,
 	KERNFS_STATIC_NAME	= 0x0200,
 	KERNFS_SUICIDAL		= 0x0400,
 	KERNFS_SUICIDED		= 0x0800,
 };
 /* @flags for kernfs_create_root() */
 enum kernfs_root_flag {
 	/*
 	 * kernfs_nodes are created in the deactivated state and invisible.
 	 * They require explicit kernfs_activate() to become visible.  This
 	 * can be used to make related nodes become visible atomically
 	 * after all nodes are created successfully.
 	 */
 	KERNFS_ROOT_CREATE_DEACTIVATED		= 0x0001,
 	/*
 	 * For regular flies, if the opener has CAP_DAC_OVERRIDE, open(2)
 	 * succeeds regardless of the RW permissions.  sysfs had an extra
 	 * layer of enforcement where open(2) fails with -EACCES regardless
 	 * of CAP_DAC_OVERRIDE if the permission doesn't have the
 	 * respective read or write access at all (none of S_IRUGO or
 	 * S_IWUGO) or the respective operation isn't implemented.  The
 	 * following flag enables that behavior.
 	 */
 	KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK	= 0x0002,
 };
 /* type-specific structures for kernfs_node union members */
 struct kernfs_elem_dir {
 	unsigned long		subdirs;
 	/* children rbtree starts here and goes through kn->rb */
 	struct rb_root		children;
 	/*
 	 * The kernfs hierarchy this directory belongs to.  This fits
 	 * better directly in kernfs_node but is here to save space.
 	 */
 	struct kernfs_root	*root;
 };
 struct kernfs_elem_symlink {
 	struct kernfs_node	*target_kn;
 };
 struct kernfs_elem_attr {
 	const struct kernfs_ops	*ops;
 	struct kernfs_open_node	*open;
 	loff_t			size;
 };
 /*
  * kernfs_node - the building block of kernfs hierarchy.  Each and every
  * kernfs node is represented by single kernfs_node.  Most fields are
  * private to kernfs and shouldn't be accessed directly by kernfs users.
  *
  * As long as s_count reference is held, the kernfs_node itself is
  * accessible.  Dereferencing elem or any other outer entity requires
  * active reference.
  */
 struct kernfs_node {
 	atomic_t		count;
 	atomic_t		active;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
 #endif
 	/*
 	 * Use kernfs_get_parent() and kernfs_name/path() instead of
 	 * accessing the following two fields directly.  If the node is
 	 * never moved to a different parent, it is safe to access the
 	 * parent directly.
 	 */
 	struct kernfs_node	*parent;
 	const char		*name;
 	struct rb_node		rb;
 	const void		*ns;	/* namespace tag */
 	unsigned int		hash;	/* ns + name hash */
 	union {
 		struct kernfs_elem_dir		dir;
 		struct kernfs_elem_symlink	symlink;
 		struct kernfs_elem_attr		attr;
 	};
 	void			*priv;
 	unsigned short		flags;
 	umode_t			mode;
 	unsigned int		ino;
 	struct kernfs_iattrs	*iattr;
 };
 /*
  * kernfs_syscall_ops may be specified on kernfs_create_root() to support
  * syscalls.  These optional callbacks are invoked on the matching syscalls
  * and can perform any kernfs operations which don't necessarily have to be
  * the exact operation requested.  An active reference is held for each
  * kernfs_node parameter.
  */
 struct kernfs_syscall_ops {
 	int (*remount_fs)(struct kernfs_root *root, int *flags, char *data);
 	int (*show_options)(struct seq_file *sf, struct kernfs_root *root);
 	int (*mkdir)(struct kernfs_node *parent, const char *name,
 		     umode_t mode);
 	int (*rmdir)(struct kernfs_node *kn);
 	int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent,
 		      const char *new_name);
 };
 struct kernfs_root {
 	/* published fields */
 	struct kernfs_node	*kn;
 	unsigned int		flags;	/* KERNFS_ROOT_* flags */
 	/* private fields, do not use outside kernfs proper */
 	struct ida		ino_ida;
 	struct kernfs_syscall_ops *syscall_ops;
 	wait_queue_head_t	deactivate_waitq;
 };
 struct kernfs_open_file {
 	/* published fields */
 	struct kernfs_node	*kn;
 	struct file		*file;
 	void			*priv;
 	/* private fields, do not use outside kernfs proper */
 	struct mutex		mutex;
 	int			event;
 	struct list_head	list;
 	size_t			atomic_write_len;
 	bool			mmapped;
 	const struct vm_operations_struct *vm_ops;
 };
 struct kernfs_ops {
 	/*
 	 * Read is handled by either seq_file or raw_read().
 	 *
 	 * If seq_show() is present, seq_file path is active.  Other seq
 	 * operations are optional and if not implemented, the behavior is
 	 * equivalent to single_open().  @sf->private points to the
 	 * associated kernfs_open_file.
 	 *
 	 * read() is bounced through kernel buffer and a read larger than
 	 * PAGE_SIZE results in partial operation of PAGE_SIZE.
 	 */
 	int (*seq_show)(struct seq_file *sf, void *v);
 	void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
 	void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
 	void (*seq_stop)(struct seq_file *sf, void *v);
 	ssize_t (*read)(struct kernfs_open_file *of, char *buf, size_t bytes,
 			loff_t off);
 	/*
 	 * write() is bounced through kernel buffer.  If atomic_write_len
 	 * is not set, a write larger than PAGE_SIZE results in partial
 	 * operations of PAGE_SIZE chunks.  If atomic_write_len is set,
 	 * writes upto the specified size are executed atomically but
 	 * larger ones are rejected with -E2BIG.
 	 */
 	size_t atomic_write_len;
 	ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes,
 			 loff_t off);
 	int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lock_class_key	lockdep_key;
 #endif
 };
 #ifdef CONFIG_KERNFS
 static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
 {
 	return kn->flags & KERNFS_TYPE_MASK;
 }
 /**
  * kernfs_enable_ns - enable namespace under a directory
  * @kn: directory of interest, should be empty
  *
  * This is to be called right after @kn is created to enable namespace
  * under it.  All children of @kn must have non-NULL namespace tags and
  * only the ones which match the super_block's tag will be visible.
  */
 static inline void kernfs_enable_ns(struct kernfs_node *kn)
 {
 	WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
 	WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children));
 	kn->flags |= KERNFS_NS;
 }
 /**
  * kernfs_ns_enabled - test whether namespace is enabled
  * @kn: the node to test
  *
  * Test whether namespace filtering is enabled for the children of @ns.
  */
 static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
 {
 	return kn->flags & KERNFS_NS;
 }
 int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
 char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
 				size_t buflen);
 void pr_cont_kernfs_name(struct kernfs_node *kn);
 void pr_cont_kernfs_path(struct kernfs_node *kn);
 struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn);
 struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
 					   const char *name, const void *ns);
 void kernfs_get(struct kernfs_node *kn);
 void kernfs_put(struct kernfs_node *kn);
 struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry);
 struct kernfs_root *kernfs_root_from_sb(struct super_block *sb);
 struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
 				       unsigned int flags, void *priv);
 void kernfs_destroy_root(struct kernfs_root *root);
 struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
 					 const char *name, umode_t mode,
 					 void *priv, const void *ns);
 struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
 					 const char *name,
 					 umode_t mode, loff_t size,
 					 const struct kernfs_ops *ops,
 					 void *priv, const void *ns,
 					 bool name_is_static,
 					 struct lock_class_key *key);
 struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
 				       const char *name,
 				       struct kernfs_node *target);
 void kernfs_activate(struct kernfs_node *kn);
 void kernfs_remove(struct kernfs_node *kn);
 void kernfs_break_active_protection(struct kernfs_node *kn);
 void kernfs_unbreak_active_protection(struct kernfs_node *kn);
 bool kernfs_remove_self(struct kernfs_node *kn);
 int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
 			     const void *ns);
 int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
 		     const char *new_name, const void *new_ns);
 int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
 void kernfs_notify(struct kernfs_node *kn);
 const void *kernfs_super_ns(struct super_block *sb);
 struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
-			       struct kernfs_root *root, bool *new_sb_created,
+			       struct kernfs_root *root, unsigned long magic,
-			       const void *ns);
+			       bool *new_sb_created, const void *ns);
 void kernfs_kill_sb(struct super_block *sb);
 void kernfs_init(void);
 #else	/* CONFIG_KERNFS */
 static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn)
 { return 0; }	/* whatever */
 static inline void kernfs_enable_ns(struct kernfs_node *kn) { }
 static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
 { return false; }
 static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
 { return -ENOSYS; }
 static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
 					      size_t buflen)
 { return NULL; }
 static inline void pr_cont_kernfs_name(struct kernfs_node *kn) { }
 static inline void pr_cont_kernfs_path(struct kernfs_node *kn) { }
 static inline struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
 { return NULL; }
 static inline struct kernfs_node *
 kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name,
 		       const void *ns)
 { return NULL; }
 static inline void kernfs_get(struct kernfs_node *kn) { }
 static inline void kernfs_put(struct kernfs_node *kn) { }
 static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
 { return NULL; }
 static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
 { return NULL; }
 static inline struct kernfs_root *
 kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags,
 		   void *priv)
 { return ERR_PTR(-ENOSYS); }
 static inline void kernfs_destroy_root(struct kernfs_root *root) { }
 static inline struct kernfs_node *
 kernfs_create_dir_ns(struct kernfs_node *parent, const char *name,
 		     umode_t mode, void *priv, const void *ns)
 { return ERR_PTR(-ENOSYS); }
 static inline struct kernfs_node *
 __kernfs_create_file(struct kernfs_node *parent, const char *name,
 		     umode_t mode, loff_t size, const struct kernfs_ops *ops,
 		     void *priv, const void *ns, bool name_is_static,
 		     struct lock_class_key *key)
 { return ERR_PTR(-ENOSYS); }
 static inline struct kernfs_node *
 kernfs_create_link(struct kernfs_node *parent, const char *name,
 		   struct kernfs_node *target)
 { return ERR_PTR(-ENOSYS); }
 static inline void kernfs_activate(struct kernfs_node *kn) { }
 static inline void kernfs_remove(struct kernfs_node *kn) { }
 static inline bool kernfs_remove_self(struct kernfs_node *kn)
 { return false; }
 static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn,
 					   const char *name, const void *ns)
 { return -ENOSYS; }
 static inline int kernfs_rename_ns(struct kernfs_node *kn,
 				   struct kernfs_node *new_parent,
 				   const char *new_name, const void *new_ns)
 { return -ENOSYS; }
 static inline int kernfs_setattr(struct kernfs_node *kn,
 				 const struct iattr *iattr)
 { return -ENOSYS; }
 static inline void kernfs_notify(struct kernfs_node *kn) { }
 static inline const void *kernfs_super_ns(struct super_block *sb)
 { return NULL; }
 static inline struct dentry *
 kernfs_mount_ns(struct file_system_type *fs_type, int flags,
-		struct kernfs_root *root, bool *new_sb_created, const void *ns)
+		struct kernfs_root *root, unsigned long magic,
+		bool *new_sb_created, const void *ns)
 { return ERR_PTR(-ENOSYS); }
 static inline void kernfs_kill_sb(struct super_block *sb) { }
 static inline void kernfs_init(void) { }
 #endif	/* CONFIG_KERNFS */
 static inline struct kernfs_node *
 kernfs_find_and_get(struct kernfs_node *kn, const char *name)
 {
 	return kernfs_find_and_get_ns(kn, name, NULL);
 }
 static inline struct kernfs_node *
 kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode,
 		  void *priv)
 {
 	return kernfs_create_dir_ns(parent, name, mode, priv, NULL);
 }
 static inline struct kernfs_node *
 kernfs_create_file_ns(struct kernfs_node *parent, const char *name,
 		      umode_t mode, loff_t size, const struct kernfs_ops *ops,
 		      void *priv, const void *ns)
 {
 	struct lock_class_key *key = NULL;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	key = (struct lock_class_key *)&ops->lockdep_key;
 #endif
 	return __kernfs_create_file(parent, name, mode, size, ops, priv, ns,
 				    false, key);
 }
 static inline struct kernfs_node *
 kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode,
 		   loff_t size, const struct kernfs_ops *ops, void *priv)
 {
 	return kernfs_create_file_ns(parent, name, mode, size, ops, priv, NULL);
 }
 static inline int kernfs_remove_by_name(struct kernfs_node *parent,
 					const char *name)
 {
 	return kernfs_remove_by_name_ns(parent, name, NULL);
 }
 static inline int kernfs_rename(struct kernfs_node *kn,
 				struct kernfs_node *new_parent,
 				const char *new_name)
 {
 	return kernfs_rename_ns(kn, new_parent, new_name, NULL);
 }
 static inline struct dentry *
 kernfs_mount(struct file_system_type *fs_type, int flags,
-	     struct kernfs_root *root, bool *new_sb_created)
+		struct kernfs_root *root, unsigned long magic,
+		bool *new_sb_created)
 {
-	return kernfs_mount_ns(fs_type, flags, root, new_sb_created, NULL);
+	return kernfs_mount_ns(fs_type, flags, root,
+				magic, new_sb_created, NULL);
 }
 #endif	/* __LINUX_KERNFS_H */

 /*
  *  Generic process-grouping system.
  *
  *  Based originally on the cpuset system, extracted by Paul Menage
  *  Copyright (C) 2006 Google, Inc
  *
  *  Notifications support
  *  Copyright (C) 2009 Nokia Corporation
  *  Author: Kirill A. Shutemov
  *
  *  Copyright notices from the original cpuset code:
  *  --------------------------------------------------
  *  Copyright (C) 2003 BULL SA.
  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  *
  *  Portions derived from Patrick Mochel's sysfs code.
  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  *
  *  2003-10-10 Written by Simon Derr.
  *  2003-10-22 Updates by Stephen Hemminger.
  *  2004 May-July Rework by Paul Jackson.
  *  ---------------------------------------------------
  *
  *  This file is subject to the terms and conditions of the GNU General Public
  *  License.  See the file COPYING in the main directory of the Linux
  *  distribution for more details.
  */
 #include <linux/cgroup.h>
 #include <linux/cred.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/init_task.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/magic.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/rwsem.h>
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hashtable.h>
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/kthread.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
 /*
  * pidlists linger the following amount before being destroyed.  The goal
  * is avoiding frequent destruction in the middle of consecutive read calls
  * Expiring in the middle is a performance problem not a correctness one.
  * 1 sec should be enough.
  */
 #define CGROUP_PIDLIST_DESTROY_DELAY	HZ
 #define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
 					 MAX_CFTYPE_NAME + 2)
 /*
  * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
  * creation/removal and hierarchy changing operations including cgroup
  * creation, removal, css association and controller rebinding.  This outer
  * lock is needed mainly to resolve the circular dependency between kernfs
  * active ref and cgroup_mutex.  cgroup_tree_mutex nests above both.
  */
 static DEFINE_MUTEX(cgroup_tree_mutex);
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
  *
  * css_set_rwsem protects task->cgroups pointer, the list of css_set
  * objects, and the chain of tasks off each css_set.
  *
  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  * cgroup.h can use them for lockdep annotations.
  */
 #ifdef CONFIG_PROVE_RCU
 DEFINE_MUTEX(cgroup_mutex);
 DECLARE_RWSEM(css_set_rwsem);
 EXPORT_SYMBOL_GPL(cgroup_mutex);
 EXPORT_SYMBOL_GPL(css_set_rwsem);
 #else
 static DEFINE_MUTEX(cgroup_mutex);
 static DECLARE_RWSEM(css_set_rwsem);
 #endif
 /*
  * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
  * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
  */
 static DEFINE_SPINLOCK(release_agent_path_lock);
 #define cgroup_assert_mutexes_or_rcu_locked()				\
 	rcu_lockdep_assert(rcu_read_lock_held() ||			\
 			   lockdep_is_held(&cgroup_tree_mutex) ||	\
 			   lockdep_is_held(&cgroup_mutex),		\
 			   "cgroup_[tree_]mutex or RCU read lock required");
 /*
  * cgroup destruction makes heavy use of work items and there can be a lot
  * of concurrent destructions.  Use a separate workqueue so that cgroup
  * destruction work items don't end up filling up max_active of system_wq
  * which may lead to deadlock.
  */
 static struct workqueue_struct *cgroup_destroy_wq;
 /*
  * pidlist destructions need to be flushed on cgroup destruction.  Use a
  * separate workqueue as flush domain.
  */
 static struct workqueue_struct *cgroup_pidlist_destroy_wq;
 /* generate an array of cgroup subsystem pointers */
 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
 static struct cgroup_subsys *cgroup_subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
 #undef SUBSYS
 /* array of cgroup subsystem names */
 #define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
 static const char *cgroup_subsys_name[] = {
 #include <linux/cgroup_subsys.h>
 };
 #undef SUBSYS
 /*
  * The default hierarchy, reserved for the subsystems that are otherwise
  * unattached - it never has more than a single cgroup, and all tasks are
  * part of that cgroup.
  */
 struct cgroup_root cgrp_dfl_root;
 /*
  * The default hierarchy always exists but is hidden until mounted for the
  * first time.  This is for backward compatibility.
  */
 static bool cgrp_dfl_root_visible;
 /* The list of hierarchy roots */
 static LIST_HEAD(cgroup_roots);
 static int cgroup_root_count;
 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
 static DEFINE_IDR(cgroup_hierarchy_idr);
 /*
  * Assign a monotonically increasing serial number to cgroups.  It
  * guarantees cgroups with bigger numbers are newer than those with smaller
  * numbers.  Also, as cgroups are always appended to the parent's
  * ->children list, it guarantees that sibling cgroups are always sorted in
  * the ascending serial number order on the list.  Protected by
  * cgroup_mutex.
  */
 static u64 cgroup_serial_nr_next = 1;
 /* This flag indicates whether tasks in the fork and exit paths should
  * check for fork/exit handlers to call. This avoids us having to do
  * extra work in the fork/exit path if none of the subsystems need to
  * be called.
  */
 static int need_forkexit_callback __read_mostly;
 static struct cftype cgroup_base_files[];
 static void cgroup_put(struct cgroup *cgrp);
 static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned long ss_mask);
 static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
 /**
  * cgroup_css - obtain a cgroup's css for the specified subsystem
  * @cgrp: the cgroup of interest
  * @ss: the subsystem of interest (%NULL returns the dummy_css)
  *
  * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
  * function must be called either under cgroup_mutex or rcu_read_lock() and
  * the caller is responsible for pinning the returned css if it wants to
  * keep accessing it outside the said locks.  This function may return
  * %NULL if @cgrp doesn't have @subsys_id enabled.
  */
 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 					      struct cgroup_subsys *ss)
 {
 	if (ss)
 		return rcu_dereference_check(cgrp->subsys[ss->id],
 					lockdep_is_held(&cgroup_tree_mutex) ||
 					lockdep_is_held(&cgroup_mutex));
 	else
 		return &cgrp->dummy_css;
 }
 /* convenient tests for these bits */
 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 {
 	return test_bit(CGRP_DEAD, &cgrp->flags);
 }
 struct cgroup_subsys_state *seq_css(struct seq_file *seq)
 {
 	struct kernfs_open_file *of = seq->private;
 	struct cgroup *cgrp = of->kn->parent->priv;
 	struct cftype *cft = seq_cft(seq);
 	/*
 	 * This is open and unprotected implementation of cgroup_css().
 	 * seq_css() is only called from a kernfs file operation which has
 	 * an active reference on the file.  Because all the subsystem
 	 * files are drained before a css is disassociated with a cgroup,
 	 * the matching css from the cgroup's subsys table is guaranteed to
 	 * be and stay valid until the enclosing operation is complete.
 	 */
 	if (cft->ss)
 		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
 	else
 		return &cgrp->dummy_css;
 }
 EXPORT_SYMBOL_GPL(seq_css);
 /**
  * cgroup_is_descendant - test ancestry
  * @cgrp: the cgroup to be tested
  * @ancestor: possible ancestor of @cgrp
  *
  * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
  * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
  * and @ancestor are accessible.
  */
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
 {
 	while (cgrp) {
 		if (cgrp == ancestor)
 			return true;
 		cgrp = cgrp->parent;
 	}
 	return false;
 }
 static int cgroup_is_releasable(const struct cgroup *cgrp)
 {
 	const int bits =
 		(1 << CGRP_RELEASABLE) |
 		(1 << CGRP_NOTIFY_ON_RELEASE);
 	return (cgrp->flags & bits) == bits;
 }
 static int notify_on_release(const struct cgroup *cgrp)
 {
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
 /**
  * for_each_css - iterate all css's of a cgroup
  * @css: the iteration cursor
  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
  * @cgrp: the target cgroup to iterate css's of
  *
  * Should be called under cgroup_mutex.
  */
 #define for_each_css(css, ssid, cgrp)					\
 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
 		if (!((css) = rcu_dereference_check(			\
 				(cgrp)->subsys[(ssid)],			\
 				lockdep_is_held(&cgroup_tree_mutex) ||	\
 				lockdep_is_held(&cgroup_mutex)))) { }	\
 		else
 /**
  * for_each_subsys - iterate all enabled cgroup subsystems
  * @ss: the iteration cursor
  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
  */
 #define for_each_subsys(ss, ssid)					\
 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
 	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
 /* iterate across the hierarchies */
 #define for_each_root(root)						\
 	list_for_each_entry((root), &cgroup_roots, root_list)
 /**
  * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
  * @cgrp: the cgroup to be checked for liveness
  *
  * On success, returns true; the mutex should be later unlocked.  On
  * failure returns false with no lock held.
  */
 static bool cgroup_lock_live_group(struct cgroup *cgrp)
 {
 	mutex_lock(&cgroup_mutex);
 	if (cgroup_is_dead(cgrp)) {
 		mutex_unlock(&cgroup_mutex);
 		return false;
 	}
 	return true;
 }
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
 static LIST_HEAD(release_list);
 static DEFINE_RAW_SPINLOCK(release_list_lock);
 static void cgroup_release_agent(struct work_struct *work);
 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
 /*
  * A cgroup can be associated with multiple css_sets as different tasks may
  * belong to different cgroups on different hierarchies.  In the other
  * direction, a css_set is naturally associated with multiple cgroups.
  * This M:N relationship is represented by the following link structure
  * which exists for each association and allows traversing the associations
  * from both sides.
  */
 struct cgrp_cset_link {
 	/* the cgroup and css_set this link associates */
 	struct cgroup		*cgrp;
 	struct css_set		*cset;
 	/* list of cgrp_cset_links anchored at cgrp->cset_links */
 	struct list_head	cset_link;
 	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
 	struct list_head	cgrp_link;
 };
 /*
  * The default css_set - used by init and its children prior to any
  * hierarchies being mounted. It contains a pointer to the root state
  * for each subsystem. Also used to anchor the list of css_sets. Not
  * reference-counted, to improve performance when child cgroups
  * haven't been created.
  */
 struct css_set init_css_set = {
 	.refcount		= ATOMIC_INIT(1),
 	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
 	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
 };
 static int css_set_count	= 1;	/* 1 for init_css_set */
 /*
  * hash table for cgroup groups. This improves the performance to find
  * an existing css_set. This hash doesn't (currently) take into
  * account cgroups in empty hierarchies.
  */
 #define CSS_SET_HASH_BITS	7
 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 {
 	unsigned long key = 0UL;
 	struct cgroup_subsys *ss;
 	int i;
 	for_each_subsys(ss, i)
 		key += (unsigned long)css[i];
 	key = (key >> 16) ^ key;
 	return key;
 }
 static void put_css_set_locked(struct css_set *cset, bool taskexit)
 {
 	struct cgrp_cset_link *link, *tmp_link;
 	lockdep_assert_held(&css_set_rwsem);
 	if (!atomic_dec_and_test(&cset->refcount))
 		return;
 	/* This css_set is dead. unlink it and release cgroup refcounts */
 	hash_del(&cset->hlist);
 	css_set_count--;
 	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
 		struct cgroup *cgrp = link->cgrp;
 		list_del(&link->cset_link);
 		list_del(&link->cgrp_link);
 		/* @cgrp can't go away while we're holding css_set_rwsem */
 		if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
 			if (taskexit)
 				set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
 		}
 		kfree(link);
 	}
 	kfree_rcu(cset, rcu_head);
 }
 static void put_css_set(struct css_set *cset, bool taskexit)
 {
 	/*
 	 * Ensure that the refcount doesn't hit zero while any readers
 	 * can see it. Similar to atomic_dec_and_lock(), but for an
 	 * rwlock
 	 */
 	if (atomic_add_unless(&cset->refcount, -1, 1))
 		return;
 	down_write(&css_set_rwsem);
 	put_css_set_locked(cset, taskexit);
 	up_write(&css_set_rwsem);
 }
 /*
  * refcounted get/put for css_set objects
  */
 static inline void get_css_set(struct css_set *cset)
 {
 	atomic_inc(&cset->refcount);
 }
 /**
  * compare_css_sets - helper function for find_existing_css_set().
  * @cset: candidate css_set being tested
  * @old_cset: existing css_set for a task
  * @new_cgrp: cgroup that's being entered by the task
  * @template: desired set of css pointers in css_set (pre-calculated)
  *
  * Returns true if "cset" matches "old_cset" except for the hierarchy
  * which "new_cgrp" belongs to, for which it should match "new_cgrp".
  */
 static bool compare_css_sets(struct css_set *cset,
 			     struct css_set *old_cset,
 			     struct cgroup *new_cgrp,
 			     struct cgroup_subsys_state *template[])
 {
 	struct list_head *l1, *l2;
 	if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
 		/* Not all subsystems matched */
 		return false;
 	}
 	/*
 	 * Compare cgroup pointers in order to distinguish between
 	 * different cgroups in heirarchies with no subsystems. We
 	 * could get by with just this check alone (and skip the
 	 * memcmp above) but on most setups the memcmp check will
 	 * avoid the need for this more expensive check on almost all
 	 * candidates.
 	 */
 	l1 = &cset->cgrp_links;
 	l2 = &old_cset->cgrp_links;
 	while (1) {
 		struct cgrp_cset_link *link1, *link2;
 		struct cgroup *cgrp1, *cgrp2;
 		l1 = l1->next;
 		l2 = l2->next;
 		/* See if we reached the end - both lists are equal length. */
 		if (l1 == &cset->cgrp_links) {
 			BUG_ON(l2 != &old_cset->cgrp_links);
 			break;
 		} else {
 			BUG_ON(l2 == &old_cset->cgrp_links);
 		}
 		/* Locate the cgroups associated with these links. */
 		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
 		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
 		cgrp1 = link1->cgrp;
 		cgrp2 = link2->cgrp;
 		/* Hierarchies should be linked in the same order. */
 		BUG_ON(cgrp1->root != cgrp2->root);
 		/*
 		 * If this hierarchy is the hierarchy of the cgroup
 		 * that's changing, then we need to check that this
 		 * css_set points to the new cgroup; if it's any other
 		 * hierarchy, then this css_set should point to the
 		 * same cgroup as the old css_set.
 		 */
 		if (cgrp1->root == new_cgrp->root) {
 			if (cgrp1 != new_cgrp)
 				return false;
 		} else {
 			if (cgrp1 != cgrp2)
 				return false;
 		}
 	}
 	return true;
 }
 /**
  * find_existing_css_set - init css array and find the matching css_set
  * @old_cset: the css_set that we're using before the cgroup transition
  * @cgrp: the cgroup that we're moving into
  * @template: out param for the new set of csses, should be clear on entry
  */
 static struct css_set *find_existing_css_set(struct css_set *old_cset,
 					struct cgroup *cgrp,
 					struct cgroup_subsys_state *template[])
 {
 	struct cgroup_root *root = cgrp->root;
 	struct cgroup_subsys *ss;
 	struct css_set *cset;
 	unsigned long key;
 	int i;
 	/*
 	 * Build the set of subsystem state objects that we want to see in the
 	 * new css_set. while subsystems can change globally, the entries here
 	 * won't change, so no need for locking.
 	 */
 	for_each_subsys(ss, i) {
 		if (root->cgrp.subsys_mask & (1UL << i)) {
 			/* Subsystem is in this hierarchy. So we want
 			 * the subsystem state from the new
 			 * cgroup */
 			template[i] = cgroup_css(cgrp, ss);
 		} else {
 			/* Subsystem is not in this hierarchy, so we
 			 * don't want to change the subsystem state */
 			template[i] = old_cset->subsys[i];
 		}
 	}
 	key = css_set_hash(template);
 	hash_for_each_possible(css_set_table, cset, hlist, key) {
 		if (!compare_css_sets(cset, old_cset, cgrp, template))
 			continue;
 		/* This css_set matches what we need */
 		return cset;
 	}
 	/* No existing cgroup group matched */
 	return NULL;
 }
 static void free_cgrp_cset_links(struct list_head *links_to_free)
 {
 	struct cgrp_cset_link *link, *tmp_link;
 	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
 		list_del(&link->cset_link);
 		kfree(link);
 	}
 }
 /**
  * allocate_cgrp_cset_links - allocate cgrp_cset_links
  * @count: the number of links to allocate
  * @tmp_links: list_head the allocated links are put on
  *
  * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
  * through ->cset_link.  Returns 0 on success or -errno.
  */
 static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
 {
 	struct cgrp_cset_link *link;
 	int i;
 	INIT_LIST_HEAD(tmp_links);
 	for (i = 0; i < count; i++) {
 		link = kzalloc(sizeof(*link), GFP_KERNEL);
 		if (!link) {
 			free_cgrp_cset_links(tmp_links);
 			return -ENOMEM;
 		}
 		list_add(&link->cset_link, tmp_links);
 	}
 	return 0;
 }
 /**
  * link_css_set - a helper function to link a css_set to a cgroup
  * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
  * @cset: the css_set to be linked
  * @cgrp: the destination cgroup
  */
 static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
 			 struct cgroup *cgrp)
 {
 	struct cgrp_cset_link *link;
 	BUG_ON(list_empty(tmp_links));
 	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
 	link->cset = cset;
 	link->cgrp = cgrp;
 	list_move(&link->cset_link, &cgrp->cset_links);
 	/*
 	 * Always add links to the tail of the list so that the list
 	 * is sorted by order of hierarchy creation
 	 */
 	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
 }
 /**
  * find_css_set - return a new css_set with one cgroup updated
  * @old_cset: the baseline css_set
  * @cgrp: the cgroup to be updated
  *
  * Return a new css_set that's equivalent to @old_cset, but with @cgrp
  * substituted into the appropriate hierarchy.
  */
 static struct css_set *find_css_set(struct css_set *old_cset,
 				    struct cgroup *cgrp)
 {
 	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
 	struct css_set *cset;
 	struct list_head tmp_links;
 	struct cgrp_cset_link *link;
 	unsigned long key;
 	lockdep_assert_held(&cgroup_mutex);
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
 	down_read(&css_set_rwsem);
 	cset = find_existing_css_set(old_cset, cgrp, template);
 	if (cset)
 		get_css_set(cset);
 	up_read(&css_set_rwsem);
 	if (cset)
 		return cset;
 	cset = kzalloc(sizeof(*cset), GFP_KERNEL);
 	if (!cset)
 		return NULL;
 	/* Allocate all the cgrp_cset_link objects that we'll need */
 	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
 		kfree(cset);
 		return NULL;
 	}
 	atomic_set(&cset->refcount, 1);
 	INIT_LIST_HEAD(&cset->cgrp_links);
 	INIT_LIST_HEAD(&cset->tasks);
 	INIT_LIST_HEAD(&cset->mg_tasks);
 	INIT_LIST_HEAD(&cset->mg_preload_node);
 	INIT_LIST_HEAD(&cset->mg_node);
 	INIT_HLIST_NODE(&cset->hlist);
 	/* Copy the set of subsystem state objects generated in
 	 * find_existing_css_set() */
 	memcpy(cset->subsys, template, sizeof(cset->subsys));
 	down_write(&css_set_rwsem);
 	/* Add reference counts and links from the new css_set. */
 	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
 		if (c->root == cgrp->root)
 			c = cgrp;
 		link_css_set(&tmp_links, cset, c);
 	}
 	BUG_ON(!list_empty(&tmp_links));
 	css_set_count++;
 	/* Add this cgroup group to the hash table */
 	key = css_set_hash(cset->subsys);
 	hash_add(css_set_table, &cset->hlist, key);
 	up_write(&css_set_rwsem);
 	return cset;
 }
 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
 	struct cgroup *root_cgrp = kf_root->kn->priv;
 	return root_cgrp->root;
 }
 static int cgroup_init_root_id(struct cgroup_root *root)
 {
 	int id;
 	lockdep_assert_held(&cgroup_mutex);
 	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
 	if (id < 0)
 		return id;
 	root->hierarchy_id = id;
 	return 0;
 }
 static void cgroup_exit_root_id(struct cgroup_root *root)
 {
 	lockdep_assert_held(&cgroup_mutex);
 	if (root->hierarchy_id) {
 		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
 		root->hierarchy_id = 0;
 	}
 }
 static void cgroup_free_root(struct cgroup_root *root)
 {
 	if (root) {
 		/* hierarhcy ID shoulid already have been released */
 		WARN_ON_ONCE(root->hierarchy_id);
 		idr_destroy(&root->cgroup_idr);
 		kfree(root);
 	}
 }
 static void cgroup_destroy_root(struct cgroup_root *root)
 {
 	struct cgroup *cgrp = &root->cgrp;
 	struct cgrp_cset_link *link, *tmp_link;
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	BUG_ON(atomic_read(&root->nr_cgrps));
 	BUG_ON(!list_empty(&cgrp->children));
 	/* Rebind all subsystems back to the default hierarchy */
 	rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask);
 	/*
 	 * Release all the links from cset_links to this hierarchy's
 	 * root cgroup
 	 */
 	down_write(&css_set_rwsem);
 	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
 		list_del(&link->cset_link);
 		list_del(&link->cgrp_link);
 		kfree(link);
 	}
 	up_write(&css_set_rwsem);
 	if (!list_empty(&root->root_list)) {
 		list_del(&root->root_list);
 		cgroup_root_count--;
 	}
 	cgroup_exit_root_id(root);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	kernfs_destroy_root(root->kf_root);
 	cgroup_free_root(root);
 }
 /* look up cgroup associated with given css_set on the specified hierarchy */
 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 					    struct cgroup_root *root)
 {
 	struct cgroup *res = NULL;
 	lockdep_assert_held(&cgroup_mutex);
 	lockdep_assert_held(&css_set_rwsem);
 	if (cset == &init_css_set) {
 		res = &root->cgrp;
 	} else {
 		struct cgrp_cset_link *link;
 		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 			struct cgroup *c = link->cgrp;
 			if (c->root == root) {
 				res = c;
 				break;
 			}
 		}
 	}
 	BUG_ON(!res);
 	return res;
 }
 /*
  * Return the cgroup for "task" from the given hierarchy. Must be
  * called with cgroup_mutex and css_set_rwsem held.
  */
 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 					    struct cgroup_root *root)
 {
 	/*
 	 * No need to lock the task - since we hold cgroup_mutex the
 	 * task can't change groups, so the only thing that can happen
 	 * is that it exits and its css is set back to init_css_set.
 	 */
 	return cset_cgroup_from_root(task_css_set(task), root);
 }
 /*
  * A task must hold cgroup_mutex to modify cgroups.
  *
  * Any task can increment and decrement the count field without lock.
  * So in general, code holding cgroup_mutex can't rely on the count
  * field not changing.  However, if the count goes to zero, then only
  * cgroup_attach_task() can increment it again.  Because a count of zero
  * means that no tasks are currently attached, therefore there is no
  * way a task attached to that cgroup can fork (the other way to
  * increment the count).  So code holding cgroup_mutex can safely
  * assume that if the count is zero, it will stay zero. Similarly, if
  * a task holds cgroup_mutex on a cgroup with zero count, it
  * knows that the cgroup won't be removed, as cgroup_rmdir()
  * needs that mutex.
  *
  * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
  * (usually) take cgroup_mutex.  These are the two most performance
  * critical pieces of code here.  The exception occurs on cgroup_exit(),
  * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
  * is taken, and if the cgroup count is zero, a usermode call made
  * to the release agent with the name of the cgroup (path relative to
  * the root of cgroup file system) as the argument.
  *
  * A cgroup can only be deleted if both its 'count' of using tasks
  * is zero, and its list of 'children' cgroups is empty.  Since all
  * tasks in the system use _some_ cgroup, and since there is always at
  * least one task in the system (init, pid == 1), therefore, root cgroup
  * always has either children cgroups and/or using tasks.  So we don't
  * need a special hack to ensure that root cgroup cannot be deleted.
  *
  * P.S.  One more locking exception.  RCU is used to guard the
  * update of a tasks cgroup pointer by cgroup_attach_task()
  */
 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 static const struct file_operations proc_cgroupstats_operations;
 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 			      char *buf)
 {
 	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
 	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
 		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
 			 cft->ss->name, cft->name);
 	else
 		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
 	return buf;
 }
 /**
  * cgroup_file_mode - deduce file mode of a control file
  * @cft: the control file in question
  *
  * returns cft->mode if ->mode is not 0
  * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
  * returns S_IRUGO if it has only a read handler
  * returns S_IWUSR if it has only a write hander
  */
 static umode_t cgroup_file_mode(const struct cftype *cft)
 {
 	umode_t mode = 0;
 	if (cft->mode)
 		return cft->mode;
 	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
 		mode |= S_IRUGO;
 	if (cft->write_u64 || cft->write_s64 || cft->write_string ||
 	    cft->trigger)
 		mode |= S_IWUSR;
 	return mode;
 }
 static void cgroup_free_fn(struct work_struct *work)
 {
 	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
 	atomic_dec(&cgrp->root->nr_cgrps);
 	cgroup_pidlist_destroy_all(cgrp);
 	if (cgrp->parent) {
 		/*
 		 * We get a ref to the parent, and put the ref when this
 		 * cgroup is being freed, so it's guaranteed that the
 		 * parent won't be destroyed before its children.
 		 */
 		cgroup_put(cgrp->parent);
 		kernfs_put(cgrp->kn);
 		kfree(cgrp);
 	} else {
 		/*
 		 * This is root cgroup's refcnt reaching zero, which
 		 * indicates that the root should be released.
 		 */
 		cgroup_destroy_root(cgrp->root);
 	}
 }
 static void cgroup_free_rcu(struct rcu_head *head)
 {
 	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
 	INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
 	queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
 }
 static void cgroup_get(struct cgroup *cgrp)
 {
 	WARN_ON_ONCE(cgroup_is_dead(cgrp));
 	WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
 	atomic_inc(&cgrp->refcnt);
 }
 static void cgroup_put(struct cgroup *cgrp)
 {
 	if (!atomic_dec_and_test(&cgrp->refcnt))
 		return;
 	if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
 		return;
 	/*
 	 * XXX: cgrp->id is only used to look up css's.  As cgroup and
 	 * css's lifetimes will be decoupled, it should be made
 	 * per-subsystem and moved to css->id so that lookups are
 	 * successful until the target css is released.
 	 */
 	mutex_lock(&cgroup_mutex);
 	idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
 	mutex_unlock(&cgroup_mutex);
 	cgrp->id = -1;
 	call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
 }
 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 {
 	char name[CGROUP_FILE_NAME_MAX];
 	lockdep_assert_held(&cgroup_tree_mutex);
 	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
 }
 /**
  * cgroup_clear_dir - remove subsys files in a cgroup directory
  * @cgrp: target cgroup
  * @subsys_mask: mask of the subsystem ids whose files should be removed
  */
 static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 {
 	struct cgroup_subsys *ss;
 	int i;
 	for_each_subsys(ss, i) {
 		struct cftype *cfts;
 		if (!test_bit(i, &subsys_mask))
 			continue;
 		list_for_each_entry(cfts, &ss->cfts, node)
 			cgroup_addrm_files(cgrp, cfts, false);
 	}
 }
 static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned long ss_mask)
 {
 	struct cgroup_subsys *ss;
 	int ssid, ret;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	for_each_subsys(ss, ssid) {
 		if (!(ss_mask & (1 << ssid)))
 			continue;
 		/* if @ss is on the dummy_root, we can always move it */
 		if (ss->root == &cgrp_dfl_root)
 			continue;
 		/* if @ss has non-root cgroups attached to it, can't move */
 		if (!list_empty(&ss->root->cgrp.children))
 			return -EBUSY;
 		/* can't move between two non-dummy roots either */
 		if (dst_root != &cgrp_dfl_root)
 			return -EBUSY;
 	}
 	ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
 	if (ret) {
 		if (dst_root != &cgrp_dfl_root)
 			return ret;
 		/*
 		 * Rebinding back to the default root is not allowed to
 		 * fail.  Using both default and non-default roots should
 		 * be rare.  Moving subsystems back and forth even more so.
 		 * Just warn about it and continue.
 		 */
 		if (cgrp_dfl_root_visible) {
 			pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",
 				   ret, ss_mask);
 			pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");
 		}
 	}
 	/*
 	 * Nothing can fail from this point on.  Remove files for the
 	 * removed subsystems and rebind each subsystem.
 	 */
 	mutex_unlock(&cgroup_mutex);
 	for_each_subsys(ss, ssid)
 		if (ss_mask & (1 << ssid))
 			cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
 	mutex_lock(&cgroup_mutex);
 	for_each_subsys(ss, ssid) {
 		struct cgroup_root *src_root;
 		struct cgroup_subsys_state *css;
 		if (!(ss_mask & (1 << ssid)))
 			continue;
 		src_root = ss->root;
 		css = cgroup_css(&src_root->cgrp, ss);
 		WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
 		RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
 		rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
 		ss->root = dst_root;
 		css->cgroup = &dst_root->cgrp;
 		src_root->cgrp.subsys_mask &= ~(1 << ssid);
 		dst_root->cgrp.subsys_mask |= 1 << ssid;
 		if (ss->bind)
 			ss->bind(css);
 	}
 	kernfs_activate(dst_root->cgrp.kn);
 	return 0;
 }
 static int cgroup_show_options(struct seq_file *seq,
 			       struct kernfs_root *kf_root)
 {
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_subsys *ss;
 	int ssid;
 	for_each_subsys(ss, ssid)
 		if (root->cgrp.subsys_mask & (1 << ssid))
 			seq_printf(seq, ",%s", ss->name);
 	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
 		seq_puts(seq, ",sane_behavior");
 	if (root->flags & CGRP_ROOT_NOPREFIX)
 		seq_puts(seq, ",noprefix");
 	if (root->flags & CGRP_ROOT_XATTR)
 		seq_puts(seq, ",xattr");
 	spin_lock(&release_agent_path_lock);
 	if (strlen(root->release_agent_path))
 		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
 	spin_unlock(&release_agent_path_lock);
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
 		seq_printf(seq, ",name=%s", root->name);
 	return 0;
 }
 struct cgroup_sb_opts {
 	unsigned long subsys_mask;
 	unsigned long flags;
 	char *release_agent;
 	bool cpuset_clone_children;
 	char *name;
 	/* User explicitly requested empty subsystem */
 	bool none;
 };
 /*
  * Convert a hierarchy specifier into a bitmask of subsystems and
  * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
  * array. This function takes refcounts on subsystems to be used, unless it
  * returns error, in which case no refcounts are taken.
  */
 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
 	char *token, *o = data;
 	bool all_ss = false, one_ss = false;
 	unsigned long mask = (unsigned long)-1;
 	struct cgroup_subsys *ss;
 	int i;
 	BUG_ON(!mutex_is_locked(&cgroup_mutex));
 #ifdef CONFIG_CPUSETS
 	mask = ~(1UL << cpuset_cgrp_id);
 #endif
 	memset(opts, 0, sizeof(*opts));
 	while ((token = strsep(&o, ",")) != NULL) {
 		if (!*token)
 			return -EINVAL;
 		if (!strcmp(token, "none")) {
 			/* Explicitly have no subsystems */
 			opts->none = true;
 			continue;
 		}
 		if (!strcmp(token, "all")) {
 			/* Mutually exclusive option 'all' + subsystem name */
 			if (one_ss)
 				return -EINVAL;
 			all_ss = true;
 			continue;
 		}
 		if (!strcmp(token, "__DEVEL__sane_behavior")) {
 			opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
 			continue;
 		}
 		if (!strcmp(token, "noprefix")) {
 			opts->flags |= CGRP_ROOT_NOPREFIX;
 			continue;
 		}
 		if (!strcmp(token, "clone_children")) {
 			opts->cpuset_clone_children = true;
 			continue;
 		}
 		if (!strcmp(token, "xattr")) {
 			opts->flags |= CGRP_ROOT_XATTR;
 			continue;
 		}
 		if (!strncmp(token, "release_agent=", 14)) {
 			/* Specifying two release agents is forbidden */
 			if (opts->release_agent)
 				return -EINVAL;
 			opts->release_agent =
 				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
 			if (!opts->release_agent)
 				return -ENOMEM;
 			continue;
 		}
 		if (!strncmp(token, "name=", 5)) {
 			const char *name = token + 5;
 			/* Can't specify an empty name */
 			if (!strlen(name))
 				return -EINVAL;
 			/* Must match [\w.-]+ */
 			for (i = 0; i < strlen(name); i++) {
 				char c = name[i];
 				if (isalnum(c))
 					continue;
 				if ((c == '.') || (c == '-') || (c == '_'))
 					continue;
 				return -EINVAL;
 			}
 			/* Specifying two names is forbidden */
 			if (opts->name)
 				return -EINVAL;
 			opts->name = kstrndup(name,
 					      MAX_CGROUP_ROOT_NAMELEN - 1,
 					      GFP_KERNEL);
 			if (!opts->name)
 				return -ENOMEM;
 			continue;
 		}
 		for_each_subsys(ss, i) {
 			if (strcmp(token, ss->name))
 				continue;
 			if (ss->disabled)
 				continue;
 			/* Mutually exclusive option 'all' + subsystem name */
 			if (all_ss)
 				return -EINVAL;
 			set_bit(i, &opts->subsys_mask);
 			one_ss = true;
 			break;
 		}
 		if (i == CGROUP_SUBSYS_COUNT)
 			return -ENOENT;
 	}
 	/* Consistency checks */
 	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
 		pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
 		if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
 		    opts->cpuset_clone_children || opts->release_agent ||
 		    opts->name) {
 			pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
 			return -EINVAL;
 		}
 	} else {
 		/*
 		 * If the 'all' option was specified select all the
 		 * subsystems, otherwise if 'none', 'name=' and a subsystem
 		 * name options were not specified, let's default to 'all'
 		 */
 		if (all_ss || (!one_ss && !opts->none && !opts->name))
 			for_each_subsys(ss, i)
 				if (!ss->disabled)
 					set_bit(i, &opts->subsys_mask);
 		/*
 		 * We either have to specify by name or by subsystems. (So
 		 * all empty hierarchies must have a name).
 		 */
 		if (!opts->subsys_mask && !opts->name)
 			return -EINVAL;
 	}
 	/*
 	 * Option noprefix was introduced just for backward compatibility
 	 * with the old cpuset, so we allow noprefix only if mounting just
 	 * the cpuset subsystem.
 	 */
 	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
 		return -EINVAL;
 	/* Can't specify "none" and some subsystems */
 	if (opts->subsys_mask && opts->none)
 		return -EINVAL;
 	return 0;
 }
 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 {
 	int ret = 0;
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_sb_opts opts;
 	unsigned long added_mask, removed_mask;
 	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
 		pr_err("cgroup: sane_behavior: remount is not allowed\n");
 		return -EINVAL;
 	}
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	/* See what subsystems are wanted */
 	ret = parse_cgroupfs_options(data, &opts);
 	if (ret)
 		goto out_unlock;
 	if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent)
 		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
 			   task_tgid_nr(current), current->comm);
 	added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask;
 	removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask;
 	/* Don't allow flags or name to change at remount */
 	if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
 	    (opts.name && strcmp(opts.name, root->name))) {
 		pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
 		       opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
 		       root->flags & CGRP_ROOT_OPTION_MASK, root->name);
 		ret = -EINVAL;
 		goto out_unlock;
 	}
 	/* remounting is not allowed for populated hierarchies */
 	if (!list_empty(&root->cgrp.children)) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
 	ret = rebind_subsystems(root, added_mask);
 	if (ret)
 		goto out_unlock;
 	rebind_subsystems(&cgrp_dfl_root, removed_mask);
 	if (opts.release_agent) {
 		spin_lock(&release_agent_path_lock);
 		strcpy(root->release_agent_path, opts.release_agent);
 		spin_unlock(&release_agent_path_lock);
 	}
  out_unlock:
 	kfree(opts.release_agent);
 	kfree(opts.name);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	return ret;
 }
 /*
  * To reduce the fork() overhead for systems that are not actually using
  * their cgroups capability, we don't maintain the lists running through
  * each css_set to its tasks until we see the list actually used - in other
  * words after the first mount.
  */
 static bool use_task_css_set_links __read_mostly;
 static void cgroup_enable_task_cg_lists(void)
 {
 	struct task_struct *p, *g;
 	down_write(&css_set_rwsem);
 	if (use_task_css_set_links)
 		goto out_unlock;
 	use_task_css_set_links = true;
 	/*
 	 * We need tasklist_lock because RCU is not safe against
 	 * while_each_thread(). Besides, a forking task that has passed
 	 * cgroup_post_fork() without seeing use_task_css_set_links = 1
 	 * is not guaranteed to have its child immediately visible in the
 	 * tasklist if we walk through it with RCU.
 	 */
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
 			     task_css_set(p) != &init_css_set);
 		/*
 		 * We should check if the process is exiting, otherwise
 		 * it will race with cgroup_exit() in that the list
 		 * entry won't be deleted though the process has exited.
 		 * Do it while holding siglock so that we don't end up
 		 * racing against cgroup_exit().
 		 */
 		spin_lock_irq(&p->sighand->siglock);
 		if (!(p->flags & PF_EXITING)) {
 			struct css_set *cset = task_css_set(p);
 			list_add(&p->cg_list, &cset->tasks);
 			get_css_set(cset);
 		}
 		spin_unlock_irq(&p->sighand->siglock);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 out_unlock:
 	up_write(&css_set_rwsem);
 }
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
 {
 	atomic_set(&cgrp->refcnt, 1);
 	INIT_LIST_HEAD(&cgrp->sibling);
 	INIT_LIST_HEAD(&cgrp->children);
 	INIT_LIST_HEAD(&cgrp->cset_links);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
 	mutex_init(&cgrp->pidlist_mutex);
 	cgrp->dummy_css.cgroup = cgrp;
 }
 static void init_cgroup_root(struct cgroup_root *root,
 			     struct cgroup_sb_opts *opts)
 {
 	struct cgroup *cgrp = &root->cgrp;
 	INIT_LIST_HEAD(&root->root_list);
 	atomic_set(&root->nr_cgrps, 1);
 	cgrp->root = root;
 	init_cgroup_housekeeping(cgrp);
 	idr_init(&root->cgroup_idr);
 	root->flags = opts->flags;
 	if (opts->release_agent)
 		strcpy(root->release_agent_path, opts->release_agent);
 	if (opts->name)
 		strcpy(root->name, opts->name);
 	if (opts->cpuset_clone_children)
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
 static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 {
 	LIST_HEAD(tmp_links);
 	struct cgroup *root_cgrp = &root->cgrp;
 	struct css_set *cset;
 	int i, ret;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
 	if (ret < 0)
 		goto out;
 	root_cgrp->id = ret;
 	/*
 	 * We're accessing css_set_count without locking css_set_rwsem here,
 	 * but that's OK - it can only be increased by someone holding
 	 * cgroup_lock, and that's us. The worst that can happen is that we
 	 * have some link structures left over
 	 */
 	ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
 	if (ret)
 		goto out;
 	ret = cgroup_init_root_id(root);
 	if (ret)
 		goto out;
 	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
 					   KERNFS_ROOT_CREATE_DEACTIVATED,
 					   root_cgrp);
 	if (IS_ERR(root->kf_root)) {
 		ret = PTR_ERR(root->kf_root);
 		goto exit_root_id;
 	}
 	root_cgrp->kn = root->kf_root->kn;
 	ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
 	if (ret)
 		goto destroy_root;
 	ret = rebind_subsystems(root, ss_mask);
 	if (ret)
 		goto destroy_root;
 	/*
 	 * There must be no failure case after here, since rebinding takes
 	 * care of subsystems' refcounts, which are explicitly dropped in
 	 * the failure exit path.
 	 */
 	list_add(&root->root_list, &cgroup_roots);
 	cgroup_root_count++;
 	/*
 	 * Link the root cgroup in this hierarchy into all the css_set
 	 * objects.
 	 */
 	down_write(&css_set_rwsem);
 	hash_for_each(css_set_table, i, cset, hlist)
 		link_css_set(&tmp_links, cset, root_cgrp);
 	up_write(&css_set_rwsem);
 	BUG_ON(!list_empty(&root_cgrp->children));
 	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
 	kernfs_activate(root_cgrp->kn);
 	ret = 0;
 	goto out;
 destroy_root:
 	kernfs_destroy_root(root->kf_root);
 	root->kf_root = NULL;
 exit_root_id:
 	cgroup_exit_root_id(root);
 out:
 	free_cgrp_cset_links(&tmp_links);
 	return ret;
 }
 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data)
 {
 	struct cgroup_root *root;
 	struct cgroup_sb_opts opts;
 	struct dentry *dentry;
 	int ret;
 	bool new_sb;
 	/*
 	 * The first time anyone tries to mount a cgroup, enable the list
 	 * linking each css_set to its tasks and fix up all existing tasks.
 	 */
 	if (!use_task_css_set_links)
 		cgroup_enable_task_cg_lists();
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	/* First find the desired set of subsystems */
 	ret = parse_cgroupfs_options(data, &opts);
 	if (ret)
 		goto out_unlock;
 retry:
 	/* look for a matching existing root */
 	if (!opts.subsys_mask && !opts.none && !opts.name) {
 		cgrp_dfl_root_visible = true;
 		root = &cgrp_dfl_root;
 		cgroup_get(&root->cgrp);
 		ret = 0;
 		goto out_unlock;
 	}
 	for_each_root(root) {
 		bool name_match = false;
 		if (root == &cgrp_dfl_root)
 			continue;
 		/*
 		 * If we asked for a name then it must match.  Also, if
 		 * name matches but sybsys_mask doesn't, we should fail.
 		 * Remember whether name matched.
 		 */
 		if (opts.name) {
 			if (strcmp(opts.name, root->name))
 				continue;
 			name_match = true;
 		}
 		/*
 		 * If we asked for subsystems (or explicitly for no
 		 * subsystems) then they must match.
 		 */
 		if ((opts.subsys_mask || opts.none) &&
 		    (opts.subsys_mask != root->cgrp.subsys_mask)) {
 			if (!name_match)
 				continue;
 			ret = -EBUSY;
 			goto out_unlock;
 		}
 		if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
 			if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
 				pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
 				ret = -EINVAL;
 				goto out_unlock;
 			} else {
 				pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
 			}
 		}
 		/*
 		 * A root's lifetime is governed by its root cgroup.  Zero
 		 * ref indicate that the root is being destroyed.  Wait for
 		 * destruction to complete so that the subsystems are free.
 		 * We can use wait_queue for the wait but this path is
 		 * super cold.  Let's just sleep for a bit and retry.
 		 */
 		if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
 			mutex_unlock(&cgroup_mutex);
 			mutex_unlock(&cgroup_tree_mutex);
 			msleep(10);
 			mutex_lock(&cgroup_tree_mutex);
 			mutex_lock(&cgroup_mutex);
 			goto retry;
 		}
 		ret = 0;
 		goto out_unlock;
 	}
 	/*
 	 * No such thing, create a new one.  name= matching without subsys
 	 * specification is allowed for already existing hierarchies but we
 	 * can't create new one without subsys specification.
 	 */
 	if (!opts.subsys_mask && !opts.none) {
 		ret = -EINVAL;
 		goto out_unlock;
 	}
 	root = kzalloc(sizeof(*root), GFP_KERNEL);
 	if (!root) {
 		ret = -ENOMEM;
 		goto out_unlock;
 	}
 	init_cgroup_root(root, &opts);
 	ret = cgroup_setup_root(root, opts.subsys_mask);
 	if (ret)
 		cgroup_free_root(root);
 out_unlock:
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	kfree(opts.release_agent);
 	kfree(opts.name);
 	if (ret)
 		return ERR_PTR(ret);
-	dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
+	dentry = kernfs_mount(fs_type, flags, root->kf_root,
+				CGROUP_SUPER_MAGIC, &new_sb);
 	if (IS_ERR(dentry) || !new_sb)
 		cgroup_put(&root->cgrp);
 	return dentry;
 }
 static void cgroup_kill_sb(struct super_block *sb)
 {
 	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	cgroup_put(&root->cgrp);
 	kernfs_kill_sb(sb);
 }
 static struct file_system_type cgroup_fs_type = {
 	.name = "cgroup",
 	.mount = cgroup_mount,
 	.kill_sb = cgroup_kill_sb,
 };
 static struct kobject *cgroup_kobj;
 /**
  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
  * @task: target task
  * @buf: the buffer to write the path into
  * @buflen: the length of the buffer
  *
  * Determine @task's cgroup on the first (the one with the lowest non-zero
  * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
  * function grabs cgroup_mutex and shouldn't be used inside locks used by
  * cgroup controller callbacks.
  *
  * Return value is the same as kernfs_path().
  */
 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 {
 	struct cgroup_root *root;
 	struct cgroup *cgrp;
 	int hierarchy_id = 1;
 	char *path = NULL;
 	mutex_lock(&cgroup_mutex);
 	down_read(&css_set_rwsem);
 	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
 	if (root) {
 		cgrp = task_cgroup_from_root(task, root);
 		path = cgroup_path(cgrp, buf, buflen);
 	} else {
 		/* if no hierarchy exists, everyone is in "/" */
 		if (strlcpy(buf, "/", buflen) < buflen)
 			path = buf;
 	}
 	up_read(&css_set_rwsem);
 	mutex_unlock(&cgroup_mutex);
 	return path;
 }
 EXPORT_SYMBOL_GPL(task_cgroup_path);
 /* used to track tasks and other necessary states during migration */
 struct cgroup_taskset {
 	/* the src and dst cset list running through cset->mg_node */
 	struct list_head	src_csets;
 	struct list_head	dst_csets;
 	/*
 	 * Fields for cgroup_taskset_*() iteration.
 	 *
 	 * Before migration is committed, the target migration tasks are on
 	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
 	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
 	 * or ->dst_csets depending on whether migration is committed.
 	 *
 	 * ->cur_csets and ->cur_task point to the current task position
 	 * during iteration.
 	 */
 	struct list_head	*csets;
 	struct css_set		*cur_cset;
 	struct task_struct	*cur_task;
 };
 /**
  * cgroup_taskset_first - reset taskset and return the first task
  * @tset: taskset of interest
  *
  * @tset iteration is initialized and the first task is returned.
  */
 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
 {
 	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
 	tset->cur_task = NULL;
 	return cgroup_taskset_next(tset);
 }
 /**
  * cgroup_taskset_next - iterate to the next task in taskset
  * @tset: taskset of interest
  *
  * Return the next task in @tset.  Iteration must have been initialized
  * with cgroup_taskset_first().
  */
 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
 {
 	struct css_set *cset = tset->cur_cset;
 	struct task_struct *task = tset->cur_task;
 	while (&cset->mg_node != tset->csets) {
 		if (!task)
 			task = list_first_entry(&cset->mg_tasks,
 						struct task_struct, cg_list);
 		else
 			task = list_next_entry(task, cg_list);
 		if (&task->cg_list != &cset->mg_tasks) {
 			tset->cur_cset = cset;
 			tset->cur_task = task;
 			return task;
 		}
 		cset = list_next_entry(cset, mg_node);
 		task = NULL;
 	}
 	return NULL;
 }
 /**
  * cgroup_task_migrate - move a task from one cgroup to another.
  * @old_cgrp; the cgroup @tsk is being migrated from
  * @tsk: the task being migrated
  * @new_cset: the new css_set @tsk is being attached to
  *
  * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
  */
 static void cgroup_task_migrate(struct cgroup *old_cgrp,
 				struct task_struct *tsk,
 				struct css_set *new_cset)
 {
 	struct css_set *old_cset;
 	lockdep_assert_held(&cgroup_mutex);
 	lockdep_assert_held(&css_set_rwsem);
 	/*
 	 * We are synchronized through threadgroup_lock() against PF_EXITING
 	 * setting such that we can't race against cgroup_exit() changing the
 	 * css_set to init_css_set and dropping the old one.
 	 */
 	WARN_ON_ONCE(tsk->flags & PF_EXITING);
 	old_cset = task_css_set(tsk);
 	get_css_set(new_cset);
 	rcu_assign_pointer(tsk->cgroups, new_cset);
 	/*
 	 * Use move_tail so that cgroup_taskset_first() still returns the
 	 * leader after migration.  This works because cgroup_migrate()
 	 * ensures that the dst_cset of the leader is the first on the
 	 * tset's dst_csets list.
 	 */
 	list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
 	/*
 	 * We just gained a reference on old_cset by taking it from the
 	 * task. As trading it for new_cset is protected by cgroup_mutex,
 	 * we're safe to drop it here; it will be freed under RCU.
 	 */
 	set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
 	put_css_set_locked(old_cset, false);
 }
 /**
  * cgroup_migrate_finish - cleanup after attach
  * @preloaded_csets: list of preloaded css_sets
  *
  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
  * those functions for details.
  */
 static void cgroup_migrate_finish(struct list_head *preloaded_csets)
 {
 	struct css_set *cset, *tmp_cset;
 	lockdep_assert_held(&cgroup_mutex);
 	down_write(&css_set_rwsem);
 	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
 		cset->mg_src_cgrp = NULL;
 		cset->mg_dst_cset = NULL;
 		list_del_init(&cset->mg_preload_node);
 		put_css_set_locked(cset, false);
 	}
 	up_write(&css_set_rwsem);
 }
 /**
  * cgroup_migrate_add_src - add a migration source css_set
  * @src_cset: the source css_set to add
  * @dst_cgrp: the destination cgroup
  * @preloaded_csets: list of preloaded css_sets
  *
  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
  * @src_cset and add it to @preloaded_csets, which should later be cleaned
  * up by cgroup_migrate_finish().
  *
  * This function may be called without holding threadgroup_lock even if the
  * target is a process.  Threads may be created and destroyed but as long
  * as cgroup_mutex is not dropped, no new css_set can be put into play and
  * the preloaded css_sets are guaranteed to cover all migrations.
  */
 static void cgroup_migrate_add_src(struct css_set *src_cset,
 				   struct cgroup *dst_cgrp,
 				   struct list_head *preloaded_csets)
 {
 	struct cgroup *src_cgrp;
 	lockdep_assert_held(&cgroup_mutex);
 	lockdep_assert_held(&css_set_rwsem);
 	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
 	/* nothing to do if this cset already belongs to the cgroup */
 	if (src_cgrp == dst_cgrp)
 		return;
 	if (!list_empty(&src_cset->mg_preload_node))
 		return;
 	WARN_ON(src_cset->mg_src_cgrp);
 	WARN_ON(!list_empty(&src_cset->mg_tasks));
 	WARN_ON(!list_empty(&src_cset->mg_node));
 	src_cset->mg_src_cgrp = src_cgrp;
 	get_css_set(src_cset);
 	list_add(&src_cset->mg_preload_node, preloaded_csets);
 }
 /**
  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
  * @dst_cgrp: the destination cgroup
  * @preloaded_csets: list of preloaded source css_sets
  *
  * Tasks are about to be moved to @dst_cgrp and all the source css_sets
  * have been preloaded to @preloaded_csets.  This function looks up and
  * pins all destination css_sets, links each to its source, and put them on
  * @preloaded_csets.
  *
  * This function must be called after cgroup_migrate_add_src() has been
  * called on each migration source css_set.  After migration is performed
  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
  * @preloaded_csets.
  */
 static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
 				      struct list_head *preloaded_csets)
 {
 	LIST_HEAD(csets);
 	struct css_set *src_cset;
 	lockdep_assert_held(&cgroup_mutex);
 	/* look up the dst cset for each src cset and link it to src */
 	list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {
 		struct css_set *dst_cset;
 		dst_cset = find_css_set(src_cset, dst_cgrp);
 		if (!dst_cset)
 			goto err;
 		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
 		src_cset->mg_dst_cset = dst_cset;
 		if (list_empty(&dst_cset->mg_preload_node))
 			list_add(&dst_cset->mg_preload_node, &csets);
 		else
 			put_css_set(dst_cset, false);
 	}
 	list_splice(&csets, preloaded_csets);
 	return 0;
 err:
 	cgroup_migrate_finish(&csets);
 	return -ENOMEM;
 }
 /**
  * cgroup_migrate - migrate a process or task to a cgroup
  * @cgrp: the destination cgroup
  * @leader: the leader of the process or the task to migrate
  * @threadgroup: whether @leader points to the whole process or a single task
  *
  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
  * process, the caller must be holding threadgroup_lock of @leader.  The
  * caller is also responsible for invoking cgroup_migrate_add_src() and
  * cgroup_migrate_prepare_dst() on the targets before invoking this
  * function and following up with cgroup_migrate_finish().
  *
  * As long as a controller's ->can_attach() doesn't fail, this function is
  * guaranteed to succeed.  This means that, excluding ->can_attach()
  * failure, when migrating multiple targets, the success or failure can be
  * decided for all targets by invoking group_migrate_prepare_dst() before
  * actually starting migrating.
  */
 static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
 			  bool threadgroup)
 {
 	struct cgroup_taskset tset = {
 		.src_csets	= LIST_HEAD_INIT(tset.src_csets),
 		.dst_csets	= LIST_HEAD_INIT(tset.dst_csets),
 		.csets		= &tset.src_csets,
 	};
 	struct cgroup_subsys_state *css, *failed_css = NULL;
 	struct css_set *cset, *tmp_cset;
 	struct task_struct *task, *tmp_task;
 	int i, ret;
 	/*
 	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
 	 * already PF_EXITING could be freed from underneath us unless we
 	 * take an rcu_read_lock.
 	 */
 	down_write(&css_set_rwsem);
 	rcu_read_lock();
 	task = leader;
 	do {
 		/* @task either already exited or can't exit until the end */
 		if (task->flags & PF_EXITING)
 			goto next;
 		/* leave @task alone if post_fork() hasn't linked it yet */
 		if (list_empty(&task->cg_list))
 			goto next;
 		cset = task_css_set(task);
 		if (!cset->mg_src_cgrp)
 			goto next;
 		/*
 		 * cgroup_taskset_first() must always return the leader.
 		 * Take care to avoid disturbing the ordering.
 		 */
 		list_move_tail(&task->cg_list, &cset->mg_tasks);
 		if (list_empty(&cset->mg_node))
 			list_add_tail(&cset->mg_node, &tset.src_csets);
 		if (list_empty(&cset->mg_dst_cset->mg_node))
 			list_move_tail(&cset->mg_dst_cset->mg_node,
 				       &tset.dst_csets);
 	next:
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
 	up_write(&css_set_rwsem);
 	/* methods shouldn't be called if no task is actually migrating */
 	if (list_empty(&tset.src_csets))
 		return 0;
 	/* check that we can legitimately attach to the cgroup */
 	for_each_css(css, i, cgrp) {
 		if (css->ss->can_attach) {
 			ret = css->ss->can_attach(css, &tset);
 			if (ret) {
 				failed_css = css;
 				goto out_cancel_attach;
 			}
 		}
 	}
 	/*
 	 * Now that we're guaranteed success, proceed to move all tasks to
 	 * the new cgroup.  There are no failure cases after here, so this
 	 * is the commit point.
 	 */
 	down_write(&css_set_rwsem);
 	list_for_each_entry(cset, &tset.src_csets, mg_node) {
 		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
 			cgroup_task_migrate(cset->mg_src_cgrp, task,
 					    cset->mg_dst_cset);
 	}
 	up_write(&css_set_rwsem);
 	/*
 	 * Migration is committed, all target tasks are now on dst_csets.
 	 * Nothing is sensitive to fork() after this point.  Notify
 	 * controllers that migration is complete.
 	 */
 	tset.csets = &tset.dst_csets;
 	for_each_css(css, i, cgrp)
 		if (css->ss->attach)
 			css->ss->attach(css, &tset);
 	ret = 0;
 	goto out_release_tset;
 out_cancel_attach:
 	for_each_css(css, i, cgrp) {
 		if (css == failed_css)
 			break;
 		if (css->ss->cancel_attach)
 			css->ss->cancel_attach(css, &tset);
 	}
 out_release_tset:
 	down_write(&css_set_rwsem);
 	list_splice_init(&tset.dst_csets, &tset.src_csets);
 	list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
 		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
 		list_del_init(&cset->mg_node);
 	}
 	up_write(&css_set_rwsem);
 	return ret;
 }
 /**
  * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
  * @dst_cgrp: the cgroup to attach to
  * @leader: the task or the leader of the threadgroup to be attached
  * @threadgroup: attach the whole threadgroup?
  *
  * Call holding cgroup_mutex and threadgroup_lock of @leader.
  */
 static int cgroup_attach_task(struct cgroup *dst_cgrp,
 			      struct task_struct *leader, bool threadgroup)
 {
 	LIST_HEAD(preloaded_csets);
 	struct task_struct *task;
 	int ret;
 	/* look up all src csets */
 	down_read(&css_set_rwsem);
 	rcu_read_lock();
 	task = leader;
 	do {
 		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
 				       &preloaded_csets);
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
 	up_read(&css_set_rwsem);
 	/* prepare dst csets and commit */
 	ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
 	if (!ret)
 		ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
 	cgroup_migrate_finish(&preloaded_csets);
 	return ret;
 }
 /*
  * Find the task_struct of the task to attach by vpid and pass it along to the
  * function to attach either it or all tasks in its threadgroup. Will lock
  * cgroup_mutex and threadgroup.
  */
 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 {
 	struct task_struct *tsk;
 	const struct cred *cred = current_cred(), *tcred;
 	int ret;
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 retry_find_task:
 	rcu_read_lock();
 	if (pid) {
 		tsk = find_task_by_vpid(pid);
 		if (!tsk) {
 			rcu_read_unlock();
 			ret = -ESRCH;
 			goto out_unlock_cgroup;
 		}
 		/*
 		 * even if we're attaching all tasks in the thread group, we
 		 * only need to check permissions on one of them.
 		 */
 		tcred = __task_cred(tsk);
 		if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
 		    !uid_eq(cred->euid, tcred->uid) &&
 		    !uid_eq(cred->euid, tcred->suid)) {
 			rcu_read_unlock();
 			ret = -EACCES;
 			goto out_unlock_cgroup;
 		}
 	} else
 		tsk = current;
 	if (threadgroup)
 		tsk = tsk->group_leader;
 	/*
 	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
 	 * trapped in a cpuset, or RT worker may be born in a cgroup
 	 * with no rt_runtime allocated.  Just say no.
 	 */
 	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
 		rcu_read_unlock();
 		goto out_unlock_cgroup;
 	}
 	get_task_struct(tsk);
 	rcu_read_unlock();
 	threadgroup_lock(tsk);
 	if (threadgroup) {
 		if (!thread_group_leader(tsk)) {
 			/*
 			 * a race with de_thread from another thread's exec()
 			 * may strip us of our leadership, if this happens,
 			 * there is no choice but to throw this task away and
 			 * try again; this is
 			 * "double-double-toil-and-trouble-check locking".
 			 */
 			threadgroup_unlock(tsk);
 			put_task_struct(tsk);
 			goto retry_find_task;
 		}
 	}
 	ret = cgroup_attach_task(cgrp, tsk, threadgroup);
 	threadgroup_unlock(tsk);
 	put_task_struct(tsk);
 out_unlock_cgroup:
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
 /**
  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
  * @from: attach to all cgroups of a given task
  * @tsk: the task to be attached
  */
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 {
 	struct cgroup_root *root;
 	int retval = 0;
 	mutex_lock(&cgroup_mutex);
 	for_each_root(root) {
 		struct cgroup *from_cgrp;
 		if (root == &cgrp_dfl_root)
 			continue;
 		down_read(&css_set_rwsem);
 		from_cgrp = task_cgroup_from_root(from, root);
 		up_read(&css_set_rwsem);
 		retval = cgroup_attach_task(from_cgrp, tsk, false);
 		if (retval)
 			break;
 	}
 	mutex_unlock(&cgroup_mutex);
 	return retval;
 }
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 static int cgroup_tasks_write(struct cgroup_subsys_state *css,
 			      struct cftype *cft, u64 pid)
 {
 	return attach_task_by_pid(css->cgroup, pid, false);
 }
 static int cgroup_procs_write(struct cgroup_subsys_state *css,
 			      struct cftype *cft, u64 tgid)
 {
 	return attach_task_by_pid(css->cgroup, tgid, true);
 }
 static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
 				      struct cftype *cft, char *buffer)
 {
 	struct cgroup_root *root = css->cgroup->root;
 	BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
 	if (!cgroup_lock_live_group(css->cgroup))
 		return -ENODEV;
 	spin_lock(&release_agent_path_lock);
 	strlcpy(root->release_agent_path, buffer,
 		sizeof(root->release_agent_path));
 	spin_unlock(&release_agent_path_lock);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 static int cgroup_release_agent_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 	seq_puts(seq, cgrp->root->release_agent_path);
 	seq_putc(seq, '\n');
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
 	seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
 	return 0;
 }
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off)
 {
 	struct cgroup *cgrp = of->kn->parent->priv;
 	struct cftype *cft = of->kn->priv;
 	struct cgroup_subsys_state *css;
 	int ret;
 	/*
 	 * kernfs guarantees that a file isn't deleted with operations in
 	 * flight, which means that the matching css is and stays alive and
 	 * doesn't need to be pinned.  The RCU locking is not necessary
 	 * either.  It's just for the convenience of using cgroup_css().
 	 */
 	rcu_read_lock();
 	css = cgroup_css(cgrp, cft->ss);
 	rcu_read_unlock();
 	if (cft->write_string) {
 		ret = cft->write_string(css, cft, strstrip(buf));
 	} else if (cft->write_u64) {
 		unsigned long long v;
 		ret = kstrtoull(buf, 0, &v);
 		if (!ret)
 			ret = cft->write_u64(css, cft, v);
 	} else if (cft->write_s64) {
 		long long v;
 		ret = kstrtoll(buf, 0, &v);
 		if (!ret)
 			ret = cft->write_s64(css, cft, v);
 	} else if (cft->trigger) {
 		ret = cft->trigger(css, (unsigned int)cft->private);
 	} else {
 		ret = -EINVAL;
 	}
 	return ret ?: nbytes;
 }
 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
 {
 	return seq_cft(seq)->seq_start(seq, ppos);
 }
 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
 {
 	return seq_cft(seq)->seq_next(seq, v, ppos);
 }
 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
 {
 	seq_cft(seq)->seq_stop(seq, v);
 }
 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 {
 	struct cftype *cft = seq_cft(m);
 	struct cgroup_subsys_state *css = seq_css(m);
 	if (cft->seq_show)
 		return cft->seq_show(m, arg);
 	if (cft->read_u64)
 		seq_printf(m, "%llu\n", cft->read_u64(css, cft));
 	else if (cft->read_s64)
 		seq_printf(m, "%lld\n", cft->read_s64(css, cft));
 	else
 		return -EINVAL;
 	return 0;
 }
 static struct kernfs_ops cgroup_kf_single_ops = {
 	.atomic_write_len	= PAGE_SIZE,
 	.write			= cgroup_file_write,
 	.seq_show		= cgroup_seqfile_show,
 };
 static struct kernfs_ops cgroup_kf_ops = {
 	.atomic_write_len	= PAGE_SIZE,
 	.write			= cgroup_file_write,
 	.seq_start		= cgroup_seqfile_start,
 	.seq_next		= cgroup_seqfile_next,
 	.seq_stop		= cgroup_seqfile_stop,
 	.seq_show		= cgroup_seqfile_show,
 };
 /*
  * cgroup_rename - Only allow simple rename of directories in place.
  */
 static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
 			 const char *new_name_str)
 {
 	struct cgroup *cgrp = kn->priv;
 	int ret;
 	if (kernfs_type(kn) != KERNFS_DIR)
 		return -ENOTDIR;
 	if (kn->parent != new_parent)
 		return -EIO;
 	/*
 	 * This isn't a proper migration and its usefulness is very
 	 * limited.  Disallow if sane_behavior.
 	 */
 	if (cgroup_sane_behavior(cgrp))
 		return -EPERM;
 	/*
 	 * We're gonna grab cgroup_tree_mutex which nests outside kernfs
 	 * active_ref.  kernfs_rename() doesn't require active_ref
 	 * protection.  Break them before grabbing cgroup_tree_mutex.
 	 */
 	kernfs_break_active_protection(new_parent);
 	kernfs_break_active_protection(kn);
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	ret = kernfs_rename(kn, new_parent, new_name_str);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	kernfs_unbreak_active_protection(kn);
 	kernfs_unbreak_active_protection(new_parent);
 	return ret;
 }
 /* set uid and gid of cgroup dirs and files to that of the creator */
 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
 {
 	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
 			       .ia_uid = current_fsuid(),
 			       .ia_gid = current_fsgid(), };
 	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
 	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
 		return 0;
 	return kernfs_setattr(kn, &iattr);
 }
 static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 {
 	char name[CGROUP_FILE_NAME_MAX];
 	struct kernfs_node *kn;
 	struct lock_class_key *key = NULL;
 	int ret;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	key = &cft->lockdep_key;
 #endif
 	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
 				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
 				  NULL, false, key);
 	if (IS_ERR(kn))
 		return PTR_ERR(kn);
 	ret = cgroup_kn_set_ugid(kn);
 	if (ret)
 		kernfs_remove(kn);
 	return ret;
 }
 /**
  * cgroup_addrm_files - add or remove files to a cgroup directory
  * @cgrp: the target cgroup
  * @cfts: array of cftypes to be added
  * @is_add: whether to add or remove
  *
  * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
  * For removals, this function never fails.  If addition fails, this
  * function doesn't remove files already added.  The caller is responsible
  * for cleaning up.
  */
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add)
 {
 	struct cftype *cft;
 	int ret;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
 		/* does cft->flags tell us to skip this file on @cgrp? */
 		if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
 			continue;
 		if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
 			continue;
 		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
 			continue;
 		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
 			continue;
 		if (is_add) {
 			ret = cgroup_add_file(cgrp, cft);
 			if (ret) {
 				pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
 					cft->name, ret);
 				return ret;
 			}
 		} else {
 			cgroup_rm_file(cgrp, cft);
 		}
 	}
 	return 0;
 }
 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
 {
 	LIST_HEAD(pending);
 	struct cgroup_subsys *ss = cfts[0].ss;
 	struct cgroup *root = &ss->root->cgrp;
 	struct cgroup_subsys_state *css;
 	int ret = 0;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	/* don't bother if @ss isn't attached */
 	if (ss->root == &cgrp_dfl_root)
 		return 0;
 	/* add/rm files for all cgroups created before */
 	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
 		struct cgroup *cgrp = css->cgroup;
 		if (cgroup_is_dead(cgrp))
 			continue;
 		ret = cgroup_addrm_files(cgrp, cfts, is_add);
 		if (ret)
 			break;
 	}
 	if (is_add && !ret)
 		kernfs_activate(root->kn);
 	return ret;
 }
 static void cgroup_exit_cftypes(struct cftype *cfts)
 {
 	struct cftype *cft;
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
 		/* free copy for custom atomic_write_len, see init_cftypes() */
 		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
 			kfree(cft->kf_ops);
 		cft->kf_ops = NULL;
 		cft->ss = NULL;
 	}
 }
 static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
 	struct cftype *cft;
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
 		struct kernfs_ops *kf_ops;
 		WARN_ON(cft->ss || cft->kf_ops);
 		if (cft->seq_start)
 			kf_ops = &cgroup_kf_ops;
 		else
 			kf_ops = &cgroup_kf_single_ops;
 		/*
 		 * Ugh... if @cft wants a custom max_write_len, we need to
 		 * make a copy of kf_ops to set its atomic_write_len.
 		 */
 		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
 			kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
 			if (!kf_ops) {
 				cgroup_exit_cftypes(cfts);
 				return -ENOMEM;
 			}
 			kf_ops->atomic_write_len = cft->max_write_len;
 		}
 		cft->kf_ops = kf_ops;
 		cft->ss = ss;
 	}
 	return 0;
 }
 static int cgroup_rm_cftypes_locked(struct cftype *cfts)
 {
 	lockdep_assert_held(&cgroup_tree_mutex);
 	if (!cfts || !cfts[0].ss)
 		return -ENOENT;
 	list_del(&cfts->node);
 	cgroup_apply_cftypes(cfts, false);
 	cgroup_exit_cftypes(cfts);
 	return 0;
 }
 /**
  * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
  * @cfts: zero-length name terminated array of cftypes
  *
  * Unregister @cfts.  Files described by @cfts are removed from all
  * existing cgroups and all future cgroups won't have them either.  This
  * function can be called anytime whether @cfts' subsys is attached or not.
  *
  * Returns 0 on successful unregistration, -ENOENT if @cfts is not
  * registered.
  */
 int cgroup_rm_cftypes(struct cftype *cfts)
 {
 	int ret;
 	mutex_lock(&cgroup_tree_mutex);
 	ret = cgroup_rm_cftypes_locked(cfts);
 	mutex_unlock(&cgroup_tree_mutex);
 	return ret;
 }
 /**
  * cgroup_add_cftypes - add an array of cftypes to a subsystem
  * @ss: target cgroup subsystem
  * @cfts: zero-length name terminated array of cftypes
  *
  * Register @cfts to @ss.  Files described by @cfts are created for all
  * existing cgroups to which @ss is attached and all future cgroups will
  * have them too.  This function can be called anytime whether @ss is
  * attached or not.
  *
  * Returns 0 on successful registration, -errno on failure.  Note that this
  * function currently returns 0 as long as @cfts registration is successful
  * even if some file creation attempts on existing cgroups fail.
  */
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
 	int ret;
 	if (!cfts || cfts[0].name[0] == '\0')
 		return 0;
 	ret = cgroup_init_cftypes(ss, cfts);
 	if (ret)
 		return ret;
 	mutex_lock(&cgroup_tree_mutex);
 	list_add_tail(&cfts->node, &ss->cfts);
 	ret = cgroup_apply_cftypes(cfts, true);
 	if (ret)
 		cgroup_rm_cftypes_locked(cfts);
 	mutex_unlock(&cgroup_tree_mutex);
 	return ret;
 }
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
  *
  * Return the number of tasks in the cgroup.
  */
 static int cgroup_task_count(const struct cgroup *cgrp)
 {
 	int count = 0;
 	struct cgrp_cset_link *link;
 	down_read(&css_set_rwsem);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
 		count += atomic_read(&link->cset->refcount);
 	up_read(&css_set_rwsem);
 	return count;
 }
 /**
  * css_next_child - find the next child of a given css
  * @pos_css: the current position (%NULL to initiate traversal)
  * @parent_css: css whose children to walk
  *
  * This function returns the next child of @parent_css and should be called
  * under either cgroup_mutex or RCU read lock.  The only requirement is
  * that @parent_css and @pos_css are accessible.  The next sibling is
  * guaranteed to be returned regardless of their states.
  */
 struct cgroup_subsys_state *
 css_next_child(struct cgroup_subsys_state *pos_css,
 	       struct cgroup_subsys_state *parent_css)
 {
 	struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
 	struct cgroup *cgrp = parent_css->cgroup;
 	struct cgroup *next;
 	cgroup_assert_mutexes_or_rcu_locked();
 	/*
 	 * @pos could already have been removed.  Once a cgroup is removed,
 	 * its ->sibling.next is no longer updated when its next sibling
 	 * changes.  As CGRP_DEAD assertion is serialized and happens
 	 * before the cgroup is taken off the ->sibling list, if we see it
 	 * unasserted, it's guaranteed that the next sibling hasn't
 	 * finished its grace period even if it's already removed, and thus
 	 * safe to dereference from this RCU critical section.  If
 	 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
 	 * to be visible as %true here.
 	 *
 	 * If @pos is dead, its next pointer can't be dereferenced;
 	 * however, as each cgroup is given a monotonically increasing
 	 * unique serial number and always appended to the sibling list,
 	 * the next one can be found by walking the parent's children until
 	 * we see a cgroup with higher serial number than @pos's.  While
 	 * this path can be slower, it's taken only when either the current
 	 * cgroup is removed or iteration and removal race.
 	 */
 	if (!pos) {
 		next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
 	} else if (likely(!cgroup_is_dead(pos))) {
 		next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
 	} else {
 		list_for_each_entry_rcu(next, &cgrp->children, sibling)
 			if (next->serial_nr > pos->serial_nr)
 				break;
 	}
 	if (&next->sibling == &cgrp->children)
 		return NULL;
 	return cgroup_css(next, parent_css->ss);
 }
 /**
  * css_next_descendant_pre - find the next descendant for pre-order walk
  * @pos: the current position (%NULL to initiate traversal)
  * @root: css whose descendants to walk
  *
  * To be used by css_for_each_descendant_pre().  Find the next descendant
  * to visit for pre-order traversal of @root's descendants.  @root is
  * included in the iteration and the first node to be visited.
  *
  * While this function requires cgroup_mutex or RCU read locking, it
  * doesn't require the whole traversal to be contained in a single critical
  * section.  This function will return the correct next descendant as long
  * as both @pos and @root are accessible and @pos is a descendant of @root.
  */
 struct cgroup_subsys_state *
 css_next_descendant_pre(struct cgroup_subsys_state *pos,
 			struct cgroup_subsys_state *root)
 {
 	struct cgroup_subsys_state *next;
 	cgroup_assert_mutexes_or_rcu_locked();
 	/* if first iteration, visit @root */
 	if (!pos)
 		return root;
 	/* visit the first child if exists */
 	next = css_next_child(NULL, pos);
 	if (next)
 		return next;
 	/* no child, visit my or the closest ancestor's next sibling */
 	while (pos != root) {
 		next = css_next_child(pos, css_parent(pos));
 		if (next)
 			return next;
 		pos = css_parent(pos);
 	}
 	return NULL;
 }
 /**
  * css_rightmost_descendant - return the rightmost descendant of a css
  * @pos: css of interest
  *
  * Return the rightmost descendant of @pos.  If there's no descendant, @pos
  * is returned.  This can be used during pre-order traversal to skip
  * subtree of @pos.
  *
  * While this function requires cgroup_mutex or RCU read locking, it
  * doesn't require the whole traversal to be contained in a single critical
  * section.  This function will return the correct rightmost descendant as
  * long as @pos is accessible.
  */
 struct cgroup_subsys_state *
 css_rightmost_descendant(struct cgroup_subsys_state *pos)
 {
 	struct cgroup_subsys_state *last, *tmp;
 	cgroup_assert_mutexes_or_rcu_locked();
 	do {
 		last = pos;
 		/* ->prev isn't RCU safe, walk ->next till the end */
 		pos = NULL;
 		css_for_each_child(tmp, last)
 			pos = tmp;
 	} while (pos);
 	return last;
 }
 static struct cgroup_subsys_state *
 css_leftmost_descendant(struct cgroup_subsys_state *pos)
 {
 	struct cgroup_subsys_state *last;
 	do {
 		last = pos;
 		pos = css_next_child(NULL, pos);
 	} while (pos);
 	return last;
 }
 /**
  * css_next_descendant_post - find the next descendant for post-order walk
  * @pos: the current position (%NULL to initiate traversal)
  * @root: css whose descendants to walk
  *
  * To be used by css_for_each_descendant_post().  Find the next descendant
  * to visit for post-order traversal of @root's descendants.  @root is
  * included in the iteration and the last node to be visited.
  *
  * While this function requires cgroup_mutex or RCU read locking, it
  * doesn't require the whole traversal to be contained in a single critical
  * section.  This function will return the correct next descendant as long
  * as both @pos and @cgroup are accessible and @pos is a descendant of
  * @cgroup.
  */
 struct cgroup_subsys_state *
 css_next_descendant_post(struct cgroup_subsys_state *pos,
 			 struct cgroup_subsys_state *root)
 {
 	struct cgroup_subsys_state *next;
 	cgroup_assert_mutexes_or_rcu_locked();
 	/* if first iteration, visit leftmost descendant which may be @root */
 	if (!pos)
 		return css_leftmost_descendant(root);
 	/* if we visited @root, we're done */
 	if (pos == root)
 		return NULL;
 	/* if there's an unvisited sibling, visit its leftmost descendant */
 	next = css_next_child(pos, css_parent(pos));
 	if (next)
 		return css_leftmost_descendant(next);
 	/* no sibling left, visit parent */
 	return css_parent(pos);
 }
 /**
  * css_advance_task_iter - advance a task itererator to the next css_set
  * @it: the iterator to advance
  *
  * Advance @it to the next css_set to walk.
  */
 static void css_advance_task_iter(struct css_task_iter *it)
 {
 	struct list_head *l = it->cset_link;
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
 	/* Advance to the next non-empty css_set */
 	do {
 		l = l->next;
 		if (l == &it->origin_css->cgroup->cset_links) {
 			it->cset_link = NULL;
 			return;
 		}
 		link = list_entry(l, struct cgrp_cset_link, cset_link);
 		cset = link->cset;
 	} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
 	it->cset_link = l;
 	if (!list_empty(&cset->tasks))
 		it->task = cset->tasks.next;
 	else
 		it->task = cset->mg_tasks.next;
 }
 /**
  * css_task_iter_start - initiate task iteration
  * @css: the css to walk tasks of
  * @it: the task iterator to use
  *
  * Initiate iteration through the tasks of @css.  The caller can call
  * css_task_iter_next() to walk through the tasks until the function
  * returns NULL.  On completion of iteration, css_task_iter_end() must be
  * called.
  *
  * Note that this function acquires a lock which is released when the
  * iteration finishes.  The caller can't sleep while iteration is in
  * progress.
  */
 void css_task_iter_start(struct cgroup_subsys_state *css,
 			 struct css_task_iter *it)
 	__acquires(css_set_rwsem)
 {
 	/* no one should try to iterate before mounting cgroups */
 	WARN_ON_ONCE(!use_task_css_set_links);
 	down_read(&css_set_rwsem);
 	it->origin_css = css;
 	it->cset_link = &css->cgroup->cset_links;
 	css_advance_task_iter(it);
 }
 /**
  * css_task_iter_next - return the next task for the iterator
  * @it: the task iterator being iterated
  *
  * The "next" function for task iteration.  @it should have been
  * initialized via css_task_iter_start().  Returns NULL when the iteration
  * reaches the end.
  */
 struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
 	struct task_struct *res;
 	struct list_head *l = it->task;
 	struct cgrp_cset_link *link = list_entry(it->cset_link,
 					struct cgrp_cset_link, cset_link);
 	/* If the iterator cg is NULL, we have no tasks */
 	if (!it->cset_link)
 		return NULL;
 	res = list_entry(l, struct task_struct, cg_list);
 	/*
 	 * Advance iterator to find next entry.  cset->tasks is consumed
 	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
 	 * next cset.
 	 */
 	l = l->next;
 	if (l == &link->cset->tasks)
 		l = link->cset->mg_tasks.next;
 	if (l == &link->cset->mg_tasks)
 		css_advance_task_iter(it);
 	else
 		it->task = l;
 	return res;
 }
 /**
  * css_task_iter_end - finish task iteration
  * @it: the task iterator to finish
  *
  * Finish task iteration started by css_task_iter_start().
  */
 void css_task_iter_end(struct css_task_iter *it)
 	__releases(css_set_rwsem)
 {
 	up_read(&css_set_rwsem);
 }
 /**
  * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
  * @to: cgroup to which the tasks will be moved
  * @from: cgroup in which the tasks currently reside
  *
  * Locking rules between cgroup_post_fork() and the migration path
  * guarantee that, if a task is forking while being migrated, the new child
  * is guaranteed to be either visible in the source cgroup after the
  * parent's migration is complete or put into the target cgroup.  No task
  * can slip out of migration through forking.
  */
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 {
 	LIST_HEAD(preloaded_csets);
 	struct cgrp_cset_link *link;
 	struct css_task_iter it;
 	struct task_struct *task;
 	int ret;
 	mutex_lock(&cgroup_mutex);
 	/* all tasks in @from are being moved, all csets are source */
 	down_read(&css_set_rwsem);
 	list_for_each_entry(link, &from->cset_links, cset_link)
 		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
 	up_read(&css_set_rwsem);
 	ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
 	if (ret)
 		goto out_err;
 	/*
 	 * Migrate tasks one-by-one until @form is empty.  This fails iff
 	 * ->can_attach() fails.
 	 */
 	do {
 		css_task_iter_start(&from->dummy_css, &it);
 		task = css_task_iter_next(&it);
 		if (task)
 			get_task_struct(task);
 		css_task_iter_end(&it);
 		if (task) {
 			ret = cgroup_migrate(to, task, false);
 			put_task_struct(task);
 		}
 	} while (task && !ret);
 out_err:
 	cgroup_migrate_finish(&preloaded_csets);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
 /*
  * Stuff for reading the 'tasks'/'procs' files.
  *
  * Reading this file can return large amounts of data if a cgroup has
  * *lots* of attached tasks. So it may need several calls to read(),
  * but we cannot guarantee that the information we produce is correct
  * unless we produce it entirely atomically.
  *
  */
 /* which pidlist file are we talking about? */
 enum cgroup_filetype {
 	CGROUP_FILE_PROCS,
 	CGROUP_FILE_TASKS,
 };
 /*
  * A pidlist is a list of pids that virtually represents the contents of one
  * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
  * a pair (one each for procs, tasks) for each pid namespace that's relevant
  * to the cgroup.
  */
 struct cgroup_pidlist {
 	/*
 	 * used to find which pidlist is wanted. doesn't change as long as
 	 * this particular list stays in the list.
 	*/
 	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
 	/* array of xids */
 	pid_t *list;
 	/* how many elements the above list has */
 	int length;
 	/* each of these stored in a list by its cgroup */
 	struct list_head links;
 	/* pointer to the cgroup we belong to, for list removal purposes */
 	struct cgroup *owner;
 	/* for delayed destruction */
 	struct delayed_work destroy_dwork;
 };
 /*
  * The following two functions "fix" the issue where there are more pids
  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
  * TODO: replace with a kernel-wide solution to this problem
  */
 #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
 static void *pidlist_allocate(int count)
 {
 	if (PIDLIST_TOO_LARGE(count))
 		return vmalloc(count * sizeof(pid_t));
 	else
 		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
 }
 static void pidlist_free(void *p)
 {
 	if (is_vmalloc_addr(p))
 		vfree(p);
 	else
 		kfree(p);
 }
 /*
  * Used to destroy all pidlists lingering waiting for destroy timer.  None
  * should be left afterwards.
  */
 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
 {
 	struct cgroup_pidlist *l, *tmp_l;
 	mutex_lock(&cgrp->pidlist_mutex);
 	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
 	mutex_unlock(&cgrp->pidlist_mutex);
 	flush_workqueue(cgroup_pidlist_destroy_wq);
 	BUG_ON(!list_empty(&cgrp->pidlists));
 }
 static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
 {
 	struct delayed_work *dwork = to_delayed_work(work);
 	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
 						destroy_dwork);
 	struct cgroup_pidlist *tofree = NULL;
 	mutex_lock(&l->owner->pidlist_mutex);
 	/*
 	 * Destroy iff we didn't get queued again.  The state won't change
 	 * as destroy_dwork can only be queued while locked.
 	 */
 	if (!delayed_work_pending(dwork)) {
 		list_del(&l->links);
 		pidlist_free(l->list);
 		put_pid_ns(l->key.ns);
 		tofree = l;
 	}
 	mutex_unlock(&l->owner->pidlist_mutex);
 	kfree(tofree);
 }
 /*
  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
  * Returns the number of unique elements.
  */
 static int pidlist_uniq(pid_t *list, int length)
 {
 	int src, dest = 1;
 	/*
 	 * we presume the 0th element is unique, so i starts at 1. trivial
 	 * edge cases first; no work needs to be done for either
 	 */
 	if (length == 0 || length == 1)
 		return length;
 	/* src and dest walk down the list; dest counts unique elements */
 	for (src = 1; src < length; src++) {
 		/* find next unique element */
 		while (list[src] == list[src-1]) {
 			src++;
 			if (src == length)
 				goto after;
 		}
 		/* dest always points to where the next unique element goes */
 		list[dest] = list[src];
 		dest++;
 	}
 after:
 	return dest;
 }
 /*
  * The two pid files - task and cgroup.procs - guaranteed that the result
  * is sorted, which forced this whole pidlist fiasco.  As pid order is
  * different per namespace, each namespace needs differently sorted list,
  * making it impossible to use, for example, single rbtree of member tasks
  * sorted by task pointer.  As pidlists can be fairly large, allocating one
  * per open file is dangerous, so cgroup had to implement shared pool of
  * pidlists keyed by cgroup and namespace.
  *
  * All this extra complexity was caused by the original implementation
  * committing to an entirely unnecessary property.  In the long term, we
  * want to do away with it.  Explicitly scramble sort order if
  * sane_behavior so that no such expectation exists in the new interface.
  *
  * Scrambling is done by swapping every two consecutive bits, which is
  * non-identity one-to-one mapping which disturbs sort order sufficiently.
  */
 static pid_t pid_fry(pid_t pid)
 {
 	unsigned a = pid & 0x55555555;
 	unsigned b = pid & 0xAAAAAAAA;
 	return (a << 1) | (b >> 1);
 }
 static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
 {
 	if (cgroup_sane_behavior(cgrp))
 		return pid_fry(pid);
 	else
 		return pid;
 }
 static int cmppid(const void *a, const void *b)
 {
 	return *(pid_t *)a - *(pid_t *)b;
 }
 static int fried_cmppid(const void *a, const void *b)
 {
 	return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
 }
 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 						  enum cgroup_filetype type)
 {
 	struct cgroup_pidlist *l;
 	/* don't need task_nsproxy() if we're looking at ourself */
 	struct pid_namespace *ns = task_active_pid_ns(current);
 	lockdep_assert_held(&cgrp->pidlist_mutex);
 	list_for_each_entry(l, &cgrp->pidlists, links)
 		if (l->key.type == type && l->key.ns == ns)
 			return l;
 	return NULL;
 }
 /*
  * find the appropriate pidlist for our purpose (given procs vs tasks)
  * returns with the lock on that pidlist already held, and takes care
  * of the use count, or returns NULL with no locks held if we're out of
  * memory.
  */
 static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
 						enum cgroup_filetype type)
 {
 	struct cgroup_pidlist *l;
 	lockdep_assert_held(&cgrp->pidlist_mutex);
 	l = cgroup_pidlist_find(cgrp, type);
 	if (l)
 		return l;
 	/* entry not found; create a new one */
 	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
 	if (!l)
 		return l;
 	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
 	l->key.type = type;
 	/* don't need task_nsproxy() if we're looking at ourself */
 	l->key.ns = get_pid_ns(task_active_pid_ns(current));
 	l->owner = cgrp;
 	list_add(&l->links, &cgrp->pidlists);
 	return l;
 }
 /*
  * Load a cgroup's pidarray with either procs' tgids or tasks' pids
  */
 static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 			      struct cgroup_pidlist **lp)
 {
 	pid_t *array;
 	int length;
 	int pid, n = 0; /* used for populating the array */
 	struct css_task_iter it;
 	struct task_struct *tsk;
 	struct cgroup_pidlist *l;
 	lockdep_assert_held(&cgrp->pidlist_mutex);
 	/*
 	 * If cgroup gets more users after we read count, we won't have
 	 * enough space - tough.  This race is indistinguishable to the
 	 * caller from the case that the additional cgroup users didn't
 	 * show up until sometime later on.
 	 */
 	length = cgroup_task_count(cgrp);
 	array = pidlist_allocate(length);
 	if (!array)
 		return -ENOMEM;
 	/* now, populate the array */
 	css_task_iter_start(&cgrp->dummy_css, &it);
 	while ((tsk = css_task_iter_next(&it))) {
 		if (unlikely(n == length))
 			break;
 		/* get tgid or pid for procs or tasks file respectively */
 		if (type == CGROUP_FILE_PROCS)
 			pid = task_tgid_vnr(tsk);
 		else
 			pid = task_pid_vnr(tsk);
 		if (pid > 0) /* make sure to only use valid results */
 			array[n++] = pid;
 	}
 	css_task_iter_end(&it);
 	length = n;
 	/* now sort & (if procs) strip out duplicates */
 	if (cgroup_sane_behavior(cgrp))
 		sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
 	else
 		sort(array, length, sizeof(pid_t), cmppid, NULL);
 	if (type == CGROUP_FILE_PROCS)
 		length = pidlist_uniq(array, length);
 	l = cgroup_pidlist_find_create(cgrp, type);
 	if (!l) {
 		mutex_unlock(&cgrp->pidlist_mutex);
 		pidlist_free(array);
 		return -ENOMEM;
 	}
 	/* store array, freeing old if necessary */
 	pidlist_free(l->list);
 	l->list = array;
 	l->length = length;
 	*lp = l;
 	return 0;
 }
 /**
  * cgroupstats_build - build and fill cgroupstats
  * @stats: cgroupstats to fill information into
  * @dentry: A dentry entry belonging to the cgroup for which stats have
  * been requested.
  *
  * Build and fill cgroupstats so that taskstats can export it to user
  * space.
  */
 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 {
 	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
 	struct cgroup *cgrp;
 	struct css_task_iter it;
 	struct task_struct *tsk;
 	/* it should be kernfs_node belonging to cgroupfs and is a directory */
 	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
 	    kernfs_type(kn) != KERNFS_DIR)
 		return -EINVAL;
 	mutex_lock(&cgroup_mutex);
 	/*
 	 * We aren't being called from kernfs and there's no guarantee on
 	 * @kn->priv's validity.  For this and css_tryget_from_dir(),
 	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
 	 */
 	rcu_read_lock();
 	cgrp = rcu_dereference(kn->priv);
 	if (!cgrp || cgroup_is_dead(cgrp)) {
 		rcu_read_unlock();
 		mutex_unlock(&cgroup_mutex);
 		return -ENOENT;
 	}
 	rcu_read_unlock();
 	css_task_iter_start(&cgrp->dummy_css, &it);
 	while ((tsk = css_task_iter_next(&it))) {
 		switch (tsk->state) {
 		case TASK_RUNNING:
 			stats->nr_running++;
 			break;
 		case TASK_INTERRUPTIBLE:
 			stats->nr_sleeping++;
 			break;
 		case TASK_UNINTERRUPTIBLE:
 			stats->nr_uninterruptible++;
 			break;
 		case TASK_STOPPED:
 			stats->nr_stopped++;
 			break;
 		default:
 			if (delayacct_is_task_waiting_on_io(tsk))
 				stats->nr_io_wait++;
 			break;
 		}
 	}
 	css_task_iter_end(&it);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 /*
  * seq_file methods for the tasks/procs files. The seq_file position is the
  * next pid to display; the seq_file iterator is a pointer to the pid
  * in the cgroup->l->list array.
  */
 static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 {
 	/*
 	 * Initially we receive a position value that corresponds to
 	 * one more than the last pid shown (or 0 on the first call or
 	 * after a seek to the start). Use a binary-search to find the
 	 * next pid to display, if any
 	 */
 	struct kernfs_open_file *of = s->private;
 	struct cgroup *cgrp = seq_css(s)->cgroup;
 	struct cgroup_pidlist *l;
 	enum cgroup_filetype type = seq_cft(s)->private;
 	int index = 0, pid = *pos;
 	int *iter, ret;
 	mutex_lock(&cgrp->pidlist_mutex);
 	/*
 	 * !NULL @of->priv indicates that this isn't the first start()
 	 * after open.  If the matching pidlist is around, we can use that.
 	 * Look for it.  Note that @of->priv can't be used directly.  It
 	 * could already have been destroyed.
 	 */
 	if (of->priv)
 		of->priv = cgroup_pidlist_find(cgrp, type);
 	/*
 	 * Either this is the first start() after open or the matching
 	 * pidlist has been destroyed inbetween.  Create a new one.
 	 */
 	if (!of->priv) {
 		ret = pidlist_array_load(cgrp, type,
 					 (struct cgroup_pidlist **)&of->priv);
 		if (ret)
 			return ERR_PTR(ret);
 	}
 	l = of->priv;
 	if (pid) {
 		int end = l->length;
 		while (index < end) {
 			int mid = (index + end) / 2;
 			if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
 				index = mid;
 				break;
 			} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
 				index = mid + 1;
 			else
 				end = mid;
 		}
 	}
 	/* If we're off the end of the array, we're done */
 	if (index >= l->length)
 		return NULL;
 	/* Update the abstract position to be the actual pid that we found */
 	iter = l->list + index;
 	*pos = cgroup_pid_fry(cgrp, *iter);
 	return iter;
 }
 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
 	struct kernfs_open_file *of = s->private;
 	struct cgroup_pidlist *l = of->priv;
 	if (l)
 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
 				 CGROUP_PIDLIST_DESTROY_DELAY);
 	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
 }
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 {
 	struct kernfs_open_file *of = s->private;
 	struct cgroup_pidlist *l = of->priv;
 	pid_t *p = v;
 	pid_t *end = l->list + l->length;
 	/*
 	 * Advance to the next pid in the array. If this goes off the
 	 * end, we're done
 	 */
 	p++;
 	if (p >= end) {
 		return NULL;
 	} else {
 		*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
 		return p;
 	}
 }
 static int cgroup_pidlist_show(struct seq_file *s, void *v)
 {
 	return seq_printf(s, "%d\n", *(int *)v);
 }
 /*
  * seq_operations functions for iterating on pidlists through seq_file -
  * independent of whether it's tasks or procs
  */
 static const struct seq_operations cgroup_pidlist_seq_operations = {
 	.start = cgroup_pidlist_start,
 	.stop = cgroup_pidlist_stop,
 	.next = cgroup_pidlist_next,
 	.show = cgroup_pidlist_show,
 };
 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
 					 struct cftype *cft)
 {
 	return notify_on_release(css->cgroup);
 }
 static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
 					  struct cftype *cft, u64 val)
 {
 	clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
 	if (val)
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
 	else
 		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
 	return 0;
 }
 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
 				      struct cftype *cft)
 {
 	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
 }
 static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
 				       struct cftype *cft, u64 val)
 {
 	if (val)
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
 	else
 		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
 	return 0;
 }
 static struct cftype cgroup_base_files[] = {
 	{
 		.name = "cgroup.procs",
 		.seq_start = cgroup_pidlist_start,
 		.seq_next = cgroup_pidlist_next,
 		.seq_stop = cgroup_pidlist_stop,
 		.seq_show = cgroup_pidlist_show,
 		.private = CGROUP_FILE_PROCS,
 		.write_u64 = cgroup_procs_write,
 		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
 		.name = "cgroup.clone_children",
 		.flags = CFTYPE_INSANE,
 		.read_u64 = cgroup_clone_children_read,
 		.write_u64 = cgroup_clone_children_write,
 	},
 	{
 		.name = "cgroup.sane_behavior",
 		.flags = CFTYPE_ONLY_ON_ROOT,
 		.seq_show = cgroup_sane_behavior_show,
 	},
 	/*
 	 * Historical crazy stuff.  These don't have "cgroup."  prefix and
 	 * don't exist if sane_behavior.  If you're depending on these, be
 	 * prepared to be burned.
 	 */
 	{
 		.name = "tasks",
 		.flags = CFTYPE_INSANE,		/* use "procs" instead */
 		.seq_start = cgroup_pidlist_start,
 		.seq_next = cgroup_pidlist_next,
 		.seq_stop = cgroup_pidlist_stop,
 		.seq_show = cgroup_pidlist_show,
 		.private = CGROUP_FILE_TASKS,
 		.write_u64 = cgroup_tasks_write,
 		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
 		.name = "notify_on_release",
 		.flags = CFTYPE_INSANE,
 		.read_u64 = cgroup_read_notify_on_release,
 		.write_u64 = cgroup_write_notify_on_release,
 	},
 	{
 		.name = "release_agent",
 		.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
 		.seq_show = cgroup_release_agent_show,
 		.write_string = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX - 1,
 	},
 	{ }	/* terminate */
 };
 /**
  * cgroup_populate_dir - create subsys files in a cgroup directory
  * @cgrp: target cgroup
  * @subsys_mask: mask of the subsystem ids whose files should be added
  *
  * On failure, no file is added.
  */
 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 {
 	struct cgroup_subsys *ss;
 	int i, ret = 0;
 	/* process cftsets of each subsystem */
 	for_each_subsys(ss, i) {
 		struct cftype *cfts;
 		if (!test_bit(i, &subsys_mask))
 			continue;
 		list_for_each_entry(cfts, &ss->cfts, node) {
 			ret = cgroup_addrm_files(cgrp, cfts, true);
 			if (ret < 0)
 				goto err;
 		}
 	}
 	return 0;
 err:
 	cgroup_clear_dir(cgrp, subsys_mask);
 	return ret;
 }
 /*
  * css destruction is four-stage process.
  *
  * 1. Destruction starts.  Killing of the percpu_ref is initiated.
  *    Implemented in kill_css().
  *
  * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
  *    and thus css_tryget() is guaranteed to fail, the css can be offlined
  *    by invoking offline_css().  After offlining, the base ref is put.
  *    Implemented in css_killed_work_fn().
  *
  * 3. When the percpu_ref reaches zero, the only possible remaining
  *    accessors are inside RCU read sections.  css_release() schedules the
  *    RCU callback.
  *
  * 4. After the grace period, the css can be freed.  Implemented in
  *    css_free_work_fn().
  *
  * It is actually hairier because both step 2 and 4 require process context
  * and thus involve punting to css->destroy_work adding two additional
  * steps to the already complex sequence.
  */
 static void css_free_work_fn(struct work_struct *work)
 {
 	struct cgroup_subsys_state *css =
 		container_of(work, struct cgroup_subsys_state, destroy_work);
 	struct cgroup *cgrp = css->cgroup;
 	if (css->parent)
 		css_put(css->parent);
 	css->ss->css_free(css);
 	cgroup_put(cgrp);
 }
 static void css_free_rcu_fn(struct rcu_head *rcu_head)
 {
 	struct cgroup_subsys_state *css =
 		container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
 	INIT_WORK(&css->destroy_work, css_free_work_fn);
 	queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 static void css_release(struct percpu_ref *ref)
 {
 	struct cgroup_subsys_state *css =
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 	RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
 	call_rcu(&css->rcu_head, css_free_rcu_fn);
 }
 static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
 		     struct cgroup *cgrp)
 {
 	css->cgroup = cgrp;
 	css->ss = ss;
 	css->flags = 0;
 	if (cgrp->parent)
 		css->parent = cgroup_css(cgrp->parent, ss);
 	else
 		css->flags |= CSS_ROOT;
 	BUG_ON(cgroup_css(cgrp, ss));
 }
 /* invoke ->css_online() on a new CSS and mark it online if successful */
 static int online_css(struct cgroup_subsys_state *css)
 {
 	struct cgroup_subsys *ss = css->ss;
 	int ret = 0;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	if (ss->css_online)
 		ret = ss->css_online(css);
 	if (!ret) {
 		css->flags |= CSS_ONLINE;
 		css->cgroup->nr_css++;
 		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
 	}
 	return ret;
 }
 /* if the CSS is online, invoke ->css_offline() on it and mark it offline */
 static void offline_css(struct cgroup_subsys_state *css)
 {
 	struct cgroup_subsys *ss = css->ss;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	if (!(css->flags & CSS_ONLINE))
 		return;
 	if (ss->css_offline)
 		ss->css_offline(css);
 	css->flags &= ~CSS_ONLINE;
 	css->cgroup->nr_css--;
 	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);
 }
 /**
  * create_css - create a cgroup_subsys_state
  * @cgrp: the cgroup new css will be associated with
  * @ss: the subsys of new css
  *
  * Create a new css associated with @cgrp - @ss pair.  On success, the new
  * css is online and installed in @cgrp with all interface files created.
  * Returns 0 on success, -errno on failure.
  */
 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
 {
 	struct cgroup *parent = cgrp->parent;
 	struct cgroup_subsys_state *css;
 	int err;
 	lockdep_assert_held(&cgroup_mutex);
 	css = ss->css_alloc(cgroup_css(parent, ss));
 	if (IS_ERR(css))
 		return PTR_ERR(css);
 	err = percpu_ref_init(&css->refcnt, css_release);
 	if (err)
 		goto err_free_css;
 	init_css(css, ss, cgrp);
 	err = cgroup_populate_dir(cgrp, 1 << ss->id);
 	if (err)
 		goto err_free_percpu_ref;
 	err = online_css(css);
 	if (err)
 		goto err_clear_dir;
 	cgroup_get(cgrp);
 	css_get(css->parent);
 	cgrp->subsys_mask |= 1 << ss->id;
 	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
 	    parent->parent) {
 		pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
 			   current->comm, current->pid, ss->name);
 		if (!strcmp(ss->name, "memory"))
 			pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
 		ss->warned_broken_hierarchy = true;
 	}
 	return 0;
 err_clear_dir:
 	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
 err_free_percpu_ref:
 	percpu_ref_cancel_init(&css->refcnt);
 err_free_css:
 	ss->css_free(css);
 	return err;
 }
 /**
  * cgroup_create - create a cgroup
  * @parent: cgroup that will be parent of the new cgroup
  * @name: name of the new cgroup
  * @mode: mode to set on new cgroup
  */
 static long cgroup_create(struct cgroup *parent, const char *name,
 			  umode_t mode)
 {
 	struct cgroup *cgrp;
 	struct cgroup_root *root = parent->root;
 	int ssid, err;
 	struct cgroup_subsys *ss;
 	struct kernfs_node *kn;
 	/*
 	 * XXX: The default hierarchy isn't fully implemented yet.  Block
 	 * !root cgroup creation on it for now.
 	 */
 	if (root == &cgrp_dfl_root)
 		return -EINVAL;
 	/* allocate the cgroup and its ID, 0 is reserved for the root */
 	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
 	if (!cgrp)
 		return -ENOMEM;
 	mutex_lock(&cgroup_tree_mutex);
 	/*
 	 * Only live parents can have children.  Note that the liveliness
 	 * check isn't strictly necessary because cgroup_mkdir() and
 	 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
 	 * anyway so that locking is contained inside cgroup proper and we
 	 * don't get nasty surprises if we ever grow another caller.
 	 */
 	if (!cgroup_lock_live_group(parent)) {
 		err = -ENODEV;
 		goto err_unlock_tree;
 	}
 	/*
 	 * Temporarily set the pointer to NULL, so idr_find() won't return
 	 * a half-baked cgroup.
 	 */
 	cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
 	if (cgrp->id < 0) {
 		err = -ENOMEM;
 		goto err_unlock;
 	}
 	init_cgroup_housekeeping(cgrp);
 	cgrp->parent = parent;
 	cgrp->dummy_css.parent = &parent->dummy_css;
 	cgrp->root = parent->root;
 	if (notify_on_release(parent))
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
 	/* create the directory */
 	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
 	if (IS_ERR(kn)) {
 		err = PTR_ERR(kn);
 		goto err_free_id;
 	}
 	cgrp->kn = kn;
 	/*
 	 * This extra ref will be put in cgroup_free_fn() and guarantees
 	 * that @cgrp->kn is always accessible.
 	 */
 	kernfs_get(kn);
 	cgrp->serial_nr = cgroup_serial_nr_next++;
 	/* allocation complete, commit to creation */
 	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
 	atomic_inc(&root->nr_cgrps);
 	cgroup_get(parent);
 	/*
 	 * @cgrp is now fully operational.  If something fails after this
 	 * point, it'll be released via the normal destruction path.
 	 */
 	idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
 	err = cgroup_kn_set_ugid(kn);
 	if (err)
 		goto err_destroy;
 	err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
 	if (err)
 		goto err_destroy;
 	/* let's create and online css's */
 	for_each_subsys(ss, ssid) {
 		if (root->cgrp.subsys_mask & (1 << ssid)) {
 			err = create_css(cgrp, ss);
 			if (err)
 				goto err_destroy;
 		}
 	}
 	kernfs_activate(kn);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	return 0;
 err_free_id:
 	idr_remove(&root->cgroup_idr, cgrp->id);
 err_unlock:
 	mutex_unlock(&cgroup_mutex);
 err_unlock_tree:
 	mutex_unlock(&cgroup_tree_mutex);
 	kfree(cgrp);
 	return err;
 err_destroy:
 	cgroup_destroy_locked(cgrp);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	return err;
 }
 static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 			umode_t mode)
 {
 	struct cgroup *parent = parent_kn->priv;
 	int ret;
 	/*
 	 * cgroup_create() grabs cgroup_tree_mutex which nests outside
 	 * kernfs active_ref and cgroup_create() already synchronizes
 	 * properly against removal through cgroup_lock_live_group().
 	 * Break it before calling cgroup_create().
 	 */
 	cgroup_get(parent);
 	kernfs_break_active_protection(parent_kn);
 	ret = cgroup_create(parent, name, mode);
 	kernfs_unbreak_active_protection(parent_kn);
 	cgroup_put(parent);
 	return ret;
 }
 /*
  * This is called when the refcnt of a css is confirmed to be killed.
  * css_tryget() is now guaranteed to fail.
  */
 static void css_killed_work_fn(struct work_struct *work)
 {
 	struct cgroup_subsys_state *css =
 		container_of(work, struct cgroup_subsys_state, destroy_work);
 	struct cgroup *cgrp = css->cgroup;
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	/*
 	 * css_tryget() is guaranteed to fail now.  Tell subsystems to
 	 * initate destruction.
 	 */
 	offline_css(css);
 	/*
 	 * If @cgrp is marked dead, it's waiting for refs of all css's to
 	 * be disabled before proceeding to the second phase of cgroup
 	 * destruction.  If we are the last one, kick it off.
 	 */
 	if (!cgrp->nr_css && cgroup_is_dead(cgrp))
 		cgroup_destroy_css_killed(cgrp);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	/*
 	 * Put the css refs from kill_css().  Each css holds an extra
 	 * reference to the cgroup's dentry and cgroup removal proceeds
 	 * regardless of css refs.  On the last put of each css, whenever
 	 * that may be, the extra dentry ref is put so that dentry
 	 * destruction happens only after all css's are released.
 	 */
 	css_put(css);
 }
 /* css kill confirmation processing requires process context, bounce */
 static void css_killed_ref_fn(struct percpu_ref *ref)
 {
 	struct cgroup_subsys_state *css =
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 	INIT_WORK(&css->destroy_work, css_killed_work_fn);
 	queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 static void __kill_css(struct cgroup_subsys_state *css)
 {
 	lockdep_assert_held(&cgroup_tree_mutex);
 	/*
 	 * This must happen before css is disassociated with its cgroup.
 	 * See seq_css() for details.
 	 */
 	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
 	/*
 	 * Killing would put the base ref, but we need to keep it alive
 	 * until after ->css_offline().
 	 */
 	css_get(css);
 	/*
 	 * cgroup core guarantees that, by the time ->css_offline() is
 	 * invoked, no new css reference will be given out via
 	 * css_tryget().  We can't simply call percpu_ref_kill() and
 	 * proceed to offlining css's because percpu_ref_kill() doesn't
 	 * guarantee that the ref is seen as killed on all CPUs on return.
 	 *
 	 * Use percpu_ref_kill_and_confirm() to get notifications as each
 	 * css is confirmed to be seen as killed on all CPUs.
 	 */
 	percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
 }
 /**
  * kill_css - destroy a css
  * @css: css to destroy
  *
  * This function initiates destruction of @css by removing cgroup interface
  * files and putting its base reference.  ->css_offline() will be invoked
  * asynchronously once css_tryget() is guaranteed to fail and when the
  * reference count reaches zero, @css will be released.
  */
 static void kill_css(struct cgroup_subsys_state *css)
 {
 	struct cgroup *cgrp = css->cgroup;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	/* if already killed, noop */
 	if (cgrp->subsys_mask & (1 << css->ss->id)) {
 		cgrp->subsys_mask &= ~(1 << css->ss->id);
 		__kill_css(css);
 	}
 }
 /**
  * cgroup_destroy_locked - the first stage of cgroup destruction
  * @cgrp: cgroup to be destroyed
  *
  * css's make use of percpu refcnts whose killing latency shouldn't be
  * exposed to userland and are RCU protected.  Also, cgroup core needs to
  * guarantee that css_tryget() won't succeed by the time ->css_offline() is
  * invoked.  To satisfy all the requirements, destruction is implemented in
  * the following two steps.
  *
  * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
  *     userland visible parts and start killing the percpu refcnts of
  *     css's.  Set up so that the next stage will be kicked off once all
  *     the percpu refcnts are confirmed to be killed.
  *
  * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
  *     rest of destruction.  Once all cgroup references are gone, the
  *     cgroup is RCU-freed.
  *
  * This function implements s1.  After this step, @cgrp is gone as far as
  * the userland is concerned and a new cgroup with the same name may be
  * created.  As cgroup doesn't care about the names internally, this
  * doesn't cause any problem.
  */
 static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
 	struct cgroup *child;
 	struct cgroup_subsys_state *css;
 	bool empty;
 	int ssid;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	/*
 	 * css_set_rwsem synchronizes access to ->cset_links and prevents
 	 * @cgrp from being removed while put_css_set() is in progress.
 	 */
 	down_read(&css_set_rwsem);
 	empty = list_empty(&cgrp->cset_links);
 	up_read(&css_set_rwsem);
 	if (!empty)
 		return -EBUSY;
 	/*
 	 * Make sure there's no live children.  We can't test ->children
 	 * emptiness as dead children linger on it while being destroyed;
 	 * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
 	 */
 	empty = true;
 	rcu_read_lock();
 	list_for_each_entry_rcu(child, &cgrp->children, sibling) {
 		empty = cgroup_is_dead(child);
 		if (!empty)
 			break;
 	}
 	rcu_read_unlock();
 	if (!empty)
 		return -EBUSY;
 	/*
 	 * Mark @cgrp dead.  This prevents further task migration and child
 	 * creation by disabling cgroup_lock_live_group().  Note that
 	 * CGRP_DEAD assertion is depended upon by css_next_child() to
 	 * resume iteration after dropping RCU read lock.  See
 	 * css_next_child() for details.
 	 */
 	set_bit(CGRP_DEAD, &cgrp->flags);
 	/*
 	 * Initiate massacre of all css's.  cgroup_destroy_css_killed()
 	 * will be invoked to perform the rest of destruction once the
 	 * percpu refs of all css's are confirmed to be killed.  This
 	 * involves removing the subsystem's files, drop cgroup_mutex.
 	 */
 	mutex_unlock(&cgroup_mutex);
 	for_each_css(css, ssid, cgrp)
 		kill_css(css);
 	mutex_lock(&cgroup_mutex);
 	/* CGRP_DEAD is set, remove from ->release_list for the last time */
 	raw_spin_lock(&release_list_lock);
 	if (!list_empty(&cgrp->release_list))
 		list_del_init(&cgrp->release_list);
 	raw_spin_unlock(&release_list_lock);
 	/*
 	 * If @cgrp has css's attached, the second stage of cgroup
 	 * destruction is kicked off from css_killed_work_fn() after the
 	 * refs of all attached css's are killed.  If @cgrp doesn't have
 	 * any css, we kick it off here.
 	 */
 	if (!cgrp->nr_css)
 		cgroup_destroy_css_killed(cgrp);
 	/* remove @cgrp directory along with the base files */
 	mutex_unlock(&cgroup_mutex);
 	/*
 	 * There are two control paths which try to determine cgroup from
 	 * dentry without going through kernfs - cgroupstats_build() and
 	 * css_tryget_from_dir().  Those are supported by RCU protecting
 	 * clearing of cgrp->kn->priv backpointer, which should happen
 	 * after all files under it have been removed.
 	 */
 	kernfs_remove(cgrp->kn);	/* @cgrp has an extra ref on its kn */
 	RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
 	mutex_lock(&cgroup_mutex);
 	return 0;
 };
 /**
  * cgroup_destroy_css_killed - the second step of cgroup destruction
  * @work: cgroup->destroy_free_work
  *
  * This function is invoked from a work item for a cgroup which is being
  * destroyed after all css's are offlined and performs the rest of
  * destruction.  This is the second step of destruction described in the
  * comment above cgroup_destroy_locked().
  */
 static void cgroup_destroy_css_killed(struct cgroup *cgrp)
 {
 	struct cgroup *parent = cgrp->parent;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	/* delete this cgroup from parent->children */
 	list_del_rcu(&cgrp->sibling);
 	cgroup_put(cgrp);
 	set_bit(CGRP_RELEASABLE, &parent->flags);
 	check_for_release(parent);
 }
 static int cgroup_rmdir(struct kernfs_node *kn)
 {
 	struct cgroup *cgrp = kn->priv;
 	int ret = 0;
 	/*
 	 * This is self-destruction but @kn can't be removed while this
 	 * callback is in progress.  Let's break active protection.  Once
 	 * the protection is broken, @cgrp can be destroyed at any point.
 	 * Pin it so that it stays accessible.
 	 */
 	cgroup_get(cgrp);
 	kernfs_break_active_protection(kn);
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	/*
 	 * @cgrp might already have been destroyed while we're trying to
 	 * grab the mutexes.
 	 */
 	if (!cgroup_is_dead(cgrp))
 		ret = cgroup_destroy_locked(cgrp);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	kernfs_unbreak_active_protection(kn);
 	cgroup_put(cgrp);
 	return ret;
 }
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
 	.remount_fs		= cgroup_remount,
 	.show_options		= cgroup_show_options,
 	.mkdir			= cgroup_mkdir,
 	.rmdir			= cgroup_rmdir,
 	.rename			= cgroup_rename,
 };
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 {
 	struct cgroup_subsys_state *css;
 	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	INIT_LIST_HEAD(&ss->cfts);
 	/* Create the root cgroup state for this subsystem */
 	ss->root = &cgrp_dfl_root;
 	css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
 	/* We don't handle early failures gracefully */
 	BUG_ON(IS_ERR(css));
 	init_css(css, ss, &cgrp_dfl_root.cgrp);
 	/* Update the init_css_set to contain a subsys
 	 * pointer to this state - since the subsystem is
 	 * newly registered, all tasks and hence the
 	 * init_css_set is in the subsystem's root cgroup. */
 	init_css_set.subsys[ss->id] = css;
 	need_forkexit_callback |= ss->fork || ss->exit;
 	/* At system boot, before all subsystems have been
 	 * registered, no tasks have been forked, so we don't
 	 * need to invoke fork callbacks here. */
 	BUG_ON(!list_empty(&init_task.tasks));
 	BUG_ON(online_css(css));
 	cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 }
 /**
  * cgroup_init_early - cgroup initialization at system boot
  *
  * Initialize cgroups at system boot, and initialize any
  * subsystems that request early init.
  */
 int __init cgroup_init_early(void)
 {
 	static struct cgroup_sb_opts __initdata opts =
 		{ .flags = CGRP_ROOT_SANE_BEHAVIOR };
 	struct cgroup_subsys *ss;
 	int i;
 	init_cgroup_root(&cgrp_dfl_root, &opts);
 	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
 	for_each_subsys(ss, i) {
 		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
 		     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
 		     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
 		     ss->id, ss->name);
 		WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
 		     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
 		ss->id = i;
 		ss->name = cgroup_subsys_name[i];
 		if (ss->early_init)
 			cgroup_init_subsys(ss);
 	}
 	return 0;
 }
 /**
  * cgroup_init - cgroup initialization
  *
  * Register cgroup filesystem and /proc file, and initialize
  * any subsystems that didn't request early init.
  */
 int __init cgroup_init(void)
 {
 	struct cgroup_subsys *ss;
 	unsigned long key;
 	int ssid, err;
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	/* Add init_css_set to the hash table */
 	key = css_set_hash(init_css_set.subsys);
 	hash_add(css_set_table, &init_css_set.hlist, key);
 	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	for_each_subsys(ss, ssid) {
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
 		/*
 		 * cftype registration needs kmalloc and can't be done
 		 * during early_init.  Register base cftypes separately.
 		 */
 		if (ss->base_cftypes)
 			WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
 	}
 	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
 	if (!cgroup_kobj)
 		return -ENOMEM;
 	err = register_filesystem(&cgroup_fs_type);
 	if (err < 0) {
 		kobject_put(cgroup_kobj);
 		return err;
 	}
 	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
 	return 0;
 }
 static int __init cgroup_wq_init(void)
 {
 	/*
 	 * There isn't much point in executing destruction path in
 	 * parallel.  Good chunk is serialized with cgroup_mutex anyway.
 	 * Use 1 for @max_active.
 	 *
 	 * We would prefer to do this in cgroup_init() above, but that
 	 * is called before init_workqueues(): so leave this until after.
 	 */
 	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
 	BUG_ON(!cgroup_destroy_wq);
 	/*
 	 * Used to destroy pidlists and separate to serve as flush domain.
 	 * Cap @max_active to 1 too.
 	 */
 	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
 						    0, 1);
 	BUG_ON(!cgroup_pidlist_destroy_wq);
 	return 0;
 }
 core_initcall(cgroup_wq_init);
 /*
  * proc_cgroup_show()
  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
  *  - Used for /proc/<pid>/cgroup.
  */
 /* TODO: Use a proper seq_file iterator */
 int proc_cgroup_show(struct seq_file *m, void *v)
 {
 	struct pid *pid;
 	struct task_struct *tsk;
 	char *buf, *path;
 	int retval;
 	struct cgroup_root *root;
 	retval = -ENOMEM;
 	buf = kmalloc(PATH_MAX, GFP_KERNEL);
 	if (!buf)
 		goto out;
 	retval = -ESRCH;
 	pid = m->private;
 	tsk = get_pid_task(pid, PIDTYPE_PID);
 	if (!tsk)
 		goto out_free;
 	retval = 0;
 	mutex_lock(&cgroup_mutex);
 	down_read(&css_set_rwsem);
 	for_each_root(root) {
 		struct cgroup_subsys *ss;
 		struct cgroup *cgrp;
 		int ssid, count = 0;
 		if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
 			continue;
 		seq_printf(m, "%d:", root->hierarchy_id);
 		for_each_subsys(ss, ssid)
 			if (root->cgrp.subsys_mask & (1 << ssid))
 				seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
 		if (strlen(root->name))
 			seq_printf(m, "%sname=%s", count ? "," : "",
 				   root->name);
 		seq_putc(m, ':');
 		cgrp = task_cgroup_from_root(tsk, root);
 		path = cgroup_path(cgrp, buf, PATH_MAX);
 		if (!path) {
 			retval = -ENAMETOOLONG;
 			goto out_unlock;
 		}
 		seq_puts(m, path);
 		seq_putc(m, '\n');
 	}
 out_unlock:
 	up_read(&css_set_rwsem);
 	mutex_unlock(&cgroup_mutex);
 	put_task_struct(tsk);
 out_free:
 	kfree(buf);
 out:
 	return retval;
 }
 /* Display information about each subsystem and each hierarchy */
 static int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
 	struct cgroup_subsys *ss;
 	int i;
 	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
 	/*
 	 * ideally we don't want subsystems moving around while we do this.
 	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
 	 * subsys/hierarchy state.
 	 */
 	mutex_lock(&cgroup_mutex);
 	for_each_subsys(ss, i)
 		seq_printf(m, "%s\t%d\t%d\t%d\n",
 			   ss->name, ss->root->hierarchy_id,
 			   atomic_read(&ss->root->nr_cgrps), !ss->disabled);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 static int cgroupstats_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, proc_cgroupstats_show, NULL);
 }
 static const struct file_operations proc_cgroupstats_operations = {
 	.open = cgroupstats_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.release = single_release,
 };
 /**
  * cgroup_fork - initialize cgroup related fields during copy_process()
  * @child: pointer to task_struct of forking parent process.
  *
  * A task is associated with the init_css_set until cgroup_post_fork()
  * attaches it to the parent's css_set.  Empty cg_list indicates that
  * @child isn't holding reference to its css_set.
  */
 void cgroup_fork(struct task_struct *child)
 {
 	RCU_INIT_POINTER(child->cgroups, &init_css_set);
 	INIT_LIST_HEAD(&child->cg_list);
 }
 /**
  * cgroup_post_fork - called on a new task after adding it to the task list
  * @child: the task in question
  *
  * Adds the task to the list running through its css_set if necessary and
  * call the subsystem fork() callbacks.  Has to be after the task is
  * visible on the task list in case we race with the first call to
  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
  * list.
  */
 void cgroup_post_fork(struct task_struct *child)
 {
 	struct cgroup_subsys *ss;
 	int i;
 	/*
 	 * This may race against cgroup_enable_task_cg_links().  As that
 	 * function sets use_task_css_set_links before grabbing
 	 * tasklist_lock and we just went through tasklist_lock to add
 	 * @child, it's guaranteed that either we see the set
 	 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
 	 * @child during its iteration.
 	 *
 	 * If we won the race, @child is associated with %current's
 	 * css_set.  Grabbing css_set_rwsem guarantees both that the
 	 * association is stable, and, on completion of the parent's
 	 * migration, @child is visible in the source of migration or
 	 * already in the destination cgroup.  This guarantee is necessary
 	 * when implementing operations which need to migrate all tasks of
 	 * a cgroup to another.
 	 *
 	 * Note that if we lose to cgroup_enable_task_cg_links(), @child
 	 * will remain in init_css_set.  This is safe because all tasks are
 	 * in the init_css_set before cg_links is enabled and there's no
 	 * operation which transfers all tasks out of init_css_set.
 	 */
 	if (use_task_css_set_links) {
 		struct css_set *cset;
 		down_write(&css_set_rwsem);
 		cset = task_css_set(current);
 		if (list_empty(&child->cg_list)) {
 			rcu_assign_pointer(child->cgroups, cset);
 			list_add(&child->cg_list, &cset->tasks);
 			get_css_set(cset);
 		}
 		up_write(&css_set_rwsem);
 	}
 	/*
 	 * Call ss->fork().  This must happen after @child is linked on
 	 * css_set; otherwise, @child might change state between ->fork()
 	 * and addition to css_set.
 	 */
 	if (need_forkexit_callback) {
 		for_each_subsys(ss, i)
 			if (ss->fork)
 				ss->fork(child);
 	}
 }
 /**
  * cgroup_exit - detach cgroup from exiting task
  * @tsk: pointer to task_struct of exiting process
  *
  * Description: Detach cgroup from @tsk and release it.
  *
  * Note that cgroups marked notify_on_release force every task in
  * them to take the global cgroup_mutex mutex when exiting.
  * This could impact scaling on very large systems.  Be reluctant to
  * use notify_on_release cgroups where very high task exit scaling
  * is required on large systems.
  *
  * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
  * call cgroup_exit() while the task is still competent to handle
  * notify_on_release(), then leave the task attached to the root cgroup in
  * each hierarchy for the remainder of its exit.  No need to bother with
  * init_css_set refcnting.  init_css_set never goes away and we can't race
  * with migration path - PF_EXITING is visible to migration path.
  */
 void cgroup_exit(struct task_struct *tsk)
 {
 	struct cgroup_subsys *ss;
 	struct css_set *cset;
 	bool put_cset = false;
 	int i;
 	/*
 	 * Unlink from @tsk from its css_set.  As migration path can't race
 	 * with us, we can check cg_list without grabbing css_set_rwsem.
 	 */
 	if (!list_empty(&tsk->cg_list)) {
 		down_write(&css_set_rwsem);
 		list_del_init(&tsk->cg_list);
 		up_write(&css_set_rwsem);
 		put_cset = true;
 	}
 	/* Reassign the task to the init_css_set. */
 	cset = task_css_set(tsk);
 	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
 	if (need_forkexit_callback) {
 		/* see cgroup_post_fork() for details */
 		for_each_subsys(ss, i) {
 			if (ss->exit) {
 				struct cgroup_subsys_state *old_css = cset->subsys[i];
 				struct cgroup_subsys_state *css = task_css(tsk, i);
 				ss->exit(css, old_css, tsk);
 			}
 		}
 	}
 	if (put_cset)
 		put_css_set(cset, true);
 }
 static void check_for_release(struct cgroup *cgrp)
 {
 	if (cgroup_is_releasable(cgrp) &&
 	    list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
 		/*
 		 * Control Group is currently removeable. If it's not
 		 * already queued for a userspace notification, queue
 		 * it now
 		 */
 		int need_schedule_work = 0;
 		raw_spin_lock(&release_list_lock);
 		if (!cgroup_is_dead(cgrp) &&
 		    list_empty(&cgrp->release_list)) {
 			list_add(&cgrp->release_list, &release_list);
 			need_schedule_work = 1;
 		}
 		raw_spin_unlock(&release_list_lock);
 		if (need_schedule_work)
 			schedule_work(&release_agent_work);
 	}
 }
 /*
  * Notify userspace when a cgroup is released, by running the
  * configured release agent with the name of the cgroup (path
  * relative to the root of cgroup file system) as the argument.
  *
  * Most likely, this user command will try to rmdir this cgroup.
  *
  * This races with the possibility that some other task will be
  * attached to this cgroup before it is removed, or that some other
  * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
  * The presumed 'rmdir' will fail quietly if this cgroup is no longer
  * unused, and this cgroup will be reprieved from its death sentence,
  * to continue to serve a useful existence.  Next time it's released,
  * we will get notified again, if it still has 'notify_on_release' set.
  *
  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
  * means only wait until the task is successfully execve()'d.  The
  * separate release agent task is forked by call_usermodehelper(),
  * then control in this thread returns here, without waiting for the
  * release agent task.  We don't bother to wait because the caller of
  * this routine has no use for the exit status of the release agent
  * task, so no sense holding our caller up for that.
  */
 static void cgroup_release_agent(struct work_struct *work)
 {
 	BUG_ON(work != &release_agent_work);
 	mutex_lock(&cgroup_mutex);
 	raw_spin_lock(&release_list_lock);
 	while (!list_empty(&release_list)) {
 		char *argv[3], *envp[3];
 		int i;
 		char *pathbuf = NULL, *agentbuf = NULL, *path;
 		struct cgroup *cgrp = list_entry(release_list.next,
 						    struct cgroup,
 						    release_list);
 		list_del_init(&cgrp->release_list);
 		raw_spin_unlock(&release_list_lock);
 		pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
 		if (!pathbuf)
 			goto continue_free;
 		path = cgroup_path(cgrp, pathbuf, PATH_MAX);
 		if (!path)
 			goto continue_free;
 		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
 		if (!agentbuf)
 			goto continue_free;
 		i = 0;
 		argv[i++] = agentbuf;
 		argv[i++] = path;
 		argv[i] = NULL;
 		i = 0;
 		/* minimal command environment */
 		envp[i++] = "HOME=/";
 		envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 		envp[i] = NULL;
 		/* Drop the lock while we invoke the usermode helper,
 		 * since the exec could involve hitting disk and hence
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
 		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
 		mutex_lock(&cgroup_mutex);
  continue_free:
 		kfree(pathbuf);
 		kfree(agentbuf);
 		raw_spin_lock(&release_list_lock);
 	}
 	raw_spin_unlock(&release_list_lock);
 	mutex_unlock(&cgroup_mutex);
 }
 static int __init cgroup_disable(char *str)
 {
 	struct cgroup_subsys *ss;
 	char *token;
 	int i;
 	while ((token = strsep(&str, ",")) != NULL) {
 		if (!*token)
 			continue;
 		for_each_subsys(ss, i) {
 			if (!strcmp(token, ss->name)) {
 				ss->disabled = 1;
 				printk(KERN_INFO "Disabling %s control group"
 					" subsystem\n", ss->name);
 				break;
 			}
 		}
 	}
 	return 1;
 }
 __setup("cgroup_disable=", cgroup_disable);
 /**
  * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
  * @dentry: directory dentry of interest
  * @ss: subsystem of interest
  *
  * If @dentry is a directory for a cgroup which has @ss enabled on it, try
  * to get the corresponding css and return it.  If such css doesn't exist
  * or can't be pinned, an ERR_PTR value is returned.
  */
 struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
 						struct cgroup_subsys *ss)
 {
 	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
 	struct cgroup_subsys_state *css = NULL;
 	struct cgroup *cgrp;
 	/* is @dentry a cgroup dir? */
 	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
 	    kernfs_type(kn) != KERNFS_DIR)
 		return ERR_PTR(-EBADF);
 	rcu_read_lock();
 	/*
 	 * This path doesn't originate from kernfs and @kn could already
 	 * have been or be removed at any point.  @kn->priv is RCU
 	 * protected for this access.  See destroy_locked() for details.
 	 */
 	cgrp = rcu_dereference(kn->priv);
 	if (cgrp)
 		css = cgroup_css(cgrp, ss);
 	if (!css || !css_tryget(css))
 		css = ERR_PTR(-ENOENT);
 	rcu_read_unlock();
 	return css;
 }
 /**
  * css_from_id - lookup css by id
  * @id: the cgroup id
  * @ss: cgroup subsys to be looked into
  *
  * Returns the css if there's valid one with @id, otherwise returns NULL.
  * Should be called under rcu_read_lock().
  */
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 {
 	struct cgroup *cgrp;
 	cgroup_assert_mutexes_or_rcu_locked();
 	cgrp = idr_find(&ss->root->cgroup_idr, id);
 	if (cgrp)
 		return cgroup_css(cgrp, ss);
 	return NULL;
 }
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *
 debug_css_alloc(struct cgroup_subsys_state *parent_css)
 {
 	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
 	if (!css)
 		return ERR_PTR(-ENOMEM);
 	return css;
 }
 static void debug_css_free(struct cgroup_subsys_state *css)
 {
 	kfree(css);
 }
 static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
 				struct cftype *cft)
 {
 	return cgroup_task_count(css->cgroup);
 }
 static u64 current_css_set_read(struct cgroup_subsys_state *css,
 				struct cftype *cft)
 {
 	return (u64)(unsigned long)current->cgroups;
 }
 static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
 					 struct cftype *cft)
 {
 	u64 count;
 	rcu_read_lock();
 	count = atomic_read(&task_css_set(current)->refcount);
 	rcu_read_unlock();
 	return count;
 }
 static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 {
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
 	char *name_buf;
 	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
 	if (!name_buf)
 		return -ENOMEM;
 	down_read(&css_set_rwsem);
 	rcu_read_lock();
 	cset = rcu_dereference(current->cgroups);
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
 		cgroup_name(c, name_buf, NAME_MAX + 1);
 		seq_printf(seq, "Root %d group %s\n",
 			   c->root->hierarchy_id, name_buf);
 	}
 	rcu_read_unlock();
 	up_read(&css_set_rwsem);
 	kfree(name_buf);
 	return 0;
 }
 #define MAX_TASKS_SHOWN_PER_CSS 25
 static int cgroup_css_links_read(struct seq_file *seq, void *v)
 {
 	struct cgroup_subsys_state *css = seq_css(seq);
 	struct cgrp_cset_link *link;
 	down_read(&css_set_rwsem);
 	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
 		struct css_set *cset = link->cset;
 		struct task_struct *task;
 		int count = 0;
 		seq_printf(seq, "css_set %p\n", cset);
 		list_for_each_entry(task, &cset->tasks, cg_list) {
 			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
 				goto overflow;
 			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
 		}
 		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
 			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
 				goto overflow;
 			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
 		}
 		continue;
 	overflow:
 		seq_puts(seq, "  ...\n");
 	}
 	up_read(&css_set_rwsem);
 	return 0;
 }
 static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
 	return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
 }
 static struct cftype debug_files[] =  {
 	{
 		.name = "taskcount",
 		.read_u64 = debug_taskcount_read,
 	},
 	{
 		.name = "current_css_set",
 		.read_u64 = current_css_set_read,
 	},
 	{
 		.name = "current_css_set_refcount",
 		.read_u64 = current_css_set_refcount_read,
 	},
 	{
 		.name = "current_css_set_cg_links",
 		.seq_show = current_css_set_cg_links_read,
 	},
 	{
 		.name = "cgroup_css_links",
 		.seq_show = cgroup_css_links_read,
 	},
 	{
 		.name = "releasable",
 		.read_u64 = releasable_read,
 	},
 	{ }	/* terminate */
 };
 struct cgroup_subsys debug_cgrp_subsys = {
 	.css_alloc = debug_css_alloc,
 	.css_free = debug_css_free,
 	.base_cftypes = debug_files,
 };
 #endif /* CONFIG_CGROUP_DEBUG */