Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

* Generic process-grouping system.

2

* Generic process-grouping system.

3

*

3

*

4

* Based originally on the cpuset system, extracted by Paul Menage

4

* Based originally on the cpuset system, extracted by Paul Menage

5

6

*

6

*

7

* Notifications support

7

* Notifications support

8

9

* Author: Kirill A. Shutemov

9

* Author: Kirill A. Shutemov

10

*

10

*

11

* Copyright notices from the original cpuset code:

11

* Copyright notices from the original cpuset code:

12

* --------------------------------------------------

12

* --------------------------------------------------

13

14

15

*

15

*

16

* Portions derived from Patrick Mochel's sysfs code.

16

* Portions derived from Patrick Mochel's sysfs code.

17

18

*

18

*

19

* 2003-10-10 Written by Simon Derr.

19

* 2003-10-10 Written by Simon Derr.

20

* 2003-10-22 Updates by Stephen Hemminger.

20

* 2003-10-22 Updates by Stephen Hemminger.

21

* 2004 May-July Rework by Paul Jackson.

21

* 2004 May-July Rework by Paul Jackson.

22

* ---------------------------------------------------

22

* ---------------------------------------------------

23

*

23

*

24

* This file is subject to the terms and conditions of the GNU General Public

24

* This file is subject to the terms and conditions of the GNU General Public

25

* License. See the file COPYING in the main directory of the Linux

25

* License. See the file COPYING in the main directory of the Linux

26

* distribution for more details.

26

* distribution for more details.

27

*/

27

*/

28

29

#include <linux/cgroup.h>

29

#include <linux/cgroup.h>

30

#include <linux/cred.h>

30

#include <linux/cred.h>

31

#include <linux/ctype.h>

31

#include <linux/ctype.h>

32

#include <linux/errno.h>

32

#include <linux/errno.h>

33

#include <linux/init_task.h>

33

#include <linux/init_task.h>

34

#include <linux/kernel.h>

34

#include <linux/kernel.h>

35

#include <linux/list.h>

35

#include <linux/list.h>

36

#include <linux/mm.h>

36

#include <linux/mm.h>

37

#include <linux/mutex.h>

37

#include <linux/mutex.h>

38

#include <linux/mount.h>

38

#include <linux/mount.h>

39

#include <linux/pagemap.h>

39

#include <linux/pagemap.h>

40

#include <linux/proc_fs.h>

40

#include <linux/proc_fs.h>

41

#include <linux/rcupdate.h>

41

#include <linux/rcupdate.h>

42

#include <linux/sched.h>

42

#include <linux/sched.h>

43

#include <linux/slab.h>

43

#include <linux/slab.h>

44

#include <linux/spinlock.h>

44

#include <linux/spinlock.h>

45

#include <linux/rwsem.h>

45

#include <linux/rwsem.h>

46

#include <linux/string.h>

46

#include <linux/string.h>

47

#include <linux/sort.h>

47

#include <linux/sort.h>

48

#include <linux/kmod.h>

48

#include <linux/kmod.h>

49

#include <linux/delayacct.h>

49

#include <linux/delayacct.h>

50

#include <linux/cgroupstats.h>

50

#include <linux/cgroupstats.h>

51

#include <linux/hashtable.h>

51

#include <linux/hashtable.h>

52

#include <linux/pid_namespace.h>

52

#include <linux/pid_namespace.h>

53

#include <linux/idr.h>

53

#include <linux/idr.h>

54

#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */

54

#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */

55

#include <linux/kthread.h>

55

#include <linux/kthread.h>

56

#include <linux/delay.h>

56

#include <linux/delay.h>

57

58

#include <linux/atomic.h>

58

#include <linux/atomic.h>

59

60

/*

60

/*

61

* pidlists linger the following amount before being destroyed. The goal

61

* pidlists linger the following amount before being destroyed. The goal

62

* is avoiding frequent destruction in the middle of consecutive read calls

62

* is avoiding frequent destruction in the middle of consecutive read calls

63

* Expiring in the middle is a performance problem not a correctness one.

63

* Expiring in the middle is a performance problem not a correctness one.

64

* 1 sec should be enough.

64

* 1 sec should be enough.

65

*/

65

*/

66

#define CGROUP_PIDLIST_DESTROY_DELAY HZ

66

#define CGROUP_PIDLIST_DESTROY_DELAY HZ

67

68

#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \

68

#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \

69

MAX_CFTYPE_NAME + 2)

69

MAX_CFTYPE_NAME + 2)

70

71

/*

71

/*

72

* cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file

72

* cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file

73

* creation/removal and hierarchy changing operations including cgroup

73

* creation/removal and hierarchy changing operations including cgroup

74

* creation, removal, css association and controller rebinding. This outer

74

* creation, removal, css association and controller rebinding. This outer

75

* lock is needed mainly to resolve the circular dependency between kernfs

75

* lock is needed mainly to resolve the circular dependency between kernfs

76

* active ref and cgroup_mutex. cgroup_tree_mutex nests above both.

76

* active ref and cgroup_mutex. cgroup_tree_mutex nests above both.

77

*/

77

*/

78

static DEFINE_MUTEX(cgroup_tree_mutex);

78

static DEFINE_MUTEX(cgroup_tree_mutex);

79

80

/*

80

/*

81

* cgroup_mutex is the master lock. Any modification to cgroup or its

81

* cgroup_mutex is the master lock. Any modification to cgroup or its

82

* hierarchy must be performed while holding it.

82

* hierarchy must be performed while holding it.

83

*

83

*

84

* css_set_rwsem protects task->cgroups pointer, the list of css_set

84

* css_set_rwsem protects task->cgroups pointer, the list of css_set

85

* objects, and the chain of tasks off each css_set.

85

* objects, and the chain of tasks off each css_set.

86

*

86

*

87

* These locks are exported if CONFIG_PROVE_RCU so that accessors in

87

* These locks are exported if CONFIG_PROVE_RCU so that accessors in

88

* cgroup.h can use them for lockdep annotations.

88

* cgroup.h can use them for lockdep annotations.

89

*/

89

*/

90

#ifdef CONFIG_PROVE_RCU

90

#ifdef CONFIG_PROVE_RCU

91

DEFINE_MUTEX(cgroup_mutex);

91

DEFINE_MUTEX(cgroup_mutex);

92

DECLARE_RWSEM(css_set_rwsem);

92

DECLARE_RWSEM(css_set_rwsem);

93

EXPORT_SYMBOL_GPL(cgroup_mutex);

93

EXPORT_SYMBOL_GPL(cgroup_mutex);

94

EXPORT_SYMBOL_GPL(css_set_rwsem);

94

EXPORT_SYMBOL_GPL(css_set_rwsem);

95

#else

95

#else

96

static DEFINE_MUTEX(cgroup_mutex);

96

static DEFINE_MUTEX(cgroup_mutex);

97

static DECLARE_RWSEM(css_set_rwsem);

97

static DECLARE_RWSEM(css_set_rwsem);

98

#endif

98

#endif

99

100

/*

100

/*

101

* Protects cgroup_subsys->release_agent_path. Modifying it also requires

101

* Protects cgroup_subsys->release_agent_path. Modifying it also requires

102

* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.

102

* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.

103

*/

103

*/

104

static DEFINE_SPINLOCK(release_agent_path_lock);

104

static DEFINE_SPINLOCK(release_agent_path_lock);

105

106

#define cgroup_assert_mutexes_or_rcu_locked() \

106

#define cgroup_assert_mutexes_or_rcu_locked() \

107

rcu_lockdep_assert(rcu_read_lock_held() || \

107

rcu_lockdep_assert(rcu_read_lock_held() || \

108

lockdep_is_held(&cgroup_tree_mutex) || \

108

lockdep_is_held(&cgroup_tree_mutex) || \

109

lockdep_is_held(&cgroup_mutex), \

109

lockdep_is_held(&cgroup_mutex), \

110

"cgroup_[tree_]mutex or RCU read lock required");

110

"cgroup_[tree_]mutex or RCU read lock required");

111

112

/*

112

/*

113

* cgroup destruction makes heavy use of work items and there can be a lot

113

* cgroup destruction makes heavy use of work items and there can be a lot

114

* of concurrent destructions. Use a separate workqueue so that cgroup

114

* of concurrent destructions. Use a separate workqueue so that cgroup

115

* destruction work items don't end up filling up max_active of system_wq

115

* destruction work items don't end up filling up max_active of system_wq

116

* which may lead to deadlock.

116

* which may lead to deadlock.

117

*/

117

*/

118

static struct workqueue_struct *cgroup_destroy_wq;

118

static struct workqueue_struct *cgroup_destroy_wq;

119

120

/*

120

/*

121

* pidlist destructions need to be flushed on cgroup destruction. Use a

121

* pidlist destructions need to be flushed on cgroup destruction. Use a

122

* separate workqueue as flush domain.

122

* separate workqueue as flush domain.

123

*/

123

*/

124

static struct workqueue_struct *cgroup_pidlist_destroy_wq;

124

static struct workqueue_struct *cgroup_pidlist_destroy_wq;

125

126

/* generate an array of cgroup subsystem pointers */

126

/* generate an array of cgroup subsystem pointers */

127

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,

127

#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,

128

static struct cgroup_subsys *cgroup_subsys[] = {

128

static struct cgroup_subsys *cgroup_subsys[] = {

129

#include <linux/cgroup_subsys.h>

129

#include <linux/cgroup_subsys.h>

130

};

130

};

131

#undef SUBSYS

131

#undef SUBSYS

132

133

/* array of cgroup subsystem names */

133

/* array of cgroup subsystem names */

134

#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,

134

#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,

135

static const char *cgroup_subsys_name[] = {

135

static const char *cgroup_subsys_name[] = {

136

#include <linux/cgroup_subsys.h>

136

#include <linux/cgroup_subsys.h>

137

};

137

};

138

#undef SUBSYS

138

#undef SUBSYS

139

140

/*

140

/*

141

* The default hierarchy, reserved for the subsystems that are otherwise

141

* The default hierarchy, reserved for the subsystems that are otherwise

142

* unattached - it never has more than a single cgroup, and all tasks are

142

* unattached - it never has more than a single cgroup, and all tasks are

143

* part of that cgroup.

143

* part of that cgroup.

144

*/

144

*/

145

struct cgroup_root cgrp_dfl_root;

145

struct cgroup_root cgrp_dfl_root;

146

147

/*

147

/*

148

* The default hierarchy always exists but is hidden until mounted for the

148

* The default hierarchy always exists but is hidden until mounted for the

149

* first time. This is for backward compatibility.

149

* first time. This is for backward compatibility.

150

*/

150

*/

151

static bool cgrp_dfl_root_visible;

151

static bool cgrp_dfl_root_visible;

152

153

/* The list of hierarchy roots */

153

/* The list of hierarchy roots */

154

155

static LIST_HEAD(cgroup_roots);

155

static LIST_HEAD(cgroup_roots);

156

static int cgroup_root_count;

156

static int cgroup_root_count;

157

158

/* hierarchy ID allocation and mapping, protected by cgroup_mutex */

158

/* hierarchy ID allocation and mapping, protected by cgroup_mutex */

159

static DEFINE_IDR(cgroup_hierarchy_idr);

159

static DEFINE_IDR(cgroup_hierarchy_idr);

160

161

/*

161

/*

162

* Assign a monotonically increasing serial number to cgroups. It

162

* Assign a monotonically increasing serial number to cgroups. It

163

* guarantees cgroups with bigger numbers are newer than those with smaller

163

* guarantees cgroups with bigger numbers are newer than those with smaller

164

* numbers. Also, as cgroups are always appended to the parent's

164

* numbers. Also, as cgroups are always appended to the parent's

165

* ->children list, it guarantees that sibling cgroups are always sorted in

165

* ->children list, it guarantees that sibling cgroups are always sorted in

166

* the ascending serial number order on the list. Protected by

166

* the ascending serial number order on the list. Protected by

167

* cgroup_mutex.

167

* cgroup_mutex.

168

*/

168

*/

169

static u64 cgroup_serial_nr_next = 1;

169

static u64 cgroup_serial_nr_next = 1;

170

171

/* This flag indicates whether tasks in the fork and exit paths should

171

/* This flag indicates whether tasks in the fork and exit paths should

172

* check for fork/exit handlers to call. This avoids us having to do

172

* check for fork/exit handlers to call. This avoids us having to do

173

* extra work in the fork/exit path if none of the subsystems need to

173

* extra work in the fork/exit path if none of the subsystems need to

174

* be called.

174

* be called.

175

*/

175

*/

176

static int need_forkexit_callback __read_mostly;

176

static int need_forkexit_callback __read_mostly;

177

178

static struct cftype cgroup_base_files[];

178

static struct cftype cgroup_base_files[];

179

180

static void cgroup_put(struct cgroup *cgrp);

180

static void cgroup_put(struct cgroup *cgrp);

181

static int rebind_subsystems(struct cgroup_root *dst_root,

181

static int rebind_subsystems(struct cgroup_root *dst_root,

182

unsigned long ss_mask);

182

unsigned long ss_mask);

183

static void cgroup_destroy_css_killed(struct cgroup *cgrp);

183

static void cgroup_destroy_css_killed(struct cgroup *cgrp);

184

static int cgroup_destroy_locked(struct cgroup *cgrp);

184

static int cgroup_destroy_locked(struct cgroup *cgrp);

185

static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],

185

static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],

186

bool is_add);

186

bool is_add);

187

static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);

187

static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);

188

189

/**

189

/**

190

* cgroup_css - obtain a cgroup's css for the specified subsystem

190

* cgroup_css - obtain a cgroup's css for the specified subsystem

191

* @cgrp: the cgroup of interest

191

* @cgrp: the cgroup of interest

192

* @ss: the subsystem of interest (%NULL returns the dummy_css)

192

* @ss: the subsystem of interest (%NULL returns the dummy_css)

193

*

193

*

194

* Return @cgrp's css (cgroup_subsys_state) associated with @ss. This

194

* Return @cgrp's css (cgroup_subsys_state) associated with @ss. This

195

* function must be called either under cgroup_mutex or rcu_read_lock() and

195

* function must be called either under cgroup_mutex or rcu_read_lock() and

196

* the caller is responsible for pinning the returned css if it wants to

196

* the caller is responsible for pinning the returned css if it wants to

197

* keep accessing it outside the said locks. This function may return

197

* keep accessing it outside the said locks. This function may return

198

* %NULL if @cgrp doesn't have @subsys_id enabled.

198

* %NULL if @cgrp doesn't have @subsys_id enabled.

199

*/

199

*/

200

static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,

200

static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,

201

struct cgroup_subsys *ss)

201

struct cgroup_subsys *ss)

202

{

202

{

203

if (ss)

203

if (ss)

204

return rcu_dereference_check(cgrp->subsys[ss->id],

204

return rcu_dereference_check(cgrp->subsys[ss->id],

205

lockdep_is_held(&cgroup_tree_mutex) ||

205

lockdep_is_held(&cgroup_tree_mutex) ||

206

lockdep_is_held(&cgroup_mutex));

206

lockdep_is_held(&cgroup_mutex));

207

else

207

else

208

return &cgrp->dummy_css;

208

return &cgrp->dummy_css;

209

}

209

}

210

211

/**

211

/**

212

* cgroup_e_css - obtain a cgroup's effective css for the specified subsystem

212

* cgroup_e_css - obtain a cgroup's effective css for the specified subsystem

213

* @cgrp: the cgroup of interest

213

* @cgrp: the cgroup of interest

214

* @ss: the subsystem of interest (%NULL returns the dummy_css)

214

* @ss: the subsystem of interest (%NULL returns the dummy_css)

215

*

215

*

216

* Similar to cgroup_css() but returns the effctive css, which is defined

216

* Similar to cgroup_css() but returns the effctive css, which is defined

217

* as the matching css of the nearest ancestor including self which has @ss

217

* as the matching css of the nearest ancestor including self which has @ss

218

* enabled. If @ss is associated with the hierarchy @cgrp is on, this

218

* enabled. If @ss is associated with the hierarchy @cgrp is on, this

219

* function is guaranteed to return non-NULL css.

219

* function is guaranteed to return non-NULL css.

220

*/

220

*/

221

static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,

221

static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,

222

struct cgroup_subsys *ss)

222

struct cgroup_subsys *ss)

223

{

223

{

224

lockdep_assert_held(&cgroup_mutex);

224

lockdep_assert_held(&cgroup_mutex);

225

226

if (!ss)

226

if (!ss)

227

return &cgrp->dummy_css;

227

return &cgrp->dummy_css;

228

229

if (!(cgrp->root->subsys_mask & (1 << ss->id)))

229

if (!(cgrp->root->subsys_mask & (1 << ss->id)))

230

return NULL;

230

return NULL;

231

232

while (cgrp->parent &&

232

while (cgrp->parent &&

233

!(cgrp->parent->child_subsys_mask & (1 << ss->id)))

233

!(cgrp->parent->child_subsys_mask & (1 << ss->id)))

234

cgrp = cgrp->parent;

234

cgrp = cgrp->parent;

235

236

return cgroup_css(cgrp, ss);

236

return cgroup_css(cgrp, ss);

237

}

237

}

238

239

/* convenient tests for these bits */

239

/* convenient tests for these bits */

240

static inline bool cgroup_is_dead(const struct cgroup *cgrp)

240

static inline bool cgroup_is_dead(const struct cgroup *cgrp)

241

{

241

{

242

return test_bit(CGRP_DEAD, &cgrp->flags);

242

return test_bit(CGRP_DEAD, &cgrp->flags);

243

}

243

}

244

245

struct cgroup_subsys_state *seq_css(struct seq_file *seq)

245

struct cgroup_subsys_state *seq_css(struct seq_file *seq)

246

{

246

{

247

struct kernfs_open_file *of = seq->private;

247

struct kernfs_open_file *of = seq->private;

248

struct cgroup *cgrp = of->kn->parent->priv;

248

struct cgroup *cgrp = of->kn->parent->priv;

249

struct cftype *cft = seq_cft(seq);

249

struct cftype *cft = seq_cft(seq);

250

251

/*

251

/*

252

* This is open and unprotected implementation of cgroup_css().

252

* This is open and unprotected implementation of cgroup_css().

253

* seq_css() is only called from a kernfs file operation which has

253

* seq_css() is only called from a kernfs file operation which has

254

* an active reference on the file. Because all the subsystem

254

* an active reference on the file. Because all the subsystem

255

* files are drained before a css is disassociated with a cgroup,

255

* files are drained before a css is disassociated with a cgroup,

256

* the matching css from the cgroup's subsys table is guaranteed to

256

* the matching css from the cgroup's subsys table is guaranteed to

257

* be and stay valid until the enclosing operation is complete.

257

* be and stay valid until the enclosing operation is complete.

258

*/

258

*/

259

if (cft->ss)

259

if (cft->ss)

260

return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);

260

return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);

261

else

261

else

262

return &cgrp->dummy_css;

262

return &cgrp->dummy_css;

263

}

263

}

264

EXPORT_SYMBOL_GPL(seq_css);

264

EXPORT_SYMBOL_GPL(seq_css);

265

266

/**

266

/**

267

* cgroup_is_descendant - test ancestry

267

* cgroup_is_descendant - test ancestry

268

* @cgrp: the cgroup to be tested

268

* @cgrp: the cgroup to be tested

269

* @ancestor: possible ancestor of @cgrp

269

* @ancestor: possible ancestor of @cgrp

270

*

270

*

271

* Test whether @cgrp is a descendant of @ancestor. It also returns %true

271

* Test whether @cgrp is a descendant of @ancestor. It also returns %true

272

* if @cgrp == @ancestor. This function is safe to call as long as @cgrp

272

* if @cgrp == @ancestor. This function is safe to call as long as @cgrp

273

* and @ancestor are accessible.

273

* and @ancestor are accessible.

274

*/

274

*/

275

bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)

275

bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)

276

{

276

{

277

while (cgrp) {

277

while (cgrp) {

278

if (cgrp == ancestor)

278

if (cgrp == ancestor)

279

return true;

279

return true;

280

cgrp = cgrp->parent;

280

cgrp = cgrp->parent;

281

}

281

}

282

return false;

282

return false;

283

}

283

}

284

285

static int cgroup_is_releasable(const struct cgroup *cgrp)

285

static int cgroup_is_releasable(const struct cgroup *cgrp)

286

{

286

{

287

const int bits =

287

const int bits =

288

(1 << CGRP_RELEASABLE) |

288

(1 << CGRP_RELEASABLE) |

289

(1 << CGRP_NOTIFY_ON_RELEASE);

289

(1 << CGRP_NOTIFY_ON_RELEASE);

290

return (cgrp->flags & bits) == bits;

290

return (cgrp->flags & bits) == bits;

291

}

291

}

292

293

static int notify_on_release(const struct cgroup *cgrp)

293

static int notify_on_release(const struct cgroup *cgrp)

294

{

294

{

295

return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

295

return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

296

}

296

}

297

298

/**

298

/**

299

* for_each_css - iterate all css's of a cgroup

299

* for_each_css - iterate all css's of a cgroup

300

* @css: the iteration cursor

300

* @css: the iteration cursor

301

* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end

301

* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end

302

* @cgrp: the target cgroup to iterate css's of

302

* @cgrp: the target cgroup to iterate css's of

303

*

303

*

304

* Should be called under cgroup_[tree_]mutex.

304

* Should be called under cgroup_[tree_]mutex.

305

*/

305

*/

306

#define for_each_css(css, ssid, cgrp) \

306

#define for_each_css(css, ssid, cgrp) \

307

for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \

307

for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \

308

if (!((css) = rcu_dereference_check( \

308

if (!((css) = rcu_dereference_check( \

309

(cgrp)->subsys[(ssid)], \

309

(cgrp)->subsys[(ssid)], \

310

lockdep_is_held(&cgroup_tree_mutex) || \

310

lockdep_is_held(&cgroup_tree_mutex) || \

311

lockdep_is_held(&cgroup_mutex)))) { } \

311

lockdep_is_held(&cgroup_mutex)))) { } \

312

else

312

else

313

314

/**

314

/**

315

* for_each_e_css - iterate all effective css's of a cgroup

315

* for_each_e_css - iterate all effective css's of a cgroup

316

* @css: the iteration cursor

316

* @css: the iteration cursor

317

* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end

317

* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end

318

* @cgrp: the target cgroup to iterate css's of

318

* @cgrp: the target cgroup to iterate css's of

319

*

319

*

320

* Should be called under cgroup_[tree_]mutex.

320

* Should be called under cgroup_[tree_]mutex.

321

*/

321

*/

322

#define for_each_e_css(css, ssid, cgrp) \

322

#define for_each_e_css(css, ssid, cgrp) \

323

for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \

323

for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \

324

if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \

324

if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \

325

; \

325

; \

326

else

326

else

327

328

/**

328

/**

329

* for_each_subsys - iterate all enabled cgroup subsystems

329

* for_each_subsys - iterate all enabled cgroup subsystems

330

* @ss: the iteration cursor

330

* @ss: the iteration cursor

331

* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end

331

* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end

332

*/

332

*/

333

#define for_each_subsys(ss, ssid) \

333

#define for_each_subsys(ss, ssid) \

334

for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \

334

for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \

335

(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)

335

(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)

336

337

/* iterate across the hierarchies */

337

/* iterate across the hierarchies */

338

#define for_each_root(root) \

338

#define for_each_root(root) \

339

list_for_each_entry((root), &cgroup_roots, root_list)

339

list_for_each_entry((root), &cgroup_roots, root_list)

340

341

/**

341

/**

342

* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.

342

* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.

343

* @cgrp: the cgroup to be checked for liveness

343

* @cgrp: the cgroup to be checked for liveness

344

*

344

*

345

* On success, returns true; the mutex should be later unlocked. On

345

* On success, returns true; the mutex should be later unlocked. On

346

* failure returns false with no lock held.

346

* failure returns false with no lock held.

347

*/

347

*/

348

static bool cgroup_lock_live_group(struct cgroup *cgrp)

348

static bool cgroup_lock_live_group(struct cgroup *cgrp)

349

{

349

{

350

mutex_lock(&cgroup_mutex);

350

mutex_lock(&cgroup_mutex);

351

if (cgroup_is_dead(cgrp)) {

351

if (cgroup_is_dead(cgrp)) {

352

mutex_unlock(&cgroup_mutex);

352

mutex_unlock(&cgroup_mutex);

353

return false;

353

return false;

354

}

354

}

355

return true;

355

return true;

356

}

356

}

357

358

/* the list of cgroups eligible for automatic release. Protected by

358

/* the list of cgroups eligible for automatic release. Protected by

359

* release_list_lock */

359

* release_list_lock */

360

static LIST_HEAD(release_list);

360

static LIST_HEAD(release_list);

361

static DEFINE_RAW_SPINLOCK(release_list_lock);

361

static DEFINE_RAW_SPINLOCK(release_list_lock);

362

static void cgroup_release_agent(struct work_struct *work);

362

static void cgroup_release_agent(struct work_struct *work);

363

static DECLARE_WORK(release_agent_work, cgroup_release_agent);

363

static DECLARE_WORK(release_agent_work, cgroup_release_agent);

364

static void check_for_release(struct cgroup *cgrp);

364

static void check_for_release(struct cgroup *cgrp);

365

366

/*

366

/*

367

* A cgroup can be associated with multiple css_sets as different tasks may

367

* A cgroup can be associated with multiple css_sets as different tasks may

368

* belong to different cgroups on different hierarchies. In the other

368

* belong to different cgroups on different hierarchies. In the other

369

* direction, a css_set is naturally associated with multiple cgroups.

369

* direction, a css_set is naturally associated with multiple cgroups.

370

* This M:N relationship is represented by the following link structure

370

* This M:N relationship is represented by the following link structure

371

* which exists for each association and allows traversing the associations

371

* which exists for each association and allows traversing the associations

372

* from both sides.

372

* from both sides.

373

*/

373

*/

374

struct cgrp_cset_link {

374

struct cgrp_cset_link {

375

/* the cgroup and css_set this link associates */

375

/* the cgroup and css_set this link associates */

376

struct cgroup *cgrp;

376

struct cgroup *cgrp;

377

struct css_set *cset;

377

struct css_set *cset;

378

379

/* list of cgrp_cset_links anchored at cgrp->cset_links */

379

/* list of cgrp_cset_links anchored at cgrp->cset_links */

380

struct list_head cset_link;

380

struct list_head cset_link;

381

382

/* list of cgrp_cset_links anchored at css_set->cgrp_links */

382

/* list of cgrp_cset_links anchored at css_set->cgrp_links */

383

struct list_head cgrp_link;

383

struct list_head cgrp_link;

384

};

384

};

385

386

/*

386

/*

387

* The default css_set - used by init and its children prior to any

387

* The default css_set - used by init and its children prior to any

388

* hierarchies being mounted. It contains a pointer to the root state

388

* hierarchies being mounted. It contains a pointer to the root state

389

* for each subsystem. Also used to anchor the list of css_sets. Not

389

* for each subsystem. Also used to anchor the list of css_sets. Not

390

* reference-counted, to improve performance when child cgroups

390

* reference-counted, to improve performance when child cgroups

391

* haven't been created.

391

* haven't been created.

392

*/

392

*/

393

static struct css_set init_css_set = {

393

static struct css_set init_css_set = {

394

.refcount = ATOMIC_INIT(1),

394

.refcount = ATOMIC_INIT(1),

395

.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),

395

.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),

396

.tasks = LIST_HEAD_INIT(init_css_set.tasks),

396

.tasks = LIST_HEAD_INIT(init_css_set.tasks),

397

.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),

397

.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),

398

.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),

398

.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),

399

.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),

399

.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),

400

};

400

};

401

402

static int css_set_count = 1; /* 1 for init_css_set */

402

static int css_set_count = 1; /* 1 for init_css_set */

403

404

/*

404

/*

405

* hash table for cgroup groups. This improves the performance to find

405

* hash table for cgroup groups. This improves the performance to find

406

* an existing css_set. This hash doesn't (currently) take into

406

* an existing css_set. This hash doesn't (currently) take into

407

* account cgroups in empty hierarchies.

407

* account cgroups in empty hierarchies.

408

*/

408

*/

409

#define CSS_SET_HASH_BITS 7

409

#define CSS_SET_HASH_BITS 7

410

static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);

410

static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);

411

412

static unsigned long css_set_hash(struct cgroup_subsys_state *css[])

412

static unsigned long css_set_hash(struct cgroup_subsys_state *css[])

413

{

413

{

414

unsigned long key = 0UL;

414

unsigned long key = 0UL;

415

struct cgroup_subsys *ss;

415

struct cgroup_subsys *ss;

416

int i;

416

int i;

417

418

for_each_subsys(ss, i)

418

for_each_subsys(ss, i)

419

key += (unsigned long)css[i];

419

key += (unsigned long)css[i];

420

key = (key >> 16) ^ key;

420

key = (key >> 16) ^ key;

421

422

return key;

422

return key;

423

}

423

}

424

425

static void put_css_set_locked(struct css_set *cset, bool taskexit)

425

static void put_css_set_locked(struct css_set *cset, bool taskexit)

426

{

426

{

427

struct cgrp_cset_link *link, *tmp_link;

427

struct cgrp_cset_link *link, *tmp_link;

428

struct cgroup_subsys *ss;

428

struct cgroup_subsys *ss;

429

int ssid;

429

int ssid;

430

431

lockdep_assert_held(&css_set_rwsem);

431

lockdep_assert_held(&css_set_rwsem);

432

433

if (!atomic_dec_and_test(&cset->refcount))

433

if (!atomic_dec_and_test(&cset->refcount))

434

return;

434

return;

435

436

/* This css_set is dead. unlink it and release cgroup refcounts */

436

/* This css_set is dead. unlink it and release cgroup refcounts */

437

for_each_subsys(ss, ssid)

437

for_each_subsys(ss, ssid)

438

list_del(&cset->e_cset_node[ssid]);

438

list_del(&cset->e_cset_node[ssid]);

439

hash_del(&cset->hlist);

439

hash_del(&cset->hlist);

440

css_set_count--;

440

css_set_count--;

441

442

list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {

442

list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {

443

struct cgroup *cgrp = link->cgrp;

443

struct cgroup *cgrp = link->cgrp;

444

445

list_del(&link->cset_link);

445

list_del(&link->cset_link);

446

list_del(&link->cgrp_link);

446

list_del(&link->cgrp_link);

447

448

/* @cgrp can't go away while we're holding css_set_rwsem */

448

/* @cgrp can't go away while we're holding css_set_rwsem */

449

if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {

449

if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {

450

if (taskexit)

450

if (taskexit)

451

set_bit(CGRP_RELEASABLE, &cgrp->flags);

451

set_bit(CGRP_RELEASABLE, &cgrp->flags);

452

check_for_release(cgrp);

452

check_for_release(cgrp);

453

}

453

}

454

455

kfree(link);

455

kfree(link);

456

}

456

}

457

458

kfree_rcu(cset, rcu_head);

458

kfree_rcu(cset, rcu_head);

459

}

459

}

460

461

static void put_css_set(struct css_set *cset, bool taskexit)

461

static void put_css_set(struct css_set *cset, bool taskexit)

462

{

462

{

463

/*

463

/*

464

* Ensure that the refcount doesn't hit zero while any readers

464

* Ensure that the refcount doesn't hit zero while any readers

465

* can see it. Similar to atomic_dec_and_lock(), but for an

465

* can see it. Similar to atomic_dec_and_lock(), but for an

466

* rwlock

466

* rwlock

467

*/

467

*/

468

if (atomic_add_unless(&cset->refcount, -1, 1))

468

if (atomic_add_unless(&cset->refcount, -1, 1))

469

return;

469

return;

470

471

down_write(&css_set_rwsem);

471

down_write(&css_set_rwsem);

472

put_css_set_locked(cset, taskexit);

472

put_css_set_locked(cset, taskexit);

473

up_write(&css_set_rwsem);

473

up_write(&css_set_rwsem);

474

}

474

}

475

476

/*

476

/*

477

* refcounted get/put for css_set objects

477

* refcounted get/put for css_set objects

478

*/

478

*/

479

static inline void get_css_set(struct css_set *cset)

479

static inline void get_css_set(struct css_set *cset)

480

{

480

{

481

atomic_inc(&cset->refcount);

481

atomic_inc(&cset->refcount);

482

}

482

}

483

484

/**

484

/**

485

* compare_css_sets - helper function for find_existing_css_set().

485

* compare_css_sets - helper function for find_existing_css_set().

486

* @cset: candidate css_set being tested

486

* @cset: candidate css_set being tested

487

* @old_cset: existing css_set for a task

487

* @old_cset: existing css_set for a task

488

* @new_cgrp: cgroup that's being entered by the task

488

* @new_cgrp: cgroup that's being entered by the task

489

* @template: desired set of css pointers in css_set (pre-calculated)

489

* @template: desired set of css pointers in css_set (pre-calculated)

490

*

490

*

491

* Returns true if "cset" matches "old_cset" except for the hierarchy

491

* Returns true if "cset" matches "old_cset" except for the hierarchy

492

* which "new_cgrp" belongs to, for which it should match "new_cgrp".

492

* which "new_cgrp" belongs to, for which it should match "new_cgrp".

493

*/

493

*/

494

static bool compare_css_sets(struct css_set *cset,

494

static bool compare_css_sets(struct css_set *cset,

495

struct css_set *old_cset,

495

struct css_set *old_cset,

496

struct cgroup *new_cgrp,

496

struct cgroup *new_cgrp,

497

struct cgroup_subsys_state *template[])

497

struct cgroup_subsys_state *template[])

498

{

498

{

499

struct list_head *l1, *l2;

499

struct list_head *l1, *l2;

500

501

/*

501

/*

502

* On the default hierarchy, there can be csets which are

502

* On the default hierarchy, there can be csets which are

503

* associated with the same set of cgroups but different csses.

503

* associated with the same set of cgroups but different csses.

504

* Let's first ensure that csses match.

504

* Let's first ensure that csses match.

505

*/

505

*/

506

if (memcmp(template, cset->subsys, sizeof(cset->subsys)))

506

if (memcmp(template, cset->subsys, sizeof(cset->subsys)))

507

return false;

507

return false;

508

509

/*

509

/*

510

* Compare cgroup pointers in order to distinguish between

510

* Compare cgroup pointers in order to distinguish between

511

* different cgroups in hierarchies. As different cgroups may

511

* different cgroups in hierarchies. As different cgroups may

512

* share the same effective css, this comparison is always

512

* share the same effective css, this comparison is always

513

* necessary.

513

* necessary.

514

*/

514

*/

515

l1 = &cset->cgrp_links;

515

l1 = &cset->cgrp_links;

516

l2 = &old_cset->cgrp_links;

516

l2 = &old_cset->cgrp_links;

517

while (1) {

517

while (1) {

518

struct cgrp_cset_link *link1, *link2;

518

struct cgrp_cset_link *link1, *link2;

519

struct cgroup *cgrp1, *cgrp2;

519

struct cgroup *cgrp1, *cgrp2;

520

521

l1 = l1->next;

521

l1 = l1->next;

522

l2 = l2->next;

522

l2 = l2->next;

523

/* See if we reached the end - both lists are equal length. */

523

/* See if we reached the end - both lists are equal length. */

524

if (l1 == &cset->cgrp_links) {

524

if (l1 == &cset->cgrp_links) {

525

BUG_ON(l2 != &old_cset->cgrp_links);

525

BUG_ON(l2 != &old_cset->cgrp_links);

526

break;

526

break;

527

} else {

527

} else {

528

BUG_ON(l2 == &old_cset->cgrp_links);

528

BUG_ON(l2 == &old_cset->cgrp_links);

529

}

529

}

530

/* Locate the cgroups associated with these links. */

530

/* Locate the cgroups associated with these links. */

531

link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);

531

link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);

532

link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);

532

link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);

533

cgrp1 = link1->cgrp;

533

cgrp1 = link1->cgrp;

534

cgrp2 = link2->cgrp;

534

cgrp2 = link2->cgrp;

535

/* Hierarchies should be linked in the same order. */

535

/* Hierarchies should be linked in the same order. */

536

BUG_ON(cgrp1->root != cgrp2->root);

536

BUG_ON(cgrp1->root != cgrp2->root);

537

538

/*

538

/*

539

* If this hierarchy is the hierarchy of the cgroup

539

* If this hierarchy is the hierarchy of the cgroup

540

* that's changing, then we need to check that this

540

* that's changing, then we need to check that this

541

* css_set points to the new cgroup; if it's any other

541

* css_set points to the new cgroup; if it's any other

542

* hierarchy, then this css_set should point to the

542

* hierarchy, then this css_set should point to the

543

* same cgroup as the old css_set.

543

* same cgroup as the old css_set.

544

*/

544

*/

545

if (cgrp1->root == new_cgrp->root) {

545

if (cgrp1->root == new_cgrp->root) {

546

if (cgrp1 != new_cgrp)

546

if (cgrp1 != new_cgrp)

547

return false;

547

return false;

548

} else {

548

} else {

549

if (cgrp1 != cgrp2)

549

if (cgrp1 != cgrp2)

550

return false;

550

return false;

551

}

551

}

552

}

552

}

553

return true;

553

return true;

554

}

554

}

555

556

/**

556

/**

557

* find_existing_css_set - init css array and find the matching css_set

557

* find_existing_css_set - init css array and find the matching css_set

558

* @old_cset: the css_set that we're using before the cgroup transition

558

* @old_cset: the css_set that we're using before the cgroup transition

559

* @cgrp: the cgroup that we're moving into

559

* @cgrp: the cgroup that we're moving into

560

* @template: out param for the new set of csses, should be clear on entry

560

* @template: out param for the new set of csses, should be clear on entry

561

*/

561

*/

562

static struct css_set *find_existing_css_set(struct css_set *old_cset,

562

static struct css_set *find_existing_css_set(struct css_set *old_cset,

563

struct cgroup *cgrp,

563

struct cgroup *cgrp,

564

struct cgroup_subsys_state *template[])

564

struct cgroup_subsys_state *template[])

565

{

565

{

566

struct cgroup_root *root = cgrp->root;

566

struct cgroup_root *root = cgrp->root;

567

struct cgroup_subsys *ss;

567

struct cgroup_subsys *ss;

568

struct css_set *cset;

568

struct css_set *cset;

569

unsigned long key;

569

unsigned long key;

570

int i;

570

int i;

571

572

/*

572

/*

573

* Build the set of subsystem state objects that we want to see in the

573

* Build the set of subsystem state objects that we want to see in the

574

* new css_set. while subsystems can change globally, the entries here

574

* new css_set. while subsystems can change globally, the entries here

575

* won't change, so no need for locking.

575

* won't change, so no need for locking.

576

*/

576

*/

577

for_each_subsys(ss, i) {

577

for_each_subsys(ss, i) {

578

if (root->subsys_mask & (1UL << i)) {

578

if (root->subsys_mask & (1UL << i)) {

579

/*

579

/*

580

* @ss is in this hierarchy, so we want the

580

* @ss is in this hierarchy, so we want the

581

* effective css from @cgrp.

581

* effective css from @cgrp.

582

*/

582

*/

583

template[i] = cgroup_e_css(cgrp, ss);

583

template[i] = cgroup_e_css(cgrp, ss);

584

} else {

584

} else {

585

/*

585

/*

586

* @ss is not in this hierarchy, so we don't want

586

* @ss is not in this hierarchy, so we don't want

587

* to change the css.

587

* to change the css.

588

*/

588

*/

589

template[i] = old_cset->subsys[i];

589

template[i] = old_cset->subsys[i];

590

}

590

}

591

}

591

}

592

593

key = css_set_hash(template);

593

key = css_set_hash(template);

594

hash_for_each_possible(css_set_table, cset, hlist, key) {

594

hash_for_each_possible(css_set_table, cset, hlist, key) {

595

if (!compare_css_sets(cset, old_cset, cgrp, template))

595

if (!compare_css_sets(cset, old_cset, cgrp, template))

596

continue;

596

continue;

597

598

/* This css_set matches what we need */

598

/* This css_set matches what we need */

599

return cset;

599

return cset;

600

}

600

}

601

602

/* No existing cgroup group matched */

602

/* No existing cgroup group matched */

603

return NULL;

603

return NULL;

604

}

604

}

605

606

static void free_cgrp_cset_links(struct list_head *links_to_free)

606

static void free_cgrp_cset_links(struct list_head *links_to_free)

607

{

607

{

608

struct cgrp_cset_link *link, *tmp_link;

608

struct cgrp_cset_link *link, *tmp_link;

609

610

list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {

610

list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {

611

list_del(&link->cset_link);

611

list_del(&link->cset_link);

612

kfree(link);

612

kfree(link);

613

}

613

}

614

}

614

}

615

616

/**

616

/**

617

* allocate_cgrp_cset_links - allocate cgrp_cset_links

617

* allocate_cgrp_cset_links - allocate cgrp_cset_links

618

* @count: the number of links to allocate

618

* @count: the number of links to allocate

619

* @tmp_links: list_head the allocated links are put on

619

* @tmp_links: list_head the allocated links are put on

620

*

620

*

621

* Allocate @count cgrp_cset_link structures and chain them on @tmp_links

621

* Allocate @count cgrp_cset_link structures and chain them on @tmp_links

622

* through ->cset_link. Returns 0 on success or -errno.

622

* through ->cset_link. Returns 0 on success or -errno.

623

*/

623

*/

624

static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)

624

static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)

625

{

625

{

626

struct cgrp_cset_link *link;

626

struct cgrp_cset_link *link;

627

int i;

627

int i;

628

629

INIT_LIST_HEAD(tmp_links);

629

INIT_LIST_HEAD(tmp_links);

630

631

for (i = 0; i < count; i++) {

631

for (i = 0; i < count; i++) {

632

link = kzalloc(sizeof(*link), GFP_KERNEL);

632

link = kzalloc(sizeof(*link), GFP_KERNEL);

633

if (!link) {

633

if (!link) {

634

free_cgrp_cset_links(tmp_links);

634

free_cgrp_cset_links(tmp_links);

635

return -ENOMEM;

635

return -ENOMEM;

636

}

636

}

637

list_add(&link->cset_link, tmp_links);

637

list_add(&link->cset_link, tmp_links);

638

}

638

}

639

return 0;

639

return 0;

640

}

640

}

641

642

/**

642

/**

643

* link_css_set - a helper function to link a css_set to a cgroup

643

* link_css_set - a helper function to link a css_set to a cgroup

644

* @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()

644

* @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()

645

* @cset: the css_set to be linked

645

* @cset: the css_set to be linked

646

* @cgrp: the destination cgroup

646

* @cgrp: the destination cgroup

647

*/

647

*/

648

static void link_css_set(struct list_head *tmp_links, struct css_set *cset,

648

static void link_css_set(struct list_head *tmp_links, struct css_set *cset,

649

struct cgroup *cgrp)

649

struct cgroup *cgrp)

650

{

650

{

651

struct cgrp_cset_link *link;

651

struct cgrp_cset_link *link;

652

653

BUG_ON(list_empty(tmp_links));

653

BUG_ON(list_empty(tmp_links));

654

655

if (cgroup_on_dfl(cgrp))

656

cset->dfl_cgrp = cgrp;

657

654

link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);

658

link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);

655

link->cset = cset;

659

link->cset = cset;

656

link->cgrp = cgrp;

660

link->cgrp = cgrp;

657

list_move(&link->cset_link, &cgrp->cset_links);

661

list_move(&link->cset_link, &cgrp->cset_links);

658

/*

662

/*

659

* Always add links to the tail of the list so that the list

663

* Always add links to the tail of the list so that the list

660

* is sorted by order of hierarchy creation

664

* is sorted by order of hierarchy creation

661

*/

665

*/

662

list_add_tail(&link->cgrp_link, &cset->cgrp_links);

666

list_add_tail(&link->cgrp_link, &cset->cgrp_links);

663

}

667

}

664

668

665

/**

669

/**

666

* find_css_set - return a new css_set with one cgroup updated

670

* find_css_set - return a new css_set with one cgroup updated

667

* @old_cset: the baseline css_set

671

* @old_cset: the baseline css_set

668

* @cgrp: the cgroup to be updated

672

* @cgrp: the cgroup to be updated

669

*

673

*

670

* Return a new css_set that's equivalent to @old_cset, but with @cgrp

674

* Return a new css_set that's equivalent to @old_cset, but with @cgrp

671

* substituted into the appropriate hierarchy.

675

* substituted into the appropriate hierarchy.

672

*/

676

*/

673

static struct css_set *find_css_set(struct css_set *old_cset,

677

static struct css_set *find_css_set(struct css_set *old_cset,

674

struct cgroup *cgrp)

678

struct cgroup *cgrp)

675

{

679

{

676

struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };

680

struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };

677

struct css_set *cset;

681

struct css_set *cset;

678

struct list_head tmp_links;

682

struct list_head tmp_links;

679

struct cgrp_cset_link *link;

683

struct cgrp_cset_link *link;

680

struct cgroup_subsys *ss;

684

struct cgroup_subsys *ss;

681

unsigned long key;

685

unsigned long key;

682

int ssid;

686

int ssid;

683

687

684

lockdep_assert_held(&cgroup_mutex);

688

lockdep_assert_held(&cgroup_mutex);

685

689

686

/* First see if we already have a cgroup group that matches

690

/* First see if we already have a cgroup group that matches

687

* the desired set */

691

* the desired set */

688

down_read(&css_set_rwsem);

692

down_read(&css_set_rwsem);

689

cset = find_existing_css_set(old_cset, cgrp, template);

693

cset = find_existing_css_set(old_cset, cgrp, template);

690

if (cset)

694

if (cset)

691

get_css_set(cset);

695

get_css_set(cset);

692

up_read(&css_set_rwsem);

696

up_read(&css_set_rwsem);

693

697

694

if (cset)

698

if (cset)

695

return cset;

699

return cset;

696

700

697

cset = kzalloc(sizeof(*cset), GFP_KERNEL);

701

cset = kzalloc(sizeof(*cset), GFP_KERNEL);

698

if (!cset)

702

if (!cset)

699

return NULL;

703

return NULL;

700

704

701

/* Allocate all the cgrp_cset_link objects that we'll need */

705

/* Allocate all the cgrp_cset_link objects that we'll need */

702

if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {

706

if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {

703

kfree(cset);

707

kfree(cset);

704

return NULL;

708

return NULL;

705

}

709

}

706

710

707

atomic_set(&cset->refcount, 1);

711

atomic_set(&cset->refcount, 1);

708

INIT_LIST_HEAD(&cset->cgrp_links);

712

INIT_LIST_HEAD(&cset->cgrp_links);

709

INIT_LIST_HEAD(&cset->tasks);

713

INIT_LIST_HEAD(&cset->tasks);

710

INIT_LIST_HEAD(&cset->mg_tasks);

714

INIT_LIST_HEAD(&cset->mg_tasks);

711

INIT_LIST_HEAD(&cset->mg_preload_node);

715

INIT_LIST_HEAD(&cset->mg_preload_node);

712

INIT_LIST_HEAD(&cset->mg_node);

716

INIT_LIST_HEAD(&cset->mg_node);

713

INIT_HLIST_NODE(&cset->hlist);

717

INIT_HLIST_NODE(&cset->hlist);

714

718

715

/* Copy the set of subsystem state objects generated in

719

/* Copy the set of subsystem state objects generated in

716

* find_existing_css_set() */

720

* find_existing_css_set() */

717

memcpy(cset->subsys, template, sizeof(cset->subsys));

721

memcpy(cset->subsys, template, sizeof(cset->subsys));

718

722

719

down_write(&css_set_rwsem);

723

down_write(&css_set_rwsem);

720

/* Add reference counts and links from the new css_set. */

724

/* Add reference counts and links from the new css_set. */

721

list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {

725

list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {

722

struct cgroup *c = link->cgrp;

726

struct cgroup *c = link->cgrp;

723

727

724

if (c->root == cgrp->root)

728

if (c->root == cgrp->root)

725

c = cgrp;

729

c = cgrp;

726

link_css_set(&tmp_links, cset, c);

730

link_css_set(&tmp_links, cset, c);

727

}

731

}

728

732

729

BUG_ON(!list_empty(&tmp_links));

733

BUG_ON(!list_empty(&tmp_links));

730

734

731

css_set_count++;

735

css_set_count++;

732

736

733

/* Add @cset to the hash table */

737

/* Add @cset to the hash table */

734

key = css_set_hash(cset->subsys);

738

key = css_set_hash(cset->subsys);

735

hash_add(css_set_table, &cset->hlist, key);

739

hash_add(css_set_table, &cset->hlist, key);

736

740

737

for_each_subsys(ss, ssid)

741

for_each_subsys(ss, ssid)

738

list_add_tail(&cset->e_cset_node[ssid],

742

list_add_tail(&cset->e_cset_node[ssid],

739

&cset->subsys[ssid]->cgroup->e_csets[ssid]);

743

&cset->subsys[ssid]->cgroup->e_csets[ssid]);

740

744

741

up_write(&css_set_rwsem);

745

up_write(&css_set_rwsem);

742

746

743

return cset;

747

return cset;

744

}

748

}

745

749

746

static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)

750

static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)

747

{

751

{

748

struct cgroup *root_cgrp = kf_root->kn->priv;

752

struct cgroup *root_cgrp = kf_root->kn->priv;

749

753

750

return root_cgrp->root;

754

return root_cgrp->root;

751

}

755

}

752

756

753

static int cgroup_init_root_id(struct cgroup_root *root)

757

static int cgroup_init_root_id(struct cgroup_root *root)

754

{

758

{

755

int id;

759

int id;

756

760

757

lockdep_assert_held(&cgroup_mutex);

761

lockdep_assert_held(&cgroup_mutex);

758

762

759

id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);

763

id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);

760

if (id < 0)

764

if (id < 0)

761

return id;

765

return id;

762

766

763

root->hierarchy_id = id;

767

root->hierarchy_id = id;

764

return 0;

768

return 0;

765

}

769

}

766

770

767

static void cgroup_exit_root_id(struct cgroup_root *root)

771

static void cgroup_exit_root_id(struct cgroup_root *root)

768

{

772

{

769

lockdep_assert_held(&cgroup_mutex);

773

lockdep_assert_held(&cgroup_mutex);

770

774

771

if (root->hierarchy_id) {

775

if (root->hierarchy_id) {

772

idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);

776

idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);

773

root->hierarchy_id = 0;

777

root->hierarchy_id = 0;

774

}

778

}

775

}

779

}

776

780

777

static void cgroup_free_root(struct cgroup_root *root)

781

static void cgroup_free_root(struct cgroup_root *root)

778

{

782

{

779

if (root) {

783

if (root) {

780

/* hierarhcy ID shoulid already have been released */

784

/* hierarhcy ID shoulid already have been released */

781

WARN_ON_ONCE(root->hierarchy_id);

785

WARN_ON_ONCE(root->hierarchy_id);

782

786

783

idr_destroy(&root->cgroup_idr);

787

idr_destroy(&root->cgroup_idr);

784

kfree(root);

788

kfree(root);

785

}

789

}

786

}

790

}

787

791

788

static void cgroup_destroy_root(struct cgroup_root *root)

792

static void cgroup_destroy_root(struct cgroup_root *root)

789

{

793

{

790

struct cgroup *cgrp = &root->cgrp;

794

struct cgroup *cgrp = &root->cgrp;

791

struct cgrp_cset_link *link, *tmp_link;

795

struct cgrp_cset_link *link, *tmp_link;

792

796

793

mutex_lock(&cgroup_tree_mutex);

797

mutex_lock(&cgroup_tree_mutex);

794

mutex_lock(&cgroup_mutex);

798

mutex_lock(&cgroup_mutex);

795

799

796

BUG_ON(atomic_read(&root->nr_cgrps));

800

BUG_ON(atomic_read(&root->nr_cgrps));

797

BUG_ON(!list_empty(&cgrp->children));

801

BUG_ON(!list_empty(&cgrp->children));

798

802

799

/* Rebind all subsystems back to the default hierarchy */

803

/* Rebind all subsystems back to the default hierarchy */

800

rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);

804

rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);

801

805

802

/*

806

/*

803

* Release all the links from cset_links to this hierarchy's

807

* Release all the links from cset_links to this hierarchy's

804

* root cgroup

808

* root cgroup

805

*/

809

*/

806

down_write(&css_set_rwsem);

810

down_write(&css_set_rwsem);

807

811

808

list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {

812

list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {

809

list_del(&link->cset_link);

813

list_del(&link->cset_link);

810

list_del(&link->cgrp_link);

814

list_del(&link->cgrp_link);

811

kfree(link);

815

kfree(link);

812

}

816

}

813

up_write(&css_set_rwsem);

817

up_write(&css_set_rwsem);

814

818

815

if (!list_empty(&root->root_list)) {

819

if (!list_empty(&root->root_list)) {

816

list_del(&root->root_list);

820

list_del(&root->root_list);

817

cgroup_root_count--;

821

cgroup_root_count--;

818

}

822

}

819

823

820

cgroup_exit_root_id(root);

824

cgroup_exit_root_id(root);

821

825

822

mutex_unlock(&cgroup_mutex);

826

mutex_unlock(&cgroup_mutex);

823

mutex_unlock(&cgroup_tree_mutex);

827

mutex_unlock(&cgroup_tree_mutex);

824

828

825

kernfs_destroy_root(root->kf_root);

829

kernfs_destroy_root(root->kf_root);

826

cgroup_free_root(root);

830

cgroup_free_root(root);

827

}

831

}

828

832

829

/* look up cgroup associated with given css_set on the specified hierarchy */

833

/* look up cgroup associated with given css_set on the specified hierarchy */

830

static struct cgroup *cset_cgroup_from_root(struct css_set *cset,

834

static struct cgroup *cset_cgroup_from_root(struct css_set *cset,

831

struct cgroup_root *root)

835

struct cgroup_root *root)

832

{

836

{

833

struct cgroup *res = NULL;

837

struct cgroup *res = NULL;

834

838

835

lockdep_assert_held(&cgroup_mutex);

839

lockdep_assert_held(&cgroup_mutex);

836

lockdep_assert_held(&css_set_rwsem);

840

lockdep_assert_held(&css_set_rwsem);

837

841

838

if (cset == &init_css_set) {

842

if (cset == &init_css_set) {

839

res = &root->cgrp;

843

res = &root->cgrp;

840

} else {

844

} else {

841

struct cgrp_cset_link *link;

845

struct cgrp_cset_link *link;

842

846

843

list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {

847

list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {

844

struct cgroup *c = link->cgrp;

848

struct cgroup *c = link->cgrp;

845

849

846

if (c->root == root) {

850

if (c->root == root) {

847

res = c;

851

res = c;

848

break;

852

break;

849

}

853

}

850

}

854

}

851

}

855

}

852

856

853

BUG_ON(!res);

857

BUG_ON(!res);

854

return res;

858

return res;

855

}

859

}

856

860

857

/*

861

/*

858

* Return the cgroup for "task" from the given hierarchy. Must be

862

* Return the cgroup for "task" from the given hierarchy. Must be

859

* called with cgroup_mutex and css_set_rwsem held.

863

* called with cgroup_mutex and css_set_rwsem held.

860

*/

864

*/

861

static struct cgroup *task_cgroup_from_root(struct task_struct *task,

865

static struct cgroup *task_cgroup_from_root(struct task_struct *task,

862

struct cgroup_root *root)

866

struct cgroup_root *root)

863

{

867

{

864

/*

868

/*

865

* No need to lock the task - since we hold cgroup_mutex the

869

* No need to lock the task - since we hold cgroup_mutex the

866

* task can't change groups, so the only thing that can happen

870

* task can't change groups, so the only thing that can happen

867

* is that it exits and its css is set back to init_css_set.

871

* is that it exits and its css is set back to init_css_set.

868

*/

872

*/

869

return cset_cgroup_from_root(task_css_set(task), root);

873

return cset_cgroup_from_root(task_css_set(task), root);

870

}

874

}

871

875

872

/*

876

/*

873

* A task must hold cgroup_mutex to modify cgroups.

877

* A task must hold cgroup_mutex to modify cgroups.

874

*

878

*

875

* Any task can increment and decrement the count field without lock.

879

* Any task can increment and decrement the count field without lock.

876

* So in general, code holding cgroup_mutex can't rely on the count

880

* So in general, code holding cgroup_mutex can't rely on the count

877

* field not changing. However, if the count goes to zero, then only

881

* field not changing. However, if the count goes to zero, then only

878

* cgroup_attach_task() can increment it again. Because a count of zero

882

* cgroup_attach_task() can increment it again. Because a count of zero

879

* means that no tasks are currently attached, therefore there is no

883

* means that no tasks are currently attached, therefore there is no

880

* way a task attached to that cgroup can fork (the other way to

884

* way a task attached to that cgroup can fork (the other way to

881

* increment the count). So code holding cgroup_mutex can safely

885

* increment the count). So code holding cgroup_mutex can safely

882

* assume that if the count is zero, it will stay zero. Similarly, if

886

* assume that if the count is zero, it will stay zero. Similarly, if

883

* a task holds cgroup_mutex on a cgroup with zero count, it

887

* a task holds cgroup_mutex on a cgroup with zero count, it

884

* knows that the cgroup won't be removed, as cgroup_rmdir()

888

* knows that the cgroup won't be removed, as cgroup_rmdir()

885

* needs that mutex.

889

* needs that mutex.

886

*

890

*

887

* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't

891

* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't

888

* (usually) take cgroup_mutex. These are the two most performance

892

* (usually) take cgroup_mutex. These are the two most performance

889

* critical pieces of code here. The exception occurs on cgroup_exit(),

893

* critical pieces of code here. The exception occurs on cgroup_exit(),

890

* when a task in a notify_on_release cgroup exits. Then cgroup_mutex

894

* when a task in a notify_on_release cgroup exits. Then cgroup_mutex

891

* is taken, and if the cgroup count is zero, a usermode call made

895

* is taken, and if the cgroup count is zero, a usermode call made

892

* to the release agent with the name of the cgroup (path relative to

896

* to the release agent with the name of the cgroup (path relative to

893

* the root of cgroup file system) as the argument.

897

* the root of cgroup file system) as the argument.

894

*

898

*

895

* A cgroup can only be deleted if both its 'count' of using tasks

899

* A cgroup can only be deleted if both its 'count' of using tasks

896

* is zero, and its list of 'children' cgroups is empty. Since all

900

* is zero, and its list of 'children' cgroups is empty. Since all

897

* tasks in the system use _some_ cgroup, and since there is always at

901

* tasks in the system use _some_ cgroup, and since there is always at

898

* least one task in the system (init, pid == 1), therefore, root cgroup

902

* least one task in the system (init, pid == 1), therefore, root cgroup

899

* always has either children cgroups and/or using tasks. So we don't

903

* always has either children cgroups and/or using tasks. So we don't

900

* need a special hack to ensure that root cgroup cannot be deleted.

904

* need a special hack to ensure that root cgroup cannot be deleted.

901

*

905

*

902

* P.S. One more locking exception. RCU is used to guard the

906

* P.S. One more locking exception. RCU is used to guard the

903

* update of a tasks cgroup pointer by cgroup_attach_task()

907

* update of a tasks cgroup pointer by cgroup_attach_task()

904

*/

908

*/

905

909

906

static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);

910

static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);

907

static struct kernfs_syscall_ops cgroup_kf_syscall_ops;

911

static struct kernfs_syscall_ops cgroup_kf_syscall_ops;

908

static const struct file_operations proc_cgroupstats_operations;

912

static const struct file_operations proc_cgroupstats_operations;

909

913

910

static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,

914

static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,

911

char *buf)

915

char *buf)

912

{

916

{

913

if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&

917

if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&

914

!(cgrp->root->flags & CGRP_ROOT_NOPREFIX))

918

!(cgrp->root->flags & CGRP_ROOT_NOPREFIX))

915

snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",

919

snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",

916

cft->ss->name, cft->name);

920

cft->ss->name, cft->name);

917

else

921

else

918

strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);

922

strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);

919

return buf;

923

return buf;

920

}

924

}

921

925

922

/**

926

/**

923

* cgroup_file_mode - deduce file mode of a control file

927

* cgroup_file_mode - deduce file mode of a control file

924

* @cft: the control file in question

928

* @cft: the control file in question

925

*

929

*

926

* returns cft->mode if ->mode is not 0

930

* returns cft->mode if ->mode is not 0

927

* returns S_IRUGO|S_IWUSR if it has both a read and a write handler

931

* returns S_IRUGO|S_IWUSR if it has both a read and a write handler

928

* returns S_IRUGO if it has only a read handler

932

* returns S_IRUGO if it has only a read handler

929

* returns S_IWUSR if it has only a write hander

933

* returns S_IWUSR if it has only a write hander

930

*/

934

*/

931

static umode_t cgroup_file_mode(const struct cftype *cft)

935

static umode_t cgroup_file_mode(const struct cftype *cft)

932

{

936

{

933

umode_t mode = 0;

937

umode_t mode = 0;

934

938

935

if (cft->mode)

939

if (cft->mode)

936

return cft->mode;

940

return cft->mode;

937

941

938

if (cft->read_u64 || cft->read_s64 || cft->seq_show)

942

if (cft->read_u64 || cft->read_s64 || cft->seq_show)

939

mode |= S_IRUGO;

943

mode |= S_IRUGO;

940

944

941

if (cft->write_u64 || cft->write_s64 || cft->write_string ||

945

if (cft->write_u64 || cft->write_s64 || cft->write_string ||

942

cft->trigger)

946

cft->trigger)

943

mode |= S_IWUSR;

947

mode |= S_IWUSR;

944

948

945

return mode;

949

return mode;

946

}

950

}

947

951

948

static void cgroup_free_fn(struct work_struct *work)

952

static void cgroup_free_fn(struct work_struct *work)

949

{

953

{

950

struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);

954

struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);

951

955

952

atomic_dec(&cgrp->root->nr_cgrps);

956

atomic_dec(&cgrp->root->nr_cgrps);

953

cgroup_pidlist_destroy_all(cgrp);

957

cgroup_pidlist_destroy_all(cgrp);

954

958

955

if (cgrp->parent) {

959

if (cgrp->parent) {

956

/*

960

/*

957

* We get a ref to the parent, and put the ref when this

961

* We get a ref to the parent, and put the ref when this

958

* cgroup is being freed, so it's guaranteed that the

962

* cgroup is being freed, so it's guaranteed that the

959

* parent won't be destroyed before its children.

963

* parent won't be destroyed before its children.

960

*/

964

*/

961

cgroup_put(cgrp->parent);

965

cgroup_put(cgrp->parent);

962

kernfs_put(cgrp->kn);

966

kernfs_put(cgrp->kn);

963

kfree(cgrp);

967

kfree(cgrp);

964

} else {

968

} else {

965

/*

969

/*

966

* This is root cgroup's refcnt reaching zero, which

970

* This is root cgroup's refcnt reaching zero, which

967

* indicates that the root should be released.

971

* indicates that the root should be released.

968

*/

972

*/

969

cgroup_destroy_root(cgrp->root);

973

cgroup_destroy_root(cgrp->root);

970

}

974

}

971

}

975

}

972

976

973

static void cgroup_free_rcu(struct rcu_head *head)

977

static void cgroup_free_rcu(struct rcu_head *head)

974

{

978

{

975

struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);

979

struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);

976

980

977

INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);

981

INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);

978

queue_work(cgroup_destroy_wq, &cgrp->destroy_work);

982

queue_work(cgroup_destroy_wq, &cgrp->destroy_work);

979

}

983

}

980

984

981

static void cgroup_get(struct cgroup *cgrp)

985

static void cgroup_get(struct cgroup *cgrp)

982

{

986

{

983

WARN_ON_ONCE(cgroup_is_dead(cgrp));

987

WARN_ON_ONCE(cgroup_is_dead(cgrp));

984

WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);

988

WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);

985

atomic_inc(&cgrp->refcnt);

989

atomic_inc(&cgrp->refcnt);

986

}

990

}

987

991

988

static void cgroup_put(struct cgroup *cgrp)

992

static void cgroup_put(struct cgroup *cgrp)

989

{

993

{

990

if (!atomic_dec_and_test(&cgrp->refcnt))

994

if (!atomic_dec_and_test(&cgrp->refcnt))

991

return;

995

return;

992

if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))

996

if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))

993

return;

997

return;

994

998

995

/*

999

/*

996

* XXX: cgrp->id is only used to look up css's. As cgroup and

1000

* XXX: cgrp->id is only used to look up css's. As cgroup and

997

* css's lifetimes will be decoupled, it should be made

1001

* css's lifetimes will be decoupled, it should be made

998

* per-subsystem and moved to css->id so that lookups are

1002

* per-subsystem and moved to css->id so that lookups are

999

* successful until the target css is released.

1003

* successful until the target css is released.

1000

*/

1004

*/

1001

mutex_lock(&cgroup_mutex);

1005

mutex_lock(&cgroup_mutex);

1002

idr_remove(&cgrp->root->cgroup_idr, cgrp->id);

1006

idr_remove(&cgrp->root->cgroup_idr, cgrp->id);

1003

mutex_unlock(&cgroup_mutex);

1007

mutex_unlock(&cgroup_mutex);

1004

cgrp->id = -1;

1008

cgrp->id = -1;

1005

1009

1006

call_rcu(&cgrp->rcu_head, cgroup_free_rcu);

1010

call_rcu(&cgrp->rcu_head, cgroup_free_rcu);

1007

}

1011

}

1008

1012

1009

static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)

1013

static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)

1010

{

1014

{

1011

char name[CGROUP_FILE_NAME_MAX];

1015

char name[CGROUP_FILE_NAME_MAX];

1012

1016

1013

lockdep_assert_held(&cgroup_tree_mutex);

1017

lockdep_assert_held(&cgroup_tree_mutex);

1014

kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));

1018

kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));

1015

}

1019

}

1016

1020

1017

/**

1021

/**

1018

* cgroup_clear_dir - remove subsys files in a cgroup directory

1022

* cgroup_clear_dir - remove subsys files in a cgroup directory

1019

* @cgrp: target cgroup

1023

* @cgrp: target cgroup

1020

* @subsys_mask: mask of the subsystem ids whose files should be removed

1024

* @subsys_mask: mask of the subsystem ids whose files should be removed

1021

*/

1025

*/

1022

static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)

1026

static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)

1023

{

1027

{

1024

struct cgroup_subsys *ss;

1028

struct cgroup_subsys *ss;

1025

int i;

1029

int i;

1026

1030

1027

for_each_subsys(ss, i) {

1031

for_each_subsys(ss, i) {

1028

struct cftype *cfts;

1032

struct cftype *cfts;

1029

1033

1030

if (!test_bit(i, &subsys_mask))

1034

if (!test_bit(i, &subsys_mask))

1031

continue;

1035

continue;

1032

list_for_each_entry(cfts, &ss->cfts, node)

1036

list_for_each_entry(cfts, &ss->cfts, node)

1033

cgroup_addrm_files(cgrp, cfts, false);

1037

cgroup_addrm_files(cgrp, cfts, false);

1034

}

1038

}

1035

}

1039

}

1036

1040

1037

static int rebind_subsystems(struct cgroup_root *dst_root,

1041

static int rebind_subsystems(struct cgroup_root *dst_root,

1038

unsigned long ss_mask)

1042

unsigned long ss_mask)

1039

{

1043

{

1040

struct cgroup_subsys *ss;

1044

struct cgroup_subsys *ss;

1041

int ssid, i, ret;

1045

int ssid, i, ret;

1042

1046

1043

lockdep_assert_held(&cgroup_tree_mutex);

1047

lockdep_assert_held(&cgroup_tree_mutex);

1044

lockdep_assert_held(&cgroup_mutex);

1048

lockdep_assert_held(&cgroup_mutex);

1045

1049

1046

for_each_subsys(ss, ssid) {

1050

for_each_subsys(ss, ssid) {

1047

if (!(ss_mask & (1 << ssid)))

1051

if (!(ss_mask & (1 << ssid)))

1048

continue;

1052

continue;

1049

1053

1050

/* if @ss is on the dummy_root, we can always move it */

1054

/* if @ss is on the dummy_root, we can always move it */

1051

if (ss->root == &cgrp_dfl_root)

1055

if (ss->root == &cgrp_dfl_root)

1052

continue;

1056

continue;

1053

1057

1054

/* if @ss has non-root cgroups attached to it, can't move */

1058

/* if @ss has non-root cgroups attached to it, can't move */

1055

if (!list_empty(&ss->root->cgrp.children))

1059

if (!list_empty(&ss->root->cgrp.children))

1056

return -EBUSY;

1060

return -EBUSY;

1057

1061

1058

/* can't move between two non-dummy roots either */

1062

/* can't move between two non-dummy roots either */

1059

if (dst_root != &cgrp_dfl_root)

1063

if (dst_root != &cgrp_dfl_root)

1060

return -EBUSY;

1064

return -EBUSY;

1061

}

1065

}

1062

1066

1063

ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);

1067

ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);

1064

if (ret) {

1068

if (ret) {

1065

if (dst_root != &cgrp_dfl_root)

1069

if (dst_root != &cgrp_dfl_root)

1066

return ret;

1070

return ret;

1067

1071

1068

/*

1072

/*

1069

* Rebinding back to the default root is not allowed to

1073

* Rebinding back to the default root is not allowed to

1070

* fail. Using both default and non-default roots should

1074

* fail. Using both default and non-default roots should

1071

* be rare. Moving subsystems back and forth even more so.

1075

* be rare. Moving subsystems back and forth even more so.

1072

* Just warn about it and continue.

1076

* Just warn about it and continue.

1073

*/

1077

*/

1074

if (cgrp_dfl_root_visible) {

1078

if (cgrp_dfl_root_visible) {

1075

pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",

1079

pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",

1076

ret, ss_mask);

1080

ret, ss_mask);

1077

pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");

1081

pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");

1078

}

1082

}

1079

}

1083

}

1080

1084

1081

/*

1085

/*

1082

* Nothing can fail from this point on. Remove files for the

1086

* Nothing can fail from this point on. Remove files for the

1083

* removed subsystems and rebind each subsystem.

1087

* removed subsystems and rebind each subsystem.

1084

*/

1088

*/

1085

mutex_unlock(&cgroup_mutex);

1089

mutex_unlock(&cgroup_mutex);

1086

for_each_subsys(ss, ssid)

1090

for_each_subsys(ss, ssid)

1087

if (ss_mask & (1 << ssid))

1091

if (ss_mask & (1 << ssid))

1088

cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);

1092

cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);

1089

mutex_lock(&cgroup_mutex);

1093

mutex_lock(&cgroup_mutex);

1090

1094

1091

for_each_subsys(ss, ssid) {

1095

for_each_subsys(ss, ssid) {

1092

struct cgroup_root *src_root;

1096

struct cgroup_root *src_root;

1093

struct cgroup_subsys_state *css;

1097

struct cgroup_subsys_state *css;

1094

struct css_set *cset;

1098

struct css_set *cset;

1095

1099

1096

if (!(ss_mask & (1 << ssid)))

1100

if (!(ss_mask & (1 << ssid)))

1097

continue;

1101

continue;

1098

1102

1099

src_root = ss->root;

1103

src_root = ss->root;

1100

css = cgroup_css(&src_root->cgrp, ss);

1104

css = cgroup_css(&src_root->cgrp, ss);

1101

1105

1102

WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));

1106

WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));

1103

1107

1104

RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);

1108

RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);

1105

rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);

1109

rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);

1106

ss->root = dst_root;

1110

ss->root = dst_root;

1107

css->cgroup = &dst_root->cgrp;

1111

css->cgroup = &dst_root->cgrp;

1108

1112

1109

down_write(&css_set_rwsem);

1113

down_write(&css_set_rwsem);

1110

hash_for_each(css_set_table, i, cset, hlist)

1114

hash_for_each(css_set_table, i, cset, hlist)

1111

list_move_tail(&cset->e_cset_node[ss->id],

1115

list_move_tail(&cset->e_cset_node[ss->id],

1112

&dst_root->cgrp.e_csets[ss->id]);

1116

&dst_root->cgrp.e_csets[ss->id]);

1113

up_write(&css_set_rwsem);

1117

up_write(&css_set_rwsem);

1114

1118

1115

src_root->subsys_mask &= ~(1 << ssid);

1119

src_root->subsys_mask &= ~(1 << ssid);

1116

src_root->cgrp.child_subsys_mask &= ~(1 << ssid);

1120

src_root->cgrp.child_subsys_mask &= ~(1 << ssid);

1117

1121

1118

/* default hierarchy doesn't enable controllers by default */

1122

/* default hierarchy doesn't enable controllers by default */

1119

dst_root->subsys_mask |= 1 << ssid;

1123

dst_root->subsys_mask |= 1 << ssid;

1120

if (dst_root != &cgrp_dfl_root)

1124

if (dst_root != &cgrp_dfl_root)

1121

dst_root->cgrp.child_subsys_mask |= 1 << ssid;

1125

dst_root->cgrp.child_subsys_mask |= 1 << ssid;

1122

1126

1123

if (ss->bind)

1127

if (ss->bind)

1124

ss->bind(css);

1128

ss->bind(css);

1125

}

1129

}

1126

1130

1127

kernfs_activate(dst_root->cgrp.kn);

1131

kernfs_activate(dst_root->cgrp.kn);

1128

return 0;

1132

return 0;

1129

}

1133

}

1130

1134

1131

static int cgroup_show_options(struct seq_file *seq,

1135

static int cgroup_show_options(struct seq_file *seq,

1132

struct kernfs_root *kf_root)

1136

struct kernfs_root *kf_root)

1133

{

1137

{

1134

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1138

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1135

struct cgroup_subsys *ss;

1139

struct cgroup_subsys *ss;

1136

int ssid;

1140

int ssid;

1137

1141

1138

for_each_subsys(ss, ssid)

1142

for_each_subsys(ss, ssid)

1139

if (root->subsys_mask & (1 << ssid))

1143

if (root->subsys_mask & (1 << ssid))

1140

seq_printf(seq, ",%s", ss->name);

1144

seq_printf(seq, ",%s", ss->name);

1141

if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)

1145

if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)

1142

seq_puts(seq, ",sane_behavior");

1146

seq_puts(seq, ",sane_behavior");

1143

if (root->flags & CGRP_ROOT_NOPREFIX)

1147

if (root->flags & CGRP_ROOT_NOPREFIX)

1144

seq_puts(seq, ",noprefix");

1148

seq_puts(seq, ",noprefix");

1145

if (root->flags & CGRP_ROOT_XATTR)

1149

if (root->flags & CGRP_ROOT_XATTR)

1146

seq_puts(seq, ",xattr");

1150

seq_puts(seq, ",xattr");

1147

1151

1148

spin_lock(&release_agent_path_lock);

1152

spin_lock(&release_agent_path_lock);

1149

if (strlen(root->release_agent_path))

1153

if (strlen(root->release_agent_path))

1150

seq_printf(seq, ",release_agent=%s", root->release_agent_path);

1154

seq_printf(seq, ",release_agent=%s", root->release_agent_path);

1151

spin_unlock(&release_agent_path_lock);

1155

spin_unlock(&release_agent_path_lock);

1152

1156

1153

if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))

1157

if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))

1154

seq_puts(seq, ",clone_children");

1158

seq_puts(seq, ",clone_children");

1155

if (strlen(root->name))

1159

if (strlen(root->name))

1156

seq_printf(seq, ",name=%s", root->name);

1160

seq_printf(seq, ",name=%s", root->name);

1157

return 0;

1161

return 0;

1158

}

1162

}

1159

1163

1160

struct cgroup_sb_opts {

1164

struct cgroup_sb_opts {

1161

unsigned long subsys_mask;

1165

unsigned long subsys_mask;

1162

unsigned long flags;

1166

unsigned long flags;

1163

char *release_agent;

1167

char *release_agent;

1164

bool cpuset_clone_children;

1168

bool cpuset_clone_children;

1165

char *name;

1169

char *name;

1166

/* User explicitly requested empty subsystem */

1170

/* User explicitly requested empty subsystem */

1167

bool none;

1171

bool none;

1168

};

1172

};

1169

1173

1170

/*

1174

/*

1171

* Convert a hierarchy specifier into a bitmask of subsystems and

1175

* Convert a hierarchy specifier into a bitmask of subsystems and

1172

* flags. Call with cgroup_mutex held to protect the cgroup_subsys[]

1176

* flags. Call with cgroup_mutex held to protect the cgroup_subsys[]

1173

* array. This function takes refcounts on subsystems to be used, unless it

1177

* array. This function takes refcounts on subsystems to be used, unless it

1174

* returns error, in which case no refcounts are taken.

1178

* returns error, in which case no refcounts are taken.

1175

*/

1179

*/

1176

static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)

1180

static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)

1177

{

1181

{

1178

char *token, *o = data;

1182

char *token, *o = data;

1179

bool all_ss = false, one_ss = false;

1183

bool all_ss = false, one_ss = false;

1180

unsigned long mask = (unsigned long)-1;

1184

unsigned long mask = (unsigned long)-1;

1181

struct cgroup_subsys *ss;

1185

struct cgroup_subsys *ss;

1182

int i;

1186

int i;

1183

1187

1184

BUG_ON(!mutex_is_locked(&cgroup_mutex));

1188

BUG_ON(!mutex_is_locked(&cgroup_mutex));

1185

1189

1186

#ifdef CONFIG_CPUSETS

1190

#ifdef CONFIG_CPUSETS

1187

mask = ~(1UL << cpuset_cgrp_id);

1191

mask = ~(1UL << cpuset_cgrp_id);

1188

#endif

1192

#endif

1189

1193

1190

memset(opts, 0, sizeof(*opts));

1194

memset(opts, 0, sizeof(*opts));

1191

1195

1192

while ((token = strsep(&o, ",")) != NULL) {

1196

while ((token = strsep(&o, ",")) != NULL) {

1193

if (!*token)

1197

if (!*token)

1194

return -EINVAL;

1198

return -EINVAL;

1195

if (!strcmp(token, "none")) {

1199

if (!strcmp(token, "none")) {

1196

/* Explicitly have no subsystems */

1200

/* Explicitly have no subsystems */

1197

opts->none = true;

1201

opts->none = true;

1198

continue;

1202

continue;

1199

}

1203

}

1200

if (!strcmp(token, "all")) {

1204

if (!strcmp(token, "all")) {

1201

/* Mutually exclusive option 'all' + subsystem name */

1205

/* Mutually exclusive option 'all' + subsystem name */

1202

if (one_ss)

1206

if (one_ss)

1203

return -EINVAL;

1207

return -EINVAL;

1204

all_ss = true;

1208

all_ss = true;

1205

continue;

1209

continue;

1206

}

1210

}

1207

if (!strcmp(token, "__DEVEL__sane_behavior")) {

1211

if (!strcmp(token, "__DEVEL__sane_behavior")) {

1208

opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;

1212

opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;

1209

continue;

1213

continue;

1210

}

1214

}

1211

if (!strcmp(token, "noprefix")) {

1215

if (!strcmp(token, "noprefix")) {

1212

opts->flags |= CGRP_ROOT_NOPREFIX;

1216

opts->flags |= CGRP_ROOT_NOPREFIX;

1213

continue;

1217

continue;

1214

}

1218

}

1215

if (!strcmp(token, "clone_children")) {

1219

if (!strcmp(token, "clone_children")) {

1216

opts->cpuset_clone_children = true;

1220

opts->cpuset_clone_children = true;

1217

continue;

1221

continue;

1218

}

1222

}

1219

if (!strcmp(token, "xattr")) {

1223

if (!strcmp(token, "xattr")) {

1220

opts->flags |= CGRP_ROOT_XATTR;

1224

opts->flags |= CGRP_ROOT_XATTR;

1221

continue;

1225

continue;

1222

}

1226

}

1223

if (!strncmp(token, "release_agent=", 14)) {

1227

if (!strncmp(token, "release_agent=", 14)) {

1224

/* Specifying two release agents is forbidden */

1228

/* Specifying two release agents is forbidden */

1225

if (opts->release_agent)

1229

if (opts->release_agent)

1226

return -EINVAL;

1230

return -EINVAL;

1227

opts->release_agent =

1231

opts->release_agent =

1228

kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);

1232

kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);

1229

if (!opts->release_agent)

1233

if (!opts->release_agent)

1230

return -ENOMEM;

1234

return -ENOMEM;

1231

continue;

1235

continue;

1232

}

1236

}

1233

if (!strncmp(token, "name=", 5)) {

1237

if (!strncmp(token, "name=", 5)) {

1234

const char *name = token + 5;

1238

const char *name = token + 5;

1235

/* Can't specify an empty name */

1239

/* Can't specify an empty name */

1236

if (!strlen(name))

1240

if (!strlen(name))

1237

return -EINVAL;

1241

return -EINVAL;

1238

/* Must match [\w.-]+ */

1242

/* Must match [\w.-]+ */

1239

for (i = 0; i < strlen(name); i++) {

1243

for (i = 0; i < strlen(name); i++) {

1240

char c = name[i];

1244

char c = name[i];

1241

if (isalnum(c))

1245

if (isalnum(c))

1242

continue;

1246

continue;

1243

if ((c == '.') || (c == '-') || (c == '_'))

1247

if ((c == '.') || (c == '-') || (c == '_'))

1244

continue;

1248

continue;

1245

return -EINVAL;

1249

return -EINVAL;

1246

}

1250

}

1247

/* Specifying two names is forbidden */

1251

/* Specifying two names is forbidden */

1248

if (opts->name)

1252

if (opts->name)

1249

return -EINVAL;

1253

return -EINVAL;

1250

opts->name = kstrndup(name,

1254

opts->name = kstrndup(name,

1251

MAX_CGROUP_ROOT_NAMELEN - 1,

1255

MAX_CGROUP_ROOT_NAMELEN - 1,

1252

GFP_KERNEL);

1256

GFP_KERNEL);

1253

if (!opts->name)

1257

if (!opts->name)

1254

return -ENOMEM;

1258

return -ENOMEM;

1255

1259

1256

continue;

1260

continue;

1257

}

1261

}

1258

1262

1259

for_each_subsys(ss, i) {

1263

for_each_subsys(ss, i) {

1260

if (strcmp(token, ss->name))

1264

if (strcmp(token, ss->name))

1261

continue;

1265

continue;

1262

if (ss->disabled)

1266

if (ss->disabled)

1263

continue;

1267

continue;

1264

1268

1265

/* Mutually exclusive option 'all' + subsystem name */

1269

/* Mutually exclusive option 'all' + subsystem name */

1266

if (all_ss)

1270

if (all_ss)

1267

return -EINVAL;

1271

return -EINVAL;

1268

set_bit(i, &opts->subsys_mask);

1272

set_bit(i, &opts->subsys_mask);

1269

one_ss = true;

1273

one_ss = true;

1270

1274

1271

break;

1275

break;

1272

}

1276

}

1273

if (i == CGROUP_SUBSYS_COUNT)

1277

if (i == CGROUP_SUBSYS_COUNT)

1274

return -ENOENT;

1278

return -ENOENT;

1275

}

1279

}

1276

1280

1277

/* Consistency checks */

1281

/* Consistency checks */

1278

1282

1279

if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {

1283

if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {

1280

pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");

1284

pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");

1281

1285

1282

if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||

1286

if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||

1283

opts->cpuset_clone_children || opts->release_agent ||

1287

opts->cpuset_clone_children || opts->release_agent ||

1284

opts->name) {

1288

opts->name) {

1285

pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");

1289

pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");

1286

return -EINVAL;

1290

return -EINVAL;

1287

}

1291

}

1288

} else {

1292

} else {

1289

/*

1293

/*

1290

* If the 'all' option was specified select all the

1294

* If the 'all' option was specified select all the

1291

* subsystems, otherwise if 'none', 'name=' and a subsystem

1295

* subsystems, otherwise if 'none', 'name=' and a subsystem

1292

* name options were not specified, let's default to 'all'

1296

* name options were not specified, let's default to 'all'

1293

*/

1297

*/

1294

if (all_ss || (!one_ss && !opts->none && !opts->name))

1298

if (all_ss || (!one_ss && !opts->none && !opts->name))

1295

for_each_subsys(ss, i)

1299

for_each_subsys(ss, i)

1296

if (!ss->disabled)

1300

if (!ss->disabled)

1297

set_bit(i, &opts->subsys_mask);

1301

set_bit(i, &opts->subsys_mask);

1298

1302

1299

/*

1303

/*

1300

* We either have to specify by name or by subsystems. (So

1304

* We either have to specify by name or by subsystems. (So

1301

* all empty hierarchies must have a name).

1305

* all empty hierarchies must have a name).

1302

*/

1306

*/

1303

if (!opts->subsys_mask && !opts->name)

1307

if (!opts->subsys_mask && !opts->name)

1304

return -EINVAL;

1308

return -EINVAL;

1305

}

1309

}

1306

1310

1307

/*

1311

/*

1308

* Option noprefix was introduced just for backward compatibility

1312

* Option noprefix was introduced just for backward compatibility

1309

* with the old cpuset, so we allow noprefix only if mounting just

1313

* with the old cpuset, so we allow noprefix only if mounting just

1310

* the cpuset subsystem.

1314

* the cpuset subsystem.

1311

*/

1315

*/

1312

if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))

1316

if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))

1313

return -EINVAL;

1317

return -EINVAL;

1314

1318

1315

1319

1316

/* Can't specify "none" and some subsystems */

1320

/* Can't specify "none" and some subsystems */

1317

if (opts->subsys_mask && opts->none)

1321

if (opts->subsys_mask && opts->none)

1318

return -EINVAL;

1322

return -EINVAL;

1319

1323

1320

return 0;

1324

return 0;

1321

}

1325

}

1322

1326

1323

static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)

1327

static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)

1324

{

1328

{

1325

int ret = 0;

1329

int ret = 0;

1326

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1330

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1327

struct cgroup_sb_opts opts;

1331

struct cgroup_sb_opts opts;

1328

unsigned long added_mask, removed_mask;

1332

unsigned long added_mask, removed_mask;

1329

1333

1330

if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {

1334

if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {

1331

pr_err("cgroup: sane_behavior: remount is not allowed\n");

1335

pr_err("cgroup: sane_behavior: remount is not allowed\n");

1332

return -EINVAL;

1336

return -EINVAL;

1333

}

1337

}

1334

1338

1335

mutex_lock(&cgroup_tree_mutex);

1339

mutex_lock(&cgroup_tree_mutex);

1336

mutex_lock(&cgroup_mutex);

1340

mutex_lock(&cgroup_mutex);

1337

1341

1338

/* See what subsystems are wanted */

1342

/* See what subsystems are wanted */

1339

ret = parse_cgroupfs_options(data, &opts);

1343

ret = parse_cgroupfs_options(data, &opts);

1340

if (ret)

1344

if (ret)

1341

goto out_unlock;

1345

goto out_unlock;

1342

1346

1343

if (opts.subsys_mask != root->subsys_mask || opts.release_agent)

1347

if (opts.subsys_mask != root->subsys_mask || opts.release_agent)

1344

pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",

1348

pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",

1345

task_tgid_nr(current), current->comm);

1349

task_tgid_nr(current), current->comm);

1346

1350

1347

added_mask = opts.subsys_mask & ~root->subsys_mask;

1351

added_mask = opts.subsys_mask & ~root->subsys_mask;

1348

removed_mask = root->subsys_mask & ~opts.subsys_mask;

1352

removed_mask = root->subsys_mask & ~opts.subsys_mask;

1349

1353

1350

/* Don't allow flags or name to change at remount */

1354

/* Don't allow flags or name to change at remount */

1351

if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||

1355

if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||

1352

(opts.name && strcmp(opts.name, root->name))) {

1356

(opts.name && strcmp(opts.name, root->name))) {

1353

pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",

1357

pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",

1354

opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",

1358

opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",

1355

root->flags & CGRP_ROOT_OPTION_MASK, root->name);

1359

root->flags & CGRP_ROOT_OPTION_MASK, root->name);

1356

ret = -EINVAL;

1360

ret = -EINVAL;

1357

goto out_unlock;

1361

goto out_unlock;

1358

}

1362

}

1359

1363

1360

/* remounting is not allowed for populated hierarchies */

1364

/* remounting is not allowed for populated hierarchies */

1361

if (!list_empty(&root->cgrp.children)) {

1365

if (!list_empty(&root->cgrp.children)) {

1362

ret = -EBUSY;

1366

ret = -EBUSY;

1363

goto out_unlock;

1367

goto out_unlock;

1364

}

1368

}

1365

1369

1366

ret = rebind_subsystems(root, added_mask);

1370

ret = rebind_subsystems(root, added_mask);

1367

if (ret)

1371

if (ret)

1368

goto out_unlock;

1372

goto out_unlock;

1369

1373

1370

rebind_subsystems(&cgrp_dfl_root, removed_mask);

1374

rebind_subsystems(&cgrp_dfl_root, removed_mask);

1371

1375

1372

if (opts.release_agent) {

1376

if (opts.release_agent) {

1373

spin_lock(&release_agent_path_lock);

1377

spin_lock(&release_agent_path_lock);

1374

strcpy(root->release_agent_path, opts.release_agent);

1378

strcpy(root->release_agent_path, opts.release_agent);

1375

spin_unlock(&release_agent_path_lock);

1379

spin_unlock(&release_agent_path_lock);

1376

}

1380

}

1377

out_unlock:

1381

out_unlock:

1378

kfree(opts.release_agent);

1382

kfree(opts.release_agent);

1379

kfree(opts.name);

1383

kfree(opts.name);

1380

mutex_unlock(&cgroup_mutex);

1384

mutex_unlock(&cgroup_mutex);

1381

mutex_unlock(&cgroup_tree_mutex);

1385

mutex_unlock(&cgroup_tree_mutex);

1382

return ret;

1386

return ret;

1383

}

1387

}

1384

1388

1385

/*

1389

/*

1386

* To reduce the fork() overhead for systems that are not actually using

1390

* To reduce the fork() overhead for systems that are not actually using

1387

* their cgroups capability, we don't maintain the lists running through

1391

* their cgroups capability, we don't maintain the lists running through

1388

* each css_set to its tasks until we see the list actually used - in other

1392

* each css_set to its tasks until we see the list actually used - in other

1389

* words after the first mount.

1393

* words after the first mount.

1390

*/

1394

*/

1391

static bool use_task_css_set_links __read_mostly;

1395

static bool use_task_css_set_links __read_mostly;

1392

1396

1393

static void cgroup_enable_task_cg_lists(void)

1397

static void cgroup_enable_task_cg_lists(void)

1394

{

1398

{

1395

struct task_struct *p, *g;

1399

struct task_struct *p, *g;

1396

1400

1397

down_write(&css_set_rwsem);

1401

down_write(&css_set_rwsem);

1398

1402

1399

if (use_task_css_set_links)

1403

if (use_task_css_set_links)

1400

goto out_unlock;

1404

goto out_unlock;

1401

1405

1402

use_task_css_set_links = true;

1406

use_task_css_set_links = true;

1403

1407

1404

/*

1408

/*

1405

* We need tasklist_lock because RCU is not safe against

1409

* We need tasklist_lock because RCU is not safe against

1406

* while_each_thread(). Besides, a forking task that has passed

1410

* while_each_thread(). Besides, a forking task that has passed

1407

* cgroup_post_fork() without seeing use_task_css_set_links = 1

1411

* cgroup_post_fork() without seeing use_task_css_set_links = 1

1408

* is not guaranteed to have its child immediately visible in the

1412

* is not guaranteed to have its child immediately visible in the

1409

* tasklist if we walk through it with RCU.

1413

* tasklist if we walk through it with RCU.

1410

*/

1414

*/

1411

read_lock(&tasklist_lock);

1415

read_lock(&tasklist_lock);

1412

do_each_thread(g, p) {

1416

do_each_thread(g, p) {

1413

WARN_ON_ONCE(!list_empty(&p->cg_list) ||

1417

WARN_ON_ONCE(!list_empty(&p->cg_list) ||

1414

task_css_set(p) != &init_css_set);

1418

task_css_set(p) != &init_css_set);

1415

1419

1416

/*

1420

/*

1417

* We should check if the process is exiting, otherwise

1421

* We should check if the process is exiting, otherwise

1418

* it will race with cgroup_exit() in that the list

1422

* it will race with cgroup_exit() in that the list

1419

* entry won't be deleted though the process has exited.

1423

* entry won't be deleted though the process has exited.

1420

* Do it while holding siglock so that we don't end up

1424

* Do it while holding siglock so that we don't end up

1421

* racing against cgroup_exit().

1425

* racing against cgroup_exit().

1422

*/

1426

*/

1423

spin_lock_irq(&p->sighand->siglock);

1427

spin_lock_irq(&p->sighand->siglock);

1424

if (!(p->flags & PF_EXITING)) {

1428

if (!(p->flags & PF_EXITING)) {

1425

struct css_set *cset = task_css_set(p);

1429

struct css_set *cset = task_css_set(p);

1426

1430

1427

list_add(&p->cg_list, &cset->tasks);

1431

list_add(&p->cg_list, &cset->tasks);

1428

get_css_set(cset);

1432

get_css_set(cset);

1429

}

1433

}

1430

spin_unlock_irq(&p->sighand->siglock);

1434

spin_unlock_irq(&p->sighand->siglock);

1431

} while_each_thread(g, p);

1435

} while_each_thread(g, p);

1432

read_unlock(&tasklist_lock);

1436

read_unlock(&tasklist_lock);

1433

out_unlock:

1437

out_unlock:

1434

up_write(&css_set_rwsem);

1438

up_write(&css_set_rwsem);

1435

}

1439

}

1436

1440

1437

static void init_cgroup_housekeeping(struct cgroup *cgrp)

1441

static void init_cgroup_housekeeping(struct cgroup *cgrp)

1438

{

1442

{

1439

struct cgroup_subsys *ss;

1443

struct cgroup_subsys *ss;

1440

int ssid;

1444

int ssid;

1441

1445

1442

atomic_set(&cgrp->refcnt, 1);

1446

atomic_set(&cgrp->refcnt, 1);

1443

INIT_LIST_HEAD(&cgrp->sibling);

1447

INIT_LIST_HEAD(&cgrp->sibling);

1444

INIT_LIST_HEAD(&cgrp->children);

1448

INIT_LIST_HEAD(&cgrp->children);

1445

INIT_LIST_HEAD(&cgrp->cset_links);

1449

INIT_LIST_HEAD(&cgrp->cset_links);

1446

INIT_LIST_HEAD(&cgrp->release_list);

1450

INIT_LIST_HEAD(&cgrp->release_list);

1447

INIT_LIST_HEAD(&cgrp->pidlists);

1451

INIT_LIST_HEAD(&cgrp->pidlists);

1448

mutex_init(&cgrp->pidlist_mutex);

1452

mutex_init(&cgrp->pidlist_mutex);

1449

cgrp->dummy_css.cgroup = cgrp;

1453

cgrp->dummy_css.cgroup = cgrp;

1450

1454

1451

for_each_subsys(ss, ssid)

1455

for_each_subsys(ss, ssid)

1452

INIT_LIST_HEAD(&cgrp->e_csets[ssid]);

1456

INIT_LIST_HEAD(&cgrp->e_csets[ssid]);

1453

}

1457

}

1454

1458

1455

static void init_cgroup_root(struct cgroup_root *root,

1459

static void init_cgroup_root(struct cgroup_root *root,

1456

struct cgroup_sb_opts *opts)

1460

struct cgroup_sb_opts *opts)

1457

{

1461

{

1458

struct cgroup *cgrp = &root->cgrp;

1462

struct cgroup *cgrp = &root->cgrp;

1459

1463

1460

INIT_LIST_HEAD(&root->root_list);

1464

INIT_LIST_HEAD(&root->root_list);

1461

atomic_set(&root->nr_cgrps, 1);

1465

atomic_set(&root->nr_cgrps, 1);

1462

cgrp->root = root;

1466

cgrp->root = root;

1463

init_cgroup_housekeeping(cgrp);

1467

init_cgroup_housekeeping(cgrp);

1464

idr_init(&root->cgroup_idr);

1468

idr_init(&root->cgroup_idr);

1465

1469

1466

root->flags = opts->flags;

1470

root->flags = opts->flags;

1467

if (opts->release_agent)

1471

if (opts->release_agent)

1468

strcpy(root->release_agent_path, opts->release_agent);

1472

strcpy(root->release_agent_path, opts->release_agent);

1469

if (opts->name)

1473

if (opts->name)

1470

strcpy(root->name, opts->name);

1474

strcpy(root->name, opts->name);

1471

if (opts->cpuset_clone_children)

1475

if (opts->cpuset_clone_children)

1472

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);

1476

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);

1473

}

1477

}

1474

1478

1475

static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)

1479

static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)

1476

{

1480

{

1477

LIST_HEAD(tmp_links);

1481

LIST_HEAD(tmp_links);

1478

struct cgroup *root_cgrp = &root->cgrp;

1482

struct cgroup *root_cgrp = &root->cgrp;

1479

struct css_set *cset;

1483

struct css_set *cset;

1480

int i, ret;

1484

int i, ret;

1481

1485

1482

lockdep_assert_held(&cgroup_tree_mutex);

1486

lockdep_assert_held(&cgroup_tree_mutex);

1483

lockdep_assert_held(&cgroup_mutex);

1487

lockdep_assert_held(&cgroup_mutex);

1484

1488

1485

ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);

1489

ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);

1486

if (ret < 0)

1490

if (ret < 0)

1487

goto out;

1491

goto out;

1488

root_cgrp->id = ret;

1492

root_cgrp->id = ret;

1489

1493

1490

/*

1494

/*

1491

* We're accessing css_set_count without locking css_set_rwsem here,

1495

* We're accessing css_set_count without locking css_set_rwsem here,

1492

* but that's OK - it can only be increased by someone holding

1496

* but that's OK - it can only be increased by someone holding

1493

* cgroup_lock, and that's us. The worst that can happen is that we

1497

* cgroup_lock, and that's us. The worst that can happen is that we

1494

* have some link structures left over

1498

* have some link structures left over

1495

*/

1499

*/

1496

ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);

1500

ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);

1497

if (ret)

1501

if (ret)

1498

goto out;

1502

goto out;

1499

1503

1500

ret = cgroup_init_root_id(root);

1504

ret = cgroup_init_root_id(root);

1501

if (ret)

1505

if (ret)

1502

goto out;

1506

goto out;

1503

1507

1504

root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,

1508

root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,

1505

KERNFS_ROOT_CREATE_DEACTIVATED,

1509

KERNFS_ROOT_CREATE_DEACTIVATED,

1506

root_cgrp);

1510

root_cgrp);

1507

if (IS_ERR(root->kf_root)) {

1511

if (IS_ERR(root->kf_root)) {

1508

ret = PTR_ERR(root->kf_root);

1512

ret = PTR_ERR(root->kf_root);

1509

goto exit_root_id;

1513

goto exit_root_id;

1510

}

1514

}

1511

root_cgrp->kn = root->kf_root->kn;

1515

root_cgrp->kn = root->kf_root->kn;

1512

1516

1513

ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);

1517

ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);

1514

if (ret)

1518

if (ret)

1515

goto destroy_root;

1519

goto destroy_root;

1516

1520

1517

ret = rebind_subsystems(root, ss_mask);

1521

ret = rebind_subsystems(root, ss_mask);

1518

if (ret)

1522

if (ret)

1519

goto destroy_root;

1523

goto destroy_root;

1520

1524

1521

/*

1525

/*

1522

* There must be no failure case after here, since rebinding takes

1526

* There must be no failure case after here, since rebinding takes

1523

* care of subsystems' refcounts, which are explicitly dropped in

1527

* care of subsystems' refcounts, which are explicitly dropped in

1524

* the failure exit path.

1528

* the failure exit path.

1525

*/

1529

*/

1526

list_add(&root->root_list, &cgroup_roots);

1530

list_add(&root->root_list, &cgroup_roots);

1527

cgroup_root_count++;

1531

cgroup_root_count++;

1528

1532

1529

/*

1533

/*

1530

* Link the root cgroup in this hierarchy into all the css_set

1534

* Link the root cgroup in this hierarchy into all the css_set

1531

* objects.

1535

* objects.

1532

*/

1536

*/

1533

down_write(&css_set_rwsem);

1537

down_write(&css_set_rwsem);

1534

hash_for_each(css_set_table, i, cset, hlist)

1538

hash_for_each(css_set_table, i, cset, hlist)

1535

link_css_set(&tmp_links, cset, root_cgrp);

1539

link_css_set(&tmp_links, cset, root_cgrp);

1536

up_write(&css_set_rwsem);

1540

up_write(&css_set_rwsem);

1537

1541

1538

BUG_ON(!list_empty(&root_cgrp->children));

1542

BUG_ON(!list_empty(&root_cgrp->children));

1539

BUG_ON(atomic_read(&root->nr_cgrps) != 1);

1543

BUG_ON(atomic_read(&root->nr_cgrps) != 1);

1540

1544

1541

kernfs_activate(root_cgrp->kn);

1545

kernfs_activate(root_cgrp->kn);

1542

ret = 0;

1546

ret = 0;

1543

goto out;

1547

goto out;

1544

1548

1545

destroy_root:

1549

destroy_root:

1546

kernfs_destroy_root(root->kf_root);

1550

kernfs_destroy_root(root->kf_root);

1547

root->kf_root = NULL;

1551

root->kf_root = NULL;

1548

exit_root_id:

1552

exit_root_id:

1549

cgroup_exit_root_id(root);

1553

cgroup_exit_root_id(root);

1550

out:

1554

out:

1551

free_cgrp_cset_links(&tmp_links);

1555

free_cgrp_cset_links(&tmp_links);

1552

return ret;

1556

return ret;

1553

}

1557

}

1554

1558

1555

static struct dentry *cgroup_mount(struct file_system_type *fs_type,

1559

static struct dentry *cgroup_mount(struct file_system_type *fs_type,

1556

int flags, const char *unused_dev_name,

1560

int flags, const char *unused_dev_name,

1557

void *data)

1561

void *data)

1558

{

1562

{

1559

struct cgroup_root *root;

1563

struct cgroup_root *root;

1560

struct cgroup_sb_opts opts;

1564

struct cgroup_sb_opts opts;

1561

struct dentry *dentry;

1565

struct dentry *dentry;

1562

int ret;

1566

int ret;

1563

bool new_sb;

1567

bool new_sb;

1564

1568

1565

/*

1569

/*

1566

* The first time anyone tries to mount a cgroup, enable the list

1570

* The first time anyone tries to mount a cgroup, enable the list

1567

* linking each css_set to its tasks and fix up all existing tasks.

1571

* linking each css_set to its tasks and fix up all existing tasks.

1568

*/

1572

*/

1569

if (!use_task_css_set_links)

1573

if (!use_task_css_set_links)

1570

cgroup_enable_task_cg_lists();

1574

cgroup_enable_task_cg_lists();

1571

1575

1572

mutex_lock(&cgroup_tree_mutex);

1576

mutex_lock(&cgroup_tree_mutex);

1573

mutex_lock(&cgroup_mutex);

1577

mutex_lock(&cgroup_mutex);

1574

1578

1575

/* First find the desired set of subsystems */

1579

/* First find the desired set of subsystems */

1576

ret = parse_cgroupfs_options(data, &opts);

1580

ret = parse_cgroupfs_options(data, &opts);

1577

if (ret)

1581

if (ret)

1578

goto out_unlock;

1582

goto out_unlock;

1579

retry:

1583

retry:

1580

/* look for a matching existing root */

1584

/* look for a matching existing root */

1581

if (!opts.subsys_mask && !opts.none && !opts.name) {

1585

if (!opts.subsys_mask && !opts.none && !opts.name) {

1582

cgrp_dfl_root_visible = true;

1586

cgrp_dfl_root_visible = true;

1583

root = &cgrp_dfl_root;

1587

root = &cgrp_dfl_root;

1584

cgroup_get(&root->cgrp);

1588

cgroup_get(&root->cgrp);

1585

ret = 0;

1589

ret = 0;

1586

goto out_unlock;

1590

goto out_unlock;

1587

}

1591

}

1588

1592

1589

for_each_root(root) {

1593

for_each_root(root) {

1590

bool name_match = false;

1594

bool name_match = false;

1591

1595

1592

if (root == &cgrp_dfl_root)

1596

if (root == &cgrp_dfl_root)

1593

continue;

1597

continue;

1594

1598

1595

/*

1599

/*

1596

* If we asked for a name then it must match. Also, if

1600

* If we asked for a name then it must match. Also, if

1597

* name matches but sybsys_mask doesn't, we should fail.

1601

* name matches but sybsys_mask doesn't, we should fail.

1598

* Remember whether name matched.

1602

* Remember whether name matched.

1599

*/

1603

*/

1600

if (opts.name) {

1604

if (opts.name) {

1601

if (strcmp(opts.name, root->name))

1605

if (strcmp(opts.name, root->name))

1602

continue;

1606

continue;

1603

name_match = true;

1607

name_match = true;

1604

}

1608

}

1605

1609

1606

/*

1610

/*

1607

* If we asked for subsystems (or explicitly for no

1611

* If we asked for subsystems (or explicitly for no

1608

* subsystems) then they must match.

1612

* subsystems) then they must match.

1609

*/

1613

*/

1610

if ((opts.subsys_mask || opts.none) &&

1614

if ((opts.subsys_mask || opts.none) &&

1611

(opts.subsys_mask != root->subsys_mask)) {

1615

(opts.subsys_mask != root->subsys_mask)) {

1612

if (!name_match)

1616

if (!name_match)

1613

continue;

1617

continue;

1614

ret = -EBUSY;

1618

ret = -EBUSY;

1615

goto out_unlock;

1619

goto out_unlock;

1616

}

1620

}

1617

1621

1618

if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {

1622

if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {

1619

if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {

1623

if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {

1620

pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");

1624

pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");

1621

ret = -EINVAL;

1625

ret = -EINVAL;

1622

goto out_unlock;

1626

goto out_unlock;

1623

} else {

1627

} else {

1624

pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");

1628

pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");

1625

}

1629

}

1626

}

1630

}

1627

1631

1628

/*

1632

/*

1629

* A root's lifetime is governed by its root cgroup. Zero

1633

* A root's lifetime is governed by its root cgroup. Zero

1630

* ref indicate that the root is being destroyed. Wait for

1634

* ref indicate that the root is being destroyed. Wait for

1631

* destruction to complete so that the subsystems are free.

1635

* destruction to complete so that the subsystems are free.

1632

* We can use wait_queue for the wait but this path is

1636

* We can use wait_queue for the wait but this path is

1633

* super cold. Let's just sleep for a bit and retry.

1637

* super cold. Let's just sleep for a bit and retry.

1634

*/

1638

*/

1635

if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {

1639

if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {

1636

mutex_unlock(&cgroup_mutex);

1640

mutex_unlock(&cgroup_mutex);

1637

mutex_unlock(&cgroup_tree_mutex);

1641

mutex_unlock(&cgroup_tree_mutex);

1638

msleep(10);

1642

msleep(10);

1639

mutex_lock(&cgroup_tree_mutex);

1643

mutex_lock(&cgroup_tree_mutex);

1640

mutex_lock(&cgroup_mutex);

1644

mutex_lock(&cgroup_mutex);

1641

goto retry;

1645

goto retry;

1642

}

1646

}

1643

1647

1644

ret = 0;

1648

ret = 0;

1645

goto out_unlock;

1649

goto out_unlock;

1646

}

1650

}

1647

1651

1648

/*

1652

/*

1649

* No such thing, create a new one. name= matching without subsys

1653

* No such thing, create a new one. name= matching without subsys

1650

* specification is allowed for already existing hierarchies but we

1654

* specification is allowed for already existing hierarchies but we

1651

* can't create new one without subsys specification.

1655

* can't create new one without subsys specification.

1652

*/

1656

*/

1653

if (!opts.subsys_mask && !opts.none) {

1657

if (!opts.subsys_mask && !opts.none) {

1654

ret = -EINVAL;

1658

ret = -EINVAL;

1655

goto out_unlock;

1659

goto out_unlock;

1656

}

1660

}

1657

1661

1658

root = kzalloc(sizeof(*root), GFP_KERNEL);

1662

root = kzalloc(sizeof(*root), GFP_KERNEL);

1659

if (!root) {

1663

if (!root) {

1660

ret = -ENOMEM;

1664

ret = -ENOMEM;

1661

goto out_unlock;

1665

goto out_unlock;

1662

}

1666

}

1663

1667

1664

init_cgroup_root(root, &opts);

1668

init_cgroup_root(root, &opts);

1665

1669

1666

ret = cgroup_setup_root(root, opts.subsys_mask);

1670

ret = cgroup_setup_root(root, opts.subsys_mask);

1667

if (ret)

1671

if (ret)

1668

cgroup_free_root(root);

1672

cgroup_free_root(root);

1669

1673

1670

out_unlock:

1674

out_unlock:

1671

mutex_unlock(&cgroup_mutex);

1675

mutex_unlock(&cgroup_mutex);

1672

mutex_unlock(&cgroup_tree_mutex);

1676

mutex_unlock(&cgroup_tree_mutex);

1673

1677

1674

kfree(opts.release_agent);

1678

kfree(opts.release_agent);

1675

kfree(opts.name);

1679

kfree(opts.name);

1676

1680

1677

if (ret)

1681

if (ret)

1678

return ERR_PTR(ret);

1682

return ERR_PTR(ret);

1679

1683

1680

dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);

1684

dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);

1681

if (IS_ERR(dentry) || !new_sb)

1685

if (IS_ERR(dentry) || !new_sb)

1682

cgroup_put(&root->cgrp);

1686

cgroup_put(&root->cgrp);

1683

return dentry;

1687

return dentry;

1684

}

1688

}

1685

1689

1686

static void cgroup_kill_sb(struct super_block *sb)

1690

static void cgroup_kill_sb(struct super_block *sb)

1687

{

1691

{

1688

struct kernfs_root *kf_root = kernfs_root_from_sb(sb);

1692

struct kernfs_root *kf_root = kernfs_root_from_sb(sb);

1689

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1693

struct cgroup_root *root = cgroup_root_from_kf(kf_root);

1690

1694

1691

cgroup_put(&root->cgrp);

1695

cgroup_put(&root->cgrp);

1692

kernfs_kill_sb(sb);

1696

kernfs_kill_sb(sb);

1693

}

1697

}

1694

1698

1695

static struct file_system_type cgroup_fs_type = {

1699

static struct file_system_type cgroup_fs_type = {

1696

.name = "cgroup",

1700

.name = "cgroup",

1697

.mount = cgroup_mount,

1701

.mount = cgroup_mount,

1698

.kill_sb = cgroup_kill_sb,

1702

.kill_sb = cgroup_kill_sb,

1699

};

1703

};

1700

1704

1701

static struct kobject *cgroup_kobj;

1705

static struct kobject *cgroup_kobj;

1702

1706

1703

/**

1707

/**

1704

* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy

1708

* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy

1705

* @task: target task

1709

* @task: target task

1706

* @buf: the buffer to write the path into

1710

* @buf: the buffer to write the path into

1707

* @buflen: the length of the buffer

1711

* @buflen: the length of the buffer

1708

*

1712

*

1709

* Determine @task's cgroup on the first (the one with the lowest non-zero

1713

* Determine @task's cgroup on the first (the one with the lowest non-zero

1710

* hierarchy_id) cgroup hierarchy and copy its path into @buf. This

1714

* hierarchy_id) cgroup hierarchy and copy its path into @buf. This

1711

* function grabs cgroup_mutex and shouldn't be used inside locks used by

1715

* function grabs cgroup_mutex and shouldn't be used inside locks used by

1712

* cgroup controller callbacks.

1716

* cgroup controller callbacks.

1713

*

1717

*

1714

* Return value is the same as kernfs_path().

1718

* Return value is the same as kernfs_path().

1715

*/

1719

*/

1716

char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)

1720

char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)

1717

{

1721

{

1718

struct cgroup_root *root;

1722

struct cgroup_root *root;

1719

struct cgroup *cgrp;

1723

struct cgroup *cgrp;

1720

int hierarchy_id = 1;

1724

int hierarchy_id = 1;

1721

char *path = NULL;

1725

char *path = NULL;

1722

1726

1723

mutex_lock(&cgroup_mutex);

1727

mutex_lock(&cgroup_mutex);

1724

down_read(&css_set_rwsem);

1728

down_read(&css_set_rwsem);

1725

1729

1726

root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);

1730

root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);

1727

1731

1728

if (root) {

1732

if (root) {

1729

cgrp = task_cgroup_from_root(task, root);

1733

cgrp = task_cgroup_from_root(task, root);

1730

path = cgroup_path(cgrp, buf, buflen);

1734

path = cgroup_path(cgrp, buf, buflen);

1731

} else {

1735

} else {

1732

/* if no hierarchy exists, everyone is in "/" */

1736

/* if no hierarchy exists, everyone is in "/" */

1733

if (strlcpy(buf, "/", buflen) < buflen)

1737

if (strlcpy(buf, "/", buflen) < buflen)

1734

path = buf;

1738

path = buf;

1735

}

1739

}

1736

1740

1737

up_read(&css_set_rwsem);

1741

up_read(&css_set_rwsem);

1738

mutex_unlock(&cgroup_mutex);

1742

mutex_unlock(&cgroup_mutex);

1739

return path;

1743

return path;

1740

}

1744

}

1741

EXPORT_SYMBOL_GPL(task_cgroup_path);

1745

EXPORT_SYMBOL_GPL(task_cgroup_path);

1742

1746

1743

/* used to track tasks and other necessary states during migration */

1747

/* used to track tasks and other necessary states during migration */

1744

struct cgroup_taskset {

1748

struct cgroup_taskset {

1745

/* the src and dst cset list running through cset->mg_node */

1749

/* the src and dst cset list running through cset->mg_node */

1746

struct list_head src_csets;

1750

struct list_head src_csets;

1747

struct list_head dst_csets;

1751

struct list_head dst_csets;

1748

1752

1749

/*

1753

/*

1750

* Fields for cgroup_taskset_*() iteration.

1754

* Fields for cgroup_taskset_*() iteration.

1751

*

1755

*

1752

* Before migration is committed, the target migration tasks are on

1756

* Before migration is committed, the target migration tasks are on

1753

* ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of

1757

* ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of

1754

* the csets on ->dst_csets. ->csets point to either ->src_csets

1758

* the csets on ->dst_csets. ->csets point to either ->src_csets

1755

* or ->dst_csets depending on whether migration is committed.

1759

* or ->dst_csets depending on whether migration is committed.

1756

*

1760

*

1757

* ->cur_csets and ->cur_task point to the current task position

1761

* ->cur_csets and ->cur_task point to the current task position

1758

* during iteration.

1762

* during iteration.

1759

*/

1763

*/

1760

struct list_head *csets;

1764

struct list_head *csets;

1761

struct css_set *cur_cset;

1765

struct css_set *cur_cset;

1762

struct task_struct *cur_task;

1766

struct task_struct *cur_task;

1763

};

1767

};

1764

1768

1765

/**

1769

/**

1766

* cgroup_taskset_first - reset taskset and return the first task

1770

* cgroup_taskset_first - reset taskset and return the first task

1767

* @tset: taskset of interest

1771

* @tset: taskset of interest

1768

*

1772

*

1769

* @tset iteration is initialized and the first task is returned.

1773

* @tset iteration is initialized and the first task is returned.

1770

*/

1774

*/

1771

struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)

1775

struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)

1772

{

1776

{

1773

tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);

1777

tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);

1774

tset->cur_task = NULL;

1778

tset->cur_task = NULL;

1775

1779

1776

return cgroup_taskset_next(tset);

1780

return cgroup_taskset_next(tset);

1777

}

1781

}

1778

1782

1779

/**

1783

/**

1780

* cgroup_taskset_next - iterate to the next task in taskset

1784

* cgroup_taskset_next - iterate to the next task in taskset

1781

* @tset: taskset of interest

1785

* @tset: taskset of interest

1782

*

1786

*

1783

* Return the next task in @tset. Iteration must have been initialized

1787

* Return the next task in @tset. Iteration must have been initialized

1784

* with cgroup_taskset_first().

1788

* with cgroup_taskset_first().

1785

*/

1789

*/

1786

struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)

1790

struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)

1787

{

1791

{

1788

struct css_set *cset = tset->cur_cset;

1792

struct css_set *cset = tset->cur_cset;

1789

struct task_struct *task = tset->cur_task;

1793

struct task_struct *task = tset->cur_task;

1790

1794

1791

while (&cset->mg_node != tset->csets) {

1795

while (&cset->mg_node != tset->csets) {

1792

if (!task)

1796

if (!task)

1793

task = list_first_entry(&cset->mg_tasks,

1797

task = list_first_entry(&cset->mg_tasks,

1794

struct task_struct, cg_list);

1798

struct task_struct, cg_list);

1795

else

1799

else

1796

task = list_next_entry(task, cg_list);

1800

task = list_next_entry(task, cg_list);

1797

1801

1798

if (&task->cg_list != &cset->mg_tasks) {

1802

if (&task->cg_list != &cset->mg_tasks) {

1799

tset->cur_cset = cset;

1803

tset->cur_cset = cset;

1800

tset->cur_task = task;

1804

tset->cur_task = task;

1801

return task;

1805

return task;

1802

}

1806

}

1803

1807

1804

cset = list_next_entry(cset, mg_node);

1808

cset = list_next_entry(cset, mg_node);

1805

task = NULL;

1809

task = NULL;

1806

}

1810

}

1807

1811

1808

return NULL;

1812

return NULL;

1809

}

1813

}

1810

1814

1811

/**

1815

/**

1812

* cgroup_task_migrate - move a task from one cgroup to another.

1816

* cgroup_task_migrate - move a task from one cgroup to another.

1813

* @old_cgrp; the cgroup @tsk is being migrated from

1817

* @old_cgrp; the cgroup @tsk is being migrated from

1814

* @tsk: the task being migrated

1818

* @tsk: the task being migrated

1815

* @new_cset: the new css_set @tsk is being attached to

1819

* @new_cset: the new css_set @tsk is being attached to

1816

*

1820

*

1817

* Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.

1821

* Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.

1818

*/

1822

*/

1819

static void cgroup_task_migrate(struct cgroup *old_cgrp,

1823

static void cgroup_task_migrate(struct cgroup *old_cgrp,

1820

struct task_struct *tsk,

1824

struct task_struct *tsk,

1821

struct css_set *new_cset)

1825

struct css_set *new_cset)

1822

{

1826

{

1823

struct css_set *old_cset;

1827

struct css_set *old_cset;

1824

1828

1825

lockdep_assert_held(&cgroup_mutex);

1829

lockdep_assert_held(&cgroup_mutex);

1826

lockdep_assert_held(&css_set_rwsem);

1830

lockdep_assert_held(&css_set_rwsem);

1827

1831

1828

/*

1832

/*

1829

* We are synchronized through threadgroup_lock() against PF_EXITING

1833

* We are synchronized through threadgroup_lock() against PF_EXITING

1830

* setting such that we can't race against cgroup_exit() changing the

1834

* setting such that we can't race against cgroup_exit() changing the

1831

* css_set to init_css_set and dropping the old one.

1835

* css_set to init_css_set and dropping the old one.

1832

*/

1836

*/

1833

WARN_ON_ONCE(tsk->flags & PF_EXITING);

1837

WARN_ON_ONCE(tsk->flags & PF_EXITING);

1834

old_cset = task_css_set(tsk);

1838

old_cset = task_css_set(tsk);

1835

1839

1836

get_css_set(new_cset);

1840

get_css_set(new_cset);

1837

rcu_assign_pointer(tsk->cgroups, new_cset);

1841

rcu_assign_pointer(tsk->cgroups, new_cset);

1838

1842

1839

/*

1843

/*

1840

* Use move_tail so that cgroup_taskset_first() still returns the

1844

* Use move_tail so that cgroup_taskset_first() still returns the

1841

* leader after migration. This works because cgroup_migrate()

1845

* leader after migration. This works because cgroup_migrate()

1842

* ensures that the dst_cset of the leader is the first on the

1846

* ensures that the dst_cset of the leader is the first on the

1843

* tset's dst_csets list.

1847

* tset's dst_csets list.

1844

*/

1848

*/

1845

list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);

1849

list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);

1846

1850

1847

/*

1851

/*

1848

* We just gained a reference on old_cset by taking it from the

1852

* We just gained a reference on old_cset by taking it from the

1849

* task. As trading it for new_cset is protected by cgroup_mutex,

1853

* task. As trading it for new_cset is protected by cgroup_mutex,

1850

* we're safe to drop it here; it will be freed under RCU.

1854

* we're safe to drop it here; it will be freed under RCU.

1851

*/

1855

*/

1852

set_bit(CGRP_RELEASABLE, &old_cgrp->flags);

1856

set_bit(CGRP_RELEASABLE, &old_cgrp->flags);

1853

put_css_set_locked(old_cset, false);

1857

put_css_set_locked(old_cset, false);

1854

}

1858

}

1855

1859

1856

/**

1860

/**

1857

* cgroup_migrate_finish - cleanup after attach

1861

* cgroup_migrate_finish - cleanup after attach

1858

* @preloaded_csets: list of preloaded css_sets

1862

* @preloaded_csets: list of preloaded css_sets

1859

*

1863

*

1860

* Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See

1864

* Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See

1861

* those functions for details.

1865

* those functions for details.

1862

*/

1866

*/

1863

static void cgroup_migrate_finish(struct list_head *preloaded_csets)

1867

static void cgroup_migrate_finish(struct list_head *preloaded_csets)

1864

{

1868

{

1865

struct css_set *cset, *tmp_cset;

1869

struct css_set *cset, *tmp_cset;

1866

1870

1867

lockdep_assert_held(&cgroup_mutex);

1871

lockdep_assert_held(&cgroup_mutex);

1868

1872

1869

down_write(&css_set_rwsem);

1873

down_write(&css_set_rwsem);

1870

list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {

1874

list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {

1871

cset->mg_src_cgrp = NULL;

1875

cset->mg_src_cgrp = NULL;

1872

cset->mg_dst_cset = NULL;

1876

cset->mg_dst_cset = NULL;

1873

list_del_init(&cset->mg_preload_node);

1877

list_del_init(&cset->mg_preload_node);

1874

put_css_set_locked(cset, false);

1878

put_css_set_locked(cset, false);

1875

}

1879

}

1876

up_write(&css_set_rwsem);

1880

up_write(&css_set_rwsem);

1877

}

1881

}

1878

1882

1879

/**

1883

/**

1880

* cgroup_migrate_add_src - add a migration source css_set

1884

* cgroup_migrate_add_src - add a migration source css_set

1881

* @src_cset: the source css_set to add

1885

* @src_cset: the source css_set to add

1882

* @dst_cgrp: the destination cgroup

1886

* @dst_cgrp: the destination cgroup

1883

* @preloaded_csets: list of preloaded css_sets

1887

* @preloaded_csets: list of preloaded css_sets

1884

*

1888

*

1885

* Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin

1889

* Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin

1886

* @src_cset and add it to @preloaded_csets, which should later be cleaned

1890

* @src_cset and add it to @preloaded_csets, which should later be cleaned

1887

* up by cgroup_migrate_finish().

1891

* up by cgroup_migrate_finish().

1888

*

1892

*

1889

* This function may be called without holding threadgroup_lock even if the

1893

* This function may be called without holding threadgroup_lock even if the

1890

* target is a process. Threads may be created and destroyed but as long

1894

* target is a process. Threads may be created and destroyed but as long

1891

* as cgroup_mutex is not dropped, no new css_set can be put into play and

1895

* as cgroup_mutex is not dropped, no new css_set can be put into play and

1892

* the preloaded css_sets are guaranteed to cover all migrations.

1896

* the preloaded css_sets are guaranteed to cover all migrations.

1893

*/

1897

*/

1894

static void cgroup_migrate_add_src(struct css_set *src_cset,

1898

static void cgroup_migrate_add_src(struct css_set *src_cset,

1895

struct cgroup *dst_cgrp,

1899

struct cgroup *dst_cgrp,

1896

struct list_head *preloaded_csets)

1900

struct list_head *preloaded_csets)

1897

{

1901

{

1898

struct cgroup *src_cgrp;

1902

struct cgroup *src_cgrp;

1899

1903

1900

lockdep_assert_held(&cgroup_mutex);

1904

lockdep_assert_held(&cgroup_mutex);

1901

lockdep_assert_held(&css_set_rwsem);

1905

lockdep_assert_held(&css_set_rwsem);

1902

1906

1903

src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);

1907

src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);

1904

1908

1905

/* nothing to do if this cset already belongs to the cgroup */

1909

/* nothing to do if this cset already belongs to the cgroup */

1906

if (src_cgrp == dst_cgrp)

1910

if (src_cgrp == dst_cgrp)

1907

return;

1911

return;

1908

1912

1909

if (!list_empty(&src_cset->mg_preload_node))

1913

if (!list_empty(&src_cset->mg_preload_node))

1910

return;

1914

return;

1911

1915

1912

WARN_ON(src_cset->mg_src_cgrp);

1916

WARN_ON(src_cset->mg_src_cgrp);

1913

WARN_ON(!list_empty(&src_cset->mg_tasks));

1917

WARN_ON(!list_empty(&src_cset->mg_tasks));

1914

WARN_ON(!list_empty(&src_cset->mg_node));

1918

WARN_ON(!list_empty(&src_cset->mg_node));

1915

1919

1916

src_cset->mg_src_cgrp = src_cgrp;

1920

src_cset->mg_src_cgrp = src_cgrp;

1917

get_css_set(src_cset);

1921

get_css_set(src_cset);

1918

list_add(&src_cset->mg_preload_node, preloaded_csets);

1922

list_add(&src_cset->mg_preload_node, preloaded_csets);

1919

}

1923

}

1920

1924

1921

/**

1925

/**

1922

* cgroup_migrate_prepare_dst - prepare destination css_sets for migration

1926

* cgroup_migrate_prepare_dst - prepare destination css_sets for migration

1923

* @dst_cgrp: the destination cgroup

1927

* @dst_cgrp: the destination cgroup

1924

* @preloaded_csets: list of preloaded source css_sets

1928

* @preloaded_csets: list of preloaded source css_sets

1925

*

1929

*

1926

* Tasks are about to be moved to @dst_cgrp and all the source css_sets

1930

* Tasks are about to be moved to @dst_cgrp and all the source css_sets

1927

* have been preloaded to @preloaded_csets. This function looks up and

1931

* have been preloaded to @preloaded_csets. This function looks up and

1928

* pins all destination css_sets, links each to its source, and put them on

1932

* pins all destination css_sets, links each to its source, and put them on

1929

* @preloaded_csets.

1933

* @preloaded_csets.

1930

*

1934

*

1931

* This function must be called after cgroup_migrate_add_src() has been

1935

* This function must be called after cgroup_migrate_add_src() has been

1932

* called on each migration source css_set. After migration is performed

1936

* called on each migration source css_set. After migration is performed

1933

* using cgroup_migrate(), cgroup_migrate_finish() must be called on

1937

* using cgroup_migrate(), cgroup_migrate_finish() must be called on

1934

* @preloaded_csets.

1938

* @preloaded_csets.

1935

*/

1939

*/

1936

static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,

1940

static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,

1937

struct list_head *preloaded_csets)

1941

struct list_head *preloaded_csets)

1938

{

1942

{

1939

LIST_HEAD(csets);

1943

LIST_HEAD(csets);

1940

struct css_set *src_cset;

1944

struct css_set *src_cset;

1941

1945

1942

lockdep_assert_held(&cgroup_mutex);

1946

lockdep_assert_held(&cgroup_mutex);

1943

1947

1944

/* look up the dst cset for each src cset and link it to src */

1948

/* look up the dst cset for each src cset and link it to src */

1945

list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {

1949

list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {

1946

struct css_set *dst_cset;

1950

struct css_set *dst_cset;

1947

1951

1948

dst_cset = find_css_set(src_cset, dst_cgrp);

1952

dst_cset = find_css_set(src_cset, dst_cgrp);

1949

if (!dst_cset)

1953

if (!dst_cset)

1950

goto err;

1954

goto err;

1951

1955

1952

WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);

1956

WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);

1953

src_cset->mg_dst_cset = dst_cset;

1957

src_cset->mg_dst_cset = dst_cset;

1954

1958

1955

if (list_empty(&dst_cset->mg_preload_node))

1959

if (list_empty(&dst_cset->mg_preload_node))

1956

list_add(&dst_cset->mg_preload_node, &csets);

1960

list_add(&dst_cset->mg_preload_node, &csets);

1957

else

1961

else

1958

put_css_set(dst_cset, false);

1962

put_css_set(dst_cset, false);

1959

}

1963

}

1960

1964

1961

list_splice(&csets, preloaded_csets);

1965

list_splice(&csets, preloaded_csets);

1962

return 0;

1966

return 0;

1963

err:

1967

err:

1964

cgroup_migrate_finish(&csets);

1968

cgroup_migrate_finish(&csets);

1965

return -ENOMEM;

1969

return -ENOMEM;

1966

}

1970

}

1967

1971

1968

/**

1972

/**

1969

* cgroup_migrate - migrate a process or task to a cgroup

1973

* cgroup_migrate - migrate a process or task to a cgroup

1970

* @cgrp: the destination cgroup

1974

* @cgrp: the destination cgroup

1971

* @leader: the leader of the process or the task to migrate

1975

* @leader: the leader of the process or the task to migrate

1972

* @threadgroup: whether @leader points to the whole process or a single task

1976

* @threadgroup: whether @leader points to the whole process or a single task

1973

*

1977

*

1974

* Migrate a process or task denoted by @leader to @cgrp. If migrating a

1978

* Migrate a process or task denoted by @leader to @cgrp. If migrating a

1975

* process, the caller must be holding threadgroup_lock of @leader. The

1979

* process, the caller must be holding threadgroup_lock of @leader. The

1976

* caller is also responsible for invoking cgroup_migrate_add_src() and

1980

* caller is also responsible for invoking cgroup_migrate_add_src() and

1977

* cgroup_migrate_prepare_dst() on the targets before invoking this

1981

* cgroup_migrate_prepare_dst() on the targets before invoking this

1978

* function and following up with cgroup_migrate_finish().

1982

* function and following up with cgroup_migrate_finish().

1979

*

1983

*

1980

* As long as a controller's ->can_attach() doesn't fail, this function is

1984

* As long as a controller's ->can_attach() doesn't fail, this function is

1981

* guaranteed to succeed. This means that, excluding ->can_attach()

1985

* guaranteed to succeed. This means that, excluding ->can_attach()

1982

* failure, when migrating multiple targets, the success or failure can be

1986

* failure, when migrating multiple targets, the success or failure can be

1983

* decided for all targets by invoking group_migrate_prepare_dst() before

1987

* decided for all targets by invoking group_migrate_prepare_dst() before

1984

* actually starting migrating.

1988

* actually starting migrating.

1985

*/

1989

*/

1986

static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,

1990

static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,

1987

bool threadgroup)

1991

bool threadgroup)

1988

{

1992

{

1989

struct cgroup_taskset tset = {

1993

struct cgroup_taskset tset = {

1990

.src_csets = LIST_HEAD_INIT(tset.src_csets),

1994

.src_csets = LIST_HEAD_INIT(tset.src_csets),

1991

.dst_csets = LIST_HEAD_INIT(tset.dst_csets),

1995

.dst_csets = LIST_HEAD_INIT(tset.dst_csets),

1992

.csets = &tset.src_csets,

1996

.csets = &tset.src_csets,

1993

};

1997

};

1994

struct cgroup_subsys_state *css, *failed_css = NULL;

1998

struct cgroup_subsys_state *css, *failed_css = NULL;

1995

struct css_set *cset, *tmp_cset;

1999

struct css_set *cset, *tmp_cset;

1996

struct task_struct *task, *tmp_task;

2000

struct task_struct *task, *tmp_task;

1997

int i, ret;

2001

int i, ret;

1998

2002

1999

/*

2003

/*

2000

* Prevent freeing of tasks while we take a snapshot. Tasks that are

2004

* Prevent freeing of tasks while we take a snapshot. Tasks that are

2001

* already PF_EXITING could be freed from underneath us unless we

2005

* already PF_EXITING could be freed from underneath us unless we

2002

* take an rcu_read_lock.

2006

* take an rcu_read_lock.

2003

*/

2007

*/

2004

down_write(&css_set_rwsem);

2008

down_write(&css_set_rwsem);

2005

rcu_read_lock();

2009

rcu_read_lock();

2006

task = leader;

2010

task = leader;

2007

do {

2011

do {

2008

/* @task either already exited or can't exit until the end */

2012

/* @task either already exited or can't exit until the end */

2009

if (task->flags & PF_EXITING)

2013

if (task->flags & PF_EXITING)

2010

goto next;

2014

goto next;

2011

2015

2012

/* leave @task alone if post_fork() hasn't linked it yet */

2016

/* leave @task alone if post_fork() hasn't linked it yet */

2013

if (list_empty(&task->cg_list))

2017

if (list_empty(&task->cg_list))

2014

goto next;

2018

goto next;

2015

2019

2016

cset = task_css_set(task);

2020

cset = task_css_set(task);

2017

if (!cset->mg_src_cgrp)

2021

if (!cset->mg_src_cgrp)

2018

goto next;

2022

goto next;

2019

2023

2020

/*

2024

/*

2021

* cgroup_taskset_first() must always return the leader.

2025

* cgroup_taskset_first() must always return the leader.

2022

* Take care to avoid disturbing the ordering.

2026

* Take care to avoid disturbing the ordering.

2023

*/

2027

*/

2024

list_move_tail(&task->cg_list, &cset->mg_tasks);

2028

list_move_tail(&task->cg_list, &cset->mg_tasks);

2025

if (list_empty(&cset->mg_node))

2029

if (list_empty(&cset->mg_node))

2026

list_add_tail(&cset->mg_node, &tset.src_csets);

2030

list_add_tail(&cset->mg_node, &tset.src_csets);

2027

if (list_empty(&cset->mg_dst_cset->mg_node))

2031

if (list_empty(&cset->mg_dst_cset->mg_node))

2028

list_move_tail(&cset->mg_dst_cset->mg_node,

2032

list_move_tail(&cset->mg_dst_cset->mg_node,

2029

&tset.dst_csets);

2033

&tset.dst_csets);

2030

if (!threadgroup)

2035

if (!threadgroup)

2032

break;

2036

break;

2033

} while_each_thread(leader, task);

2037

} while_each_thread(leader, task);

2034

rcu_read_unlock();

2038

rcu_read_unlock();

2035

up_write(&css_set_rwsem);

2039

up_write(&css_set_rwsem);

2036

2040

2037

/* methods shouldn't be called if no task is actually migrating */

2041

/* methods shouldn't be called if no task is actually migrating */

2038

if (list_empty(&tset.src_csets))

2042

if (list_empty(&tset.src_csets))

2039

return 0;

2043

return 0;

2040

2044

2041

/* check that we can legitimately attach to the cgroup */

2045

/* check that we can legitimately attach to the cgroup */

2042

for_each_e_css(css, i, cgrp) {

2046

for_each_e_css(css, i, cgrp) {

2043

if (css->ss->can_attach) {

2047

if (css->ss->can_attach) {

2044

ret = css->ss->can_attach(css, &tset);

2048

ret = css->ss->can_attach(css, &tset);

2045

if (ret) {

2049

if (ret) {

2046

failed_css = css;

2050

failed_css = css;

2047

goto out_cancel_attach;

2051

goto out_cancel_attach;

2048

}

2052

}

2049

}

2053

}

2050

}

2054

}

2051

2055

2052

/*

2056

/*

2053

* Now that we're guaranteed success, proceed to move all tasks to

2057

* Now that we're guaranteed success, proceed to move all tasks to

2054

* the new cgroup. There are no failure cases after here, so this

2058

* the new cgroup. There are no failure cases after here, so this

2055

* is the commit point.

2059

* is the commit point.

2056

*/

2060

*/

2057

down_write(&css_set_rwsem);

2061

down_write(&css_set_rwsem);

2058

list_for_each_entry(cset, &tset.src_csets, mg_node) {

2062

list_for_each_entry(cset, &tset.src_csets, mg_node) {

2059

list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)

2063

list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)

2060

cgroup_task_migrate(cset->mg_src_cgrp, task,

2064

cgroup_task_migrate(cset->mg_src_cgrp, task,

2061

cset->mg_dst_cset);

2065

cset->mg_dst_cset);

2062

}

2066

}

2063

up_write(&css_set_rwsem);

2067

up_write(&css_set_rwsem);

2064

2068

2065

/*

2069

/*

2066

* Migration is committed, all target tasks are now on dst_csets.

2070

* Migration is committed, all target tasks are now on dst_csets.

2067

* Nothing is sensitive to fork() after this point. Notify

2071

* Nothing is sensitive to fork() after this point. Notify

2068

* controllers that migration is complete.

2072

* controllers that migration is complete.

2069

*/

2073

*/

2070

tset.csets = &tset.dst_csets;

2074

tset.csets = &tset.dst_csets;

2071

2075

2072

for_each_e_css(css, i, cgrp)

2076

for_each_e_css(css, i, cgrp)

2073

if (css->ss->attach)

2077

if (css->ss->attach)

2074

css->ss->attach(css, &tset);

2078

css->ss->attach(css, &tset);

2075

2079

2076

ret = 0;

2080

ret = 0;

2077

goto out_release_tset;

2081

goto out_release_tset;

2078

2082

2079

out_cancel_attach:

2083

out_cancel_attach:

2080

for_each_e_css(css, i, cgrp) {

2084

for_each_e_css(css, i, cgrp) {

2081

if (css == failed_css)

2085

if (css == failed_css)

2082

break;

2086

break;

2083

if (css->ss->cancel_attach)

2087

if (css->ss->cancel_attach)

2084

css->ss->cancel_attach(css, &tset);

2088

css->ss->cancel_attach(css, &tset);

2085

}

2089

}

2086

out_release_tset:

2090

out_release_tset:

2087

down_write(&css_set_rwsem);

2091

down_write(&css_set_rwsem);

2088

list_splice_init(&tset.dst_csets, &tset.src_csets);

2092

list_splice_init(&tset.dst_csets, &tset.src_csets);

2089

list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {

2093

list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {

2090

list_splice_tail_init(&cset->mg_tasks, &cset->tasks);

2094

list_splice_tail_init(&cset->mg_tasks, &cset->tasks);

2091

list_del_init(&cset->mg_node);

2095

list_del_init(&cset->mg_node);

2092

}

2096

}

2093

up_write(&css_set_rwsem);

2097

up_write(&css_set_rwsem);

2094

return ret;

2098

return ret;

2095

}

2099

}

2096

2100

2097

/**

2101

/**

2098

* cgroup_attach_task - attach a task or a whole threadgroup to a cgroup

2102

* cgroup_attach_task - attach a task or a whole threadgroup to a cgroup

2099

* @dst_cgrp: the cgroup to attach to

2103

* @dst_cgrp: the cgroup to attach to

2100

* @leader: the task or the leader of the threadgroup to be attached

2104

* @leader: the task or the leader of the threadgroup to be attached

2101

* @threadgroup: attach the whole threadgroup?

2105

* @threadgroup: attach the whole threadgroup?

2102

*

2106

*

2103

* Call holding cgroup_mutex and threadgroup_lock of @leader.

2107

* Call holding cgroup_mutex and threadgroup_lock of @leader.

2104

*/

2108

*/

2105

static int cgroup_attach_task(struct cgroup *dst_cgrp,

2109

static int cgroup_attach_task(struct cgroup *dst_cgrp,

2106

struct task_struct *leader, bool threadgroup)

2110

struct task_struct *leader, bool threadgroup)

2107

{

2111

{

2108

LIST_HEAD(preloaded_csets);

2112

LIST_HEAD(preloaded_csets);

2109

struct task_struct *task;

2113

struct task_struct *task;

2110

int ret;

2114

int ret;

2111

2115

2112

/* look up all src csets */

2116

/* look up all src csets */

2113

down_read(&css_set_rwsem);

2117

down_read(&css_set_rwsem);

2114

rcu_read_lock();

2118

rcu_read_lock();

2115

task = leader;

2119

task = leader;

2116

do {

2120

do {

2117

cgroup_migrate_add_src(task_css_set(task), dst_cgrp,

2121

cgroup_migrate_add_src(task_css_set(task), dst_cgrp,

2118

&preloaded_csets);

2122

&preloaded_csets);

2119

if (!threadgroup)

2123

if (!threadgroup)

2120

break;

2124

break;

2121

} while_each_thread(leader, task);

2125

} while_each_thread(leader, task);

2122

rcu_read_unlock();

2126

rcu_read_unlock();

2123

up_read(&css_set_rwsem);

2127

up_read(&css_set_rwsem);

2124

2128

2125

/* prepare dst csets and commit */

2129

/* prepare dst csets and commit */

2126

ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);

2130

ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);

2127

if (!ret)

2131

if (!ret)

2128

ret = cgroup_migrate(dst_cgrp, leader, threadgroup);

2132

ret = cgroup_migrate(dst_cgrp, leader, threadgroup);

2129

2133

2130

cgroup_migrate_finish(&preloaded_csets);

2134

cgroup_migrate_finish(&preloaded_csets);

2131

return ret;

2135

return ret;

2132

}

2136

}

2133

2137

2134

/*

2138

/*

2135

* Find the task_struct of the task to attach by vpid and pass it along to the

2139

* Find the task_struct of the task to attach by vpid and pass it along to the

2136

* function to attach either it or all tasks in its threadgroup. Will lock

2140

* function to attach either it or all tasks in its threadgroup. Will lock

2137

* cgroup_mutex and threadgroup.

2141

* cgroup_mutex and threadgroup.

2138

*/

2142

*/

2139

static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)

2143

static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)

2140

{

2144

{

2141

struct task_struct *tsk;

2145

struct task_struct *tsk;

2142

const struct cred *cred = current_cred(), *tcred;

2146

const struct cred *cred = current_cred(), *tcred;

2143

int ret;

2147

int ret;

2144

2148

2145

if (!cgroup_lock_live_group(cgrp))

2149

if (!cgroup_lock_live_group(cgrp))

2146

return -ENODEV;

2150

return -ENODEV;

2147

2151

2148

retry_find_task:

2152

retry_find_task:

2149

rcu_read_lock();

2153

rcu_read_lock();

2150

if (pid) {

2154

if (pid) {

2151

tsk = find_task_by_vpid(pid);

2155

tsk = find_task_by_vpid(pid);

2152

if (!tsk) {

2156

if (!tsk) {

2153

rcu_read_unlock();

2157

rcu_read_unlock();

2154

ret = -ESRCH;

2158

ret = -ESRCH;

2155

goto out_unlock_cgroup;

2159

goto out_unlock_cgroup;

2156

}

2160

}

2157

/*

2161

/*

2158

* even if we're attaching all tasks in the thread group, we

2162

* even if we're attaching all tasks in the thread group, we

2159

* only need to check permissions on one of them.

2163

* only need to check permissions on one of them.

2160

*/

2164

*/

2161

tcred = __task_cred(tsk);

2165

tcred = __task_cred(tsk);

2162

if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&

2166

if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&

2163

!uid_eq(cred->euid, tcred->uid) &&

2167

!uid_eq(cred->euid, tcred->uid) &&

2164

!uid_eq(cred->euid, tcred->suid)) {

2168

!uid_eq(cred->euid, tcred->suid)) {

2165

rcu_read_unlock();

2169

rcu_read_unlock();

2166

ret = -EACCES;

2170

ret = -EACCES;

2167

goto out_unlock_cgroup;

2171

goto out_unlock_cgroup;

2168

}

2172

}

2169

} else

2173

} else

2170

tsk = current;

2174

tsk = current;

2171

2175

2172

if (threadgroup)

2176

if (threadgroup)

2173

tsk = tsk->group_leader;

2177

tsk = tsk->group_leader;

2174

2178

2175

/*

2179

/*

2176

* Workqueue threads may acquire PF_NO_SETAFFINITY and become

2180

* Workqueue threads may acquire PF_NO_SETAFFINITY and become

2177

* trapped in a cpuset, or RT worker may be born in a cgroup

2181

* trapped in a cpuset, or RT worker may be born in a cgroup

2178

* with no rt_runtime allocated. Just say no.

2182

* with no rt_runtime allocated. Just say no.

2179

*/

2183

*/

2180

if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {

2184

if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {

2181

ret = -EINVAL;

2185

ret = -EINVAL;

2182

rcu_read_unlock();

2186

rcu_read_unlock();

2183

goto out_unlock_cgroup;

2187

goto out_unlock_cgroup;

2184

}

2188

}

2185

2189

2186

get_task_struct(tsk);

2190

get_task_struct(tsk);

2187

rcu_read_unlock();

2191

rcu_read_unlock();

2188

2192

2189

threadgroup_lock(tsk);

2193

threadgroup_lock(tsk);

2190

if (threadgroup) {

2194

if (threadgroup) {

2191

if (!thread_group_leader(tsk)) {

2195

if (!thread_group_leader(tsk)) {

2192

/*

2196

/*

2193

* a race with de_thread from another thread's exec()

2197

* a race with de_thread from another thread's exec()

2194

* may strip us of our leadership, if this happens,

2198

* may strip us of our leadership, if this happens,

2195

* there is no choice but to throw this task away and

2199

* there is no choice but to throw this task away and

2196

* try again; this is

2200

* try again; this is

2197

* "double-double-toil-and-trouble-check locking".

2201

* "double-double-toil-and-trouble-check locking".

2198

*/

2202

*/

2199

threadgroup_unlock(tsk);

2203

threadgroup_unlock(tsk);

2200

put_task_struct(tsk);

2204

put_task_struct(tsk);

2201

goto retry_find_task;

2205

goto retry_find_task;

2202

}

2206

}

2203

}

2207

}

2204

2208

2205

ret = cgroup_attach_task(cgrp, tsk, threadgroup);

2209

ret = cgroup_attach_task(cgrp, tsk, threadgroup);

2206

2210

2207

threadgroup_unlock(tsk);

2211

threadgroup_unlock(tsk);

2208

2212

2209

put_task_struct(tsk);

2213

put_task_struct(tsk);

2210

out_unlock_cgroup:

2214

out_unlock_cgroup:

2211

mutex_unlock(&cgroup_mutex);

2215

mutex_unlock(&cgroup_mutex);

2212

return ret;

2216

return ret;

2213

}

2217

}

2214

2218

2215

/**

2219

/**

2216

* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'

2220

* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'

2217

* @from: attach to all cgroups of a given task

2221

* @from: attach to all cgroups of a given task

2218

* @tsk: the task to be attached

2222

* @tsk: the task to be attached

2219

*/

2223

*/

2220

int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)

2224

int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)

2221

{

2225

{

2222

struct cgroup_root *root;

2226

struct cgroup_root *root;

2223

int retval = 0;

2227

int retval = 0;

2224

2228

2225

mutex_lock(&cgroup_mutex);

2229

mutex_lock(&cgroup_mutex);

2226

for_each_root(root) {

2230

for_each_root(root) {

2227

struct cgroup *from_cgrp;

2231

struct cgroup *from_cgrp;

2228

2232

2229

if (root == &cgrp_dfl_root)

2233

if (root == &cgrp_dfl_root)

2230

continue;

2234

continue;

2231

2235

2232

down_read(&css_set_rwsem);

2236

down_read(&css_set_rwsem);

2233

from_cgrp = task_cgroup_from_root(from, root);

2237

from_cgrp = task_cgroup_from_root(from, root);

2234

up_read(&css_set_rwsem);

2238

up_read(&css_set_rwsem);

2235

2239

2236

retval = cgroup_attach_task(from_cgrp, tsk, false);

2240

retval = cgroup_attach_task(from_cgrp, tsk, false);

2237

if (retval)

2241

if (retval)

2238

break;

2242

break;

2239

}

2243

}

2240

mutex_unlock(&cgroup_mutex);

2244

mutex_unlock(&cgroup_mutex);

2241

2245

2242

return retval;

2246

return retval;

2243

}

2247

}

2244

EXPORT_SYMBOL_GPL(cgroup_attach_task_all);

2248

EXPORT_SYMBOL_GPL(cgroup_attach_task_all);

2245

2249

2246

static int cgroup_tasks_write(struct cgroup_subsys_state *css,

2250

static int cgroup_tasks_write(struct cgroup_subsys_state *css,

2247

struct cftype *cft, u64 pid)

2251

struct cftype *cft, u64 pid)

2248

{

2252

{

2249

return attach_task_by_pid(css->cgroup, pid, false);

2253

return attach_task_by_pid(css->cgroup, pid, false);

2250

}

2254

}

2251

2255

2252

static int cgroup_procs_write(struct cgroup_subsys_state *css,

2256

static int cgroup_procs_write(struct cgroup_subsys_state *css,

2253

struct cftype *cft, u64 tgid)

2257

struct cftype *cft, u64 tgid)

2254

{

2258

{

2255

return attach_task_by_pid(css->cgroup, tgid, true);

2259

return attach_task_by_pid(css->cgroup, tgid, true);

2256

}

2260

}

2257

2261

2258

static int cgroup_release_agent_write(struct cgroup_subsys_state *css,

2262

static int cgroup_release_agent_write(struct cgroup_subsys_state *css,

2259

struct cftype *cft, char *buffer)

2263

struct cftype *cft, char *buffer)

2260

{

2264

{

2261

struct cgroup_root *root = css->cgroup->root;

2265

struct cgroup_root *root = css->cgroup->root;

2262

2266

2263

BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);

2267

BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);

2264

if (!cgroup_lock_live_group(css->cgroup))

2268

if (!cgroup_lock_live_group(css->cgroup))

2265

return -ENODEV;

2269

return -ENODEV;

2266

spin_lock(&release_agent_path_lock);

2270

spin_lock(&release_agent_path_lock);

2267

strlcpy(root->release_agent_path, buffer,

2271

strlcpy(root->release_agent_path, buffer,

2268

sizeof(root->release_agent_path));

2272

sizeof(root->release_agent_path));

2269

spin_unlock(&release_agent_path_lock);

2273

spin_unlock(&release_agent_path_lock);

2270

mutex_unlock(&cgroup_mutex);

2274

mutex_unlock(&cgroup_mutex);

2271

return 0;

2275

return 0;

2272

}

2276

}

2273

2277

2274

static int cgroup_release_agent_show(struct seq_file *seq, void *v)

2278

static int cgroup_release_agent_show(struct seq_file *seq, void *v)

2275

{

2279

{

2276

struct cgroup *cgrp = seq_css(seq)->cgroup;

2280

struct cgroup *cgrp = seq_css(seq)->cgroup;

2277

2281

2278

if (!cgroup_lock_live_group(cgrp))

2282

if (!cgroup_lock_live_group(cgrp))

2279

return -ENODEV;

2283

return -ENODEV;

2280

seq_puts(seq, cgrp->root->release_agent_path);

2284

seq_puts(seq, cgrp->root->release_agent_path);

2281

seq_putc(seq, '\n');

2285

seq_putc(seq, '\n');

2282

mutex_unlock(&cgroup_mutex);

2286

mutex_unlock(&cgroup_mutex);

2283

return 0;

2287

return 0;

2284

}

2288

}

2285

2289

2286

static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)

2290

static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)

2287

{

2291

{

2288

struct cgroup *cgrp = seq_css(seq)->cgroup;

2292

struct cgroup *cgrp = seq_css(seq)->cgroup;

2289

2293

2290

seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));

2294

seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));

2291

return 0;

2295

return 0;

2292

}

2296

}

2293

2297

2294

static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,

2298

static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,

2295

size_t nbytes, loff_t off)

2299

size_t nbytes, loff_t off)

2296

{

2300

{

2297

struct cgroup *cgrp = of->kn->parent->priv;

2301

struct cgroup *cgrp = of->kn->parent->priv;

2298

struct cftype *cft = of->kn->priv;

2302

struct cftype *cft = of->kn->priv;

2299

struct cgroup_subsys_state *css;

2303

struct cgroup_subsys_state *css;

2300

int ret;

2304

int ret;

2301

2305

2302

/*

2306

/*

2303

* kernfs guarantees that a file isn't deleted with operations in

2307

* kernfs guarantees that a file isn't deleted with operations in

2304

* flight, which means that the matching css is and stays alive and

2308

* flight, which means that the matching css is and stays alive and

2305

* doesn't need to be pinned. The RCU locking is not necessary

2309

* doesn't need to be pinned. The RCU locking is not necessary

2306

* either. It's just for the convenience of using cgroup_css().

2310

* either. It's just for the convenience of using cgroup_css().

2307

*/

2311

*/

2308

rcu_read_lock();

2312

rcu_read_lock();

2309

css = cgroup_css(cgrp, cft->ss);

2313

css = cgroup_css(cgrp, cft->ss);

2310

rcu_read_unlock();

2314

rcu_read_unlock();

2311

2315

2312

if (cft->write_string) {

2316

if (cft->write_string) {

2313

ret = cft->write_string(css, cft, strstrip(buf));

2317

ret = cft->write_string(css, cft, strstrip(buf));

2314

} else if (cft->write_u64) {

2318

} else if (cft->write_u64) {

2315

unsigned long long v;

2319

unsigned long long v;

2316

ret = kstrtoull(buf, 0, &v);

2320

ret = kstrtoull(buf, 0, &v);

2317

if (!ret)

2321

if (!ret)

2318

ret = cft->write_u64(css, cft, v);

2322

ret = cft->write_u64(css, cft, v);

2319

} else if (cft->write_s64) {

2323

} else if (cft->write_s64) {

2320

long long v;

2324

long long v;

2321

ret = kstrtoll(buf, 0, &v);

2325

ret = kstrtoll(buf, 0, &v);

2322

if (!ret)

2326

if (!ret)

2323

ret = cft->write_s64(css, cft, v);

2327

ret = cft->write_s64(css, cft, v);

2324

} else if (cft->trigger) {

2328

} else if (cft->trigger) {

2325

ret = cft->trigger(css, (unsigned int)cft->private);

2329

ret = cft->trigger(css, (unsigned int)cft->private);

2326

} else {

2330

} else {

2327

ret = -EINVAL;

2331

ret = -EINVAL;

2328

}

2332

}

2329

2333

2330

return ret ?: nbytes;

2334

return ret ?: nbytes;

2331

}

2335

}

2332

2336

2333

static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)

2337

static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)

2334

{

2338

{

2335

return seq_cft(seq)->seq_start(seq, ppos);

2339

return seq_cft(seq)->seq_start(seq, ppos);

2336

}

2340

}

2337

2341

2338

static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)

2342

static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)

2339

{

2343

{

2340

return seq_cft(seq)->seq_next(seq, v, ppos);

2344

return seq_cft(seq)->seq_next(seq, v, ppos);

2341

}

2345

}

2342

2346

2343

static void cgroup_seqfile_stop(struct seq_file *seq, void *v)

2347

static void cgroup_seqfile_stop(struct seq_file *seq, void *v)

2344

{

2348

{

2345

seq_cft(seq)->seq_stop(seq, v);

2349

seq_cft(seq)->seq_stop(seq, v);

2346

}

2350

}

2347

2351

2348

static int cgroup_seqfile_show(struct seq_file *m, void *arg)

2352

static int cgroup_seqfile_show(struct seq_file *m, void *arg)

2349

{

2353

{

2350

struct cftype *cft = seq_cft(m);

2354

struct cftype *cft = seq_cft(m);

2351

struct cgroup_subsys_state *css = seq_css(m);

2355

struct cgroup_subsys_state *css = seq_css(m);

2352

2356

2353

if (cft->seq_show)

2357

if (cft->seq_show)

2354

return cft->seq_show(m, arg);

2358

return cft->seq_show(m, arg);

2355

2359

2356

if (cft->read_u64)

2360

if (cft->read_u64)

2357

seq_printf(m, "%llu\n", cft->read_u64(css, cft));

2361

seq_printf(m, "%llu\n", cft->read_u64(css, cft));

2358

else if (cft->read_s64)

2362

else if (cft->read_s64)

2359

seq_printf(m, "%lld\n", cft->read_s64(css, cft));

2363

seq_printf(m, "%lld\n", cft->read_s64(css, cft));

2360

else

2364

else

2361

return -EINVAL;

2365

return -EINVAL;

2362

return 0;

2366

return 0;

2363

}

2367

}

2364

2368

2365

static struct kernfs_ops cgroup_kf_single_ops = {

2369

static struct kernfs_ops cgroup_kf_single_ops = {

2366

.atomic_write_len = PAGE_SIZE,

2370

.atomic_write_len = PAGE_SIZE,

2367

.write = cgroup_file_write,

2371

.write = cgroup_file_write,

2368

.seq_show = cgroup_seqfile_show,

2372

.seq_show = cgroup_seqfile_show,

2369

};

2373

};

2370

2374

2371

static struct kernfs_ops cgroup_kf_ops = {

2375

static struct kernfs_ops cgroup_kf_ops = {

2372

.atomic_write_len = PAGE_SIZE,

2376

.atomic_write_len = PAGE_SIZE,

2373

.write = cgroup_file_write,

2377

.write = cgroup_file_write,

2374

.seq_start = cgroup_seqfile_start,

2378

.seq_start = cgroup_seqfile_start,

2375

.seq_next = cgroup_seqfile_next,

2379

.seq_next = cgroup_seqfile_next,

2376

.seq_stop = cgroup_seqfile_stop,

2380

.seq_stop = cgroup_seqfile_stop,

2377

.seq_show = cgroup_seqfile_show,

2381

.seq_show = cgroup_seqfile_show,

2378

};

2382

};

2379

2383

2380

/*

2384

/*

2381

* cgroup_rename - Only allow simple rename of directories in place.

2385

* cgroup_rename - Only allow simple rename of directories in place.

2382

*/

2386

*/

2383

static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,

2387

static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,

2384

const char *new_name_str)

2388

const char *new_name_str)

2385

{

2389

{

2386

struct cgroup *cgrp = kn->priv;

2390

struct cgroup *cgrp = kn->priv;

2387

int ret;

2391

int ret;

2388

2392

2389

if (kernfs_type(kn) != KERNFS_DIR)

2393

if (kernfs_type(kn) != KERNFS_DIR)

2390

return -ENOTDIR;

2394

return -ENOTDIR;

2391

if (kn->parent != new_parent)

2395

if (kn->parent != new_parent)

2392

return -EIO;

2396

return -EIO;

2393

2397

2394

/*

2398

/*

2395

* This isn't a proper migration and its usefulness is very

2399

* This isn't a proper migration and its usefulness is very

2396

* limited. Disallow if sane_behavior.

2400

* limited. Disallow if sane_behavior.

2397

*/

2401

*/

2398

if (cgroup_sane_behavior(cgrp))

2402

if (cgroup_sane_behavior(cgrp))

2399

return -EPERM;

2403

return -EPERM;

2400

2404

2401

/*

2405

/*

2402

* We're gonna grab cgroup_tree_mutex which nests outside kernfs

2406

* We're gonna grab cgroup_tree_mutex which nests outside kernfs

2403

* active_ref. kernfs_rename() doesn't require active_ref

2407

* active_ref. kernfs_rename() doesn't require active_ref

2404

* protection. Break them before grabbing cgroup_tree_mutex.

2408

* protection. Break them before grabbing cgroup_tree_mutex.

2405

*/

2409

*/

2406

kernfs_break_active_protection(new_parent);

2410

kernfs_break_active_protection(new_parent);

2407

kernfs_break_active_protection(kn);

2411

kernfs_break_active_protection(kn);

2408

2412

2409

mutex_lock(&cgroup_tree_mutex);

2413

mutex_lock(&cgroup_tree_mutex);

2410

mutex_lock(&cgroup_mutex);

2414

mutex_lock(&cgroup_mutex);

2411

2415

2412

ret = kernfs_rename(kn, new_parent, new_name_str);

2416

ret = kernfs_rename(kn, new_parent, new_name_str);

2413

2417

2414

mutex_unlock(&cgroup_mutex);

2418

mutex_unlock(&cgroup_mutex);

2415

mutex_unlock(&cgroup_tree_mutex);

2419

mutex_unlock(&cgroup_tree_mutex);

2416

2420

2417

kernfs_unbreak_active_protection(kn);

2421

kernfs_unbreak_active_protection(kn);

2418

kernfs_unbreak_active_protection(new_parent);

2422

kernfs_unbreak_active_protection(new_parent);

2419

return ret;

2423

return ret;

2420

}

2424

}

2421

2425

2422

/* set uid and gid of cgroup dirs and files to that of the creator */

2426

/* set uid and gid of cgroup dirs and files to that of the creator */

2423

static int cgroup_kn_set_ugid(struct kernfs_node *kn)

2427

static int cgroup_kn_set_ugid(struct kernfs_node *kn)

2424

{

2428

{

2425

struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,

2429

struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,

2426

.ia_uid = current_fsuid(),

2430

.ia_uid = current_fsuid(),

2427

.ia_gid = current_fsgid(), };

2431

.ia_gid = current_fsgid(), };

2428

2432

2429

if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&

2433

if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&

2430

gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))

2434

gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))

2431

return 0;

2435

return 0;

2432

2436

2433

return kernfs_setattr(kn, &iattr);

2437

return kernfs_setattr(kn, &iattr);

2434

}

2438

}

2435

2439

2436

static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)

2440

static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)

2437

{

2441

{

2438

char name[CGROUP_FILE_NAME_MAX];

2442

char name[CGROUP_FILE_NAME_MAX];

2439

struct kernfs_node *kn;

2443

struct kernfs_node *kn;

2440

struct lock_class_key *key = NULL;

2444

struct lock_class_key *key = NULL;

2441

int ret;

2445

int ret;

2442

2446

2443

#ifdef CONFIG_DEBUG_LOCK_ALLOC

2447

#ifdef CONFIG_DEBUG_LOCK_ALLOC

2444

key = &cft->lockdep_key;

2448

key = &cft->lockdep_key;

2445

#endif

2449

#endif

2446

kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),

2450

kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),

2447

cgroup_file_mode(cft), 0, cft->kf_ops, cft,

2451

cgroup_file_mode(cft), 0, cft->kf_ops, cft,

2448

NULL, false, key);

2452

NULL, false, key);

2449

if (IS_ERR(kn))

2453

if (IS_ERR(kn))

2450

return PTR_ERR(kn);

2454

return PTR_ERR(kn);

2451

2455

2452

ret = cgroup_kn_set_ugid(kn);

2456

ret = cgroup_kn_set_ugid(kn);

2453

if (ret)

2457

if (ret)

2454

kernfs_remove(kn);

2458

kernfs_remove(kn);

2455

return ret;

2459

return ret;

2456

}

2460

}

2457

2461

2458

/**

2462

/**

2459

* cgroup_addrm_files - add or remove files to a cgroup directory

2463

* cgroup_addrm_files - add or remove files to a cgroup directory

2460

* @cgrp: the target cgroup

2464

* @cgrp: the target cgroup

2461

* @cfts: array of cftypes to be added

2465

* @cfts: array of cftypes to be added

2462

* @is_add: whether to add or remove

2466

* @is_add: whether to add or remove

2463

*

2467

*

2464

* Depending on @is_add, add or remove files defined by @cfts on @cgrp.

2468

* Depending on @is_add, add or remove files defined by @cfts on @cgrp.

2465

* For removals, this function never fails. If addition fails, this

2469

* For removals, this function never fails. If addition fails, this

2466

* function doesn't remove files already added. The caller is responsible

2470

* function doesn't remove files already added. The caller is responsible

2467

* for cleaning up.

2471

* for cleaning up.

2468

*/

2472

*/

2469

static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],

2473

static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],

2470

bool is_add)

2474

bool is_add)

2471

{

2475

{

2472

struct cftype *cft;

2476

struct cftype *cft;

2473

int ret;

2477

int ret;

2474

2478

2475

lockdep_assert_held(&cgroup_tree_mutex);

2479

lockdep_assert_held(&cgroup_tree_mutex);

2476

2480

2477

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2481

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2478

/* does cft->flags tell us to skip this file on @cgrp? */

2482

/* does cft->flags tell us to skip this file on @cgrp? */

2479

if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))

2483

if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))

2480

continue;

2484

continue;

2481

if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))

2485

if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))

2482

continue;

2486

continue;

2483

if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)

2487

if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)

2484

continue;

2488

continue;

2485

if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)

2489

if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)

2486

continue;

2490

continue;

2487

2491

2488

if (is_add) {

2492

if (is_add) {

2489

ret = cgroup_add_file(cgrp, cft);

2493

ret = cgroup_add_file(cgrp, cft);

2490

if (ret) {

2494

if (ret) {

2491

pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",

2495

pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",

2492

cft->name, ret);

2496

cft->name, ret);

2493

return ret;

2497

return ret;

2494

}

2498

}

2495

} else {

2499

} else {

2496

cgroup_rm_file(cgrp, cft);

2500

cgroup_rm_file(cgrp, cft);

2497

}

2501

}

2498

}

2502

}

2499

return 0;

2503

return 0;

2500

}

2504

}

2501

2505

2502

static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)

2506

static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)

2503

{

2507

{

2504

LIST_HEAD(pending);

2508

LIST_HEAD(pending);

2505

struct cgroup_subsys *ss = cfts[0].ss;

2509

struct cgroup_subsys *ss = cfts[0].ss;

2506

struct cgroup *root = &ss->root->cgrp;

2510

struct cgroup *root = &ss->root->cgrp;

2507

struct cgroup_subsys_state *css;

2511

struct cgroup_subsys_state *css;

2508

int ret = 0;

2512

int ret = 0;

2509

2513

2510

lockdep_assert_held(&cgroup_tree_mutex);

2514

lockdep_assert_held(&cgroup_tree_mutex);

2511

2515

2512

/* add/rm files for all cgroups created before */

2516

/* add/rm files for all cgroups created before */

2513

css_for_each_descendant_pre(css, cgroup_css(root, ss)) {

2517

css_for_each_descendant_pre(css, cgroup_css(root, ss)) {

2514

struct cgroup *cgrp = css->cgroup;

2518

struct cgroup *cgrp = css->cgroup;

2515

2519

2516

if (cgroup_is_dead(cgrp))

2520

if (cgroup_is_dead(cgrp))

2517

continue;

2521

continue;

2518

2522

2519

ret = cgroup_addrm_files(cgrp, cfts, is_add);

2523

ret = cgroup_addrm_files(cgrp, cfts, is_add);

2520

if (ret)

2524

if (ret)

2521

break;

2525

break;

2522

}

2526

}

2523

2527

2524

if (is_add && !ret)

2528

if (is_add && !ret)

2525

kernfs_activate(root->kn);

2529

kernfs_activate(root->kn);

2526

return ret;

2530

return ret;

2527

}

2531

}

2528

2532

2529

static void cgroup_exit_cftypes(struct cftype *cfts)

2533

static void cgroup_exit_cftypes(struct cftype *cfts)

2530

{

2534

{

2531

struct cftype *cft;

2535

struct cftype *cft;

2532

2536

2533

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2537

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2534

/* free copy for custom atomic_write_len, see init_cftypes() */

2538

/* free copy for custom atomic_write_len, see init_cftypes() */

2535

if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)

2539

if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)

2536

kfree(cft->kf_ops);

2540

kfree(cft->kf_ops);

2537

cft->kf_ops = NULL;

2541

cft->kf_ops = NULL;

2538

cft->ss = NULL;

2542

cft->ss = NULL;

2539

}

2543

}

2540

}

2544

}

2541

2545

2542

static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

2546

static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

2543

{

2547

{

2544

struct cftype *cft;

2548

struct cftype *cft;

2545

2549

2546

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2550

for (cft = cfts; cft->name[0] != '\0'; cft++) {

2547

struct kernfs_ops *kf_ops;

2551

struct kernfs_ops *kf_ops;

2548

2552

2549

WARN_ON(cft->ss || cft->kf_ops);

2553

WARN_ON(cft->ss || cft->kf_ops);

2550

2554

2551

if (cft->seq_start)

2555

if (cft->seq_start)

2552

kf_ops = &cgroup_kf_ops;

2556

kf_ops = &cgroup_kf_ops;

2553

else

2557

else

2554

kf_ops = &cgroup_kf_single_ops;

2558

kf_ops = &cgroup_kf_single_ops;

2555

2559

2556

/*

2560

/*

2557

* Ugh... if @cft wants a custom max_write_len, we need to

2561

* Ugh... if @cft wants a custom max_write_len, we need to

2558

* make a copy of kf_ops to set its atomic_write_len.

2562

* make a copy of kf_ops to set its atomic_write_len.

2559

*/

2563

*/

2560

if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {

2564

if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {

2561

kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);

2565

kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);

2562

if (!kf_ops) {

2566

if (!kf_ops) {

2563

cgroup_exit_cftypes(cfts);

2567

cgroup_exit_cftypes(cfts);

2564

return -ENOMEM;

2568

return -ENOMEM;

2565

}

2569

}

2566

kf_ops->atomic_write_len = cft->max_write_len;

2570

kf_ops->atomic_write_len = cft->max_write_len;

2567

}

2571

}

2568

2572

2569

cft->kf_ops = kf_ops;

2573

cft->kf_ops = kf_ops;

2570

cft->ss = ss;

2574

cft->ss = ss;

2571

}

2575

}

2572

2576

2573

return 0;

2577

return 0;

2574

}

2578

}

2575

2579

2576

static int cgroup_rm_cftypes_locked(struct cftype *cfts)

2580

static int cgroup_rm_cftypes_locked(struct cftype *cfts)

2577

{

2581

{

2578

lockdep_assert_held(&cgroup_tree_mutex);

2582

lockdep_assert_held(&cgroup_tree_mutex);

2579

2583

2580

if (!cfts || !cfts[0].ss)

2584

if (!cfts || !cfts[0].ss)

2581

return -ENOENT;

2585

return -ENOENT;

2582

2586

2583

list_del(&cfts->node);

2587

list_del(&cfts->node);

2584

cgroup_apply_cftypes(cfts, false);

2588

cgroup_apply_cftypes(cfts, false);

2585

cgroup_exit_cftypes(cfts);

2589

cgroup_exit_cftypes(cfts);

2586

return 0;

2590

return 0;

2587

}

2591

}

2588

2592

2589

/**

2593

/**

2590

* cgroup_rm_cftypes - remove an array of cftypes from a subsystem

2594

* cgroup_rm_cftypes - remove an array of cftypes from a subsystem

2591

* @cfts: zero-length name terminated array of cftypes

2595

* @cfts: zero-length name terminated array of cftypes

2592

*

2596

*

2593

* Unregister @cfts. Files described by @cfts are removed from all

2597

* Unregister @cfts. Files described by @cfts are removed from all

2594

* existing cgroups and all future cgroups won't have them either. This

2598

* existing cgroups and all future cgroups won't have them either. This

2595

* function can be called anytime whether @cfts' subsys is attached or not.

2599

* function can be called anytime whether @cfts' subsys is attached or not.

2596

*

2600

*

2597

* Returns 0 on successful unregistration, -ENOENT if @cfts is not

2601

* Returns 0 on successful unregistration, -ENOENT if @cfts is not

2598

* registered.

2602

* registered.

2599

*/

2603

*/

2600

int cgroup_rm_cftypes(struct cftype *cfts)

2604

int cgroup_rm_cftypes(struct cftype *cfts)

2601

{

2605

{

2602

int ret;

2606

int ret;

2603

2607

2604

mutex_lock(&cgroup_tree_mutex);

2608

mutex_lock(&cgroup_tree_mutex);

2605

ret = cgroup_rm_cftypes_locked(cfts);

2609

ret = cgroup_rm_cftypes_locked(cfts);

2606

mutex_unlock(&cgroup_tree_mutex);

2610

mutex_unlock(&cgroup_tree_mutex);

2607

return ret;

2611

return ret;

2608

}

2612

}

2609

2613

2610

/**

2614

/**

2611

* cgroup_add_cftypes - add an array of cftypes to a subsystem

2615

* cgroup_add_cftypes - add an array of cftypes to a subsystem

2612

* @ss: target cgroup subsystem

2616

* @ss: target cgroup subsystem

2613

* @cfts: zero-length name terminated array of cftypes

2617

* @cfts: zero-length name terminated array of cftypes

2614

*

2618

*

2615

* Register @cfts to @ss. Files described by @cfts are created for all

2619

* Register @cfts to @ss. Files described by @cfts are created for all

2616

* existing cgroups to which @ss is attached and all future cgroups will

2620

* existing cgroups to which @ss is attached and all future cgroups will

2617

* have them too. This function can be called anytime whether @ss is

2621

* have them too. This function can be called anytime whether @ss is

2618

* attached or not.

2622

* attached or not.

2619

*

2623

*

2620

* Returns 0 on successful registration, -errno on failure. Note that this

2624

* Returns 0 on successful registration, -errno on failure. Note that this

2621

* function currently returns 0 as long as @cfts registration is successful

2625

* function currently returns 0 as long as @cfts registration is successful

2622

* even if some file creation attempts on existing cgroups fail.

2626

* even if some file creation attempts on existing cgroups fail.

2623

*/

2627

*/

2624

int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

2628

int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)

2625

{

2629

{

2626

int ret;

2630

int ret;

2627

2631

2628

if (!cfts || cfts[0].name[0] == '\0')

2632

if (!cfts || cfts[0].name[0] == '\0')

2629

return 0;

2633

return 0;

2630

2634

2631

ret = cgroup_init_cftypes(ss, cfts);

2635

ret = cgroup_init_cftypes(ss, cfts);

2632

if (ret)

2636

if (ret)

2633

return ret;

2637

return ret;

2634

2638

2635

mutex_lock(&cgroup_tree_mutex);

2639

mutex_lock(&cgroup_tree_mutex);

2636

2640

2637

list_add_tail(&cfts->node, &ss->cfts);

2641

list_add_tail(&cfts->node, &ss->cfts);

2638

ret = cgroup_apply_cftypes(cfts, true);

2642

ret = cgroup_apply_cftypes(cfts, true);

2639

if (ret)

2643

if (ret)

2640

cgroup_rm_cftypes_locked(cfts);

2644

cgroup_rm_cftypes_locked(cfts);

2641

2645

2642

mutex_unlock(&cgroup_tree_mutex);

2646

mutex_unlock(&cgroup_tree_mutex);

2643

return ret;

2647

return ret;

2644

}

2648

}

2645

2649

2646

/**

2650

/**

2647

* cgroup_task_count - count the number of tasks in a cgroup.

2651

* cgroup_task_count - count the number of tasks in a cgroup.

2648

* @cgrp: the cgroup in question

2652

* @cgrp: the cgroup in question

2649

*

2653

*

2650

* Return the number of tasks in the cgroup.

2654

* Return the number of tasks in the cgroup.

2651

*/

2655

*/

2652

static int cgroup_task_count(const struct cgroup *cgrp)

2656

static int cgroup_task_count(const struct cgroup *cgrp)

2653

{

2657

{

2654

int count = 0;

2658

int count = 0;

2655

struct cgrp_cset_link *link;

2659

struct cgrp_cset_link *link;

2656

2660

2657

down_read(&css_set_rwsem);

2661

down_read(&css_set_rwsem);

2658

list_for_each_entry(link, &cgrp->cset_links, cset_link)

2662

list_for_each_entry(link, &cgrp->cset_links, cset_link)

2659

count += atomic_read(&link->cset->refcount);

2663

count += atomic_read(&link->cset->refcount);

2660

up_read(&css_set_rwsem);

2664

up_read(&css_set_rwsem);

2661

return count;

2665

return count;

2662

}

2666

}

2663

2667

2664

/**

2668

/**

2665

* css_next_child - find the next child of a given css

2669

* css_next_child - find the next child of a given css

2666

* @pos_css: the current position (%NULL to initiate traversal)

2670

* @pos_css: the current position (%NULL to initiate traversal)

2667

* @parent_css: css whose children to walk

2671

* @parent_css: css whose children to walk

2668

*

2672

*

2669

* This function returns the next child of @parent_css and should be called

2673

* This function returns the next child of @parent_css and should be called

2670

* under either cgroup_mutex or RCU read lock. The only requirement is

2674

* under either cgroup_mutex or RCU read lock. The only requirement is

2671

* that @parent_css and @pos_css are accessible. The next sibling is

2675

* that @parent_css and @pos_css are accessible. The next sibling is

2672

* guaranteed to be returned regardless of their states.

2676

* guaranteed to be returned regardless of their states.

2673

*/

2677

*/

2674

struct cgroup_subsys_state *

2678

struct cgroup_subsys_state *

2675

css_next_child(struct cgroup_subsys_state *pos_css,

2679

css_next_child(struct cgroup_subsys_state *pos_css,

2676

struct cgroup_subsys_state *parent_css)

2680

struct cgroup_subsys_state *parent_css)

2677

{

2681

{

2678

struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;

2682

struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;

2679

struct cgroup *cgrp = parent_css->cgroup;

2683

struct cgroup *cgrp = parent_css->cgroup;

2680

struct cgroup *next;

2684

struct cgroup *next;

2681

2685

2682

cgroup_assert_mutexes_or_rcu_locked();

2686

cgroup_assert_mutexes_or_rcu_locked();

2683

2687

2684

/*

2688

/*

2685

* @pos could already have been removed. Once a cgroup is removed,

2689

* @pos could already have been removed. Once a cgroup is removed,

2686

* its ->sibling.next is no longer updated when its next sibling

2690

* its ->sibling.next is no longer updated when its next sibling

2687

* changes. As CGRP_DEAD assertion is serialized and happens

2691

* changes. As CGRP_DEAD assertion is serialized and happens

2688

* before the cgroup is taken off the ->sibling list, if we see it

2692

* before the cgroup is taken off the ->sibling list, if we see it

2689

* unasserted, it's guaranteed that the next sibling hasn't

2693

* unasserted, it's guaranteed that the next sibling hasn't

2690

* finished its grace period even if it's already removed, and thus

2694

* finished its grace period even if it's already removed, and thus

2691

* safe to dereference from this RCU critical section. If

2695

* safe to dereference from this RCU critical section. If

2692

* ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed

2696

* ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed

2693

* to be visible as %true here.

2697

* to be visible as %true here.

2694

*

2698

*

2695

* If @pos is dead, its next pointer can't be dereferenced;

2699

* If @pos is dead, its next pointer can't be dereferenced;

2696

* however, as each cgroup is given a monotonically increasing

2700

* however, as each cgroup is given a monotonically increasing

2697

* unique serial number and always appended to the sibling list,

2701

* unique serial number and always appended to the sibling list,

2698

* the next one can be found by walking the parent's children until

2702

* the next one can be found by walking the parent's children until

2699

* we see a cgroup with higher serial number than @pos's. While

2703

* we see a cgroup with higher serial number than @pos's. While

2700

* this path can be slower, it's taken only when either the current

2704

* this path can be slower, it's taken only when either the current

2701

* cgroup is removed or iteration and removal race.

2705

* cgroup is removed or iteration and removal race.

2702

*/

2706

*/

2703

if (!pos) {

2707

if (!pos) {

2704

next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);

2708

next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);

2705

} else if (likely(!cgroup_is_dead(pos))) {

2709

} else if (likely(!cgroup_is_dead(pos))) {

2706

next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);

2710

next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);

2707

} else {

2711

} else {

2708

list_for_each_entry_rcu(next, &cgrp->children, sibling)

2712

list_for_each_entry_rcu(next, &cgrp->children, sibling)

2709

if (next->serial_nr > pos->serial_nr)

2713

if (next->serial_nr > pos->serial_nr)

2710

break;

2714

break;

2711

}

2715

}

2712

2716

2713

/*

2717

/*

2714

* @next, if not pointing to the head, can be dereferenced and is

2718

* @next, if not pointing to the head, can be dereferenced and is

2715

* the next sibling; however, it might have @ss disabled. If so,

2719

* the next sibling; however, it might have @ss disabled. If so,

2716

* fast-forward to the next enabled one.

2720

* fast-forward to the next enabled one.

2717

*/

2721

*/

2718

while (&next->sibling != &cgrp->children) {

2722

while (&next->sibling != &cgrp->children) {

2719

struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss);

2723

struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss);

2720

2724

2721

if (next_css)

2725

if (next_css)

2722

return next_css;

2726

return next_css;

2723

next = list_entry_rcu(next->sibling.next, struct cgroup, sibling);

2727

next = list_entry_rcu(next->sibling.next, struct cgroup, sibling);

2724

}

2728

}

2725

return NULL;

2729

return NULL;

2726

}

2730

}

2727

2731

2728

/**

2732

/**

2729

* css_next_descendant_pre - find the next descendant for pre-order walk

2733

* css_next_descendant_pre - find the next descendant for pre-order walk

2730

* @pos: the current position (%NULL to initiate traversal)

2734

* @pos: the current position (%NULL to initiate traversal)

2731

* @root: css whose descendants to walk

2735

* @root: css whose descendants to walk

2732

*

2736

*

2733

* To be used by css_for_each_descendant_pre(). Find the next descendant

2737

* To be used by css_for_each_descendant_pre(). Find the next descendant

2734

* to visit for pre-order traversal of @root's descendants. @root is

2738

* to visit for pre-order traversal of @root's descendants. @root is

2735

* included in the iteration and the first node to be visited.

2739

* included in the iteration and the first node to be visited.

2736

*

2740

*

2737

* While this function requires cgroup_mutex or RCU read locking, it

2741

* While this function requires cgroup_mutex or RCU read locking, it

2738

* doesn't require the whole traversal to be contained in a single critical

2742

* doesn't require the whole traversal to be contained in a single critical

2739

* section. This function will return the correct next descendant as long

2743

* section. This function will return the correct next descendant as long

2740

* as both @pos and @root are accessible and @pos is a descendant of @root.

2744

* as both @pos and @root are accessible and @pos is a descendant of @root.

2741

*/

2745

*/

2742

struct cgroup_subsys_state *

2746

struct cgroup_subsys_state *

2743

css_next_descendant_pre(struct cgroup_subsys_state *pos,

2747

css_next_descendant_pre(struct cgroup_subsys_state *pos,

2744

struct cgroup_subsys_state *root)

2748

struct cgroup_subsys_state *root)

2745

{

2749

{

2746

struct cgroup_subsys_state *next;

2750

struct cgroup_subsys_state *next;

2747

2751

2748

cgroup_assert_mutexes_or_rcu_locked();

2752

cgroup_assert_mutexes_or_rcu_locked();

2749

2753

2750

/* if first iteration, visit @root */

2754

/* if first iteration, visit @root */

2751

if (!pos)

2755

if (!pos)

2752

return root;

2756

return root;

2753

2757

2754

/* visit the first child if exists */

2758

/* visit the first child if exists */

2755

next = css_next_child(NULL, pos);

2759

next = css_next_child(NULL, pos);

2756

if (next)

2760

if (next)

2757

return next;

2761

return next;

2758

2762

2759

/* no child, visit my or the closest ancestor's next sibling */

2763

/* no child, visit my or the closest ancestor's next sibling */

2760

while (pos != root) {

2764

while (pos != root) {

2761

next = css_next_child(pos, css_parent(pos));

2765

next = css_next_child(pos, css_parent(pos));

2762

if (next)

2766

if (next)

2763

return next;

2767

return next;

2764

pos = css_parent(pos);

2768

pos = css_parent(pos);

2765

}

2769

}

2766

2770

2767

return NULL;

2771

return NULL;

2768

}

2772

}

2769

2773

2770

/**

2774

/**

2771

* css_rightmost_descendant - return the rightmost descendant of a css

2775

* css_rightmost_descendant - return the rightmost descendant of a css

2772

* @pos: css of interest

2776

* @pos: css of interest

2773

*

2777

*

2774

* Return the rightmost descendant of @pos. If there's no descendant, @pos

2778

* Return the rightmost descendant of @pos. If there's no descendant, @pos

2775

* is returned. This can be used during pre-order traversal to skip

2779

* is returned. This can be used during pre-order traversal to skip

2776

* subtree of @pos.

2780

* subtree of @pos.

2777

*

2781

*

2778

* While this function requires cgroup_mutex or RCU read locking, it

2782

* While this function requires cgroup_mutex or RCU read locking, it

2779

* doesn't require the whole traversal to be contained in a single critical

2783

* doesn't require the whole traversal to be contained in a single critical

2780

* section. This function will return the correct rightmost descendant as

2784

* section. This function will return the correct rightmost descendant as

2781

* long as @pos is accessible.

2785

* long as @pos is accessible.

2782

*/

2786

*/

2783

struct cgroup_subsys_state *

2787

struct cgroup_subsys_state *

2784

css_rightmost_descendant(struct cgroup_subsys_state *pos)

2788

css_rightmost_descendant(struct cgroup_subsys_state *pos)

2785

{

2789

{

2786

struct cgroup_subsys_state *last, *tmp;

2790

struct cgroup_subsys_state *last, *tmp;

2787

2791

2788

cgroup_assert_mutexes_or_rcu_locked();

2792

cgroup_assert_mutexes_or_rcu_locked();

2789

2793

2790

do {

2794

do {

2791

last = pos;

2795

last = pos;

2792

/* ->prev isn't RCU safe, walk ->next till the end */

2796

/* ->prev isn't RCU safe, walk ->next till the end */

2793

pos = NULL;

2797

pos = NULL;

2794

css_for_each_child(tmp, last)

2798

css_for_each_child(tmp, last)

2795

pos = tmp;

2799

pos = tmp;

2796

} while (pos);

2800

} while (pos);

2797

2801

2798

return last;

2802

return last;

2799

}

2803

}

2800

2804

2801

static struct cgroup_subsys_state *

2805

static struct cgroup_subsys_state *

2802

css_leftmost_descendant(struct cgroup_subsys_state *pos)

2806

css_leftmost_descendant(struct cgroup_subsys_state *pos)

2803

{

2807

{

2804

struct cgroup_subsys_state *last;

2808

struct cgroup_subsys_state *last;

2805

2809

2806

do {

2810

do {

2807

last = pos;

2811

last = pos;

2808

pos = css_next_child(NULL, pos);

2812

pos = css_next_child(NULL, pos);

2809

} while (pos);

2813

} while (pos);

2810

2814

2811

return last;

2815

return last;

2812

}

2816

}

2813

2817

2814

/**

2818

/**

2815

* css_next_descendant_post - find the next descendant for post-order walk

2819

* css_next_descendant_post - find the next descendant for post-order walk

2816

* @pos: the current position (%NULL to initiate traversal)

2820

* @pos: the current position (%NULL to initiate traversal)

2817

* @root: css whose descendants to walk

2821

* @root: css whose descendants to walk

2818

*

2822

*

2819

* To be used by css_for_each_descendant_post(). Find the next descendant

2823

* To be used by css_for_each_descendant_post(). Find the next descendant

2820

* to visit for post-order traversal of @root's descendants. @root is

2824

* to visit for post-order traversal of @root's descendants. @root is

2821

* included in the iteration and the last node to be visited.

2825

* included in the iteration and the last node to be visited.

2822

*

2826

*

2823

* While this function requires cgroup_mutex or RCU read locking, it

2827

* While this function requires cgroup_mutex or RCU read locking, it

2824

* doesn't require the whole traversal to be contained in a single critical

2828

* doesn't require the whole traversal to be contained in a single critical

2825

* section. This function will return the correct next descendant as long

2829

* section. This function will return the correct next descendant as long

2826

* as both @pos and @cgroup are accessible and @pos is a descendant of

2830

* as both @pos and @cgroup are accessible and @pos is a descendant of

2827

* @cgroup.

2831

* @cgroup.

2828

*/

2832

*/

2829

struct cgroup_subsys_state *

2833

struct cgroup_subsys_state *

2830

css_next_descendant_post(struct cgroup_subsys_state *pos,

2834

css_next_descendant_post(struct cgroup_subsys_state *pos,

2831

struct cgroup_subsys_state *root)

2835

struct cgroup_subsys_state *root)

2832

{

2836

{

2833

struct cgroup_subsys_state *next;

2837

struct cgroup_subsys_state *next;

2834

2838

2835

cgroup_assert_mutexes_or_rcu_locked();

2839

cgroup_assert_mutexes_or_rcu_locked();

2836

2840

2837

/* if first iteration, visit leftmost descendant which may be @root */

2841

/* if first iteration, visit leftmost descendant which may be @root */

2838

if (!pos)

2842

if (!pos)

2839

return css_leftmost_descendant(root);

2843

return css_leftmost_descendant(root);

2840

2844

2841

/* if we visited @root, we're done */

2845

/* if we visited @root, we're done */

2842

if (pos == root)

2846

if (pos == root)

2843

return NULL;

2847

return NULL;

2844

2848

2845

/* if there's an unvisited sibling, visit its leftmost descendant */

2849

/* if there's an unvisited sibling, visit its leftmost descendant */

2846

next = css_next_child(pos, css_parent(pos));

2850

next = css_next_child(pos, css_parent(pos));

2847

if (next)

2851

if (next)

2848

return css_leftmost_descendant(next);

2852

return css_leftmost_descendant(next);

2849

2853

2850

/* no sibling left, visit parent */

2854

/* no sibling left, visit parent */

2851

return css_parent(pos);

2855

return css_parent(pos);

2852

}

2856

}

2853

2857

2854

/**

2858

/**

2855

* css_advance_task_iter - advance a task itererator to the next css_set

2859

* css_advance_task_iter - advance a task itererator to the next css_set

2856

* @it: the iterator to advance

2860

* @it: the iterator to advance

2857

*

2861

*

2858

* Advance @it to the next css_set to walk.

2862

* Advance @it to the next css_set to walk.

2859

*/

2863

*/

2860

static void css_advance_task_iter(struct css_task_iter *it)

2864

static void css_advance_task_iter(struct css_task_iter *it)

2861

{

2865

{

2862

struct list_head *l = it->cset_pos;

2866

struct list_head *l = it->cset_pos;

2863

struct cgrp_cset_link *link;

2867

struct cgrp_cset_link *link;

2864

struct css_set *cset;

2868

struct css_set *cset;

2865

2869

2866

/* Advance to the next non-empty css_set */

2870

/* Advance to the next non-empty css_set */

2867

do {

2871

do {

2868

l = l->next;

2872

l = l->next;

2869

if (l == it->cset_head) {

2873

if (l == it->cset_head) {

2870

it->cset_pos = NULL;

2874

it->cset_pos = NULL;

2871

return;

2875

return;

2872

}

2876

}

2873

2877

2874

if (it->ss) {

2878

if (it->ss) {

2875

cset = container_of(l, struct css_set,

2879

cset = container_of(l, struct css_set,

2876

e_cset_node[it->ss->id]);

2880

e_cset_node[it->ss->id]);

2877

} else {

2881

} else {

2878

link = list_entry(l, struct cgrp_cset_link, cset_link);

2882

link = list_entry(l, struct cgrp_cset_link, cset_link);

2879

cset = link->cset;

2883

cset = link->cset;

2880

}

2884

}

2881

} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));

2885

} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));

2882

2886

2883

it->cset_pos = l;

2887

it->cset_pos = l;

2884

2888

2885

if (!list_empty(&cset->tasks))

2889

if (!list_empty(&cset->tasks))

2886

it->task_pos = cset->tasks.next;

2890

it->task_pos = cset->tasks.next;

2887

else

2891

else

2888

it->task_pos = cset->mg_tasks.next;

2892

it->task_pos = cset->mg_tasks.next;

2889

2893

2890

it->tasks_head = &cset->tasks;

2894

it->tasks_head = &cset->tasks;

2891

it->mg_tasks_head = &cset->mg_tasks;

2895

it->mg_tasks_head = &cset->mg_tasks;

2892

}

2896

}

2893

2897

2894

/**

2898

/**

2895

* css_task_iter_start - initiate task iteration

2899

* css_task_iter_start - initiate task iteration

2896

* @css: the css to walk tasks of

2900

* @css: the css to walk tasks of

2897

* @it: the task iterator to use

2901

* @it: the task iterator to use

2898

*

2902

*

2899

* Initiate iteration through the tasks of @css. The caller can call

2903

* Initiate iteration through the tasks of @css. The caller can call

2900

* css_task_iter_next() to walk through the tasks until the function

2904

* css_task_iter_next() to walk through the tasks until the function

2901

* returns NULL. On completion of iteration, css_task_iter_end() must be

2905

* returns NULL. On completion of iteration, css_task_iter_end() must be

2902

* called.

2906

* called.

2903

*

2907

*

2904

* Note that this function acquires a lock which is released when the

2908

* Note that this function acquires a lock which is released when the

2905

* iteration finishes. The caller can't sleep while iteration is in

2909

* iteration finishes. The caller can't sleep while iteration is in

2906

* progress.

2910

* progress.

2907

*/

2911

*/

2908

void css_task_iter_start(struct cgroup_subsys_state *css,

2912

void css_task_iter_start(struct cgroup_subsys_state *css,

2909

struct css_task_iter *it)

2913

struct css_task_iter *it)

2910

__acquires(css_set_rwsem)

2914

__acquires(css_set_rwsem)

2911

{

2915

{

2912

/* no one should try to iterate before mounting cgroups */

2916

/* no one should try to iterate before mounting cgroups */

2913

WARN_ON_ONCE(!use_task_css_set_links);

2917

WARN_ON_ONCE(!use_task_css_set_links);

2914

2918

2915

down_read(&css_set_rwsem);

2919

down_read(&css_set_rwsem);

2916

2920

2917

it->ss = css->ss;

2921

it->ss = css->ss;

2918

2922

2919

if (it->ss)

2923

if (it->ss)

2920

it->cset_pos = &css->cgroup->e_csets[css->ss->id];

2924

it->cset_pos = &css->cgroup->e_csets[css->ss->id];

2921

else

2925

else

2922

it->cset_pos = &css->cgroup->cset_links;

2926

it->cset_pos = &css->cgroup->cset_links;

2923

2927

2924

it->cset_head = it->cset_pos;

2928

it->cset_head = it->cset_pos;

2925

2929

2926

css_advance_task_iter(it);

2930

css_advance_task_iter(it);

2927

}

2931

}

2928

2932

2929

/**

2933

/**

2930

* css_task_iter_next - return the next task for the iterator

2934

* css_task_iter_next - return the next task for the iterator

2931

* @it: the task iterator being iterated

2935

* @it: the task iterator being iterated

2932

*

2936

*

2933

* The "next" function for task iteration. @it should have been

2937

* The "next" function for task iteration. @it should have been

2934

* initialized via css_task_iter_start(). Returns NULL when the iteration

2938

* initialized via css_task_iter_start(). Returns NULL when the iteration

2935

* reaches the end.

2939

* reaches the end.

2936

*/

2940

*/

2937

struct task_struct *css_task_iter_next(struct css_task_iter *it)

2941

struct task_struct *css_task_iter_next(struct css_task_iter *it)

2938

{

2942

{

2939

struct task_struct *res;

2943

struct task_struct *res;

2940

struct list_head *l = it->task_pos;

2944

struct list_head *l = it->task_pos;

2941

2945

2942

/* If the iterator cg is NULL, we have no tasks */

2946

/* If the iterator cg is NULL, we have no tasks */

2943

if (!it->cset_pos)

2947

if (!it->cset_pos)

2944

return NULL;

2948

return NULL;

2945

res = list_entry(l, struct task_struct, cg_list);

2949

res = list_entry(l, struct task_struct, cg_list);

2946

2950

2947

/*

2951

/*

2948

* Advance iterator to find next entry. cset->tasks is consumed

2952

* Advance iterator to find next entry. cset->tasks is consumed

2949

* first and then ->mg_tasks. After ->mg_tasks, we move onto the

2953

* first and then ->mg_tasks. After ->mg_tasks, we move onto the

2950

* next cset.

2954

* next cset.

2951

*/

2955

*/

2952

l = l->next;

2956

l = l->next;

2953

2957

2954

if (l == it->tasks_head)

2958

if (l == it->tasks_head)

2955

l = it->mg_tasks_head->next;

2959

l = it->mg_tasks_head->next;

2956

2960

2957

if (l == it->mg_tasks_head)

2961

if (l == it->mg_tasks_head)

2958

css_advance_task_iter(it);

2962

css_advance_task_iter(it);

2959

else

2963

else

2960

it->task_pos = l;

2964

it->task_pos = l;

2961

2965

2962

return res;

2966

return res;

2963

}

2967

}

2964

2968

2965

/**

2969

/**

2966

* css_task_iter_end - finish task iteration

2970

* css_task_iter_end - finish task iteration

2967

* @it: the task iterator to finish

2971

* @it: the task iterator to finish

2968

*

2972

*

2969

* Finish task iteration started by css_task_iter_start().

2973

* Finish task iteration started by css_task_iter_start().

2970

*/

2974

*/

2971

void css_task_iter_end(struct css_task_iter *it)

2975

void css_task_iter_end(struct css_task_iter *it)

2972

__releases(css_set_rwsem)

2976

__releases(css_set_rwsem)

2973

{

2977

{

2974

up_read(&css_set_rwsem);

2978

up_read(&css_set_rwsem);

2975

}

2979

}

2976

2980

2977

/**

2981

/**

2978

* cgroup_trasnsfer_tasks - move tasks from one cgroup to another

2982

* cgroup_trasnsfer_tasks - move tasks from one cgroup to another

2979

* @to: cgroup to which the tasks will be moved

2983

* @to: cgroup to which the tasks will be moved

2980

* @from: cgroup in which the tasks currently reside

2984

* @from: cgroup in which the tasks currently reside

2981

*

2985

*

2982

* Locking rules between cgroup_post_fork() and the migration path

2986

* Locking rules between cgroup_post_fork() and the migration path

2983

* guarantee that, if a task is forking while being migrated, the new child

2987

* guarantee that, if a task is forking while being migrated, the new child

2984

* is guaranteed to be either visible in the source cgroup after the

2988

* is guaranteed to be either visible in the source cgroup after the

2985

* parent's migration is complete or put into the target cgroup. No task

2989

* parent's migration is complete or put into the target cgroup. No task

2986

* can slip out of migration through forking.

2990

* can slip out of migration through forking.

2987

*/

2991

*/

2988

int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)

2992

int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)

2989

{

2993

{

2990

LIST_HEAD(preloaded_csets);

2994

LIST_HEAD(preloaded_csets);

2991

struct cgrp_cset_link *link;

2995

struct cgrp_cset_link *link;

2992

struct css_task_iter it;

2996

struct css_task_iter it;

2993

struct task_struct *task;

2997

struct task_struct *task;

2994

int ret;

2998

int ret;

2995

2999

2996

mutex_lock(&cgroup_mutex);

3000

mutex_lock(&cgroup_mutex);

2997

3001

2998

/* all tasks in @from are being moved, all csets are source */

3002

/* all tasks in @from are being moved, all csets are source */

2999

down_read(&css_set_rwsem);

3003

down_read(&css_set_rwsem);

3000

list_for_each_entry(link, &from->cset_links, cset_link)

3004

list_for_each_entry(link, &from->cset_links, cset_link)

3001

cgroup_migrate_add_src(link->cset, to, &preloaded_csets);

3005

cgroup_migrate_add_src(link->cset, to, &preloaded_csets);

3002

up_read(&css_set_rwsem);

3006

up_read(&css_set_rwsem);

3003

3007

3004

ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);

3008

ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);

3005

if (ret)

3009

if (ret)

3006

goto out_err;

3010

goto out_err;

3007

3011

3008

/*

3012

/*

3009

* Migrate tasks one-by-one until @form is empty. This fails iff

3013

* Migrate tasks one-by-one until @form is empty. This fails iff

3010

* ->can_attach() fails.

3014

* ->can_attach() fails.

3011

*/

3015

*/

3012

do {

3016

do {

3013

css_task_iter_start(&from->dummy_css, &it);

3017

css_task_iter_start(&from->dummy_css, &it);

3014

task = css_task_iter_next(&it);

3018

task = css_task_iter_next(&it);

3015

if (task)

3019

if (task)

3016

get_task_struct(task);

3020

get_task_struct(task);

3017

css_task_iter_end(&it);

3021

css_task_iter_end(&it);

3018

3022

3019

if (task) {

3023

if (task) {

3020

ret = cgroup_migrate(to, task, false);

3024

ret = cgroup_migrate(to, task, false);

3021

put_task_struct(task);

3025

put_task_struct(task);

3022

}

3026

}

3023

} while (task && !ret);

3027

} while (task && !ret);

3024

out_err:

3028

out_err:

3025

cgroup_migrate_finish(&preloaded_csets);

3029

cgroup_migrate_finish(&preloaded_csets);

3026

mutex_unlock(&cgroup_mutex);

3030

mutex_unlock(&cgroup_mutex);

3027

return ret;

3031

return ret;

3028

}

3032

}

3029

3033

3030

/*

3034

/*

3031

* Stuff for reading the 'tasks'/'procs' files.

3035

* Stuff for reading the 'tasks'/'procs' files.

3032

*

3036

*

3033

* Reading this file can return large amounts of data if a cgroup has

3037

* Reading this file can return large amounts of data if a cgroup has

3034

* *lots* of attached tasks. So it may need several calls to read(),

3038

* *lots* of attached tasks. So it may need several calls to read(),

3035

* but we cannot guarantee that the information we produce is correct

3039

* but we cannot guarantee that the information we produce is correct

3036

* unless we produce it entirely atomically.

3040

* unless we produce it entirely atomically.

3037

*

3041

*

3038

*/

3042

*/

3039

3043

3040

/* which pidlist file are we talking about? */

3044

/* which pidlist file are we talking about? */

3041

enum cgroup_filetype {

3045

enum cgroup_filetype {

3042

CGROUP_FILE_PROCS,

3046

CGROUP_FILE_PROCS,

3043

CGROUP_FILE_TASKS,

3047

CGROUP_FILE_TASKS,

3044

};

3048

};

3045

3049

3046

/*

3050

/*

3047

* A pidlist is a list of pids that virtually represents the contents of one

3051

* A pidlist is a list of pids that virtually represents the contents of one

3048

* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,

3052

* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,

3049

* a pair (one each for procs, tasks) for each pid namespace that's relevant

3053

* a pair (one each for procs, tasks) for each pid namespace that's relevant

3050

* to the cgroup.

3054

* to the cgroup.

3051

*/

3055

*/

3052

struct cgroup_pidlist {

3056

struct cgroup_pidlist {

3053

/*

3057

/*

3054

* used to find which pidlist is wanted. doesn't change as long as

3058

* used to find which pidlist is wanted. doesn't change as long as

3055

* this particular list stays in the list.

3059

* this particular list stays in the list.

3056

*/

3060

*/

3057

struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;

3061

struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;

3058

/* array of xids */

3062

/* array of xids */

3059

pid_t *list;

3063

pid_t *list;

3060

/* how many elements the above list has */

3064

/* how many elements the above list has */

3061

int length;

3065

int length;

3062

/* each of these stored in a list by its cgroup */

3066

/* each of these stored in a list by its cgroup */

3063

struct list_head links;

3067

struct list_head links;

3064

/* pointer to the cgroup we belong to, for list removal purposes */

3068

/* pointer to the cgroup we belong to, for list removal purposes */

3065

struct cgroup *owner;

3069

struct cgroup *owner;

3066

/* for delayed destruction */

3070

/* for delayed destruction */

3067

struct delayed_work destroy_dwork;

3071

struct delayed_work destroy_dwork;

3068

};

3072

};

3069

3073

3070

/*

3074

/*

3071

* The following two functions "fix" the issue where there are more pids

3075

* The following two functions "fix" the issue where there are more pids

3072

* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.

3076

* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.

3073

* TODO: replace with a kernel-wide solution to this problem

3077

* TODO: replace with a kernel-wide solution to this problem

3074

*/

3078

*/

3075

#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))

3079

#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))

3076

static void *pidlist_allocate(int count)

3080

static void *pidlist_allocate(int count)

3077

{

3081

{

3078

if (PIDLIST_TOO_LARGE(count))

3082

if (PIDLIST_TOO_LARGE(count))

3079

return vmalloc(count * sizeof(pid_t));

3083

return vmalloc(count * sizeof(pid_t));

3080

else

3084

else

3081

return kmalloc(count * sizeof(pid_t), GFP_KERNEL);

3085

return kmalloc(count * sizeof(pid_t), GFP_KERNEL);

3082

}

3086

}

3083

3087

3084

static void pidlist_free(void *p)

3088

static void pidlist_free(void *p)

3085

{

3089

{

3086

if (is_vmalloc_addr(p))

3090

if (is_vmalloc_addr(p))

3087

vfree(p);

3091

vfree(p);

3088

else

3092

else

3089

kfree(p);

3093

kfree(p);

3090

}

3094

}

3091

3095

3092

/*

3096

/*

3093

* Used to destroy all pidlists lingering waiting for destroy timer. None

3097

* Used to destroy all pidlists lingering waiting for destroy timer. None

3094

* should be left afterwards.

3098

* should be left afterwards.

3095

*/

3099

*/

3096

static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)

3100

static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)

3097

{

3101

{

3098

struct cgroup_pidlist *l, *tmp_l;

3102

struct cgroup_pidlist *l, *tmp_l;

3099

3103

3100

mutex_lock(&cgrp->pidlist_mutex);

3104

mutex_lock(&cgrp->pidlist_mutex);

3101

list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)

3105

list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)

3102

mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);

3106

mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);

3103

mutex_unlock(&cgrp->pidlist_mutex);

3107

mutex_unlock(&cgrp->pidlist_mutex);

3104

3108

3105

flush_workqueue(cgroup_pidlist_destroy_wq);

3109

flush_workqueue(cgroup_pidlist_destroy_wq);

3106

BUG_ON(!list_empty(&cgrp->pidlists));

3110

BUG_ON(!list_empty(&cgrp->pidlists));

3107

}

3111

}

3108

3112

3109

static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)

3113

static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)

3110

{

3114

{

3111

struct delayed_work *dwork = to_delayed_work(work);

3115

struct delayed_work *dwork = to_delayed_work(work);

3112

struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,

3116

struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,

3113

destroy_dwork);

3117

destroy_dwork);

3114

struct cgroup_pidlist *tofree = NULL;

3118

struct cgroup_pidlist *tofree = NULL;

3115

3119

3116

mutex_lock(&l->owner->pidlist_mutex);

3120

mutex_lock(&l->owner->pidlist_mutex);

3117

3121

3118

/*

3122

/*

3119

* Destroy iff we didn't get queued again. The state won't change

3123

* Destroy iff we didn't get queued again. The state won't change

3120

* as destroy_dwork can only be queued while locked.

3124

* as destroy_dwork can only be queued while locked.

3121

*/

3125

*/

3122

if (!delayed_work_pending(dwork)) {

3126

if (!delayed_work_pending(dwork)) {

3123

list_del(&l->links);

3127

list_del(&l->links);

3124

pidlist_free(l->list);

3128

pidlist_free(l->list);

3125

put_pid_ns(l->key.ns);

3129

put_pid_ns(l->key.ns);

3126

tofree = l;

3130

tofree = l;

3127

}

3131

}

3128

3132

3129

mutex_unlock(&l->owner->pidlist_mutex);

3133

mutex_unlock(&l->owner->pidlist_mutex);

3130

kfree(tofree);

3134

kfree(tofree);

3131

}

3135

}

3132

3136

3133

/*

3137

/*

3134

* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries

3138

* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries

3135

* Returns the number of unique elements.

3139

* Returns the number of unique elements.

3136

*/

3140

*/

3137

static int pidlist_uniq(pid_t *list, int length)

3141

static int pidlist_uniq(pid_t *list, int length)

3138

{

3142

{

3139

int src, dest = 1;

3143

int src, dest = 1;

3140

3144

3141

/*

3145

/*

3142

* we presume the 0th element is unique, so i starts at 1. trivial

3146

* we presume the 0th element is unique, so i starts at 1. trivial

3143

* edge cases first; no work needs to be done for either

3147

* edge cases first; no work needs to be done for either

3144

*/

3148

*/

3145

if (length == 0 || length == 1)

3149

if (length == 0 || length == 1)

3146

return length;

3150

return length;

3147

/* src and dest walk down the list; dest counts unique elements */

3151

/* src and dest walk down the list; dest counts unique elements */

3148

for (src = 1; src < length; src++) {

3152

for (src = 1; src < length; src++) {

3149

/* find next unique element */

3153

/* find next unique element */

3150

while (list[src] == list[src-1]) {

3154

while (list[src] == list[src-1]) {

3151

src++;

3155

src++;

3152

if (src == length)

3156

if (src == length)

3153

goto after;

3157

goto after;

3154

}

3158

}

3155

/* dest always points to where the next unique element goes */

3159

/* dest always points to where the next unique element goes */

3156

list[dest] = list[src];

3160

list[dest] = list[src];

3157

dest++;

3161

dest++;

3158

}

3162

}

3159

after:

3163

after:

3160

return dest;

3164

return dest;

3161

}

3165

}

3162

3166

3163

/*

3167

/*

3164

* The two pid files - task and cgroup.procs - guaranteed that the result

3168

* The two pid files - task and cgroup.procs - guaranteed that the result

3165

* is sorted, which forced this whole pidlist fiasco. As pid order is

3169

* is sorted, which forced this whole pidlist fiasco. As pid order is

3166

* different per namespace, each namespace needs differently sorted list,

3170

* different per namespace, each namespace needs differently sorted list,

3167

* making it impossible to use, for example, single rbtree of member tasks

3171

* making it impossible to use, for example, single rbtree of member tasks

3168

* sorted by task pointer. As pidlists can be fairly large, allocating one

3172

* sorted by task pointer. As pidlists can be fairly large, allocating one

3169

* per open file is dangerous, so cgroup had to implement shared pool of

3173

* per open file is dangerous, so cgroup had to implement shared pool of

3170

* pidlists keyed by cgroup and namespace.

3174

* pidlists keyed by cgroup and namespace.

3171

*

3175

*

3172

* All this extra complexity was caused by the original implementation

3176

* All this extra complexity was caused by the original implementation

3173

* committing to an entirely unnecessary property. In the long term, we

3177

* committing to an entirely unnecessary property. In the long term, we

3174

* want to do away with it. Explicitly scramble sort order if

3178

* want to do away with it. Explicitly scramble sort order if

3175

* sane_behavior so that no such expectation exists in the new interface.

3179

* sane_behavior so that no such expectation exists in the new interface.

3176

*

3180

*

3177

* Scrambling is done by swapping every two consecutive bits, which is

3181

* Scrambling is done by swapping every two consecutive bits, which is

3178

* non-identity one-to-one mapping which disturbs sort order sufficiently.

3182

* non-identity one-to-one mapping which disturbs sort order sufficiently.

3179

*/

3183

*/

3180

static pid_t pid_fry(pid_t pid)

3184

static pid_t pid_fry(pid_t pid)

3181

{

3185

{

3182

unsigned a = pid & 0x55555555;

3186

unsigned a = pid & 0x55555555;

3183

unsigned b = pid & 0xAAAAAAAA;

3187

unsigned b = pid & 0xAAAAAAAA;

3184

3188

3185

return (a << 1) | (b >> 1);

3189

return (a << 1) | (b >> 1);

3186

}

3190

}

3187

3191

3188

static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)

3192

static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)

3189

{

3193

{

3190

if (cgroup_sane_behavior(cgrp))

3194

if (cgroup_sane_behavior(cgrp))

3191

return pid_fry(pid);

3195

return pid_fry(pid);

3192

else

3196

else

3193

return pid;

3197

return pid;

3194

}

3198

}

3195

3199

3196

static int cmppid(const void *a, const void *b)

3200

static int cmppid(const void *a, const void *b)

3197

{

3201

{

3198

return *(pid_t *)a - *(pid_t *)b;

3202

return *(pid_t *)a - *(pid_t *)b;

3199

}

3203

}

3200

3204

3201

static int fried_cmppid(const void *a, const void *b)

3205

static int fried_cmppid(const void *a, const void *b)

3202

{

3206

{

3203

return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);

3207

return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);

3204

}

3208

}

3205

3209

3206

static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,

3210

static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,

3207

enum cgroup_filetype type)

3211

enum cgroup_filetype type)

3208

{

3212

{

3209

struct cgroup_pidlist *l;

3213

struct cgroup_pidlist *l;

3210

/* don't need task_nsproxy() if we're looking at ourself */

3214

/* don't need task_nsproxy() if we're looking at ourself */

3211

struct pid_namespace *ns = task_active_pid_ns(current);

3215

struct pid_namespace *ns = task_active_pid_ns(current);

3212

3216

3213

lockdep_assert_held(&cgrp->pidlist_mutex);

3217

lockdep_assert_held(&cgrp->pidlist_mutex);

3214

3218

3215

list_for_each_entry(l, &cgrp->pidlists, links)

3219

list_for_each_entry(l, &cgrp->pidlists, links)

3216

if (l->key.type == type && l->key.ns == ns)

3220

if (l->key.type == type && l->key.ns == ns)

3217

return l;

3221

return l;

3218

return NULL;

3222

return NULL;

3219

}

3223

}

3220

3224

3221

/*

3225

/*

3222

* find the appropriate pidlist for our purpose (given procs vs tasks)

3226

* find the appropriate pidlist for our purpose (given procs vs tasks)

3223

* returns with the lock on that pidlist already held, and takes care

3227

* returns with the lock on that pidlist already held, and takes care

3224

* of the use count, or returns NULL with no locks held if we're out of

3228

* of the use count, or returns NULL with no locks held if we're out of

3225

* memory.

3229

* memory.

3226

*/

3230

*/

3227

static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,

3231

static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,

3228

enum cgroup_filetype type)

3232

enum cgroup_filetype type)

3229

{

3233

{

3230

struct cgroup_pidlist *l;

3234

struct cgroup_pidlist *l;

3231

3235

3232

lockdep_assert_held(&cgrp->pidlist_mutex);

3236

lockdep_assert_held(&cgrp->pidlist_mutex);

3233

3237

3234

l = cgroup_pidlist_find(cgrp, type);

3238

l = cgroup_pidlist_find(cgrp, type);

3235

if (l)

3239

if (l)

3236

return l;

3240

return l;

3237

3241

3238

/* entry not found; create a new one */

3242

/* entry not found; create a new one */

3239

l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);

3243

l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);

3240

if (!l)

3244

if (!l)

3241

return l;

3245

return l;

3242

3246

3243

INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);

3247

INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);

3244

l->key.type = type;

3248

l->key.type = type;

3245

/* don't need task_nsproxy() if we're looking at ourself */

3249

/* don't need task_nsproxy() if we're looking at ourself */

3246

l->key.ns = get_pid_ns(task_active_pid_ns(current));

3250

l->key.ns = get_pid_ns(task_active_pid_ns(current));

3247

l->owner = cgrp;

3251

l->owner = cgrp;

3248

list_add(&l->links, &cgrp->pidlists);

3252

list_add(&l->links, &cgrp->pidlists);

3249

return l;

3253

return l;

3250

}

3254

}

3251

3255

3252

/*

3256

/*

3253

* Load a cgroup's pidarray with either procs' tgids or tasks' pids

3257

* Load a cgroup's pidarray with either procs' tgids or tasks' pids

3254

*/

3258

*/

3255

static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,

3259

static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,

3256

struct cgroup_pidlist **lp)

3260

struct cgroup_pidlist **lp)

3257

{

3261

{

3258

pid_t *array;

3262

pid_t *array;

3259

int length;

3263

int length;

3260

int pid, n = 0; /* used for populating the array */

3264

int pid, n = 0; /* used for populating the array */

3261

struct css_task_iter it;

3265

struct css_task_iter it;

3262

struct task_struct *tsk;

3266

struct task_struct *tsk;

3263

struct cgroup_pidlist *l;

3267

struct cgroup_pidlist *l;

3264

3268

3265

lockdep_assert_held(&cgrp->pidlist_mutex);

3269

lockdep_assert_held(&cgrp->pidlist_mutex);

3266

3270

3267

/*

3271

/*

3268

* If cgroup gets more users after we read count, we won't have

3272

* If cgroup gets more users after we read count, we won't have

3269

* enough space - tough. This race is indistinguishable to the

3273

* enough space - tough. This race is indistinguishable to the

3270

* caller from the case that the additional cgroup users didn't

3274

* caller from the case that the additional cgroup users didn't

3271

* show up until sometime later on.

3275

* show up until sometime later on.

3272

*/

3276

*/

3273

length = cgroup_task_count(cgrp);

3277

length = cgroup_task_count(cgrp);

3274

array = pidlist_allocate(length);

3278

array = pidlist_allocate(length);

3275

if (!array)

3279

if (!array)

3276

return -ENOMEM;

3280

return -ENOMEM;

3277

/* now, populate the array */

3281

/* now, populate the array */

3278

css_task_iter_start(&cgrp->dummy_css, &it);

3282

css_task_iter_start(&cgrp->dummy_css, &it);

3279

while ((tsk = css_task_iter_next(&it))) {

3283

while ((tsk = css_task_iter_next(&it))) {

3280

if (unlikely(n == length))

3284

if (unlikely(n == length))

3281

break;

3285

break;

3282

/* get tgid or pid for procs or tasks file respectively */

3286

/* get tgid or pid for procs or tasks file respectively */

3283

if (type == CGROUP_FILE_PROCS)

3287

if (type == CGROUP_FILE_PROCS)

3284

pid = task_tgid_vnr(tsk);

3288

pid = task_tgid_vnr(tsk);

3285

else

3289

else

3286

pid = task_pid_vnr(tsk);

3290

pid = task_pid_vnr(tsk);

3287

if (pid > 0) /* make sure to only use valid results */

3291

if (pid > 0) /* make sure to only use valid results */

3288

array[n++] = pid;

3292

array[n++] = pid;

3289

}

3293

}

3290

css_task_iter_end(&it);

3294

css_task_iter_end(&it);

3291

length = n;

3295

length = n;

3292

/* now sort & (if procs) strip out duplicates */

3296

/* now sort & (if procs) strip out duplicates */

3293

if (cgroup_sane_behavior(cgrp))

3297

if (cgroup_sane_behavior(cgrp))

3294

sort(array, length, sizeof(pid_t), fried_cmppid, NULL);

3298

sort(array, length, sizeof(pid_t), fried_cmppid, NULL);

3295

else

3299

else

3296

sort(array, length, sizeof(pid_t), cmppid, NULL);

3300

sort(array, length, sizeof(pid_t), cmppid, NULL);

3297

if (type == CGROUP_FILE_PROCS)

3301

if (type == CGROUP_FILE_PROCS)

3298

length = pidlist_uniq(array, length);

3302

length = pidlist_uniq(array, length);

3299

3303

3300

l = cgroup_pidlist_find_create(cgrp, type);

3304

l = cgroup_pidlist_find_create(cgrp, type);

3301

if (!l) {

3305

if (!l) {

3302

mutex_unlock(&cgrp->pidlist_mutex);

3306

mutex_unlock(&cgrp->pidlist_mutex);

3303

pidlist_free(array);

3307

pidlist_free(array);

3304

return -ENOMEM;

3308

return -ENOMEM;

3305

}

3309

}

3306

3310

3307

/* store array, freeing old if necessary */

3311

/* store array, freeing old if necessary */

3308

pidlist_free(l->list);

3312

pidlist_free(l->list);

3309

l->list = array;

3313

l->list = array;

3310

l->length = length;

3314

l->length = length;

3311

*lp = l;

3315

*lp = l;

3312

return 0;

3316

return 0;

3313

}

3317

}

3314

3318

3315

/**

3319

/**

3316

* cgroupstats_build - build and fill cgroupstats

3320

* cgroupstats_build - build and fill cgroupstats

3317

* @stats: cgroupstats to fill information into

3321

* @stats: cgroupstats to fill information into

3318

* @dentry: A dentry entry belonging to the cgroup for which stats have

3322

* @dentry: A dentry entry belonging to the cgroup for which stats have

3319

* been requested.

3323

* been requested.

3320

*

3324

*

3321

* Build and fill cgroupstats so that taskstats can export it to user

3325

* Build and fill cgroupstats so that taskstats can export it to user

3322

* space.

3326

* space.

3323

*/

3327

*/

3324

int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)

3328

int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)

3325

{

3329

{

3326

struct kernfs_node *kn = kernfs_node_from_dentry(dentry);

3330

struct kernfs_node *kn = kernfs_node_from_dentry(dentry);

3327

struct cgroup *cgrp;

3331

struct cgroup *cgrp;

3328

struct css_task_iter it;

3332

struct css_task_iter it;

3329

struct task_struct *tsk;

3333

struct task_struct *tsk;

3330

3334

3331

/* it should be kernfs_node belonging to cgroupfs and is a directory */

3335

/* it should be kernfs_node belonging to cgroupfs and is a directory */

3332

if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||

3336

if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||

3333

kernfs_type(kn) != KERNFS_DIR)

3337

kernfs_type(kn) != KERNFS_DIR)

3334

return -EINVAL;

3338

return -EINVAL;

3335

3339

3336

mutex_lock(&cgroup_mutex);

3340

mutex_lock(&cgroup_mutex);

3337

3341

3338

/*

3342

/*

3339

* We aren't being called from kernfs and there's no guarantee on

3343

* We aren't being called from kernfs and there's no guarantee on

3340

* @kn->priv's validity. For this and css_tryget_from_dir(),

3344

* @kn->priv's validity. For this and css_tryget_from_dir(),

3341

* @kn->priv is RCU safe. Let's do the RCU dancing.

3345

* @kn->priv is RCU safe. Let's do the RCU dancing.

3342

*/

3346

*/

3343

rcu_read_lock();

3347

rcu_read_lock();

3344

cgrp = rcu_dereference(kn->priv);

3348

cgrp = rcu_dereference(kn->priv);

3345

if (!cgrp || cgroup_is_dead(cgrp)) {

3349

if (!cgrp || cgroup_is_dead(cgrp)) {

3346

rcu_read_unlock();

3350

rcu_read_unlock();

3347

mutex_unlock(&cgroup_mutex);

3351

mutex_unlock(&cgroup_mutex);

3348

return -ENOENT;

3352

return -ENOENT;

3349

}

3353

}

3350

rcu_read_unlock();

3354

rcu_read_unlock();

3351

3355

3352

css_task_iter_start(&cgrp->dummy_css, &it);

3356

css_task_iter_start(&cgrp->dummy_css, &it);

3353

while ((tsk = css_task_iter_next(&it))) {

3357

while ((tsk = css_task_iter_next(&it))) {

3354

switch (tsk->state) {

3358

switch (tsk->state) {

3355

case TASK_RUNNING:

3359

case TASK_RUNNING:

3356

stats->nr_running++;

3360

stats->nr_running++;

3357

break;

3361

break;

3358

case TASK_INTERRUPTIBLE:

3362

case TASK_INTERRUPTIBLE:

3359

stats->nr_sleeping++;

3363

stats->nr_sleeping++;

3360

break;

3364

break;

3361

case TASK_UNINTERRUPTIBLE:

3365

case TASK_UNINTERRUPTIBLE:

3362

stats->nr_uninterruptible++;

3366

stats->nr_uninterruptible++;

3363

break;

3367

break;

3364

case TASK_STOPPED:

3368

case TASK_STOPPED:

3365

stats->nr_stopped++;

3369

stats->nr_stopped++;

3366

break;

3370

break;

3367

default:

3371

default:

3368

if (delayacct_is_task_waiting_on_io(tsk))

3372

if (delayacct_is_task_waiting_on_io(tsk))

3369

stats->nr_io_wait++;

3373

stats->nr_io_wait++;

3370

break;

3374

break;

3371

}

3375

}

3372

}

3376

}

3373

css_task_iter_end(&it);

3377

css_task_iter_end(&it);

3374

3378

3375

mutex_unlock(&cgroup_mutex);

3379

mutex_unlock(&cgroup_mutex);

3376

return 0;

3380

return 0;

3377

}

3381

}

3378

3382

3379

3383

3380

/*

3384

/*

3381

* seq_file methods for the tasks/procs files. The seq_file position is the

3385

* seq_file methods for the tasks/procs files. The seq_file position is the

3382

* next pid to display; the seq_file iterator is a pointer to the pid

3386

* next pid to display; the seq_file iterator is a pointer to the pid

3383

* in the cgroup->l->list array.

3387

* in the cgroup->l->list array.

3384

*/

3388

*/

3385

3389

3386

static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)

3390

static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)

3387

{

3391

{

3388

/*

3392

/*

3389

* Initially we receive a position value that corresponds to

3393

* Initially we receive a position value that corresponds to

3390

* one more than the last pid shown (or 0 on the first call or

3394

* one more than the last pid shown (or 0 on the first call or

3391

* after a seek to the start). Use a binary-search to find the

3395

* after a seek to the start). Use a binary-search to find the

3392

* next pid to display, if any

3396

* next pid to display, if any

3393

*/

3397

*/

3394

struct kernfs_open_file *of = s->private;

3398

struct kernfs_open_file *of = s->private;

3395

struct cgroup *cgrp = seq_css(s)->cgroup;

3399

struct cgroup *cgrp = seq_css(s)->cgroup;

3396

struct cgroup_pidlist *l;

3400

struct cgroup_pidlist *l;

3397

enum cgroup_filetype type = seq_cft(s)->private;

3401

enum cgroup_filetype type = seq_cft(s)->private;

3398

int index = 0, pid = *pos;

3402

int index = 0, pid = *pos;

3399

int *iter, ret;

3403

int *iter, ret;

3400

3404

3401

mutex_lock(&cgrp->pidlist_mutex);

3405

mutex_lock(&cgrp->pidlist_mutex);

3402

3406

3403

/*

3407

/*

3404

* !NULL @of->priv indicates that this isn't the first start()

3408

* !NULL @of->priv indicates that this isn't the first start()

3405

* after open. If the matching pidlist is around, we can use that.

3409

* after open. If the matching pidlist is around, we can use that.

3406

* Look for it. Note that @of->priv can't be used directly. It

3410

* Look for it. Note that @of->priv can't be used directly. It

3407

* could already have been destroyed.

3411

* could already have been destroyed.

3408

*/

3412

*/

3409

if (of->priv)

3413

if (of->priv)

3410

of->priv = cgroup_pidlist_find(cgrp, type);

3414

of->priv = cgroup_pidlist_find(cgrp, type);

3411

3415

3412

/*

3416

/*

3413

* Either this is the first start() after open or the matching

3417

* Either this is the first start() after open or the matching

3414

* pidlist has been destroyed inbetween. Create a new one.

3418

* pidlist has been destroyed inbetween. Create a new one.

3415

*/

3419

*/

3416

if (!of->priv) {

3420

if (!of->priv) {

3417

ret = pidlist_array_load(cgrp, type,

3421

ret = pidlist_array_load(cgrp, type,

3418

(struct cgroup_pidlist **)&of->priv);

3422

(struct cgroup_pidlist **)&of->priv);

3419

if (ret)

3423

if (ret)

3420

return ERR_PTR(ret);

3424

return ERR_PTR(ret);

3421

}

3425

}

3422

l = of->priv;

3426

l = of->priv;

3423

3427

3424

if (pid) {

3428

if (pid) {

3425

int end = l->length;

3429

int end = l->length;

3426

3430

3427

while (index < end) {

3431

while (index < end) {

3428

int mid = (index + end) / 2;

3432

int mid = (index + end) / 2;

3429

if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {

3433

if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {

3430

index = mid;

3434

index = mid;

3431

break;

3435

break;

3432

} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)

3436

} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)

3433

index = mid + 1;

3437

index = mid + 1;

3434

else

3438

else

3435

end = mid;

3439

end = mid;

3436

}

3440

}

3437

}

3441

}

3438

/* If we're off the end of the array, we're done */

3442

/* If we're off the end of the array, we're done */

3439

if (index >= l->length)

3443

if (index >= l->length)

3440

return NULL;

3444

return NULL;

3441

/* Update the abstract position to be the actual pid that we found */

3445

/* Update the abstract position to be the actual pid that we found */

3442

iter = l->list + index;

3446

iter = l->list + index;

3443

*pos = cgroup_pid_fry(cgrp, *iter);

3447

*pos = cgroup_pid_fry(cgrp, *iter);

3444

return iter;

3448

return iter;

3445

}

3449

}

3446

3450

3447

static void cgroup_pidlist_stop(struct seq_file *s, void *v)

3451

static void cgroup_pidlist_stop(struct seq_file *s, void *v)

3448

{

3452

{

3449

struct kernfs_open_file *of = s->private;

3453

struct kernfs_open_file *of = s->private;

3450

struct cgroup_pidlist *l = of->priv;

3454

struct cgroup_pidlist *l = of->priv;

3451

3455

3452

if (l)

3456

if (l)

3453

mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,

3457

mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,

3454

CGROUP_PIDLIST_DESTROY_DELAY);

3458

CGROUP_PIDLIST_DESTROY_DELAY);

3455

mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);

3459

mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);

3456

}

3460

}

3457

3461

3458

static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)

3462

static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)

3459

{

3463

{

3460

struct kernfs_open_file *of = s->private;

3464

struct kernfs_open_file *of = s->private;

3461

struct cgroup_pidlist *l = of->priv;

3465

struct cgroup_pidlist *l = of->priv;

3462

pid_t *p = v;

3466

pid_t *p = v;

3463

pid_t *end = l->list + l->length;

3467

pid_t *end = l->list + l->length;

3464

/*

3468

/*

3465

* Advance to the next pid in the array. If this goes off the

3469

* Advance to the next pid in the array. If this goes off the

3466

* end, we're done

3470

* end, we're done

3467

*/

3471

*/

3468

p++;

3472

p++;

3469

if (p >= end) {

3473

if (p >= end) {

3470

return NULL;

3474

return NULL;

3471

} else {

3475

} else {

3472

*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);

3476

*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);

3473

return p;

3477

return p;

3474

}

3478

}

3475

}

3479

}

3476

3480

3477

static int cgroup_pidlist_show(struct seq_file *s, void *v)

3481

static int cgroup_pidlist_show(struct seq_file *s, void *v)

3478

{

3482

{

3479

return seq_printf(s, "%d\n", *(int *)v);

3483

return seq_printf(s, "%d\n", *(int *)v);

3480

}

3484

}

3481

3485

3482

/*

3486

/*

3483

* seq_operations functions for iterating on pidlists through seq_file -

3487

* seq_operations functions for iterating on pidlists through seq_file -

3484

* independent of whether it's tasks or procs

3488

* independent of whether it's tasks or procs

3485

*/

3489

*/

3486

static const struct seq_operations cgroup_pidlist_seq_operations = {

3490

static const struct seq_operations cgroup_pidlist_seq_operations = {

3487

.start = cgroup_pidlist_start,

3491

.start = cgroup_pidlist_start,

3488

.stop = cgroup_pidlist_stop,

3492

.stop = cgroup_pidlist_stop,

3489

.next = cgroup_pidlist_next,

3493

.next = cgroup_pidlist_next,

3490

.show = cgroup_pidlist_show,

3494

.show = cgroup_pidlist_show,

3491

};

3495

};

3492

3496

3493

static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,

3497

static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,

3494

struct cftype *cft)

3498

struct cftype *cft)

3495

{

3499

{

3496

return notify_on_release(css->cgroup);

3500

return notify_on_release(css->cgroup);

3497

}

3501

}

3498

3502

3499

static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,

3503

static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,

3500

struct cftype *cft, u64 val)

3504

struct cftype *cft, u64 val)

3501

{

3505

{

3502

clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);

3506

clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);

3503

if (val)

3507

if (val)

3504

set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);

3508

set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);

3505

else

3509

else

3506

clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);

3510

clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);

3507

return 0;

3511

return 0;

3508

}

3512

}

3509

3513

3510

static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,

3514

static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,

3511

struct cftype *cft)

3515

struct cftype *cft)

3512

{

3516

{

3513

return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3517

return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3514

}

3518

}

3515

3519

3516

static int cgroup_clone_children_write(struct cgroup_subsys_state *css,

3520

static int cgroup_clone_children_write(struct cgroup_subsys_state *css,

3517

struct cftype *cft, u64 val)

3521

struct cftype *cft, u64 val)

3518

{

3522

{

3519

if (val)

3523

if (val)

3520

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3524

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3521

else

3525

else

3522

clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3526

clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);

3523

return 0;

3527

return 0;

3524

}

3528

}

3525

3529

3526

static struct cftype cgroup_base_files[] = {

3530

static struct cftype cgroup_base_files[] = {

3527

{

3531

{

3528

.name = "cgroup.procs",

3532

.name = "cgroup.procs",

3529

.seq_start = cgroup_pidlist_start,

3533

.seq_start = cgroup_pidlist_start,

3530

.seq_next = cgroup_pidlist_next,

3534

.seq_next = cgroup_pidlist_next,

3531

.seq_stop = cgroup_pidlist_stop,

3535

.seq_stop = cgroup_pidlist_stop,

3532

.seq_show = cgroup_pidlist_show,

3536

.seq_show = cgroup_pidlist_show,

3533

.private = CGROUP_FILE_PROCS,

3537

.private = CGROUP_FILE_PROCS,

3534

.write_u64 = cgroup_procs_write,

3538

.write_u64 = cgroup_procs_write,

3535

.mode = S_IRUGO | S_IWUSR,

3539

.mode = S_IRUGO | S_IWUSR,

3536

},

3540

},

3537

{

3541

{

3538

.name = "cgroup.clone_children",

3542

.name = "cgroup.clone_children",

3539

.flags = CFTYPE_INSANE,

3543

.flags = CFTYPE_INSANE,

3540

.read_u64 = cgroup_clone_children_read,

3544

.read_u64 = cgroup_clone_children_read,

3541

.write_u64 = cgroup_clone_children_write,

3545

.write_u64 = cgroup_clone_children_write,

3542

},

3546

},

3543

{

3547

{

3544

.name = "cgroup.sane_behavior",

3548

.name = "cgroup.sane_behavior",

3545

.flags = CFTYPE_ONLY_ON_ROOT,

3549

.flags = CFTYPE_ONLY_ON_ROOT,

3546

.seq_show = cgroup_sane_behavior_show,

3550

.seq_show = cgroup_sane_behavior_show,

3547

},

3551

},

3548

3552

3549

/*

3553

/*

3550

* Historical crazy stuff. These don't have "cgroup." prefix and

3554

* Historical crazy stuff. These don't have "cgroup." prefix and

3551

* don't exist if sane_behavior. If you're depending on these, be

3555

* don't exist if sane_behavior. If you're depending on these, be

3552

* prepared to be burned.

3556

* prepared to be burned.

3553

*/

3557

*/

3554

{

3558

{

3555

.name = "tasks",

3559

.name = "tasks",

3556

.flags = CFTYPE_INSANE, /* use "procs" instead */

3560

.flags = CFTYPE_INSANE, /* use "procs" instead */

3557

.seq_start = cgroup_pidlist_start,

3561

.seq_start = cgroup_pidlist_start,

3558

.seq_next = cgroup_pidlist_next,

3562

.seq_next = cgroup_pidlist_next,

3559

.seq_stop = cgroup_pidlist_stop,

3563

.seq_stop = cgroup_pidlist_stop,

3560

.seq_show = cgroup_pidlist_show,

3564

.seq_show = cgroup_pidlist_show,

3561

.private = CGROUP_FILE_TASKS,

3565

.private = CGROUP_FILE_TASKS,

3562

.write_u64 = cgroup_tasks_write,

3566

.write_u64 = cgroup_tasks_write,

3563

.mode = S_IRUGO | S_IWUSR,

3567

.mode = S_IRUGO | S_IWUSR,

3564

},

3568

},

3565

{

3569

{

3566

.name = "notify_on_release",

3570

.name = "notify_on_release",

3567

.flags = CFTYPE_INSANE,

3571

.flags = CFTYPE_INSANE,

3568

.read_u64 = cgroup_read_notify_on_release,

3572

.read_u64 = cgroup_read_notify_on_release,

3569

.write_u64 = cgroup_write_notify_on_release,

3573

.write_u64 = cgroup_write_notify_on_release,

3570

},

3574

},

3571

{

3575

{

3572

.name = "release_agent",

3576

.name = "release_agent",

3573

.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,

3577

.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,

3574

.seq_show = cgroup_release_agent_show,

3578

.seq_show = cgroup_release_agent_show,

3575

.write_string = cgroup_release_agent_write,

3579

.write_string = cgroup_release_agent_write,

3576

.max_write_len = PATH_MAX - 1,

3580

.max_write_len = PATH_MAX - 1,

3577

},

3581

},

3578

{ } /* terminate */

3582

{ } /* terminate */

3579

};

3583

};

3580

3584

3581

/**

3585

/**

3582

* cgroup_populate_dir - create subsys files in a cgroup directory

3586

* cgroup_populate_dir - create subsys files in a cgroup directory

3583

* @cgrp: target cgroup

3587

* @cgrp: target cgroup

3584

* @subsys_mask: mask of the subsystem ids whose files should be added

3588

* @subsys_mask: mask of the subsystem ids whose files should be added

3585

*

3589

*

3586

* On failure, no file is added.

3590

* On failure, no file is added.

3587

*/

3591

*/

3588

static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)

3592

static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)

3589

{

3593

{

3590

struct cgroup_subsys *ss;

3594

struct cgroup_subsys *ss;

3591

int i, ret = 0;

3595

int i, ret = 0;

3592

3596

3593

/* process cftsets of each subsystem */

3597

/* process cftsets of each subsystem */

3594

for_each_subsys(ss, i) {

3598

for_each_subsys(ss, i) {

3595

struct cftype *cfts;

3599

struct cftype *cfts;

3596

3600

3597

if (!test_bit(i, &subsys_mask))

3601

if (!test_bit(i, &subsys_mask))

3598

continue;

3602

continue;

3599

3603

3600

list_for_each_entry(cfts, &ss->cfts, node) {

3604

list_for_each_entry(cfts, &ss->cfts, node) {

3601

ret = cgroup_addrm_files(cgrp, cfts, true);

3605

ret = cgroup_addrm_files(cgrp, cfts, true);

3602

if (ret < 0)

3606

if (ret < 0)

3603

goto err;

3607

goto err;

3604

}

3608

}

3605

}

3609

}

3606

return 0;

3610

return 0;

3607

err:

3611

err:

3608

cgroup_clear_dir(cgrp, subsys_mask);

3612

cgroup_clear_dir(cgrp, subsys_mask);

3609

return ret;

3613

return ret;

3610

}

3614

}

3611

3615

3612

/*

3616

/*

3613

* css destruction is four-stage process.

3617

* css destruction is four-stage process.

3614

*

3618

*

3615

* 1. Destruction starts. Killing of the percpu_ref is initiated.

3619

* 1. Destruction starts. Killing of the percpu_ref is initiated.

3616

* Implemented in kill_css().

3620

* Implemented in kill_css().

3617

*

3621

*

3618

* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs

3622

* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs

3619

* and thus css_tryget() is guaranteed to fail, the css can be offlined

3623

* and thus css_tryget() is guaranteed to fail, the css can be offlined

3620

* by invoking offline_css(). After offlining, the base ref is put.

3624

* by invoking offline_css(). After offlining, the base ref is put.

3621

* Implemented in css_killed_work_fn().

3625

* Implemented in css_killed_work_fn().

3622

*

3626

*

3623

* 3. When the percpu_ref reaches zero, the only possible remaining

3627

* 3. When the percpu_ref reaches zero, the only possible remaining

3624

* accessors are inside RCU read sections. css_release() schedules the

3628

* accessors are inside RCU read sections. css_release() schedules the

3625

* RCU callback.

3629

* RCU callback.

3626

*

3630

*

3627

* 4. After the grace period, the css can be freed. Implemented in

3631

* 4. After the grace period, the css can be freed. Implemented in

3628

* css_free_work_fn().

3632

* css_free_work_fn().

3629

*

3633

*

3630

* It is actually hairier because both step 2 and 4 require process context

3634

* It is actually hairier because both step 2 and 4 require process context

3631

* and thus involve punting to css->destroy_work adding two additional

3635

* and thus involve punting to css->destroy_work adding two additional

3632

* steps to the already complex sequence.

3636

* steps to the already complex sequence.

3633

*/

3637

*/

3634

static void css_free_work_fn(struct work_struct *work)

3638

static void css_free_work_fn(struct work_struct *work)

3635

{

3639

{

3636

struct cgroup_subsys_state *css =

3640

struct cgroup_subsys_state *css =

3637

container_of(work, struct cgroup_subsys_state, destroy_work);

3641

container_of(work, struct cgroup_subsys_state, destroy_work);

3638

struct cgroup *cgrp = css->cgroup;

3642

struct cgroup *cgrp = css->cgroup;

3639

3643

3640

if (css->parent)

3644

if (css->parent)

3641

css_put(css->parent);

3645

css_put(css->parent);

3642

3646

3643

css->ss->css_free(css);

3647

css->ss->css_free(css);

3644

cgroup_put(cgrp);

3648

cgroup_put(cgrp);

3645

}

3649

}

3646

3650

3647

static void css_free_rcu_fn(struct rcu_head *rcu_head)

3651

static void css_free_rcu_fn(struct rcu_head *rcu_head)

3648

{

3652

{

3649

struct cgroup_subsys_state *css =

3653

struct cgroup_subsys_state *css =

3650

container_of(rcu_head, struct cgroup_subsys_state, rcu_head);

3654

container_of(rcu_head, struct cgroup_subsys_state, rcu_head);

3651

3655

3652

INIT_WORK(&css->destroy_work, css_free_work_fn);

3656

INIT_WORK(&css->destroy_work, css_free_work_fn);

3653

queue_work(cgroup_destroy_wq, &css->destroy_work);

3657

queue_work(cgroup_destroy_wq, &css->destroy_work);

3654

}

3658

}

3655

3659

3656

static void css_release(struct percpu_ref *ref)

3660

static void css_release(struct percpu_ref *ref)

3657

{

3661

{

3658

struct cgroup_subsys_state *css =

3662

struct cgroup_subsys_state *css =

3659

container_of(ref, struct cgroup_subsys_state, refcnt);

3663

container_of(ref, struct cgroup_subsys_state, refcnt);

3660

3664

3661

RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);

3665

RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);

3662

call_rcu(&css->rcu_head, css_free_rcu_fn);

3666

call_rcu(&css->rcu_head, css_free_rcu_fn);

3663

}

3667

}

3664

3668

3665

static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,

3669

static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,

3666

struct cgroup *cgrp)

3670

struct cgroup *cgrp)

3667

{

3671

{

3668

css->cgroup = cgrp;

3672

css->cgroup = cgrp;

3669

css->ss = ss;

3673

css->ss = ss;

3670

css->flags = 0;

3674

css->flags = 0;

3671

3675

3672

if (cgrp->parent)

3676

if (cgrp->parent)

3673

css->parent = cgroup_css(cgrp->parent, ss);

3677

css->parent = cgroup_css(cgrp->parent, ss);

3674

else

3678

else

3675

css->flags |= CSS_ROOT;

3679

css->flags |= CSS_ROOT;

3676

3680

3677

BUG_ON(cgroup_css(cgrp, ss));

3681

BUG_ON(cgroup_css(cgrp, ss));

3678

}

3682

}

3679

3683

3680

/* invoke ->css_online() on a new CSS and mark it online if successful */

3684

/* invoke ->css_online() on a new CSS and mark it online if successful */

3681

static int online_css(struct cgroup_subsys_state *css)

3685

static int online_css(struct cgroup_subsys_state *css)

3682

{

3686

{

3683

struct cgroup_subsys *ss = css->ss;

3687

struct cgroup_subsys *ss = css->ss;

3684

int ret = 0;

3688

int ret = 0;

3685

3689

3686

lockdep_assert_held(&cgroup_tree_mutex);

3690

lockdep_assert_held(&cgroup_tree_mutex);

3687

lockdep_assert_held(&cgroup_mutex);

3691

lockdep_assert_held(&cgroup_mutex);

3688

3692

3689

if (ss->css_online)

3693

if (ss->css_online)

3690

ret = ss->css_online(css);

3694

ret = ss->css_online(css);

3691

if (!ret) {

3695

if (!ret) {

3692

css->flags |= CSS_ONLINE;

3696

css->flags |= CSS_ONLINE;

3693

css->cgroup->nr_css++;

3697

css->cgroup->nr_css++;

3694

rcu_assign_pointer(css->cgroup->subsys[ss->id], css);

3698

rcu_assign_pointer(css->cgroup->subsys[ss->id], css);

3695

}

3699

}

3696

return ret;

3700

return ret;

3697

}

3701

}

3698

3702

3699

/* if the CSS is online, invoke ->css_offline() on it and mark it offline */

3703

/* if the CSS is online, invoke ->css_offline() on it and mark it offline */

3700

static void offline_css(struct cgroup_subsys_state *css)

3704

static void offline_css(struct cgroup_subsys_state *css)

3701

{

3705

{

3702

struct cgroup_subsys *ss = css->ss;

3706

struct cgroup_subsys *ss = css->ss;

3703

3707

3704

lockdep_assert_held(&cgroup_tree_mutex);

3708

lockdep_assert_held(&cgroup_tree_mutex);

3705

lockdep_assert_held(&cgroup_mutex);

3709

lockdep_assert_held(&cgroup_mutex);

3706

3710

3707

if (!(css->flags & CSS_ONLINE))

3711

if (!(css->flags & CSS_ONLINE))

3708

return;

3712

return;

3709

3713

3710

if (ss->css_offline)

3714

if (ss->css_offline)

3711

ss->css_offline(css);

3715

ss->css_offline(css);

3712

3716

3713

css->flags &= ~CSS_ONLINE;

3717

css->flags &= ~CSS_ONLINE;

3714

css->cgroup->nr_css--;

3718

css->cgroup->nr_css--;

3715

RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);

3719

RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);

3716

}

3720

}

3717

3721

3718

/**

3722

/**

3719

* create_css - create a cgroup_subsys_state

3723

* create_css - create a cgroup_subsys_state

3720

* @cgrp: the cgroup new css will be associated with

3724

* @cgrp: the cgroup new css will be associated with

3721

* @ss: the subsys of new css

3725

* @ss: the subsys of new css

3722

*

3726

*

3723

* Create a new css associated with @cgrp - @ss pair. On success, the new

3727

* Create a new css associated with @cgrp - @ss pair. On success, the new

3724

* css is online and installed in @cgrp with all interface files created.

3728

* css is online and installed in @cgrp with all interface files created.

3725

* Returns 0 on success, -errno on failure.

3729

* Returns 0 on success, -errno on failure.

3726

*/

3730

*/

3727

static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)

3731

static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)

3728

{

3732

{

3729

struct cgroup *parent = cgrp->parent;

3733

struct cgroup *parent = cgrp->parent;

3730

struct cgroup_subsys_state *css;

3734

struct cgroup_subsys_state *css;

3731

int err;

3735

int err;

3732

3736

3733

lockdep_assert_held(&cgroup_mutex);

3737

lockdep_assert_held(&cgroup_mutex);

3734

3738

3735

css = ss->css_alloc(cgroup_css(parent, ss));

3739

css = ss->css_alloc(cgroup_css(parent, ss));

3736

if (IS_ERR(css))

3740

if (IS_ERR(css))

3737

return PTR_ERR(css);

3741

return PTR_ERR(css);

3738

3742

3739

err = percpu_ref_init(&css->refcnt, css_release);

3743

err = percpu_ref_init(&css->refcnt, css_release);

3740

if (err)

3744

if (err)

3741

goto err_free_css;

3745

goto err_free_css;

3742

3746

3743

init_css(css, ss, cgrp);

3747

init_css(css, ss, cgrp);

3744

3748

3745

err = cgroup_populate_dir(cgrp, 1 << ss->id);

3749

err = cgroup_populate_dir(cgrp, 1 << ss->id);

3746

if (err)

3750

if (err)

3747

goto err_free_percpu_ref;

3751

goto err_free_percpu_ref;

3748

3752

3749

err = online_css(css);

3753

err = online_css(css);

3750

if (err)

3754

if (err)

3751

goto err_clear_dir;

3755

goto err_clear_dir;

3752

3756

3753

cgroup_get(cgrp);

3757

cgroup_get(cgrp);

3754

css_get(css->parent);

3758

css_get(css->parent);

3755

3759

3756

if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&

3760

if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&

3757

parent->parent) {

3761

parent->parent) {

3758

pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",

3762

pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",

3759

current->comm, current->pid, ss->name);

3763

current->comm, current->pid, ss->name);

3760

if (!strcmp(ss->name, "memory"))

3764

if (!strcmp(ss->name, "memory"))

3761

pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");

3765

pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");

3762

ss->warned_broken_hierarchy = true;

3766

ss->warned_broken_hierarchy = true;

3763

}

3767

}

3764

3768

3765

return 0;

3769

return 0;

3766

3770

3767

err_clear_dir:

3771

err_clear_dir:

3768

cgroup_clear_dir(css->cgroup, 1 << css->ss->id);

3772

cgroup_clear_dir(css->cgroup, 1 << css->ss->id);

3769

err_free_percpu_ref:

3773

err_free_percpu_ref:

3770

percpu_ref_cancel_init(&css->refcnt);

3774

percpu_ref_cancel_init(&css->refcnt);

3771

err_free_css:

3775

err_free_css:

3772

ss->css_free(css);

3776

ss->css_free(css);

3773

return err;

3777

return err;

3774

}

3778

}

3775

3779

3776

/**

3780

/**

3777

* cgroup_create - create a cgroup

3781

* cgroup_create - create a cgroup

3778

* @parent: cgroup that will be parent of the new cgroup

3782

* @parent: cgroup that will be parent of the new cgroup

3779

* @name: name of the new cgroup

3783

* @name: name of the new cgroup

3780

* @mode: mode to set on new cgroup

3784

* @mode: mode to set on new cgroup

3781

*/

3785

*/

3782

static long cgroup_create(struct cgroup *parent, const char *name,

3786

static long cgroup_create(struct cgroup *parent, const char *name,

3783

umode_t mode)

3787

umode_t mode)

3784

{

3788

{

3785

struct cgroup *cgrp;

3789

struct cgroup *cgrp;

3786

struct cgroup_root *root = parent->root;

3790

struct cgroup_root *root = parent->root;

3787

int ssid, err;

3791

int ssid, err;

3788

struct cgroup_subsys *ss;

3792

struct cgroup_subsys *ss;

3789

struct kernfs_node *kn;

3793

struct kernfs_node *kn;

3790

3794

3791

/* allocate the cgroup and its ID, 0 is reserved for the root */

3795

/* allocate the cgroup and its ID, 0 is reserved for the root */

3792

cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);

3796

cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);

3793

if (!cgrp)

3797

if (!cgrp)

3794

return -ENOMEM;

3798

return -ENOMEM;

3795

3799

3796

mutex_lock(&cgroup_tree_mutex);

3800

mutex_lock(&cgroup_tree_mutex);

3797

3801

3798

/*

3802

/*

3799

* Only live parents can have children. Note that the liveliness

3803

* Only live parents can have children. Note that the liveliness

3800

* check isn't strictly necessary because cgroup_mkdir() and

3804

* check isn't strictly necessary because cgroup_mkdir() and

3801

* cgroup_rmdir() are fully synchronized by i_mutex; however, do it

3805

* cgroup_rmdir() are fully synchronized by i_mutex; however, do it

3802

* anyway so that locking is contained inside cgroup proper and we

3806

* anyway so that locking is contained inside cgroup proper and we

3803

* don't get nasty surprises if we ever grow another caller.

3807

* don't get nasty surprises if we ever grow another caller.

3804

*/

3808

*/

3805

if (!cgroup_lock_live_group(parent)) {

3809

if (!cgroup_lock_live_group(parent)) {

3806

err = -ENODEV;

3810

err = -ENODEV;

3807

goto err_unlock_tree;

3811

goto err_unlock_tree;

3808

}

3812

}

3809

3813

3810

/*

3814

/*

3811

* Temporarily set the pointer to NULL, so idr_find() won't return

3815

* Temporarily set the pointer to NULL, so idr_find() won't return

3812

* a half-baked cgroup.

3816

* a half-baked cgroup.

3813

*/

3817

*/

3814

cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);

3818

cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);

3815

if (cgrp->id < 0) {

3819

if (cgrp->id < 0) {

3816

err = -ENOMEM;

3820

err = -ENOMEM;

3817

goto err_unlock;

3821

goto err_unlock;

3818

}

3822

}

3819

3823

3820

init_cgroup_housekeeping(cgrp);

3824

init_cgroup_housekeeping(cgrp);

3821

3825

3822

cgrp->parent = parent;

3826

cgrp->parent = parent;

3823

cgrp->dummy_css.parent = &parent->dummy_css;

3827

cgrp->dummy_css.parent = &parent->dummy_css;

3824

cgrp->root = parent->root;

3828

cgrp->root = parent->root;

3825

3829

3826

if (notify_on_release(parent))

3830

if (notify_on_release(parent))

3827

set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

3831

set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

3828

3832

3829

if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))

3833

if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))

3830

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);

3834

set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);

3831

3835

3832

/* create the directory */

3836

/* create the directory */

3833

kn = kernfs_create_dir(parent->kn, name, mode, cgrp);

3837

kn = kernfs_create_dir(parent->kn, name, mode, cgrp);

3834

if (IS_ERR(kn)) {

3838

if (IS_ERR(kn)) {

3835

err = PTR_ERR(kn);

3839

err = PTR_ERR(kn);

3836

goto err_free_id;

3840

goto err_free_id;

3837

}

3841

}

3838

cgrp->kn = kn;

3842

cgrp->kn = kn;

3839

3843

3840

/*

3844

/*

3841

* This extra ref will be put in cgroup_free_fn() and guarantees

3845

* This extra ref will be put in cgroup_free_fn() and guarantees

3842

* that @cgrp->kn is always accessible.

3846

* that @cgrp->kn is always accessible.

3843

*/

3847

*/

3844

kernfs_get(kn);

3848

kernfs_get(kn);

3845

3849

3846

cgrp->serial_nr = cgroup_serial_nr_next++;

3850

cgrp->serial_nr = cgroup_serial_nr_next++;

3847

3851

3848

/* allocation complete, commit to creation */

3852

/* allocation complete, commit to creation */

3849

list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);

3853

list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);

3850

atomic_inc(&root->nr_cgrps);

3854

atomic_inc(&root->nr_cgrps);

3851

cgroup_get(parent);

3855

cgroup_get(parent);

3852

3856

3853

/*

3857

/*

3854

* @cgrp is now fully operational. If something fails after this

3858

* @cgrp is now fully operational. If something fails after this

3855

* point, it'll be released via the normal destruction path.

3859

* point, it'll be released via the normal destruction path.

3856

*/

3860

*/

3857

idr_replace(&root->cgroup_idr, cgrp, cgrp->id);

3861

idr_replace(&root->cgroup_idr, cgrp, cgrp->id);

3858

3862

3859

err = cgroup_kn_set_ugid(kn);

3863

err = cgroup_kn_set_ugid(kn);

3860

if (err)

3864

if (err)

3861

goto err_destroy;

3865

goto err_destroy;

3862

3866

3863

err = cgroup_addrm_files(cgrp, cgroup_base_files, true);

3867

err = cgroup_addrm_files(cgrp, cgroup_base_files, true);

3864

if (err)

3868

if (err)

3865

goto err_destroy;

3869

goto err_destroy;

3866

3870

3867

/* let's create and online css's */

3871

/* let's create and online css's */

3868

for_each_subsys(ss, ssid) {

3872

for_each_subsys(ss, ssid) {

3869

if (parent->child_subsys_mask & (1 << ssid)) {

3873

if (parent->child_subsys_mask & (1 << ssid)) {

3870

err = create_css(cgrp, ss);

3874

err = create_css(cgrp, ss);

3871

if (err)

3875

if (err)

3872

goto err_destroy;

3876

goto err_destroy;

3873

}

3877

}

3874

}

3878

}

3875

3879

3876

/*

3880

/*

3877

* On the default hierarchy, a child doesn't automatically inherit

3881

* On the default hierarchy, a child doesn't automatically inherit

3878

* child_subsys_mask from the parent. Each is configured manually.

3882

* child_subsys_mask from the parent. Each is configured manually.

3879

*/

3883

*/

3880

if (!cgroup_on_dfl(cgrp))

3884

if (!cgroup_on_dfl(cgrp))

3881

cgrp->child_subsys_mask = parent->child_subsys_mask;

3885

cgrp->child_subsys_mask = parent->child_subsys_mask;

3882

3886

3883

kernfs_activate(kn);

3887

kernfs_activate(kn);

3884

3888

3885

mutex_unlock(&cgroup_mutex);

3889

mutex_unlock(&cgroup_mutex);

3886

mutex_unlock(&cgroup_tree_mutex);

3890

mutex_unlock(&cgroup_tree_mutex);

3887

3891

3888

return 0;

3892

return 0;

3889

3893

3890

err_free_id:

3894

err_free_id:

3891

idr_remove(&root->cgroup_idr, cgrp->id);

3895

idr_remove(&root->cgroup_idr, cgrp->id);

3892

err_unlock:

3896

err_unlock:

3893

mutex_unlock(&cgroup_mutex);

3897

mutex_unlock(&cgroup_mutex);

3894

err_unlock_tree:

3898

err_unlock_tree:

3895

mutex_unlock(&cgroup_tree_mutex);

3899

mutex_unlock(&cgroup_tree_mutex);

3896

kfree(cgrp);

3900

kfree(cgrp);

3897

return err;

3901

return err;

3898

3902

3899

err_destroy:

3903

err_destroy:

3900

cgroup_destroy_locked(cgrp);

3904

cgroup_destroy_locked(cgrp);

3901

mutex_unlock(&cgroup_mutex);

3905

mutex_unlock(&cgroup_mutex);

3902

mutex_unlock(&cgroup_tree_mutex);

3906

mutex_unlock(&cgroup_tree_mutex);

3903

return err;

3907

return err;

3904

}

3908

}

3905

3909

3906

static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,

3910

static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,

3907

umode_t mode)

3911

umode_t mode)

3908

{

3912

{

3909

struct cgroup *parent = parent_kn->priv;

3913

struct cgroup *parent = parent_kn->priv;

3910

int ret;

3914

int ret;

3911

3915

3912

/*

3916

/*

3913

* cgroup_create() grabs cgroup_tree_mutex which nests outside

3917

* cgroup_create() grabs cgroup_tree_mutex which nests outside

3914

* kernfs active_ref and cgroup_create() already synchronizes

3918

* kernfs active_ref and cgroup_create() already synchronizes

3915

* properly against removal through cgroup_lock_live_group().

3919

* properly against removal through cgroup_lock_live_group().

3916

* Break it before calling cgroup_create().

3920

* Break it before calling cgroup_create().

3917

*/

3921

*/

3918

cgroup_get(parent);

3922

cgroup_get(parent);

3919

kernfs_break_active_protection(parent_kn);

3923

kernfs_break_active_protection(parent_kn);

3920

3924

3921

ret = cgroup_create(parent, name, mode);

3925

ret = cgroup_create(parent, name, mode);

3922

3926

3923

kernfs_unbreak_active_protection(parent_kn);

3927

kernfs_unbreak_active_protection(parent_kn);

3924

cgroup_put(parent);

3928

cgroup_put(parent);

3925

return ret;

3929

return ret;

3926

}

3930

}

3927

3931

3928

/*

3932

/*

3929

* This is called when the refcnt of a css is confirmed to be killed.

3933

* This is called when the refcnt of a css is confirmed to be killed.

3930

* css_tryget() is now guaranteed to fail.

3934

* css_tryget() is now guaranteed to fail.

3931

*/

3935

*/

3932

static void css_killed_work_fn(struct work_struct *work)

3936

static void css_killed_work_fn(struct work_struct *work)

3933

{

3937

{

3934

struct cgroup_subsys_state *css =

3938

struct cgroup_subsys_state *css =

3935

container_of(work, struct cgroup_subsys_state, destroy_work);

3939

container_of(work, struct cgroup_subsys_state, destroy_work);

3936

struct cgroup *cgrp = css->cgroup;

3940

struct cgroup *cgrp = css->cgroup;

3937

3941

3938

mutex_lock(&cgroup_tree_mutex);

3942

mutex_lock(&cgroup_tree_mutex);

3939

mutex_lock(&cgroup_mutex);

3943

mutex_lock(&cgroup_mutex);

3940

3944

3941

/*

3945

/*

3942

* css_tryget() is guaranteed to fail now. Tell subsystems to

3946

* css_tryget() is guaranteed to fail now. Tell subsystems to

3943

* initate destruction.

3947

* initate destruction.

3944

*/

3948

*/

3945

offline_css(css);

3949

offline_css(css);

3946

3950

3947

/*

3951

/*

3948

* If @cgrp is marked dead, it's waiting for refs of all css's to

3952

* If @cgrp is marked dead, it's waiting for refs of all css's to

3949

* be disabled before proceeding to the second phase of cgroup

3953

* be disabled before proceeding to the second phase of cgroup

3950

* destruction. If we are the last one, kick it off.

3954

* destruction. If we are the last one, kick it off.

3951

*/

3955

*/

3952

if (!cgrp->nr_css && cgroup_is_dead(cgrp))

3956

if (!cgrp->nr_css && cgroup_is_dead(cgrp))

3953

cgroup_destroy_css_killed(cgrp);

3957

cgroup_destroy_css_killed(cgrp);

3954

3958

3955

mutex_unlock(&cgroup_mutex);

3959

mutex_unlock(&cgroup_mutex);

3956

mutex_unlock(&cgroup_tree_mutex);

3960

mutex_unlock(&cgroup_tree_mutex);

3957

3961

3958

/*

3962

/*

3959

* Put the css refs from kill_css(). Each css holds an extra

3963

* Put the css refs from kill_css(). Each css holds an extra

3960

* reference to the cgroup's dentry and cgroup removal proceeds

3964

* reference to the cgroup's dentry and cgroup removal proceeds

3961

* regardless of css refs. On the last put of each css, whenever

3965

* regardless of css refs. On the last put of each css, whenever

3962

* that may be, the extra dentry ref is put so that dentry

3966

* that may be, the extra dentry ref is put so that dentry

3963

* destruction happens only after all css's are released.

3967

* destruction happens only after all css's are released.

3964

*/

3968

*/

3965

css_put(css);

3969

css_put(css);

3966

}

3970

}

3967

3971

3968

/* css kill confirmation processing requires process context, bounce */

3972

/* css kill confirmation processing requires process context, bounce */

3969

static void css_killed_ref_fn(struct percpu_ref *ref)

3973

static void css_killed_ref_fn(struct percpu_ref *ref)

3970

{

3974

{

3971

struct cgroup_subsys_state *css =

3975

struct cgroup_subsys_state *css =

3972

container_of(ref, struct cgroup_subsys_state, refcnt);

3976

container_of(ref, struct cgroup_subsys_state, refcnt);

3973

3977

3974

INIT_WORK(&css->destroy_work, css_killed_work_fn);

3978

INIT_WORK(&css->destroy_work, css_killed_work_fn);

3975

queue_work(cgroup_destroy_wq, &css->destroy_work);

3979

queue_work(cgroup_destroy_wq, &css->destroy_work);

3976

}

3980

}

3977

3981

3978

/**

3982

/**

3979

* kill_css - destroy a css

3983

* kill_css - destroy a css

3980

* @css: css to destroy

3984

* @css: css to destroy

3981

*

3985

*

3982

* This function initiates destruction of @css by removing cgroup interface

3986

* This function initiates destruction of @css by removing cgroup interface

3983

* files and putting its base reference. ->css_offline() will be invoked

3987

* files and putting its base reference. ->css_offline() will be invoked

3984

* asynchronously once css_tryget() is guaranteed to fail and when the

3988

* asynchronously once css_tryget() is guaranteed to fail and when the

3985

* reference count reaches zero, @css will be released.

3989

* reference count reaches zero, @css will be released.

3986

*/

3990

*/

3987

static void kill_css(struct cgroup_subsys_state *css)

3991

static void kill_css(struct cgroup_subsys_state *css)

3988

{

3992

{

3989

lockdep_assert_held(&cgroup_tree_mutex);

3993

lockdep_assert_held(&cgroup_tree_mutex);

3990

3994

3991

/*

3995

/*

3992

* This must happen before css is disassociated with its cgroup.

3996

* This must happen before css is disassociated with its cgroup.

3993

* See seq_css() for details.

3997

* See seq_css() for details.

3994

*/

3998

*/

3995

cgroup_clear_dir(css->cgroup, 1 << css->ss->id);

3999

cgroup_clear_dir(css->cgroup, 1 << css->ss->id);

3996

4000

3997

/*

4001

/*

3998

* Killing would put the base ref, but we need to keep it alive

4002

* Killing would put the base ref, but we need to keep it alive

3999

* until after ->css_offline().

4003

* until after ->css_offline().

4000

*/

4004

*/

4001

css_get(css);

4005

css_get(css);

4002

4006

4003

/*

4007

/*

4004

* cgroup core guarantees that, by the time ->css_offline() is

4008

* cgroup core guarantees that, by the time ->css_offline() is

4005

* invoked, no new css reference will be given out via

4009

* invoked, no new css reference will be given out via

4006

* css_tryget(). We can't simply call percpu_ref_kill() and

4010

* css_tryget(). We can't simply call percpu_ref_kill() and

4007

* proceed to offlining css's because percpu_ref_kill() doesn't

4011

* proceed to offlining css's because percpu_ref_kill() doesn't

4008

* guarantee that the ref is seen as killed on all CPUs on return.

4012

* guarantee that the ref is seen as killed on all CPUs on return.

4009

*

4013

*

4010

* Use percpu_ref_kill_and_confirm() to get notifications as each

4014

* Use percpu_ref_kill_and_confirm() to get notifications as each

4011

* css is confirmed to be seen as killed on all CPUs.

4015

* css is confirmed to be seen as killed on all CPUs.

4012

*/

4016

*/

4013

percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);

4017

percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);

4014

}

4018

}

4015

4019

4016

/**

4020

/**

4017

* cgroup_destroy_locked - the first stage of cgroup destruction

4021

* cgroup_destroy_locked - the first stage of cgroup destruction

4018

* @cgrp: cgroup to be destroyed

4022

* @cgrp: cgroup to be destroyed

4019

*

4023

*

4020

* css's make use of percpu refcnts whose killing latency shouldn't be

4024

* css's make use of percpu refcnts whose killing latency shouldn't be

4021

* exposed to userland and are RCU protected. Also, cgroup core needs to

4025

* exposed to userland and are RCU protected. Also, cgroup core needs to

4022

* guarantee that css_tryget() won't succeed by the time ->css_offline() is

4026

* guarantee that css_tryget() won't succeed by the time ->css_offline() is

4023

* invoked. To satisfy all the requirements, destruction is implemented in

4027

* invoked. To satisfy all the requirements, destruction is implemented in

4024

* the following two steps.

4028

* the following two steps.

4025

*

4029

*

4026

* s1. Verify @cgrp can be destroyed and mark it dying. Remove all

4030

* s1. Verify @cgrp can be destroyed and mark it dying. Remove all

4027

* userland visible parts and start killing the percpu refcnts of

4031

* userland visible parts and start killing the percpu refcnts of

4028

* css's. Set up so that the next stage will be kicked off once all

4032

* css's. Set up so that the next stage will be kicked off once all

4029

* the percpu refcnts are confirmed to be killed.

4033

* the percpu refcnts are confirmed to be killed.

4030

*

4034

*

4031

* s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the

4035

* s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the

4032

* rest of destruction. Once all cgroup references are gone, the

4036

* rest of destruction. Once all cgroup references are gone, the

4033

* cgroup is RCU-freed.

4037

* cgroup is RCU-freed.

4034

*

4038

*

4035

* This function implements s1. After this step, @cgrp is gone as far as

4039

* This function implements s1. After this step, @cgrp is gone as far as

4036

* the userland is concerned and a new cgroup with the same name may be

4040

* the userland is concerned and a new cgroup with the same name may be

4037

* created. As cgroup doesn't care about the names internally, this

4041

* created. As cgroup doesn't care about the names internally, this

4038

* doesn't cause any problem.

4042

* doesn't cause any problem.

4039

*/

4043

*/

4040

static int cgroup_destroy_locked(struct cgroup *cgrp)

4044

static int cgroup_destroy_locked(struct cgroup *cgrp)

4041

__releases(&cgroup_mutex) __acquires(&cgroup_mutex)

4045

__releases(&cgroup_mutex) __acquires(&cgroup_mutex)

4042

{

4046

{

4043

struct cgroup *child;

4047

struct cgroup *child;

4044

struct cgroup_subsys_state *css;

4048

struct cgroup_subsys_state *css;

4045

bool empty;

4049

bool empty;

4046

int ssid;

4050

int ssid;

4047

4051

4048

lockdep_assert_held(&cgroup_tree_mutex);

4052

lockdep_assert_held(&cgroup_tree_mutex);

4049

lockdep_assert_held(&cgroup_mutex);

4053

lockdep_assert_held(&cgroup_mutex);

4050

4054

4051

/*

4055

/*

4052

* css_set_rwsem synchronizes access to ->cset_links and prevents

4056

* css_set_rwsem synchronizes access to ->cset_links and prevents

4053

* @cgrp from being removed while put_css_set() is in progress.

4057

* @cgrp from being removed while put_css_set() is in progress.

4054

*/

4058

*/

4055

down_read(&css_set_rwsem);

4059

down_read(&css_set_rwsem);

4056

empty = list_empty(&cgrp->cset_links);

4060

empty = list_empty(&cgrp->cset_links);

4057

up_read(&css_set_rwsem);

4061

up_read(&css_set_rwsem);

4058

if (!empty)

4062

if (!empty)

4059

return -EBUSY;

4063

return -EBUSY;

4060

4064

4061

/*

4065

/*

4062

* Make sure there's no live children. We can't test ->children

4066

* Make sure there's no live children. We can't test ->children

4063

* emptiness as dead children linger on it while being destroyed;

4067

* emptiness as dead children linger on it while being destroyed;

4064

* otherwise, "rmdir parent/child parent" may fail with -EBUSY.

4068

* otherwise, "rmdir parent/child parent" may fail with -EBUSY.

4065

*/

4069

*/

4066

empty = true;

4070

empty = true;

4067

rcu_read_lock();

4071

rcu_read_lock();

4068

list_for_each_entry_rcu(child, &cgrp->children, sibling) {

4072

list_for_each_entry_rcu(child, &cgrp->children, sibling) {

4069

empty = cgroup_is_dead(child);

4073

empty = cgroup_is_dead(child);

4070

if (!empty)

4074

if (!empty)

4071

break;

4075

break;

4072

}

4076

}

4073

rcu_read_unlock();

4077

rcu_read_unlock();

4074

if (!empty)

4078

if (!empty)

4075

return -EBUSY;

4079

return -EBUSY;

4076

4080

4077

/*

4081

/*

4078

* Mark @cgrp dead. This prevents further task migration and child

4082

* Mark @cgrp dead. This prevents further task migration and child

4079

* creation by disabling cgroup_lock_live_group(). Note that

4083

* creation by disabling cgroup_lock_live_group(). Note that

4080

* CGRP_DEAD assertion is depended upon by css_next_child() to

4084

* CGRP_DEAD assertion is depended upon by css_next_child() to

4081

* resume iteration after dropping RCU read lock. See

4085

* resume iteration after dropping RCU read lock. See

4082

* css_next_child() for details.

4086

* css_next_child() for details.

4083

*/

4087

*/

4084

set_bit(CGRP_DEAD, &cgrp->flags);

4088

set_bit(CGRP_DEAD, &cgrp->flags);

4085

4089

4086

/*

4090

/*

4087

* Initiate massacre of all css's. cgroup_destroy_css_killed()

4091

* Initiate massacre of all css's. cgroup_destroy_css_killed()

4088

* will be invoked to perform the rest of destruction once the

4092

* will be invoked to perform the rest of destruction once the

4089

* percpu refs of all css's are confirmed to be killed. This

4093

* percpu refs of all css's are confirmed to be killed. This

4090

* involves removing the subsystem's files, drop cgroup_mutex.

4094

* involves removing the subsystem's files, drop cgroup_mutex.

4091

*/

4095

*/

4092

mutex_unlock(&cgroup_mutex);

4096

mutex_unlock(&cgroup_mutex);

4093

for_each_css(css, ssid, cgrp)

4097

for_each_css(css, ssid, cgrp)

4094

kill_css(css);

4098

kill_css(css);

4095

mutex_lock(&cgroup_mutex);

4099

mutex_lock(&cgroup_mutex);

4096

4100

4097

/* CGRP_DEAD is set, remove from ->release_list for the last time */

4101

/* CGRP_DEAD is set, remove from ->release_list for the last time */

4098

raw_spin_lock(&release_list_lock);

4102

raw_spin_lock(&release_list_lock);

4099

if (!list_empty(&cgrp->release_list))

4103

if (!list_empty(&cgrp->release_list))

4100

list_del_init(&cgrp->release_list);

4104

list_del_init(&cgrp->release_list);

4101

raw_spin_unlock(&release_list_lock);

4105

raw_spin_unlock(&release_list_lock);

4102

4106

4103

/*

4107

/*

4104

* If @cgrp has css's attached, the second stage of cgroup

4108

* If @cgrp has css's attached, the second stage of cgroup

4105

* destruction is kicked off from css_killed_work_fn() after the

4109

* destruction is kicked off from css_killed_work_fn() after the

4106

* refs of all attached css's are killed. If @cgrp doesn't have

4110

* refs of all attached css's are killed. If @cgrp doesn't have

4107

* any css, we kick it off here.

4111

* any css, we kick it off here.

4108

*/

4112

*/

4109

if (!cgrp->nr_css)

4113

if (!cgrp->nr_css)

4110

cgroup_destroy_css_killed(cgrp);

4114

cgroup_destroy_css_killed(cgrp);

4111

4115

4112

/* remove @cgrp directory along with the base files */

4116

/* remove @cgrp directory along with the base files */

4113

mutex_unlock(&cgroup_mutex);

4117

mutex_unlock(&cgroup_mutex);

4114

4118

4115

/*

4119

/*

4116

* There are two control paths which try to determine cgroup from

4120

* There are two control paths which try to determine cgroup from

4117

* dentry without going through kernfs - cgroupstats_build() and

4121

* dentry without going through kernfs - cgroupstats_build() and

4118

* css_tryget_from_dir(). Those are supported by RCU protecting

4122

* css_tryget_from_dir(). Those are supported by RCU protecting

4119

* clearing of cgrp->kn->priv backpointer, which should happen

4123

* clearing of cgrp->kn->priv backpointer, which should happen

4120

* after all files under it have been removed.

4124

* after all files under it have been removed.

4121

*/

4125

*/

4122

kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */

4126

kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */

4123

RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);

4127

RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);

4124

4128

4125

mutex_lock(&cgroup_mutex);

4129

mutex_lock(&cgroup_mutex);

4126

4130

4127

return 0;

4131

return 0;

4128

};

4132

};

4129

4133

4130

/**

4134

/**

4131

* cgroup_destroy_css_killed - the second step of cgroup destruction

4135

* cgroup_destroy_css_killed - the second step of cgroup destruction

4132

* @work: cgroup->destroy_free_work

4136

* @work: cgroup->destroy_free_work

4133

*

4137

*

4134

* This function is invoked from a work item for a cgroup which is being

4138

* This function is invoked from a work item for a cgroup which is being

4135

* destroyed after all css's are offlined and performs the rest of

4139

* destroyed after all css's are offlined and performs the rest of

4136

* destruction. This is the second step of destruction described in the

4140

* destruction. This is the second step of destruction described in the

4137

* comment above cgroup_destroy_locked().

4141

* comment above cgroup_destroy_locked().

4138

*/

4142

*/

4139

static void cgroup_destroy_css_killed(struct cgroup *cgrp)

4143

static void cgroup_destroy_css_killed(struct cgroup *cgrp)

4140

{

4144

{

4141

struct cgroup *parent = cgrp->parent;

4145

struct cgroup *parent = cgrp->parent;

4142

4146

4143

lockdep_assert_held(&cgroup_tree_mutex);

4147

lockdep_assert_held(&cgroup_tree_mutex);

4144

lockdep_assert_held(&cgroup_mutex);

4148

lockdep_assert_held(&cgroup_mutex);

4145

4149

4146

/* delete this cgroup from parent->children */

4150

/* delete this cgroup from parent->children */

4147

list_del_rcu(&cgrp->sibling);

4151

list_del_rcu(&cgrp->sibling);

4148

4152

4149

cgroup_put(cgrp);

4153

cgroup_put(cgrp);

4150

4154

4151

set_bit(CGRP_RELEASABLE, &parent->flags);

4155

set_bit(CGRP_RELEASABLE, &parent->flags);

4152

check_for_release(parent);

4156

check_for_release(parent);

4153

}

4157

}

4154

4158

4155

static int cgroup_rmdir(struct kernfs_node *kn)

4159

static int cgroup_rmdir(struct kernfs_node *kn)

4156

{

4160

{

4157

struct cgroup *cgrp = kn->priv;

4161

struct cgroup *cgrp = kn->priv;

4158

int ret = 0;

4162

int ret = 0;

4159

4163

4160

/*

4164

/*

4161

* This is self-destruction but @kn can't be removed while this

4165

* This is self-destruction but @kn can't be removed while this

4162

* callback is in progress. Let's break active protection. Once

4166

* callback is in progress. Let's break active protection. Once

4163

* the protection is broken, @cgrp can be destroyed at any point.

4167

* the protection is broken, @cgrp can be destroyed at any point.

4164

* Pin it so that it stays accessible.

4168

* Pin it so that it stays accessible.

4165

*/

4169

*/

4166

cgroup_get(cgrp);

4170

cgroup_get(cgrp);

4167

kernfs_break_active_protection(kn);

4171

kernfs_break_active_protection(kn);

4168

4172

4169

mutex_lock(&cgroup_tree_mutex);

4173

mutex_lock(&cgroup_tree_mutex);

4170

mutex_lock(&cgroup_mutex);

4174

mutex_lock(&cgroup_mutex);

4171

4175

4172

/*

4176

/*

4173

* @cgrp might already have been destroyed while we're trying to

4177

* @cgrp might already have been destroyed while we're trying to

4174

* grab the mutexes.

4178

* grab the mutexes.

4175

*/

4179

*/

4176

if (!cgroup_is_dead(cgrp))

4180

if (!cgroup_is_dead(cgrp))

4177

ret = cgroup_destroy_locked(cgrp);

4181

ret = cgroup_destroy_locked(cgrp);

4178

4182

4179

mutex_unlock(&cgroup_mutex);

4183

mutex_unlock(&cgroup_mutex);

4180

mutex_unlock(&cgroup_tree_mutex);

4184

mutex_unlock(&cgroup_tree_mutex);

4181

4185

4182

kernfs_unbreak_active_protection(kn);

4186

kernfs_unbreak_active_protection(kn);

4183

cgroup_put(cgrp);

4187

cgroup_put(cgrp);

4184

return ret;

4188

return ret;

4185

}

4189

}

4186

4190

4187

static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {

4191

static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {

4188

.remount_fs = cgroup_remount,

4192

.remount_fs = cgroup_remount,

4189

.show_options = cgroup_show_options,

4193

.show_options = cgroup_show_options,

4190

.mkdir = cgroup_mkdir,

4194

.mkdir = cgroup_mkdir,

4191

.rmdir = cgroup_rmdir,

4195

.rmdir = cgroup_rmdir,

4192

.rename = cgroup_rename,

4196

.rename = cgroup_rename,

4193

};

4197

};

4194

4198

4195

static void __init cgroup_init_subsys(struct cgroup_subsys *ss)

4199

static void __init cgroup_init_subsys(struct cgroup_subsys *ss)

4196

{

4200

{

4197

struct cgroup_subsys_state *css;

4201

struct cgroup_subsys_state *css;

4198

4202

4199

printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);

4203

printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);

4200

4204

4201

mutex_lock(&cgroup_tree_mutex);

4205

mutex_lock(&cgroup_tree_mutex);

4202

mutex_lock(&cgroup_mutex);

4206

mutex_lock(&cgroup_mutex);

4203

4207

4204

INIT_LIST_HEAD(&ss->cfts);

4208

INIT_LIST_HEAD(&ss->cfts);

4205

4209

4206

/* Create the root cgroup state for this subsystem */

4210

/* Create the root cgroup state for this subsystem */

4207

ss->root = &cgrp_dfl_root;

4211

ss->root = &cgrp_dfl_root;

4208

css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));

4212

css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));

4209

/* We don't handle early failures gracefully */

4213

/* We don't handle early failures gracefully */

4210

BUG_ON(IS_ERR(css));

4214

BUG_ON(IS_ERR(css));

4211

init_css(css, ss, &cgrp_dfl_root.cgrp);

4215

init_css(css, ss, &cgrp_dfl_root.cgrp);

4212

4216

4213

/* Update the init_css_set to contain a subsys

4217

/* Update the init_css_set to contain a subsys

4214

* pointer to this state - since the subsystem is

4218

* pointer to this state - since the subsystem is

4215

* newly registered, all tasks and hence the

4219

* newly registered, all tasks and hence the

4216

* init_css_set is in the subsystem's root cgroup. */

4220

* init_css_set is in the subsystem's root cgroup. */

4217

init_css_set.subsys[ss->id] = css;

4221

init_css_set.subsys[ss->id] = css;

4218

4222

4219

need_forkexit_callback |= ss->fork || ss->exit;

4223

need_forkexit_callback |= ss->fork || ss->exit;

4220

4224

4221

/* At system boot, before all subsystems have been

4225

/* At system boot, before all subsystems have been

4222

* registered, no tasks have been forked, so we don't

4226

* registered, no tasks have been forked, so we don't

4223

* need to invoke fork callbacks here. */

4227

* need to invoke fork callbacks here. */

4224

BUG_ON(!list_empty(&init_task.tasks));

4228

BUG_ON(!list_empty(&init_task.tasks));

4225

4229

4226

BUG_ON(online_css(css));

4230

BUG_ON(online_css(css));

4227

4231

4228

cgrp_dfl_root.subsys_mask |= 1 << ss->id;

4232

cgrp_dfl_root.subsys_mask |= 1 << ss->id;

4229

4233

4230

mutex_unlock(&cgroup_mutex);

4234

mutex_unlock(&cgroup_mutex);

4231

mutex_unlock(&cgroup_tree_mutex);

4235

mutex_unlock(&cgroup_tree_mutex);

4232

}

4236

}

4233

4237

4234

/**

4238

/**

4235

* cgroup_init_early - cgroup initialization at system boot

4239

* cgroup_init_early - cgroup initialization at system boot

4236

*

4240

*

4237

* Initialize cgroups at system boot, and initialize any

4241

* Initialize cgroups at system boot, and initialize any

4238

* subsystems that request early init.

4242

* subsystems that request early init.

4239

*/

4243

*/

4240

int __init cgroup_init_early(void)

4244

int __init cgroup_init_early(void)

4241

{

4245

{

4242

static struct cgroup_sb_opts __initdata opts =

4246

static struct cgroup_sb_opts __initdata opts =

4243

{ .flags = CGRP_ROOT_SANE_BEHAVIOR };

4247

{ .flags = CGRP_ROOT_SANE_BEHAVIOR };

4244

struct cgroup_subsys *ss;

4248

struct cgroup_subsys *ss;

4245

int i;

4249

int i;

4246

4250

4247

init_cgroup_root(&cgrp_dfl_root, &opts);

4251

init_cgroup_root(&cgrp_dfl_root, &opts);

4248

RCU_INIT_POINTER(init_task.cgroups, &init_css_set);

4252

RCU_INIT_POINTER(init_task.cgroups, &init_css_set);

4249

4253

4250

for_each_subsys(ss, i) {

4254

for_each_subsys(ss, i) {

4251

WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,

4255

WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,

4252

"invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",

4256

"invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",

4253

i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,

4257

i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,

4254

ss->id, ss->name);

4258

ss->id, ss->name);

4255

WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,

4259

WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,

4256

"cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);

4260

"cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);

4257

4261

4258

ss->id = i;

4262

ss->id = i;

4259

ss->name = cgroup_subsys_name[i];

4263

ss->name = cgroup_subsys_name[i];

4260

4264

4261

if (ss->early_init)

4265

if (ss->early_init)

4262

cgroup_init_subsys(ss);

4266

cgroup_init_subsys(ss);

4263

}

4267

}

4264

return 0;

4268

return 0;

4265

}

4269

}

4266

4270

4267

/**

4271

/**

4268

* cgroup_init - cgroup initialization

4272

* cgroup_init - cgroup initialization

4269

*

4273

*

4270

* Register cgroup filesystem and /proc file, and initialize

4274

* Register cgroup filesystem and /proc file, and initialize

4271

* any subsystems that didn't request early init.

4275

* any subsystems that didn't request early init.

4272

*/

4276

*/

4273

int __init cgroup_init(void)

4277

int __init cgroup_init(void)

4274

{

4278

{

4275

struct cgroup_subsys *ss;

4279

struct cgroup_subsys *ss;

4276

unsigned long key;

4280

unsigned long key;

4277

int ssid, err;

4281

int ssid, err;

4278

4282

4279

BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));

4283

BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));

4280

4284

4281

mutex_lock(&cgroup_tree_mutex);

4285

mutex_lock(&cgroup_tree_mutex);

4282

mutex_lock(&cgroup_mutex);

4286

mutex_lock(&cgroup_mutex);

4283

4287

4284

/* Add init_css_set to the hash table */

4288

/* Add init_css_set to the hash table */

4285

key = css_set_hash(init_css_set.subsys);

4289

key = css_set_hash(init_css_set.subsys);

4286

hash_add(css_set_table, &init_css_set.hlist, key);

4290

hash_add(css_set_table, &init_css_set.hlist, key);

4287

4291

4288

BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));

4292

BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));

4289

4293

4290

mutex_unlock(&cgroup_mutex);

4294

mutex_unlock(&cgroup_mutex);

4291

mutex_unlock(&cgroup_tree_mutex);

4295

mutex_unlock(&cgroup_tree_mutex);

4292

4296

4293

for_each_subsys(ss, ssid) {

4297

for_each_subsys(ss, ssid) {

4294

if (!ss->early_init)

4298

if (!ss->early_init)

4295

cgroup_init_subsys(ss);

4299

cgroup_init_subsys(ss);

4296

4300

4297

list_add_tail(&init_css_set.e_cset_node[ssid],

4301

list_add_tail(&init_css_set.e_cset_node[ssid],

4298

&cgrp_dfl_root.cgrp.e_csets[ssid]);

4302

&cgrp_dfl_root.cgrp.e_csets[ssid]);

4299

4303

4300

/*

4304

/*

4301

* cftype registration needs kmalloc and can't be done

4305

* cftype registration needs kmalloc and can't be done

4302

* during early_init. Register base cftypes separately.

4306

* during early_init. Register base cftypes separately.

4303

*/

4307

*/

4304

if (ss->base_cftypes)

4308

if (ss->base_cftypes)

4305

WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));

4309

WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));

4306

}

4310

}

4307

4311

4308

cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);

4312

cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);

4309

if (!cgroup_kobj)

4313

if (!cgroup_kobj)

4310

return -ENOMEM;

4314

return -ENOMEM;

4311

4315

4312

err = register_filesystem(&cgroup_fs_type);

4316

err = register_filesystem(&cgroup_fs_type);

4313

if (err < 0) {

4317

if (err < 0) {

4314

kobject_put(cgroup_kobj);

4318

kobject_put(cgroup_kobj);

4315

return err;

4319

return err;

4316

}

4320

}

4317

4321

4318

proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);

4322

proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);

4319

return 0;

4323

return 0;

4320

}

4324

}

4321

4325

4322

static int __init cgroup_wq_init(void)

4326

static int __init cgroup_wq_init(void)

4323

{

4327

{

4324

/*

4328

/*

4325

* There isn't much point in executing destruction path in

4329

* There isn't much point in executing destruction path in

4326

* parallel. Good chunk is serialized with cgroup_mutex anyway.

4330

* parallel. Good chunk is serialized with cgroup_mutex anyway.

4327

* Use 1 for @max_active.

4331

* Use 1 for @max_active.

4328

*

4332

*

4329

* We would prefer to do this in cgroup_init() above, but that

4333

* We would prefer to do this in cgroup_init() above, but that

4330

* is called before init_workqueues(): so leave this until after.

4334

* is called before init_workqueues(): so leave this until after.

4331

*/

4335

*/

4332

cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);

4336

cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);

4333

BUG_ON(!cgroup_destroy_wq);

4337

BUG_ON(!cgroup_destroy_wq);

4334

4338

4335

/*

4339

/*

4336

* Used to destroy pidlists and separate to serve as flush domain.

4340

* Used to destroy pidlists and separate to serve as flush domain.

4337

* Cap @max_active to 1 too.

4341

* Cap @max_active to 1 too.

4338

*/

4342

*/

4339

cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",

4343

cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",

4340

0, 1);

4344

0, 1);

4341

BUG_ON(!cgroup_pidlist_destroy_wq);

4345

BUG_ON(!cgroup_pidlist_destroy_wq);

4342

4346

4343

return 0;

4347

return 0;

4344

}

4348

}

4345

core_initcall(cgroup_wq_init);

4349

core_initcall(cgroup_wq_init);

4346

4350

4347

/*

4351

/*

4348

* proc_cgroup_show()

4352

* proc_cgroup_show()

4349

* - Print task's cgroup paths into seq_file, one line for each hierarchy

4353

* - Print task's cgroup paths into seq_file, one line for each hierarchy

4350

* - Used for /proc/<pid>/cgroup.

4354

* - Used for /proc/<pid>/cgroup.

4351

*/

4355

*/

4352

4356

4353

/* TODO: Use a proper seq_file iterator */

4357

/* TODO: Use a proper seq_file iterator */

4354

int proc_cgroup_show(struct seq_file *m, void *v)

4358

int proc_cgroup_show(struct seq_file *m, void *v)

4355

{

4359

{

4356

struct pid *pid;

4360

struct pid *pid;

4357

struct task_struct *tsk;

4361

struct task_struct *tsk;

4358

char *buf, *path;

4362

char *buf, *path;

4359

int retval;

4363

int retval;

4360

struct cgroup_root *root;

4364

struct cgroup_root *root;

4361

4365

4362

retval = -ENOMEM;

4366

retval = -ENOMEM;

4363

buf = kmalloc(PATH_MAX, GFP_KERNEL);

4367

buf = kmalloc(PATH_MAX, GFP_KERNEL);

4364

if (!buf)

4368

if (!buf)

4365

goto out;

4369

goto out;

4366

4370

4367

retval = -ESRCH;

4371

retval = -ESRCH;

4368

pid = m->private;

4372

pid = m->private;

4369

tsk = get_pid_task(pid, PIDTYPE_PID);

4373

tsk = get_pid_task(pid, PIDTYPE_PID);

4370

if (!tsk)

4374

if (!tsk)

4371

goto out_free;

4375

goto out_free;

4372

4376

4373

retval = 0;

4377

retval = 0;

4374

4378

4375

mutex_lock(&cgroup_mutex);

4379

mutex_lock(&cgroup_mutex);

4376

down_read(&css_set_rwsem);

4380

down_read(&css_set_rwsem);

4377

4381

4378

for_each_root(root) {

4382

for_each_root(root) {

4379

struct cgroup_subsys *ss;

4383

struct cgroup_subsys *ss;

4380

struct cgroup *cgrp;

4384

struct cgroup *cgrp;

4381

int ssid, count = 0;

4385

int ssid, count = 0;

4382

4386

4383

if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)

4387

if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)

4384

continue;

4388

continue;

4385

4389

4386

seq_printf(m, "%d:", root->hierarchy_id);

4390

seq_printf(m, "%d:", root->hierarchy_id);

4387

for_each_subsys(ss, ssid)

4391

for_each_subsys(ss, ssid)

4388

if (root->subsys_mask & (1 << ssid))

4392

if (root->subsys_mask & (1 << ssid))

4389

seq_printf(m, "%s%s", count++ ? "," : "", ss->name);

4393

seq_printf(m, "%s%s", count++ ? "," : "", ss->name);

4390

if (strlen(root->name))

4394

if (strlen(root->name))

4391

seq_printf(m, "%sname=%s", count ? "," : "",

4395

seq_printf(m, "%sname=%s", count ? "," : "",

4392

root->name);

4396

root->name);

4393

seq_putc(m, ':');

4397

seq_putc(m, ':');

4394

cgrp = task_cgroup_from_root(tsk, root);

4398

cgrp = task_cgroup_from_root(tsk, root);

4395

path = cgroup_path(cgrp, buf, PATH_MAX);

4399

path = cgroup_path(cgrp, buf, PATH_MAX);

4396

if (!path) {

4400

if (!path) {

4397

retval = -ENAMETOOLONG;

4401

retval = -ENAMETOOLONG;

4398

goto out_unlock;

4402

goto out_unlock;

4399

}

4403

}

4400

seq_puts(m, path);

4404

seq_puts(m, path);

4401

seq_putc(m, '\n');

4405

seq_putc(m, '\n');

4402

}

4406

}

4403

4407

4404

out_unlock:

4408

out_unlock:

4405

up_read(&css_set_rwsem);

4409

up_read(&css_set_rwsem);

4406

mutex_unlock(&cgroup_mutex);

4410

mutex_unlock(&cgroup_mutex);

4407

put_task_struct(tsk);

4411

put_task_struct(tsk);

4408

out_free:

4412

out_free:

4409

kfree(buf);

4413

kfree(buf);

4410

out:

4414

out:

4411

return retval;

4415

return retval;

4412

}

4416

}

4413

4417

4414

/* Display information about each subsystem and each hierarchy */

4418

/* Display information about each subsystem and each hierarchy */

4415

static int proc_cgroupstats_show(struct seq_file *m, void *v)

4419

static int proc_cgroupstats_show(struct seq_file *m, void *v)

4416

{

4420

{

4417

struct cgroup_subsys *ss;

4421

struct cgroup_subsys *ss;

4418

int i;

4422

int i;

4419

4423

4420

seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");

4424

seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");

4421

/*

4425

/*

4422

* ideally we don't want subsystems moving around while we do this.

4426

* ideally we don't want subsystems moving around while we do this.

4423

* cgroup_mutex is also necessary to guarantee an atomic snapshot of

4427

* cgroup_mutex is also necessary to guarantee an atomic snapshot of

4424

* subsys/hierarchy state.

4428

* subsys/hierarchy state.

4425

*/

4429

*/

4426

mutex_lock(&cgroup_mutex);

4430

mutex_lock(&cgroup_mutex);

4427

4431

4428

for_each_subsys(ss, i)

4432

for_each_subsys(ss, i)

4429

seq_printf(m, "%s\t%d\t%d\t%d\n",

4433

seq_printf(m, "%s\t%d\t%d\t%d\n",

4430

ss->name, ss->root->hierarchy_id,

4434

ss->name, ss->root->hierarchy_id,

4431

atomic_read(&ss->root->nr_cgrps), !ss->disabled);

4435

atomic_read(&ss->root->nr_cgrps), !ss->disabled);

4432

4436

4433

mutex_unlock(&cgroup_mutex);

4437

mutex_unlock(&cgroup_mutex);

4434

return 0;

4438

return 0;

4435

}

4439

}

4436

4440

4437

static int cgroupstats_open(struct inode *inode, struct file *file)

4441

static int cgroupstats_open(struct inode *inode, struct file *file)

4438

{

4442

{

4439

return single_open(file, proc_cgroupstats_show, NULL);

4443

return single_open(file, proc_cgroupstats_show, NULL);

4440

}

4444

}

4441

4445

4442

static const struct file_operations proc_cgroupstats_operations = {

4446

static const struct file_operations proc_cgroupstats_operations = {

4443

.open = cgroupstats_open,

4447

.open = cgroupstats_open,

4444

.read = seq_read,

4448

.read = seq_read,

4445

.llseek = seq_lseek,

4449

.llseek = seq_lseek,

4446

.release = single_release,

4450

.release = single_release,

4447

};

4451

};

4448

4452

4449

/**

4453

/**

4450

* cgroup_fork - initialize cgroup related fields during copy_process()

4454

* cgroup_fork - initialize cgroup related fields during copy_process()

4451

* @child: pointer to task_struct of forking parent process.

4455

* @child: pointer to task_struct of forking parent process.

4452

*

4456

*

4453

* A task is associated with the init_css_set until cgroup_post_fork()

4457

* A task is associated with the init_css_set until cgroup_post_fork()

4454

* attaches it to the parent's css_set. Empty cg_list indicates that

4458

* attaches it to the parent's css_set. Empty cg_list indicates that

4455

* @child isn't holding reference to its css_set.

4459

* @child isn't holding reference to its css_set.

4456

*/

4460

*/

4457

void cgroup_fork(struct task_struct *child)

4461

void cgroup_fork(struct task_struct *child)

4458

{

4462

{

4459

RCU_INIT_POINTER(child->cgroups, &init_css_set);

4463

RCU_INIT_POINTER(child->cgroups, &init_css_set);

4460

INIT_LIST_HEAD(&child->cg_list);

4464

INIT_LIST_HEAD(&child->cg_list);

4461

}

4465

}

4462

4466

4463

/**

4467

/**

4464

* cgroup_post_fork - called on a new task after adding it to the task list

4468

* cgroup_post_fork - called on a new task after adding it to the task list

4465

* @child: the task in question

4469

* @child: the task in question

4466

*

4470

*

4467

* Adds the task to the list running through its css_set if necessary and

4471

* Adds the task to the list running through its css_set if necessary and

4468

* call the subsystem fork() callbacks. Has to be after the task is

4472

* call the subsystem fork() callbacks. Has to be after the task is

4469

* visible on the task list in case we race with the first call to

4473

* visible on the task list in case we race with the first call to

4470

* cgroup_task_iter_start() - to guarantee that the new task ends up on its

4474

* cgroup_task_iter_start() - to guarantee that the new task ends up on its

4471

* list.

4475

* list.

4472

*/

4476

*/

4473

void cgroup_post_fork(struct task_struct *child)

4477

void cgroup_post_fork(struct task_struct *child)

4474

{

4478

{

4475

struct cgroup_subsys *ss;

4479

struct cgroup_subsys *ss;

4476

int i;

4480

int i;

4477

4481

4478

/*

4482

/*

4479

* This may race against cgroup_enable_task_cg_links(). As that

4483

* This may race against cgroup_enable_task_cg_links(). As that

4480

* function sets use_task_css_set_links before grabbing

4484

* function sets use_task_css_set_links before grabbing

4481

* tasklist_lock and we just went through tasklist_lock to add

4485

* tasklist_lock and we just went through tasklist_lock to add

4482

* @child, it's guaranteed that either we see the set

4486

* @child, it's guaranteed that either we see the set

4483

* use_task_css_set_links or cgroup_enable_task_cg_lists() sees

4487

* use_task_css_set_links or cgroup_enable_task_cg_lists() sees

4484

* @child during its iteration.

4488

* @child during its iteration.

4485

*

4489

*

4486

* If we won the race, @child is associated with %current's

4490

* If we won the race, @child is associated with %current's

4487

* css_set. Grabbing css_set_rwsem guarantees both that the

4491

* css_set. Grabbing css_set_rwsem guarantees both that the

4488

* association is stable, and, on completion of the parent's

4492

* association is stable, and, on completion of the parent's

4489

* migration, @child is visible in the source of migration or

4493

* migration, @child is visible in the source of migration or

4490

* already in the destination cgroup. This guarantee is necessary

4494

* already in the destination cgroup. This guarantee is necessary

4491

* when implementing operations which need to migrate all tasks of

4495

* when implementing operations which need to migrate all tasks of

4492

* a cgroup to another.

4496

* a cgroup to another.

4493

*

4497

*

4494

* Note that if we lose to cgroup_enable_task_cg_links(), @child

4498

* Note that if we lose to cgroup_enable_task_cg_links(), @child

4495

* will remain in init_css_set. This is safe because all tasks are

4499

* will remain in init_css_set. This is safe because all tasks are

4496

* in the init_css_set before cg_links is enabled and there's no

4500

* in the init_css_set before cg_links is enabled and there's no

4497

* operation which transfers all tasks out of init_css_set.

4501

* operation which transfers all tasks out of init_css_set.

4498

*/

4502

*/

4499

if (use_task_css_set_links) {

4503

if (use_task_css_set_links) {

4500

struct css_set *cset;

4504

struct css_set *cset;

4501

4505

4502

down_write(&css_set_rwsem);

4506

down_write(&css_set_rwsem);

4503

cset = task_css_set(current);

4507

cset = task_css_set(current);

4504

if (list_empty(&child->cg_list)) {

4508

if (list_empty(&child->cg_list)) {

4505

rcu_assign_pointer(child->cgroups, cset);

4509

rcu_assign_pointer(child->cgroups, cset);

4506

list_add(&child->cg_list, &cset->tasks);

4510

list_add(&child->cg_list, &cset->tasks);

4507

get_css_set(cset);

4511

get_css_set(cset);

4508

}

4512

}

4509

up_write(&css_set_rwsem);

4513

up_write(&css_set_rwsem);

4510

}

4514

}

4511

4515

4512

/*

4516

/*

4513

* Call ss->fork(). This must happen after @child is linked on

4517

* Call ss->fork(). This must happen after @child is linked on

4514

* css_set; otherwise, @child might change state between ->fork()

4518

* css_set; otherwise, @child might change state between ->fork()

4515

* and addition to css_set.

4519

* and addition to css_set.

4516

*/

4520

*/

4517

if (need_forkexit_callback) {

4521

if (need_forkexit_callback) {

4518

for_each_subsys(ss, i)

4522

for_each_subsys(ss, i)

4519

if (ss->fork)

4523

if (ss->fork)

4520

ss->fork(child);

4524

ss->fork(child);

4521

}

4525

}

4522

}

4526

}

4523

4527

4524

/**

4528

/**

4525

* cgroup_exit - detach cgroup from exiting task

4529

* cgroup_exit - detach cgroup from exiting task

4526

* @tsk: pointer to task_struct of exiting process

4530

* @tsk: pointer to task_struct of exiting process

4527

*

4531

*

4528

* Description: Detach cgroup from @tsk and release it.

4532

* Description: Detach cgroup from @tsk and release it.

4529

*

4533

*

4530

* Note that cgroups marked notify_on_release force every task in

4534

* Note that cgroups marked notify_on_release force every task in

4531

* them to take the global cgroup_mutex mutex when exiting.

4535

* them to take the global cgroup_mutex mutex when exiting.

4532

* This could impact scaling on very large systems. Be reluctant to

4536

* This could impact scaling on very large systems. Be reluctant to

4533

* use notify_on_release cgroups where very high task exit scaling

4537

* use notify_on_release cgroups where very high task exit scaling

4534

* is required on large systems.

4538

* is required on large systems.

4535

*

4539

*

4536

* We set the exiting tasks cgroup to the root cgroup (top_cgroup). We

4540

* We set the exiting tasks cgroup to the root cgroup (top_cgroup). We

4537

* call cgroup_exit() while the task is still competent to handle

4541

* call cgroup_exit() while the task is still competent to handle

4538

* notify_on_release(), then leave the task attached to the root cgroup in

4542

* notify_on_release(), then leave the task attached to the root cgroup in

4539

* each hierarchy for the remainder of its exit. No need to bother with

4543

* each hierarchy for the remainder of its exit. No need to bother with

4540

* init_css_set refcnting. init_css_set never goes away and we can't race

4544

* init_css_set refcnting. init_css_set never goes away and we can't race

4541

* with migration path - PF_EXITING is visible to migration path.

4545

* with migration path - PF_EXITING is visible to migration path.

4542

*/

4546

*/

4543

void cgroup_exit(struct task_struct *tsk)

4547

void cgroup_exit(struct task_struct *tsk)

4544

{

4548

{

4545

struct cgroup_subsys *ss;

4549

struct cgroup_subsys *ss;

4546

struct css_set *cset;

4550

struct css_set *cset;

4547

bool put_cset = false;

4551

bool put_cset = false;

4548

int i;

4552

int i;

4549

4553

4550

/*

4554

/*

4551

* Unlink from @tsk from its css_set. As migration path can't race

4555

* Unlink from @tsk from its css_set. As migration path can't race

4552

* with us, we can check cg_list without grabbing css_set_rwsem.

4556

* with us, we can check cg_list without grabbing css_set_rwsem.

4553

*/

4557

*/

4554

if (!list_empty(&tsk->cg_list)) {

4558

if (!list_empty(&tsk->cg_list)) {

4555

down_write(&css_set_rwsem);

4559

down_write(&css_set_rwsem);

4556

list_del_init(&tsk->cg_list);

4560

list_del_init(&tsk->cg_list);

4557

up_write(&css_set_rwsem);

4561

up_write(&css_set_rwsem);

4558

put_cset = true;

4562

put_cset = true;

4559

}

4563

}

4560

4564

4561

/* Reassign the task to the init_css_set. */

4565

/* Reassign the task to the init_css_set. */

4562

cset = task_css_set(tsk);

4566

cset = task_css_set(tsk);

4563

RCU_INIT_POINTER(tsk->cgroups, &init_css_set);

4567

RCU_INIT_POINTER(tsk->cgroups, &init_css_set);

4564

4568

4565

if (need_forkexit_callback) {

4569

if (need_forkexit_callback) {

4566

/* see cgroup_post_fork() for details */

4570

/* see cgroup_post_fork() for details */

4567

for_each_subsys(ss, i) {

4571

for_each_subsys(ss, i) {

4568

if (ss->exit) {

4572

if (ss->exit) {

4569

struct cgroup_subsys_state *old_css = cset->subsys[i];

4573

struct cgroup_subsys_state *old_css = cset->subsys[i];

4570

struct cgroup_subsys_state *css = task_css(tsk, i);

4574

struct cgroup_subsys_state *css = task_css(tsk, i);

4571

4575

4572

ss->exit(css, old_css, tsk);

4576

ss->exit(css, old_css, tsk);

4573

}

4577

}

4574

}

4578

}

4575

}

4579

}

4576

4580

4577

if (put_cset)

4581

if (put_cset)

4578

put_css_set(cset, true);

4582

put_css_set(cset, true);

4579

}

4583

}

4580

4584

4581

static void check_for_release(struct cgroup *cgrp)

4585

static void check_for_release(struct cgroup *cgrp)

4582

{

4586

{

4583

if (cgroup_is_releasable(cgrp) &&

4587

if (cgroup_is_releasable(cgrp) &&

4584

list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {

4588

list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {

4585

/*

4589

/*

4586

* Control Group is currently removeable. If it's not

4590

* Control Group is currently removeable. If it's not

4587

* already queued for a userspace notification, queue

4591

* already queued for a userspace notification, queue

4588

* it now

4592

* it now

4589

*/

4593

*/

4590

int need_schedule_work = 0;

4594

int need_schedule_work = 0;

4591

4595

4592

raw_spin_lock(&release_list_lock);

4596

raw_spin_lock(&release_list_lock);

4593

if (!cgroup_is_dead(cgrp) &&

4597

if (!cgroup_is_dead(cgrp) &&

4594

list_empty(&cgrp->release_list)) {

4598

list_empty(&cgrp->release_list)) {

4595

list_add(&cgrp->release_list, &release_list);

4599

list_add(&cgrp->release_list, &release_list);

4596

need_schedule_work = 1;

4600

need_schedule_work = 1;

4597

}

4601

}

4598

raw_spin_unlock(&release_list_lock);

4602

raw_spin_unlock(&release_list_lock);

4599

if (need_schedule_work)

4603

if (need_schedule_work)

4600

schedule_work(&release_agent_work);

4604

schedule_work(&release_agent_work);

4601

}

4605

}

4602

}

4606

}

4603

4607

4604

/*

4608

/*

4605

* Notify userspace when a cgroup is released, by running the

4609

* Notify userspace when a cgroup is released, by running the

4606

* configured release agent with the name of the cgroup (path

4610

* configured release agent with the name of the cgroup (path

4607

* relative to the root of cgroup file system) as the argument.

4611

* relative to the root of cgroup file system) as the argument.

4608

*

4612

*

4609

* Most likely, this user command will try to rmdir this cgroup.

4613

* Most likely, this user command will try to rmdir this cgroup.

4610

*

4614

*

4611

* This races with the possibility that some other task will be

4615

* This races with the possibility that some other task will be

4612

* attached to this cgroup before it is removed, or that some other

4616

* attached to this cgroup before it is removed, or that some other

4613

* user task will 'mkdir' a child cgroup of this cgroup. That's ok.

4617

* user task will 'mkdir' a child cgroup of this cgroup. That's ok.

4614

* The presumed 'rmdir' will fail quietly if this cgroup is no longer

4618

* The presumed 'rmdir' will fail quietly if this cgroup is no longer

4615

* unused, and this cgroup will be reprieved from its death sentence,

4619

* unused, and this cgroup will be reprieved from its death sentence,

4616

* to continue to serve a useful existence. Next time it's released,

4620

* to continue to serve a useful existence. Next time it's released,

4617

* we will get notified again, if it still has 'notify_on_release' set.

4621

* we will get notified again, if it still has 'notify_on_release' set.

4618

*

4622

*

4619

* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which

4623

* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which

4620

* means only wait until the task is successfully execve()'d. The

4624

* means only wait until the task is successfully execve()'d. The

4621

* separate release agent task is forked by call_usermodehelper(),

4625

* separate release agent task is forked by call_usermodehelper(),

4622

* then control in this thread returns here, without waiting for the

4626

* then control in this thread returns here, without waiting for the

4623

* release agent task. We don't bother to wait because the caller of

4627

* release agent task. We don't bother to wait because the caller of

4624

* this routine has no use for the exit status of the release agent

4628

* this routine has no use for the exit status of the release agent

4625

* task, so no sense holding our caller up for that.

4629

* task, so no sense holding our caller up for that.

4626

*/

4630

*/

4627

static void cgroup_release_agent(struct work_struct *work)

4631

static void cgroup_release_agent(struct work_struct *work)

4628

{

4632

{

4629

BUG_ON(work != &release_agent_work);

4633

BUG_ON(work != &release_agent_work);

4630

mutex_lock(&cgroup_mutex);

4634

mutex_lock(&cgroup_mutex);

4631

raw_spin_lock(&release_list_lock);

4635

raw_spin_lock(&release_list_lock);

4632

while (!list_empty(&release_list)) {

4636

while (!list_empty(&release_list)) {

4633

char *argv[3], *envp[3];

4637

char *argv[3], *envp[3];

4634

int i;

4638

int i;

4635

char *pathbuf = NULL, *agentbuf = NULL, *path;

4639

char *pathbuf = NULL, *agentbuf = NULL, *path;

4636

struct cgroup *cgrp = list_entry(release_list.next,

4640

struct cgroup *cgrp = list_entry(release_list.next,

4637

struct cgroup,

4641

struct cgroup,

4638

release_list);

4642

release_list);

4639

list_del_init(&cgrp->release_list);

4643

list_del_init(&cgrp->release_list);

4640

raw_spin_unlock(&release_list_lock);

4644

raw_spin_unlock(&release_list_lock);

4641

pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);

4645

pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);

4642

if (!pathbuf)

4646

if (!pathbuf)

4643

goto continue_free;

4647

goto continue_free;

4644

path = cgroup_path(cgrp, pathbuf, PATH_MAX);

4648

path = cgroup_path(cgrp, pathbuf, PATH_MAX);

4645

if (!path)

4649

if (!path)

4646

goto continue_free;

4650

goto continue_free;

4647

agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);

4651

agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);

4648

if (!agentbuf)

4652

if (!agentbuf)

4649

goto continue_free;

4653

goto continue_free;

4650

4654

4651

i = 0;

4655

i = 0;

4652

argv[i++] = agentbuf;

4656

argv[i++] = agentbuf;

4653

argv[i++] = path;

4657

argv[i++] = path;

4654

argv[i] = NULL;

4658

argv[i] = NULL;

4655

4659

4656

i = 0;

4660

i = 0;

4657

/* minimal command environment */

4661

/* minimal command environment */

4658

envp[i++] = "HOME=/";

4662

envp[i++] = "HOME=/";

4659

envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";

4663

envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";

4660

envp[i] = NULL;

4664

envp[i] = NULL;

4661

4665

4662

/* Drop the lock while we invoke the usermode helper,

4666

/* Drop the lock while we invoke the usermode helper,

4663

* since the exec could involve hitting disk and hence

4667

* since the exec could involve hitting disk and hence

4664

* be a slow process */

4668

* be a slow process */

4665

mutex_unlock(&cgroup_mutex);

4669

mutex_unlock(&cgroup_mutex);

4666

call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);

4670

call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);

4667

mutex_lock(&cgroup_mutex);

4671

mutex_lock(&cgroup_mutex);

4668

continue_free:

4672

continue_free:

4669

kfree(pathbuf);

4673

kfree(pathbuf);

4670

kfree(agentbuf);

4674

kfree(agentbuf);

4671

raw_spin_lock(&release_list_lock);

4675

raw_spin_lock(&release_list_lock);

4672

}

4676

}

4673

raw_spin_unlock(&release_list_lock);

4677

raw_spin_unlock(&release_list_lock);

4674

mutex_unlock(&cgroup_mutex);

4678

mutex_unlock(&cgroup_mutex);

4675

}

4679

}

4676

4680

4677

static int __init cgroup_disable(char *str)

4681

static int __init cgroup_disable(char *str)

4678

{

4682

{

4679

struct cgroup_subsys *ss;

4683

struct cgroup_subsys *ss;

4680

char *token;

4684

char *token;

4681

int i;

4685

int i;

4682

4686

4683

while ((token = strsep(&str, ",")) != NULL) {

4687

while ((token = strsep(&str, ",")) != NULL) {

4684

if (!*token)

4688

if (!*token)

4685

continue;

4689

continue;

4686

4690

4687

for_each_subsys(ss, i) {

4691

for_each_subsys(ss, i) {

4688

if (!strcmp(token, ss->name)) {

4692

if (!strcmp(token, ss->name)) {

4689

ss->disabled = 1;

4693

ss->disabled = 1;

4690

printk(KERN_INFO "Disabling %s control group"

4694

printk(KERN_INFO "Disabling %s control group"

4691

" subsystem\n", ss->name);

4695

" subsystem\n", ss->name);

4692

break;

4696

break;

4693

}

4697

}

4694

}

4698

}

4695

}

4699

}

4696

return 1;

4700

return 1;

4697

}

4701

}

4698

__setup("cgroup_disable=", cgroup_disable);

4702

__setup("cgroup_disable=", cgroup_disable);

4699

4703

4700

/**

4704

/**

4701

* css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir

4705

* css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir

4702

* @dentry: directory dentry of interest

4706

* @dentry: directory dentry of interest

4703

* @ss: subsystem of interest

4707

* @ss: subsystem of interest

4704

*

4708

*

4705

* If @dentry is a directory for a cgroup which has @ss enabled on it, try

4709

* If @dentry is a directory for a cgroup which has @ss enabled on it, try

4706

* to get the corresponding css and return it. If such css doesn't exist

4710

* to get the corresponding css and return it. If such css doesn't exist

4707

* or can't be pinned, an ERR_PTR value is returned.

4711

* or can't be pinned, an ERR_PTR value is returned.

4708

*/

4712

*/

4709

struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,

4713

struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,

4710

struct cgroup_subsys *ss)

4714

struct cgroup_subsys *ss)

4711

{

4715

{

4712

struct kernfs_node *kn = kernfs_node_from_dentry(dentry);

4716

struct kernfs_node *kn = kernfs_node_from_dentry(dentry);

4713

struct cgroup_subsys_state *css = NULL;

4717

struct cgroup_subsys_state *css = NULL;

4714

struct cgroup *cgrp;

4718

struct cgroup *cgrp;

4715

4719

4716

/* is @dentry a cgroup dir? */

4720

/* is @dentry a cgroup dir? */

4717

if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||

4721

if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||

4718

kernfs_type(kn) != KERNFS_DIR)

4722

kernfs_type(kn) != KERNFS_DIR)

4719

return ERR_PTR(-EBADF);

4723

return ERR_PTR(-EBADF);

4720

4724

4721

rcu_read_lock();

4725

rcu_read_lock();

4722

4726

4723

/*

4727

/*

4724

* This path doesn't originate from kernfs and @kn could already

4728

* This path doesn't originate from kernfs and @kn could already

4725

* have been or be removed at any point. @kn->priv is RCU

4729

* have been or be removed at any point. @kn->priv is RCU

4726

* protected for this access. See destroy_locked() for details.

4730

* protected for this access. See destroy_locked() for details.

4727

*/

4731

*/

4728

cgrp = rcu_dereference(kn->priv);

4732

cgrp = rcu_dereference(kn->priv);

4729

if (cgrp)

4733

if (cgrp)

4730

css = cgroup_css(cgrp, ss);

4734

css = cgroup_css(cgrp, ss);

4731

4735

4732

if (!css || !css_tryget(css))

4736

if (!css || !css_tryget(css))

4733

css = ERR_PTR(-ENOENT);

4737

css = ERR_PTR(-ENOENT);

4734

4738

4735

rcu_read_unlock();

4739

rcu_read_unlock();

4736

return css;

4740

return css;

4737

}

4741

}

4738

4742

4739

/**

4743

/**

4740

* css_from_id - lookup css by id

4744

* css_from_id - lookup css by id

4741

* @id: the cgroup id

4745

* @id: the cgroup id

4742

* @ss: cgroup subsys to be looked into

4746

* @ss: cgroup subsys to be looked into

4743

*

4747

*

4744

* Returns the css if there's valid one with @id, otherwise returns NULL.

4748

* Returns the css if there's valid one with @id, otherwise returns NULL.

4745

* Should be called under rcu_read_lock().

4749

* Should be called under rcu_read_lock().

4746

*/

4750

*/

4747

struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)

4751

struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)

4748

{

4752

{

4749

struct cgroup *cgrp;

4753

struct cgroup *cgrp;

4750

4754

4751

cgroup_assert_mutexes_or_rcu_locked();

4755

cgroup_assert_mutexes_or_rcu_locked();

4752

4756

4753

cgrp = idr_find(&ss->root->cgroup_idr, id);

4757

cgrp = idr_find(&ss->root->cgroup_idr, id);

4754

if (cgrp)

4758

if (cgrp)

4755

return cgroup_css(cgrp, ss);

4759

return cgroup_css(cgrp, ss);

4756

return NULL;

4760

return NULL;

4757

}

4761

}

4758

4762

4759

#ifdef CONFIG_CGROUP_DEBUG

4763

#ifdef CONFIG_CGROUP_DEBUG

4760

static struct cgroup_subsys_state *

4764

static struct cgroup_subsys_state *

4761

debug_css_alloc(struct cgroup_subsys_state *parent_css)

4765

debug_css_alloc(struct cgroup_subsys_state *parent_css)

4762

{

4766

{

4763

struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);

4767

struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);

4764

4768

4765

if (!css)

4769

if (!css)

4766

return ERR_PTR(-ENOMEM);

4770

return ERR_PTR(-ENOMEM);

4767

4771

4768

return css;

4772

return css;

4769

}

4773

}

4770

4774

4771

static void debug_css_free(struct cgroup_subsys_state *css)

4775

static void debug_css_free(struct cgroup_subsys_state *css)

4772

{

4776

{

4773

kfree(css);

4777

kfree(css);

4774

}

4778

}

4775

4779

4776

static u64 debug_taskcount_read(struct cgroup_subsys_state *css,

4780

static u64 debug_taskcount_read(struct cgroup_subsys_state *css,

4777

struct cftype *cft)

4781

struct cftype *cft)

4778

{

4782

{

4779

return cgroup_task_count(css->cgroup);

4783

return cgroup_task_count(css->cgroup);

4780

}

4784

}

4781

4785

4782

static u64 current_css_set_read(struct cgroup_subsys_state *css,

4786

static u64 current_css_set_read(struct cgroup_subsys_state *css,

4783

struct cftype *cft)

4787

struct cftype *cft)

4784

{

4788

{

4785

return (u64)(unsigned long)current->cgroups;

4789

return (u64)(unsigned long)current->cgroups;

4786

}

4790

}

4787

4791

4788

static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,

4792

static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,

4789

struct cftype *cft)

4793

struct cftype *cft)

4790

{

4794

{

4791

u64 count;

4795

u64 count;

4792

4796

4793

rcu_read_lock();

4797

rcu_read_lock();

4794

count = atomic_read(&task_css_set(current)->refcount);

4798

count = atomic_read(&task_css_set(current)->refcount);

4795

rcu_read_unlock();

4799

rcu_read_unlock();

4796

return count;

4800

return count;

4797

}

4801

}

4798

4802

4799

static int current_css_set_cg_links_read(struct seq_file *seq, void *v)

4803

static int current_css_set_cg_links_read(struct seq_file *seq, void *v)

4800

{

4804

{

4801

struct cgrp_cset_link *link;

4805

struct cgrp_cset_link *link;

4802

struct css_set *cset;

4806

struct css_set *cset;

4803

char *name_buf;

4807

char *name_buf;

4804

4808

4805

name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);

4809

name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);

4806

if (!name_buf)

4810

if (!name_buf)

4807

return -ENOMEM;

4811

return -ENOMEM;

4808

4812

4809

down_read(&css_set_rwsem);

4813

down_read(&css_set_rwsem);

4810

rcu_read_lock();

4814

rcu_read_lock();

4811

cset = rcu_dereference(current->cgroups);

4815

cset = rcu_dereference(current->cgroups);

4812

list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {

4816

list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {

4813

struct cgroup *c = link->cgrp;

4817

struct cgroup *c = link->cgrp;

4814

4818

4815

cgroup_name(c, name_buf, NAME_MAX + 1);

4819

cgroup_name(c, name_buf, NAME_MAX + 1);

4816

seq_printf(seq, "Root %d group %s\n",

4820

seq_printf(seq, "Root %d group %s\n",

4817

c->root->hierarchy_id, name_buf);

4821

c->root->hierarchy_id, name_buf);

4818

}

4822

}

4819

rcu_read_unlock();

4823

rcu_read_unlock();

4820

up_read(&css_set_rwsem);

4824

up_read(&css_set_rwsem);

4821

kfree(name_buf);

4825

kfree(name_buf);

4822

return 0;

4826

return 0;

4823

}

4827

}

4824

4828

4825

#define MAX_TASKS_SHOWN_PER_CSS 25

4829

#define MAX_TASKS_SHOWN_PER_CSS 25

4826

static int cgroup_css_links_read(struct seq_file *seq, void *v)

4830

static int cgroup_css_links_read(struct seq_file *seq, void *v)

4827

{

4831

{

4828

struct cgroup_subsys_state *css = seq_css(seq);

4832

struct cgroup_subsys_state *css = seq_css(seq);

4829

struct cgrp_cset_link *link;

4833

struct cgrp_cset_link *link;

4830

4834

4831

down_read(&css_set_rwsem);

4835

down_read(&css_set_rwsem);

4832

list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {

4836

list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {

4833

struct css_set *cset = link->cset;

4837

struct css_set *cset = link->cset;

4834

struct task_struct *task;

4838

struct task_struct *task;

4835

int count = 0;

4839

int count = 0;

4836

4840

4837

seq_printf(seq, "css_set %p\n", cset);

4841

seq_printf(seq, "css_set %p\n", cset);

4838

4842

4839

list_for_each_entry(task, &cset->tasks, cg_list) {

4843

list_for_each_entry(task, &cset->tasks, cg_list) {

4840

if (count++ > MAX_TASKS_SHOWN_PER_CSS)

4844

if (count++ > MAX_TASKS_SHOWN_PER_CSS)

4841

goto overflow;

4845

goto overflow;

4842

seq_printf(seq, " task %d\n", task_pid_vnr(task));

4846

seq_printf(seq, " task %d\n", task_pid_vnr(task));

4843

}

4847

}

4844

4848

4845

list_for_each_entry(task, &cset->mg_tasks, cg_list) {

4849

list_for_each_entry(task, &cset->mg_tasks, cg_list) {

4846

if (count++ > MAX_TASKS_SHOWN_PER_CSS)

4850

if (count++ > MAX_TASKS_SHOWN_PER_CSS)

4847

goto overflow;

4851

goto overflow;

4848

seq_printf(seq, " task %d\n", task_pid_vnr(task));

4852

seq_printf(seq, " task %d\n", task_pid_vnr(task));

4849

}

4853

}

4850

continue;

4854

continue;

4851

overflow:

4855

overflow:

4852

seq_puts(seq, " ...\n");

4856

seq_puts(seq, " ...\n");

4853

}

4857

}

4854

up_read(&css_set_rwsem);

4858

up_read(&css_set_rwsem);

4855

return 0;

4859

return 0;

4856

}

4860

}

4857

4861

4858

static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)

4862

static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)

4859

{

4863

{

4860

return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);

4864

return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);

4861

}

4865

}

4862

4866

4863

static struct cftype debug_files[] = {

4867

static struct cftype debug_files[] = {

4864

{

4868

{

4865

.name = "taskcount",

4869

.name = "taskcount",

4866

.read_u64 = debug_taskcount_read,

4870

.read_u64 = debug_taskcount_read,

4867

},

4871

},

4868

4872

4869

{

4873

{

4870

.name = "current_css_set",

4874

.name = "current_css_set",

4871

.read_u64 = current_css_set_read,

4875

.read_u64 = current_css_set_read,

4872

},

4876

},

4873

4877

4874

{

4878

{

4875

.name = "current_css_set_refcount",

4879

.name = "current_css_set_refcount",

4876

.read_u64 = current_css_set_refcount_read,

4880

.read_u64 = current_css_set_refcount_read,

4877

},

4881

},

4878

4882

4879

{

4883

{

4880

.name = "current_css_set_cg_links",

4884

.name = "current_css_set_cg_links",

4881

.seq_show = current_css_set_cg_links_read,

4885

.seq_show = current_css_set_cg_links_read,

4882

},

4886

},

4883

4887

4884

{

4888

{

4885

.name = "cgroup_css_links",

4889

.name = "cgroup_css_links",

4886

.seq_show = cgroup_css_links_read,

4890

.seq_show = cgroup_css_links_read,

4887

},

4891

},

4888

4892

4889

{

4893

{

4890

.name = "releasable",

4894

.name = "releasable",

4891

.read_u64 = releasable_read,

4895

.read_u64 = releasable_read,

4892

},

4896

},

4893

4897

4894

{ } /* terminate */

4898

{ } /* terminate */

4895

};

4899

};

4896

4900

4897

struct cgroup_subsys debug_cgrp_subsys = {

4901

struct cgroup_subsys debug_cgrp_subsys = {

4898

.css_alloc = debug_css_alloc,

4902

.css_alloc = debug_css_alloc,

4899

.css_free = debug_css_free,

4903

.css_free = debug_css_free,

4900

.base_cftypes = debug_files,

4904

.base_cftypes = debug_files,

4901

};

4905

};

4902

#endif /* CONFIG_CGROUP_DEBUG */

4906

#endif /* CONFIG_CGROUP_DEBUG */

4903

4907

GITLAB

cgroup: add css_set->dfl_cgrp

 #ifndef _LINUX_CGROUP_H
 #define _LINUX_CGROUP_H
 /*
  *  cgroup interface
  *
  *  Copyright (C) 2003 BULL SA
  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  *
  */
 #include <linux/sched.h>
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/cgroupstats.h>
 #include <linux/rwsem.h>
 #include <linux/idr.h>
 #include <linux/workqueue.h>
 #include <linux/fs.h>
 #include <linux/percpu-refcount.h>
 #include <linux/seq_file.h>
 #include <linux/kernfs.h>
 #ifdef CONFIG_CGROUPS
 struct cgroup_root;
 struct cgroup_subsys;
 struct inode;
 struct cgroup;
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
 extern void cgroup_fork(struct task_struct *p);
 extern void cgroup_post_fork(struct task_struct *p);
 extern void cgroup_exit(struct task_struct *p);
 extern int cgroupstats_build(struct cgroupstats *stats,
 				struct dentry *dentry);
 extern int proc_cgroup_show(struct seq_file *, void *);
 /* define the enumeration of all cgroup subsystems */
 #define SUBSYS(_x) _x ## _cgrp_id,
 enum cgroup_subsys_id {
 #include <linux/cgroup_subsys.h>
 	CGROUP_SUBSYS_COUNT,
 };
 #undef SUBSYS
 /* Per-subsystem/per-cgroup state maintained by the system. */
 struct cgroup_subsys_state {
 	/* the cgroup that this css is attached to */
 	struct cgroup *cgroup;
 	/* the cgroup subsystem that this css is attached to */
 	struct cgroup_subsys *ss;
 	/* reference count - access via css_[try]get() and css_put() */
 	struct percpu_ref refcnt;
 	/* the parent css */
 	struct cgroup_subsys_state *parent;
 	unsigned long flags;
 	/* percpu_ref killing and RCU release */
 	struct rcu_head rcu_head;
 	struct work_struct destroy_work;
 };
 /* bits in struct cgroup_subsys_state flags field */
 enum {
 	CSS_ROOT	= (1 << 0), /* this CSS is the root of the subsystem */
 	CSS_ONLINE	= (1 << 1), /* between ->css_online() and ->css_offline() */
 };
 /**
  * css_get - obtain a reference on the specified css
  * @css: target css
  *
  * The caller must already have a reference.
  */
 static inline void css_get(struct cgroup_subsys_state *css)
 {
 	/* We don't need to reference count the root state */
 	if (!(css->flags & CSS_ROOT))
 		percpu_ref_get(&css->refcnt);
 }
 /**
  * css_tryget - try to obtain a reference on the specified css
  * @css: target css
  *
  * Obtain a reference on @css if it's alive.  The caller naturally needs to
  * ensure that @css is accessible but doesn't have to be holding a
  * reference on it - IOW, RCU protected access is good enough for this
  * function.  Returns %true if a reference count was successfully obtained;
  * %false otherwise.
  */
 static inline bool css_tryget(struct cgroup_subsys_state *css)
 {
 	if (css->flags & CSS_ROOT)
 		return true;
 	return percpu_ref_tryget(&css->refcnt);
 }
 /**
  * css_put - put a css reference
  * @css: target css
  *
  * Put a reference obtained via css_get() and css_tryget().
  */
 static inline void css_put(struct cgroup_subsys_state *css)
 {
 	if (!(css->flags & CSS_ROOT))
 		percpu_ref_put(&css->refcnt);
 }
 /* bits in struct cgroup flags field */
 enum {
 	/* Control Group is dead */
 	CGRP_DEAD,
 	/*
 	 * Control Group has previously had a child cgroup or a task,
 	 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set)
 	 */
 	CGRP_RELEASABLE,
 	/* Control Group requires release notifications to userspace */
 	CGRP_NOTIFY_ON_RELEASE,
 	/*
 	 * Clone the parent's configuration when creating a new child
 	 * cpuset cgroup.  For historical reasons, this option can be
 	 * specified at mount time and thus is implemented here.
 	 */
 	CGRP_CPUSET_CLONE_CHILDREN,
 	/* see the comment above CGRP_ROOT_SANE_BEHAVIOR for details */
 	CGRP_SANE_BEHAVIOR,
 };
 struct cgroup {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 	/*
 	 * idr allocated in-hierarchy ID.
 	 *
 	 * The ID of the root cgroup is always 0, and a new cgroup
 	 * will be assigned with a smallest available ID.
 	 *
 	 * Allocating/Removing ID must be protected by cgroup_mutex.
 	 */
 	int id;
 	/* the number of attached css's */
 	int nr_css;
 	atomic_t refcnt;
 	/*
 	 * We link our 'sibling' struct into our parent's 'children'.
 	 * Our children link their 'sibling' into our 'children'.
 	 */
 	struct list_head sibling;	/* my parent's children */
 	struct list_head children;	/* my children */
 	struct cgroup *parent;		/* my parent */
 	struct kernfs_node *kn;		/* cgroup kernfs entry */
 	/*
 	 * Monotonically increasing unique serial number which defines a
 	 * uniform order among all cgroups.  It's guaranteed that all
 	 * ->children lists are in the ascending order of ->serial_nr.
 	 * It's used to allow interrupting and resuming iterations.
 	 */
 	u64 serial_nr;
 	/* the bitmask of subsystems enabled on the child cgroups */
 	unsigned long child_subsys_mask;
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
 	struct cgroup_root *root;
 	/*
 	 * List of cgrp_cset_links pointing at css_sets with tasks in this
 	 * cgroup.  Protected by css_set_lock.
 	 */
 	struct list_head cset_links;
 	/*
 	 * On the default hierarchy, a css_set for a cgroup with some
 	 * susbsys disabled will point to css's which are associated with
 	 * the closest ancestor which has the subsys enabled.  The
 	 * following lists all css_sets which point to this cgroup's css
 	 * for the given subsystem.
 	 */
 	struct list_head e_csets[CGROUP_SUBSYS_COUNT];
 	/*
 	 * Linked list running through all cgroups that can
 	 * potentially be reaped by the release agent. Protected by
 	 * release_list_lock
 	 */
 	struct list_head release_list;
 	/*
 	 * list of pidlists, up to two for each namespace (one for procs, one
 	 * for tasks); created on demand.
 	 */
 	struct list_head pidlists;
 	struct mutex pidlist_mutex;
 	/* dummy css with NULL ->ss, points back to this cgroup */
 	struct cgroup_subsys_state dummy_css;
 	/* For css percpu_ref killing and RCU-protected deletion */
 	struct rcu_head rcu_head;
 	struct work_struct destroy_work;
 };
 #define MAX_CGROUP_ROOT_NAMELEN 64
 /* cgroup_root->flags */
 enum {
 	/*
 	 * Unfortunately, cgroup core and various controllers are riddled
 	 * with idiosyncrasies and pointless options.  The following flag,
 	 * when set, will force sane behavior - some options are forced on,
 	 * others are disallowed, and some controllers will change their
 	 * hierarchical or other behaviors.
 	 *
 	 * The set of behaviors affected by this flag are still being
 	 * determined and developed and the mount option for this flag is
 	 * prefixed with __DEVEL__.  The prefix will be dropped once we
 	 * reach the point where all behaviors are compatible with the
 	 * planned unified hierarchy, which will automatically turn on this
 	 * flag.
 	 *
 	 * The followings are the behaviors currently affected this flag.
 	 *
 	 * - Mount options "noprefix", "xattr", "clone_children",
 	 *   "release_agent" and "name" are disallowed.
 	 *
 	 * - When mounting an existing superblock, mount options should
 	 *   match.
 	 *
 	 * - Remount is disallowed.
 	 *
 	 * - rename(2) is disallowed.
 	 *
 	 * - "tasks" is removed.  Everything should be at process
 	 *   granularity.  Use "cgroup.procs" instead.
 	 *
 	 * - "cgroup.procs" is not sorted.  pids will be unique unless they
 	 *   got recycled inbetween reads.
 	 *
 	 * - "release_agent" and "notify_on_release" are removed.
 	 *   Replacement notification mechanism will be implemented.
 	 *
 	 * - "cgroup.clone_children" is removed.
 	 *
 	 * - If mount is requested with sane_behavior but without any
 	 *   subsystem, the default unified hierarchy is mounted.
 	 *
 	 * - cpuset: tasks will be kept in empty cpusets when hotplug happens
 	 *   and take masks of ancestors with non-empty cpus/mems, instead of
 	 *   being moved to an ancestor.
 	 *
 	 * - cpuset: a task can be moved into an empty cpuset, and again it
 	 *   takes masks of ancestors.
 	 *
 	 * - memcg: use_hierarchy is on by default and the cgroup file for
 	 *   the flag is not created.
 	 *
 	 * - blkcg: blk-throttle becomes properly hierarchical.
 	 */
 	CGRP_ROOT_SANE_BEHAVIOR	= (1 << 0),
 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
 	/* mount options live below bit 16 */
 	CGRP_ROOT_OPTION_MASK	= (1 << 16) - 1,
 };
 /*
  * A cgroup_root represents the root of a cgroup hierarchy, and may be
  * associated with a kernfs_root to form an active hierarchy.  This is
  * internal to cgroup core.  Don't access directly from controllers.
  */
 struct cgroup_root {
 	struct kernfs_root *kf_root;
 	/* The bitmask of subsystems attached to this hierarchy */
 	unsigned long subsys_mask;
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 	/* The root cgroup.  Root is destroyed on its release. */
 	struct cgroup cgrp;
 	/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
 	atomic_t nr_cgrps;
 	/* A list running through the active hierarchies */
 	struct list_head root_list;
 	/* Hierarchy-specific flags */
 	unsigned long flags;
 	/* IDs for cgroups in this hierarchy */
 	struct idr cgroup_idr;
 	/* The path to use for release notifications. */
 	char release_agent_path[PATH_MAX];
 	/* The name for this hierarchy - may be empty */
 	char name[MAX_CGROUP_ROOT_NAMELEN];
 };
 /*
  * A css_set is a structure holding pointers to a set of
  * cgroup_subsys_state objects. This saves space in the task struct
  * object and speeds up fork()/exit(), since a single inc/dec and a
  * list_add()/del() can bump the reference count on the entire cgroup
  * set for a task.
  */
 struct css_set {
 	/* Reference count */
 	atomic_t refcount;
 	/*
 	 * List running through all cgroup groups in the same hash
 	 * slot. Protected by css_set_lock
 	 */
 	struct hlist_node hlist;
 	/*
 	 * Lists running through all tasks using this cgroup group.
 	 * mg_tasks lists tasks which belong to this cset but are in the
 	 * process of being migrated out or in.  Protected by
 	 * css_set_rwsem, but, during migration, once tasks are moved to
 	 * mg_tasks, it can be read safely while holding cgroup_mutex.
 	 */
 	struct list_head tasks;
 	struct list_head mg_tasks;
 	/*
 	 * List of cgrp_cset_links pointing at cgroups referenced from this
 	 * css_set.  Protected by css_set_lock.
 	 */
 	struct list_head cgrp_links;
+	/* the default cgroup associated with this css_set */
+	struct cgroup *dfl_cgrp;
 	/*
 	 * Set of subsystem states, one for each subsystem. This array is
 	 * immutable after creation apart from the init_css_set during
 	 * subsystem registration (at boot time).
 	 */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 	/*
 	 * List of csets participating in the on-going migration either as
 	 * source or destination.  Protected by cgroup_mutex.
 	 */
 	struct list_head mg_preload_node;
 	struct list_head mg_node;
 	/*
 	 * If this cset is acting as the source of migration the following
 	 * two fields are set.  mg_src_cgrp is the source cgroup of the
 	 * on-going migration and mg_dst_cset is the destination cset the
 	 * target tasks on this cset should be migrated to.  Protected by
 	 * cgroup_mutex.
 	 */
 	struct cgroup *mg_src_cgrp;
 	struct css_set *mg_dst_cset;
 	/*
 	 * On the default hierarhcy, ->subsys[ssid] may point to a css
 	 * attached to an ancestor instead of the cgroup this css_set is
 	 * associated with.  The following node is anchored at
 	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
 	 * iterate through all css's attached to a given cgroup.
 	 */
 	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
 };
 /*
  * struct cftype: handler definitions for cgroup control files
  *
  * When reading/writing to a file:
  *	- the cgroup to use is file->f_dentry->d_parent->d_fsdata
  *	- the 'cftype' of the file is file->f_dentry->d_fsdata
  */
 /* cftype->flags */
 enum {
 	CFTYPE_ONLY_ON_ROOT	= (1 << 0),	/* only create on root cgrp */
 	CFTYPE_NOT_ON_ROOT	= (1 << 1),	/* don't create on root cgrp */
 	CFTYPE_INSANE		= (1 << 2),	/* don't create if sane_behavior */
 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
 	CFTYPE_ONLY_ON_DFL	= (1 << 4),	/* only on default hierarchy */
 };
 #define MAX_CFTYPE_NAME		64
 struct cftype {
 	/*
 	 * By convention, the name should begin with the name of the
 	 * subsystem, followed by a period.  Zero length string indicates
 	 * end of cftype array.
 	 */
 	char name[MAX_CFTYPE_NAME];
 	int private;
 	/*
 	 * If not 0, file mode is set to this value, otherwise it will
 	 * be figured out automatically
 	 */
 	umode_t mode;
 	/*
 	 * The maximum length of string, excluding trailing nul, that can
 	 * be passed to write_string.  If < PAGE_SIZE-1, PAGE_SIZE-1 is
 	 * assumed.
 	 */
 	size_t max_write_len;
 	/* CFTYPE_* flags */
 	unsigned int flags;
 	/*
 	 * Fields used for internal bookkeeping.  Initialized automatically
 	 * during registration.
 	 */
 	struct cgroup_subsys *ss;	/* NULL for cgroup core files */
 	struct list_head node;		/* anchored at ss->cfts */
 	struct kernfs_ops *kf_ops;
 	/*
 	 * read_u64() is a shortcut for the common case of returning a
 	 * single integer. Use it in place of read()
 	 */
 	u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft);
 	/*
 	 * read_s64() is a signed version of read_u64()
 	 */
 	s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
 	/* generic seq_file read interface */
 	int (*seq_show)(struct seq_file *sf, void *v);
 	/* optional ops, implement all or none */
 	void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
 	void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
 	void (*seq_stop)(struct seq_file *sf, void *v);
 	/*
 	 * write_u64() is a shortcut for the common case of accepting
 	 * a single integer (as parsed by simple_strtoull) from
 	 * userspace. Use in place of write(); return 0 or error.
 	 */
 	int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft,
 			 u64 val);
 	/*
 	 * write_s64() is a signed version of write_u64()
 	 */
 	int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft,
 			 s64 val);
 	/*
 	 * write_string() is passed a nul-terminated kernelspace
 	 * buffer of maximum length determined by max_write_len.
 	 * Returns 0 or -ve error code.
 	 */
 	int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft,
 			    char *buffer);
 	/*
 	 * trigger() callback can be used to get some kick from the
 	 * userspace, when the actual string written is not important
 	 * at all. The private field can be used to determine the
 	 * kick type for multiplexing.
 	 */
 	int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lock_class_key	lockdep_key;
 #endif
 };
 extern struct cgroup_root cgrp_dfl_root;
 static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
 {
 	return cgrp->root == &cgrp_dfl_root;
 }
 /*
  * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details.  This
  * function can be called as long as @cgrp is accessible.
  */
 static inline bool cgroup_sane_behavior(const struct cgroup *cgrp)
 {
 	return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR;
 }
 /* no synchronization, the result can only be used as a hint */
 static inline bool cgroup_has_tasks(struct cgroup *cgrp)
 {
 	return !list_empty(&cgrp->cset_links);
 }
 /* returns ino associated with a cgroup, 0 indicates unmounted root */
 static inline ino_t cgroup_ino(struct cgroup *cgrp)
 {
 	if (cgrp->kn)
 		return cgrp->kn->ino;
 	else
 		return 0;
 }
 static inline struct cftype *seq_cft(struct seq_file *seq)
 {
 	struct kernfs_open_file *of = seq->private;
 	return of->kn->priv;
 }
 struct cgroup_subsys_state *seq_css(struct seq_file *seq);
 /*
  * Name / path handling functions.  All are thin wrappers around the kernfs
  * counterparts and can be called under any context.
  */
 static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
 {
 	return kernfs_name(cgrp->kn, buf, buflen);
 }
 static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
 					      size_t buflen)
 {
 	return kernfs_path(cgrp->kn, buf, buflen);
 }
 static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
 {
 	pr_cont_kernfs_name(cgrp->kn);
 }
 static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
 {
 	pr_cont_kernfs_path(cgrp->kn);
 }
 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cftype *cfts);
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 /*
  * Control Group taskset, used to pass around set of tasks to cgroup_subsys
  * methods.
  */
 struct cgroup_taskset;
 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
 /**
  * cgroup_taskset_for_each - iterate cgroup_taskset
  * @task: the loop cursor
  * @tset: taskset to iterate
  */
 #define cgroup_taskset_for_each(task, tset)				\
 	for ((task) = cgroup_taskset_first((tset)); (task);		\
 	     (task) = cgroup_taskset_next((tset)))
 /*
  * Control Group subsystem type.
  * See Documentation/cgroups/cgroups.txt for details
  */
 struct cgroup_subsys {
 	struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
 	int (*css_online)(struct cgroup_subsys_state *css);
 	void (*css_offline)(struct cgroup_subsys_state *css);
 	void (*css_free)(struct cgroup_subsys_state *css);
 	int (*can_attach)(struct cgroup_subsys_state *css,
 			  struct cgroup_taskset *tset);
 	void (*cancel_attach)(struct cgroup_subsys_state *css,
 			      struct cgroup_taskset *tset);
 	void (*attach)(struct cgroup_subsys_state *css,
 		       struct cgroup_taskset *tset);
 	void (*fork)(struct task_struct *task);
 	void (*exit)(struct cgroup_subsys_state *css,
 		     struct cgroup_subsys_state *old_css,
 		     struct task_struct *task);
 	void (*bind)(struct cgroup_subsys_state *root_css);
 	int disabled;
 	int early_init;
 	/*
 	 * If %false, this subsystem is properly hierarchical -
 	 * configuration, resource accounting and restriction on a parent
 	 * cgroup cover those of its children.  If %true, hierarchy support
 	 * is broken in some ways - some subsystems ignore hierarchy
 	 * completely while others are only implemented half-way.
 	 *
 	 * It's now disallowed to create nested cgroups if the subsystem is
 	 * broken and cgroup core will emit a warning message on such
 	 * cases.  Eventually, all subsystems will be made properly
 	 * hierarchical and this will go away.
 	 */
 	bool broken_hierarchy;
 	bool warned_broken_hierarchy;
 	/* the following two fields are initialized automtically during boot */
 	int id;
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 	/* link to parent, protected by cgroup_lock() */
 	struct cgroup_root *root;
 	/*
 	 * List of cftypes.  Each entry is the first entry of an array
 	 * terminated by zero length name.
 	 */
 	struct list_head cfts;
 	/* base cftypes, automatically registered with subsys itself */
 	struct cftype *base_cftypes;
 };
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
 #include <linux/cgroup_subsys.h>
 #undef SUBSYS
 /**
  * css_parent - find the parent css
  * @css: the target cgroup_subsys_state
  *
  * Return the parent css of @css.  This function is guaranteed to return
  * non-NULL parent as long as @css isn't the root.
  */
 static inline
 struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css)
 {
 	return css->parent;
 }
 /**
  * task_css_set_check - obtain a task's css_set with extra access conditions
  * @task: the task to obtain css_set for
  * @__c: extra condition expression to be passed to rcu_dereference_check()
  *
  * A task's css_set is RCU protected, initialized and exited while holding
  * task_lock(), and can only be modified while holding both cgroup_mutex
  * and task_lock() while the task is alive.  This macro verifies that the
  * caller is inside proper critical section and returns @task's css_set.
  *
  * The caller can also specify additional allowed conditions via @__c, such
  * as locks used during the cgroup_subsys::attach() methods.
  */
 #ifdef CONFIG_PROVE_RCU
 extern struct mutex cgroup_mutex;
 extern struct rw_semaphore css_set_rwsem;
 #define task_css_set_check(task, __c)					\
 	rcu_dereference_check((task)->cgroups,				\
 		lockdep_is_held(&cgroup_mutex) ||			\
 		lockdep_is_held(&css_set_rwsem) ||			\
 		((task)->flags & PF_EXITING) || (__c))
 #else
 #define task_css_set_check(task, __c)					\
 	rcu_dereference((task)->cgroups)
 #endif
 /**
  * task_css_check - obtain css for (task, subsys) w/ extra access conds
  * @task: the target task
  * @subsys_id: the target subsystem ID
  * @__c: extra condition expression to be passed to rcu_dereference_check()
  *
  * Return the cgroup_subsys_state for the (@task, @subsys_id) pair.  The
  * synchronization rules are the same as task_css_set_check().
  */
 #define task_css_check(task, subsys_id, __c)				\
 	task_css_set_check((task), (__c))->subsys[(subsys_id)]
 /**
  * task_css_set - obtain a task's css_set
  * @task: the task to obtain css_set for
  *
  * See task_css_set_check().
  */
 static inline struct css_set *task_css_set(struct task_struct *task)
 {
 	return task_css_set_check(task, false);
 }
 /**
  * task_css - obtain css for (task, subsys)
  * @task: the target task
  * @subsys_id: the target subsystem ID
  *
  * See task_css_check().
  */
 static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
 						   int subsys_id)
 {
 	return task_css_check(task, subsys_id, false);
 }
 static inline struct cgroup *task_cgroup(struct task_struct *task,
 					 int subsys_id)
 {
 	return task_css(task, subsys_id)->cgroup;
 }
 struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
 					   struct cgroup_subsys_state *parent);
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
 /**
  * css_for_each_child - iterate through children of a css
  * @pos: the css * to use as the loop cursor
  * @parent: css whose children to walk
  *
  * Walk @parent's children.  Must be called under rcu_read_lock().  A child
  * css which hasn't finished ->css_online() or already has finished
  * ->css_offline() may show up during traversal and it's each subsystem's
  * responsibility to verify that each @pos is alive.
  *
  * If a subsystem synchronizes against the parent in its ->css_online() and
  * before starting iterating, a css which finished ->css_online() is
  * guaranteed to be visible in the future iterations.
  *
  * It is allowed to temporarily drop RCU read lock during iteration.  The
  * caller is responsible for ensuring that @pos remains accessible until
  * the start of the next iteration by, for example, bumping the css refcnt.
  */
 #define css_for_each_child(pos, parent)					\
 	for ((pos) = css_next_child(NULL, (parent)); (pos);		\
 	     (pos) = css_next_child((pos), (parent)))
 struct cgroup_subsys_state *
 css_next_descendant_pre(struct cgroup_subsys_state *pos,
 			struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *
 css_rightmost_descendant(struct cgroup_subsys_state *pos);
 /**
  * css_for_each_descendant_pre - pre-order walk of a css's descendants
  * @pos: the css * to use as the loop cursor
  * @root: css whose descendants to walk
  *
  * Walk @root's descendants.  @root is included in the iteration and the
  * first node to be visited.  Must be called under rcu_read_lock().  A
  * descendant css which hasn't finished ->css_online() or already has
  * finished ->css_offline() may show up during traversal and it's each
  * subsystem's responsibility to verify that each @pos is alive.
  *
  * If a subsystem synchronizes against the parent in its ->css_online() and
  * before starting iterating, and synchronizes against @pos on each
  * iteration, any descendant css which finished ->css_online() is
  * guaranteed to be visible in the future iterations.
  *
  * In other words, the following guarantees that a descendant can't escape
  * state updates of its ancestors.
  *
  * my_online(@css)
  * {
  *	Lock @css's parent and @css;
  *	Inherit state from the parent;
  *	Unlock both.
  * }
  *
  * my_update_state(@css)
  * {
  *	css_for_each_descendant_pre(@pos, @css) {
  *		Lock @pos;
  *		if (@pos == @css)
  *			Update @css's state;
  *		else
  *			Verify @pos is alive and inherit state from its parent;
  *		Unlock @pos;
  *	}
  * }
  *
  * As long as the inheriting step, including checking the parent state, is
  * enclosed inside @pos locking, double-locking the parent isn't necessary
  * while inheriting.  The state update to the parent is guaranteed to be
  * visible by walking order and, as long as inheriting operations to the
  * same @pos are atomic to each other, multiple updates racing each other
  * still result in the correct state.  It's guaranateed that at least one
  * inheritance happens for any css after the latest update to its parent.
  *
  * If checking parent's state requires locking the parent, each inheriting
  * iteration should lock and unlock both @pos->parent and @pos.
  *
  * Alternatively, a subsystem may choose to use a single global lock to
  * synchronize ->css_online() and ->css_offline() against tree-walking
  * operations.
  *
  * It is allowed to temporarily drop RCU read lock during iteration.  The
  * caller is responsible for ensuring that @pos remains accessible until
  * the start of the next iteration by, for example, bumping the css refcnt.
  */
 #define css_for_each_descendant_pre(pos, css)				\
 	for ((pos) = css_next_descendant_pre(NULL, (css)); (pos);	\
 	     (pos) = css_next_descendant_pre((pos), (css)))
 struct cgroup_subsys_state *
 css_next_descendant_post(struct cgroup_subsys_state *pos,
 			 struct cgroup_subsys_state *css);
 /**
  * css_for_each_descendant_post - post-order walk of a css's descendants
  * @pos: the css * to use as the loop cursor
  * @css: css whose descendants to walk
  *
  * Similar to css_for_each_descendant_pre() but performs post-order
  * traversal instead.  @root is included in the iteration and the last
  * node to be visited.  Note that the walk visibility guarantee described
  * in pre-order walk doesn't apply the same to post-order walks.
  */
 #define css_for_each_descendant_post(pos, css)				\
 	for ((pos) = css_next_descendant_post(NULL, (css)); (pos);	\
 	     (pos) = css_next_descendant_post((pos), (css)))
 /* A css_task_iter should be treated as an opaque object */
 struct css_task_iter {
 	struct cgroup_subsys		*ss;
 	struct list_head		*cset_pos;
 	struct list_head		*cset_head;
 	struct list_head		*task_pos;
 	struct list_head		*tasks_head;
 	struct list_head		*mg_tasks_head;
 };
 void css_task_iter_start(struct cgroup_subsys_state *css,
 			 struct css_task_iter *it);
 struct task_struct *css_task_iter_next(struct css_task_iter *it);
 void css_task_iter_end(struct css_task_iter *it);
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
 struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
 						struct cgroup_subsys *ss);
 #else /* !CONFIG_CGROUPS */
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 static inline void cgroup_fork(struct task_struct *p) {}
 static inline void cgroup_post_fork(struct task_struct *p) {}
 static inline void cgroup_exit(struct task_struct *p) {}
 static inline int cgroupstats_build(struct cgroupstats *stats,
 					struct dentry *dentry)
 {
 	return -EINVAL;
 }
 /* No cgroups - nothing to do */
 static inline int cgroup_attach_task_all(struct task_struct *from,
 					 struct task_struct *t)
 {
 	return 0;
 }
 #endif /* !CONFIG_CGROUPS */
 #endif /* _LINUX_CGROUP_H */

 /*
  *  Generic process-grouping system.
  *
  *  Based originally on the cpuset system, extracted by Paul Menage
  *  Copyright (C) 2006 Google, Inc
  *
  *  Notifications support
  *  Copyright (C) 2009 Nokia Corporation
  *  Author: Kirill A. Shutemov
  *
  *  Copyright notices from the original cpuset code:
  *  --------------------------------------------------
  *  Copyright (C) 2003 BULL SA.
  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  *
  *  Portions derived from Patrick Mochel's sysfs code.
  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  *
  *  2003-10-10 Written by Simon Derr.
  *  2003-10-22 Updates by Stephen Hemminger.
  *  2004 May-July Rework by Paul Jackson.
  *  ---------------------------------------------------
  *
  *  This file is subject to the terms and conditions of the GNU General Public
  *  License.  See the file COPYING in the main directory of the Linux
  *  distribution for more details.
  */
 #include <linux/cgroup.h>
 #include <linux/cred.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
 #include <linux/init_task.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/rwsem.h>
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hashtable.h>
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/kthread.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
 /*
  * pidlists linger the following amount before being destroyed.  The goal
  * is avoiding frequent destruction in the middle of consecutive read calls
  * Expiring in the middle is a performance problem not a correctness one.
  * 1 sec should be enough.
  */
 #define CGROUP_PIDLIST_DESTROY_DELAY	HZ
 #define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
 					 MAX_CFTYPE_NAME + 2)
 /*
  * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
  * creation/removal and hierarchy changing operations including cgroup
  * creation, removal, css association and controller rebinding.  This outer
  * lock is needed mainly to resolve the circular dependency between kernfs
  * active ref and cgroup_mutex.  cgroup_tree_mutex nests above both.
  */
 static DEFINE_MUTEX(cgroup_tree_mutex);
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
  *
  * css_set_rwsem protects task->cgroups pointer, the list of css_set
  * objects, and the chain of tasks off each css_set.
  *
  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  * cgroup.h can use them for lockdep annotations.
  */
 #ifdef CONFIG_PROVE_RCU
 DEFINE_MUTEX(cgroup_mutex);
 DECLARE_RWSEM(css_set_rwsem);
 EXPORT_SYMBOL_GPL(cgroup_mutex);
 EXPORT_SYMBOL_GPL(css_set_rwsem);
 #else
 static DEFINE_MUTEX(cgroup_mutex);
 static DECLARE_RWSEM(css_set_rwsem);
 #endif
 /*
  * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
  * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
  */
 static DEFINE_SPINLOCK(release_agent_path_lock);
 #define cgroup_assert_mutexes_or_rcu_locked()				\
 	rcu_lockdep_assert(rcu_read_lock_held() ||			\
 			   lockdep_is_held(&cgroup_tree_mutex) ||	\
 			   lockdep_is_held(&cgroup_mutex),		\
 			   "cgroup_[tree_]mutex or RCU read lock required");
 /*
  * cgroup destruction makes heavy use of work items and there can be a lot
  * of concurrent destructions.  Use a separate workqueue so that cgroup
  * destruction work items don't end up filling up max_active of system_wq
  * which may lead to deadlock.
  */
 static struct workqueue_struct *cgroup_destroy_wq;
 /*
  * pidlist destructions need to be flushed on cgroup destruction.  Use a
  * separate workqueue as flush domain.
  */
 static struct workqueue_struct *cgroup_pidlist_destroy_wq;
 /* generate an array of cgroup subsystem pointers */
 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
 static struct cgroup_subsys *cgroup_subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
 #undef SUBSYS
 /* array of cgroup subsystem names */
 #define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
 static const char *cgroup_subsys_name[] = {
 #include <linux/cgroup_subsys.h>
 };
 #undef SUBSYS
 /*
  * The default hierarchy, reserved for the subsystems that are otherwise
  * unattached - it never has more than a single cgroup, and all tasks are
  * part of that cgroup.
  */
 struct cgroup_root cgrp_dfl_root;
 /*
  * The default hierarchy always exists but is hidden until mounted for the
  * first time.  This is for backward compatibility.
  */
 static bool cgrp_dfl_root_visible;
 /* The list of hierarchy roots */
 static LIST_HEAD(cgroup_roots);
 static int cgroup_root_count;
 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
 static DEFINE_IDR(cgroup_hierarchy_idr);
 /*
  * Assign a monotonically increasing serial number to cgroups.  It
  * guarantees cgroups with bigger numbers are newer than those with smaller
  * numbers.  Also, as cgroups are always appended to the parent's
  * ->children list, it guarantees that sibling cgroups are always sorted in
  * the ascending serial number order on the list.  Protected by
  * cgroup_mutex.
  */
 static u64 cgroup_serial_nr_next = 1;
 /* This flag indicates whether tasks in the fork and exit paths should
  * check for fork/exit handlers to call. This avoids us having to do
  * extra work in the fork/exit path if none of the subsystems need to
  * be called.
  */
 static int need_forkexit_callback __read_mostly;
 static struct cftype cgroup_base_files[];
 static void cgroup_put(struct cgroup *cgrp);
 static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned long ss_mask);
 static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
 /**
  * cgroup_css - obtain a cgroup's css for the specified subsystem
  * @cgrp: the cgroup of interest
  * @ss: the subsystem of interest (%NULL returns the dummy_css)
  *
  * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
  * function must be called either under cgroup_mutex or rcu_read_lock() and
  * the caller is responsible for pinning the returned css if it wants to
  * keep accessing it outside the said locks.  This function may return
  * %NULL if @cgrp doesn't have @subsys_id enabled.
  */
 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 					      struct cgroup_subsys *ss)
 {
 	if (ss)
 		return rcu_dereference_check(cgrp->subsys[ss->id],
 					lockdep_is_held(&cgroup_tree_mutex) ||
 					lockdep_is_held(&cgroup_mutex));
 	else
 		return &cgrp->dummy_css;
 }
 /**
  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
  * @cgrp: the cgroup of interest
  * @ss: the subsystem of interest (%NULL returns the dummy_css)
  *
  * Similar to cgroup_css() but returns the effctive css, which is defined
  * as the matching css of the nearest ancestor including self which has @ss
  * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
  * function is guaranteed to return non-NULL css.
  */
 static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
 						struct cgroup_subsys *ss)
 {
 	lockdep_assert_held(&cgroup_mutex);
 	if (!ss)
 		return &cgrp->dummy_css;
 	if (!(cgrp->root->subsys_mask & (1 << ss->id)))
 		return NULL;
 	while (cgrp->parent &&
 	       !(cgrp->parent->child_subsys_mask & (1 << ss->id)))
 		cgrp = cgrp->parent;
 	return cgroup_css(cgrp, ss);
 }
 /* convenient tests for these bits */
 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 {
 	return test_bit(CGRP_DEAD, &cgrp->flags);
 }
 struct cgroup_subsys_state *seq_css(struct seq_file *seq)
 {
 	struct kernfs_open_file *of = seq->private;
 	struct cgroup *cgrp = of->kn->parent->priv;
 	struct cftype *cft = seq_cft(seq);
 	/*
 	 * This is open and unprotected implementation of cgroup_css().
 	 * seq_css() is only called from a kernfs file operation which has
 	 * an active reference on the file.  Because all the subsystem
 	 * files are drained before a css is disassociated with a cgroup,
 	 * the matching css from the cgroup's subsys table is guaranteed to
 	 * be and stay valid until the enclosing operation is complete.
 	 */
 	if (cft->ss)
 		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
 	else
 		return &cgrp->dummy_css;
 }
 EXPORT_SYMBOL_GPL(seq_css);
 /**
  * cgroup_is_descendant - test ancestry
  * @cgrp: the cgroup to be tested
  * @ancestor: possible ancestor of @cgrp
  *
  * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
  * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
  * and @ancestor are accessible.
  */
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
 {
 	while (cgrp) {
 		if (cgrp == ancestor)
 			return true;
 		cgrp = cgrp->parent;
 	}
 	return false;
 }
 static int cgroup_is_releasable(const struct cgroup *cgrp)
 {
 	const int bits =
 		(1 << CGRP_RELEASABLE) |
 		(1 << CGRP_NOTIFY_ON_RELEASE);
 	return (cgrp->flags & bits) == bits;
 }
 static int notify_on_release(const struct cgroup *cgrp)
 {
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
 /**
  * for_each_css - iterate all css's of a cgroup
  * @css: the iteration cursor
  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
  * @cgrp: the target cgroup to iterate css's of
  *
  * Should be called under cgroup_[tree_]mutex.
  */
 #define for_each_css(css, ssid, cgrp)					\
 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
 		if (!((css) = rcu_dereference_check(			\
 				(cgrp)->subsys[(ssid)],			\
 				lockdep_is_held(&cgroup_tree_mutex) ||	\
 				lockdep_is_held(&cgroup_mutex)))) { }	\
 		else
 /**
  * for_each_e_css - iterate all effective css's of a cgroup
  * @css: the iteration cursor
  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
  * @cgrp: the target cgroup to iterate css's of
  *
  * Should be called under cgroup_[tree_]mutex.
  */
 #define for_each_e_css(css, ssid, cgrp)					\
 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
 		if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
 			;						\
 		else
 /**
  * for_each_subsys - iterate all enabled cgroup subsystems
  * @ss: the iteration cursor
  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
  */
 #define for_each_subsys(ss, ssid)					\
 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
 	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
 /* iterate across the hierarchies */
 #define for_each_root(root)						\
 	list_for_each_entry((root), &cgroup_roots, root_list)
 /**
  * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
  * @cgrp: the cgroup to be checked for liveness
  *
  * On success, returns true; the mutex should be later unlocked.  On
  * failure returns false with no lock held.
  */
 static bool cgroup_lock_live_group(struct cgroup *cgrp)
 {
 	mutex_lock(&cgroup_mutex);
 	if (cgroup_is_dead(cgrp)) {
 		mutex_unlock(&cgroup_mutex);
 		return false;
 	}
 	return true;
 }
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
 static LIST_HEAD(release_list);
 static DEFINE_RAW_SPINLOCK(release_list_lock);
 static void cgroup_release_agent(struct work_struct *work);
 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
 /*
  * A cgroup can be associated with multiple css_sets as different tasks may
  * belong to different cgroups on different hierarchies.  In the other
  * direction, a css_set is naturally associated with multiple cgroups.
  * This M:N relationship is represented by the following link structure
  * which exists for each association and allows traversing the associations
  * from both sides.
  */
 struct cgrp_cset_link {
 	/* the cgroup and css_set this link associates */
 	struct cgroup		*cgrp;
 	struct css_set		*cset;
 	/* list of cgrp_cset_links anchored at cgrp->cset_links */
 	struct list_head	cset_link;
 	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
 	struct list_head	cgrp_link;
 };
 /*
  * The default css_set - used by init and its children prior to any
  * hierarchies being mounted. It contains a pointer to the root state
  * for each subsystem. Also used to anchor the list of css_sets. Not
  * reference-counted, to improve performance when child cgroups
  * haven't been created.
  */
 static struct css_set init_css_set = {
 	.refcount		= ATOMIC_INIT(1),
 	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
 	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
 };
 static int css_set_count	= 1;	/* 1 for init_css_set */
 /*
  * hash table for cgroup groups. This improves the performance to find
  * an existing css_set. This hash doesn't (currently) take into
  * account cgroups in empty hierarchies.
  */
 #define CSS_SET_HASH_BITS	7
 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 {
 	unsigned long key = 0UL;
 	struct cgroup_subsys *ss;
 	int i;
 	for_each_subsys(ss, i)
 		key += (unsigned long)css[i];
 	key = (key >> 16) ^ key;
 	return key;
 }
 static void put_css_set_locked(struct css_set *cset, bool taskexit)
 {
 	struct cgrp_cset_link *link, *tmp_link;
 	struct cgroup_subsys *ss;
 	int ssid;
 	lockdep_assert_held(&css_set_rwsem);
 	if (!atomic_dec_and_test(&cset->refcount))
 		return;
 	/* This css_set is dead. unlink it and release cgroup refcounts */
 	for_each_subsys(ss, ssid)
 		list_del(&cset->e_cset_node[ssid]);
 	hash_del(&cset->hlist);
 	css_set_count--;
 	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
 		struct cgroup *cgrp = link->cgrp;
 		list_del(&link->cset_link);
 		list_del(&link->cgrp_link);
 		/* @cgrp can't go away while we're holding css_set_rwsem */
 		if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
 			if (taskexit)
 				set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
 		}
 		kfree(link);
 	}
 	kfree_rcu(cset, rcu_head);
 }
 static void put_css_set(struct css_set *cset, bool taskexit)
 {
 	/*
 	 * Ensure that the refcount doesn't hit zero while any readers
 	 * can see it. Similar to atomic_dec_and_lock(), but for an
 	 * rwlock
 	 */
 	if (atomic_add_unless(&cset->refcount, -1, 1))
 		return;
 	down_write(&css_set_rwsem);
 	put_css_set_locked(cset, taskexit);
 	up_write(&css_set_rwsem);
 }
 /*
  * refcounted get/put for css_set objects
  */
 static inline void get_css_set(struct css_set *cset)
 {
 	atomic_inc(&cset->refcount);
 }
 /**
  * compare_css_sets - helper function for find_existing_css_set().
  * @cset: candidate css_set being tested
  * @old_cset: existing css_set for a task
  * @new_cgrp: cgroup that's being entered by the task
  * @template: desired set of css pointers in css_set (pre-calculated)
  *
  * Returns true if "cset" matches "old_cset" except for the hierarchy
  * which "new_cgrp" belongs to, for which it should match "new_cgrp".
  */
 static bool compare_css_sets(struct css_set *cset,
 			     struct css_set *old_cset,
 			     struct cgroup *new_cgrp,
 			     struct cgroup_subsys_state *template[])
 {
 	struct list_head *l1, *l2;
 	/*
 	 * On the default hierarchy, there can be csets which are
 	 * associated with the same set of cgroups but different csses.
 	 * Let's first ensure that csses match.
 	 */
 	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
 		return false;
 	/*
 	 * Compare cgroup pointers in order to distinguish between
 	 * different cgroups in hierarchies.  As different cgroups may
 	 * share the same effective css, this comparison is always
 	 * necessary.
 	 */
 	l1 = &cset->cgrp_links;
 	l2 = &old_cset->cgrp_links;
 	while (1) {
 		struct cgrp_cset_link *link1, *link2;
 		struct cgroup *cgrp1, *cgrp2;
 		l1 = l1->next;
 		l2 = l2->next;
 		/* See if we reached the end - both lists are equal length. */
 		if (l1 == &cset->cgrp_links) {
 			BUG_ON(l2 != &old_cset->cgrp_links);
 			break;
 		} else {
 			BUG_ON(l2 == &old_cset->cgrp_links);
 		}
 		/* Locate the cgroups associated with these links. */
 		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
 		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
 		cgrp1 = link1->cgrp;
 		cgrp2 = link2->cgrp;
 		/* Hierarchies should be linked in the same order. */
 		BUG_ON(cgrp1->root != cgrp2->root);
 		/*
 		 * If this hierarchy is the hierarchy of the cgroup
 		 * that's changing, then we need to check that this
 		 * css_set points to the new cgroup; if it's any other
 		 * hierarchy, then this css_set should point to the
 		 * same cgroup as the old css_set.
 		 */
 		if (cgrp1->root == new_cgrp->root) {
 			if (cgrp1 != new_cgrp)
 				return false;
 		} else {
 			if (cgrp1 != cgrp2)
 				return false;
 		}
 	}
 	return true;
 }
 /**
  * find_existing_css_set - init css array and find the matching css_set
  * @old_cset: the css_set that we're using before the cgroup transition
  * @cgrp: the cgroup that we're moving into
  * @template: out param for the new set of csses, should be clear on entry
  */
 static struct css_set *find_existing_css_set(struct css_set *old_cset,
 					struct cgroup *cgrp,
 					struct cgroup_subsys_state *template[])
 {
 	struct cgroup_root *root = cgrp->root;
 	struct cgroup_subsys *ss;
 	struct css_set *cset;
 	unsigned long key;
 	int i;
 	/*
 	 * Build the set of subsystem state objects that we want to see in the
 	 * new css_set. while subsystems can change globally, the entries here
 	 * won't change, so no need for locking.
 	 */
 	for_each_subsys(ss, i) {
 		if (root->subsys_mask & (1UL << i)) {
 			/*
 			 * @ss is in this hierarchy, so we want the
 			 * effective css from @cgrp.
 			 */
 			template[i] = cgroup_e_css(cgrp, ss);
 		} else {
 			/*
 			 * @ss is not in this hierarchy, so we don't want
 			 * to change the css.
 			 */
 			template[i] = old_cset->subsys[i];
 		}
 	}
 	key = css_set_hash(template);
 	hash_for_each_possible(css_set_table, cset, hlist, key) {
 		if (!compare_css_sets(cset, old_cset, cgrp, template))
 			continue;
 		/* This css_set matches what we need */
 		return cset;
 	}
 	/* No existing cgroup group matched */
 	return NULL;
 }
 static void free_cgrp_cset_links(struct list_head *links_to_free)
 {
 	struct cgrp_cset_link *link, *tmp_link;
 	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
 		list_del(&link->cset_link);
 		kfree(link);
 	}
 }
 /**
  * allocate_cgrp_cset_links - allocate cgrp_cset_links
  * @count: the number of links to allocate
  * @tmp_links: list_head the allocated links are put on
  *
  * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
  * through ->cset_link.  Returns 0 on success or -errno.
  */
 static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
 {
 	struct cgrp_cset_link *link;
 	int i;
 	INIT_LIST_HEAD(tmp_links);
 	for (i = 0; i < count; i++) {
 		link = kzalloc(sizeof(*link), GFP_KERNEL);
 		if (!link) {
 			free_cgrp_cset_links(tmp_links);
 			return -ENOMEM;
 		}
 		list_add(&link->cset_link, tmp_links);
 	}
 	return 0;
 }
 /**
  * link_css_set - a helper function to link a css_set to a cgroup
  * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
  * @cset: the css_set to be linked
  * @cgrp: the destination cgroup
  */
 static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
 			 struct cgroup *cgrp)
 {
 	struct cgrp_cset_link *link;
 	BUG_ON(list_empty(tmp_links));
+	if (cgroup_on_dfl(cgrp))
+		cset->dfl_cgrp = cgrp;
 	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
 	link->cset = cset;
 	link->cgrp = cgrp;
 	list_move(&link->cset_link, &cgrp->cset_links);
 	/*
 	 * Always add links to the tail of the list so that the list
 	 * is sorted by order of hierarchy creation
 	 */
 	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
 }
 /**
  * find_css_set - return a new css_set with one cgroup updated
  * @old_cset: the baseline css_set
  * @cgrp: the cgroup to be updated
  *
  * Return a new css_set that's equivalent to @old_cset, but with @cgrp
  * substituted into the appropriate hierarchy.
  */
 static struct css_set *find_css_set(struct css_set *old_cset,
 				    struct cgroup *cgrp)
 {
 	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
 	struct css_set *cset;
 	struct list_head tmp_links;
 	struct cgrp_cset_link *link;
 	struct cgroup_subsys *ss;
 	unsigned long key;
 	int ssid;
 	lockdep_assert_held(&cgroup_mutex);
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
 	down_read(&css_set_rwsem);
 	cset = find_existing_css_set(old_cset, cgrp, template);
 	if (cset)
 		get_css_set(cset);
 	up_read(&css_set_rwsem);
 	if (cset)
 		return cset;
 	cset = kzalloc(sizeof(*cset), GFP_KERNEL);
 	if (!cset)
 		return NULL;
 	/* Allocate all the cgrp_cset_link objects that we'll need */
 	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
 		kfree(cset);
 		return NULL;
 	}
 	atomic_set(&cset->refcount, 1);
 	INIT_LIST_HEAD(&cset->cgrp_links);
 	INIT_LIST_HEAD(&cset->tasks);
 	INIT_LIST_HEAD(&cset->mg_tasks);
 	INIT_LIST_HEAD(&cset->mg_preload_node);
 	INIT_LIST_HEAD(&cset->mg_node);
 	INIT_HLIST_NODE(&cset->hlist);
 	/* Copy the set of subsystem state objects generated in
 	 * find_existing_css_set() */
 	memcpy(cset->subsys, template, sizeof(cset->subsys));
 	down_write(&css_set_rwsem);
 	/* Add reference counts and links from the new css_set. */
 	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
 		if (c->root == cgrp->root)
 			c = cgrp;
 		link_css_set(&tmp_links, cset, c);
 	}
 	BUG_ON(!list_empty(&tmp_links));
 	css_set_count++;
 	/* Add @cset to the hash table */
 	key = css_set_hash(cset->subsys);
 	hash_add(css_set_table, &cset->hlist, key);
 	for_each_subsys(ss, ssid)
 		list_add_tail(&cset->e_cset_node[ssid],
 			      &cset->subsys[ssid]->cgroup->e_csets[ssid]);
 	up_write(&css_set_rwsem);
 	return cset;
 }
 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
 	struct cgroup *root_cgrp = kf_root->kn->priv;
 	return root_cgrp->root;
 }
 static int cgroup_init_root_id(struct cgroup_root *root)
 {
 	int id;
 	lockdep_assert_held(&cgroup_mutex);
 	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
 	if (id < 0)
 		return id;
 	root->hierarchy_id = id;
 	return 0;
 }
 static void cgroup_exit_root_id(struct cgroup_root *root)
 {
 	lockdep_assert_held(&cgroup_mutex);
 	if (root->hierarchy_id) {
 		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
 		root->hierarchy_id = 0;
 	}
 }
 static void cgroup_free_root(struct cgroup_root *root)
 {
 	if (root) {
 		/* hierarhcy ID shoulid already have been released */
 		WARN_ON_ONCE(root->hierarchy_id);
 		idr_destroy(&root->cgroup_idr);
 		kfree(root);
 	}
 }
 static void cgroup_destroy_root(struct cgroup_root *root)
 {
 	struct cgroup *cgrp = &root->cgrp;
 	struct cgrp_cset_link *link, *tmp_link;
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	BUG_ON(atomic_read(&root->nr_cgrps));
 	BUG_ON(!list_empty(&cgrp->children));
 	/* Rebind all subsystems back to the default hierarchy */
 	rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
 	/*
 	 * Release all the links from cset_links to this hierarchy's
 	 * root cgroup
 	 */
 	down_write(&css_set_rwsem);
 	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
 		list_del(&link->cset_link);
 		list_del(&link->cgrp_link);
 		kfree(link);
 	}
 	up_write(&css_set_rwsem);
 	if (!list_empty(&root->root_list)) {
 		list_del(&root->root_list);
 		cgroup_root_count--;
 	}
 	cgroup_exit_root_id(root);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	kernfs_destroy_root(root->kf_root);
 	cgroup_free_root(root);
 }
 /* look up cgroup associated with given css_set on the specified hierarchy */
 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 					    struct cgroup_root *root)
 {
 	struct cgroup *res = NULL;
 	lockdep_assert_held(&cgroup_mutex);
 	lockdep_assert_held(&css_set_rwsem);
 	if (cset == &init_css_set) {
 		res = &root->cgrp;
 	} else {
 		struct cgrp_cset_link *link;
 		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 			struct cgroup *c = link->cgrp;
 			if (c->root == root) {
 				res = c;
 				break;
 			}
 		}
 	}
 	BUG_ON(!res);
 	return res;
 }
 /*
  * Return the cgroup for "task" from the given hierarchy. Must be
  * called with cgroup_mutex and css_set_rwsem held.
  */
 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 					    struct cgroup_root *root)
 {
 	/*
 	 * No need to lock the task - since we hold cgroup_mutex the
 	 * task can't change groups, so the only thing that can happen
 	 * is that it exits and its css is set back to init_css_set.
 	 */
 	return cset_cgroup_from_root(task_css_set(task), root);
 }
 /*
  * A task must hold cgroup_mutex to modify cgroups.
  *
  * Any task can increment and decrement the count field without lock.
  * So in general, code holding cgroup_mutex can't rely on the count
  * field not changing.  However, if the count goes to zero, then only
  * cgroup_attach_task() can increment it again.  Because a count of zero
  * means that no tasks are currently attached, therefore there is no
  * way a task attached to that cgroup can fork (the other way to
  * increment the count).  So code holding cgroup_mutex can safely
  * assume that if the count is zero, it will stay zero. Similarly, if
  * a task holds cgroup_mutex on a cgroup with zero count, it
  * knows that the cgroup won't be removed, as cgroup_rmdir()
  * needs that mutex.
  *
  * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
  * (usually) take cgroup_mutex.  These are the two most performance
  * critical pieces of code here.  The exception occurs on cgroup_exit(),
  * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
  * is taken, and if the cgroup count is zero, a usermode call made
  * to the release agent with the name of the cgroup (path relative to
  * the root of cgroup file system) as the argument.
  *
  * A cgroup can only be deleted if both its 'count' of using tasks
  * is zero, and its list of 'children' cgroups is empty.  Since all
  * tasks in the system use _some_ cgroup, and since there is always at
  * least one task in the system (init, pid == 1), therefore, root cgroup
  * always has either children cgroups and/or using tasks.  So we don't
  * need a special hack to ensure that root cgroup cannot be deleted.
  *
  * P.S.  One more locking exception.  RCU is used to guard the
  * update of a tasks cgroup pointer by cgroup_attach_task()
  */
 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 static const struct file_operations proc_cgroupstats_operations;
 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 			      char *buf)
 {
 	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
 	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
 		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
 			 cft->ss->name, cft->name);
 	else
 		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
 	return buf;
 }
 /**
  * cgroup_file_mode - deduce file mode of a control file
  * @cft: the control file in question
  *
  * returns cft->mode if ->mode is not 0
  * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
  * returns S_IRUGO if it has only a read handler
  * returns S_IWUSR if it has only a write hander
  */
 static umode_t cgroup_file_mode(const struct cftype *cft)
 {
 	umode_t mode = 0;
 	if (cft->mode)
 		return cft->mode;
 	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
 		mode |= S_IRUGO;
 	if (cft->write_u64 || cft->write_s64 || cft->write_string ||
 	    cft->trigger)
 		mode |= S_IWUSR;
 	return mode;
 }
 static void cgroup_free_fn(struct work_struct *work)
 {
 	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
 	atomic_dec(&cgrp->root->nr_cgrps);
 	cgroup_pidlist_destroy_all(cgrp);
 	if (cgrp->parent) {
 		/*
 		 * We get a ref to the parent, and put the ref when this
 		 * cgroup is being freed, so it's guaranteed that the
 		 * parent won't be destroyed before its children.
 		 */
 		cgroup_put(cgrp->parent);
 		kernfs_put(cgrp->kn);
 		kfree(cgrp);
 	} else {
 		/*
 		 * This is root cgroup's refcnt reaching zero, which
 		 * indicates that the root should be released.
 		 */
 		cgroup_destroy_root(cgrp->root);
 	}
 }
 static void cgroup_free_rcu(struct rcu_head *head)
 {
 	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
 	INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
 	queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
 }
 static void cgroup_get(struct cgroup *cgrp)
 {
 	WARN_ON_ONCE(cgroup_is_dead(cgrp));
 	WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
 	atomic_inc(&cgrp->refcnt);
 }
 static void cgroup_put(struct cgroup *cgrp)
 {
 	if (!atomic_dec_and_test(&cgrp->refcnt))
 		return;
 	if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
 		return;
 	/*
 	 * XXX: cgrp->id is only used to look up css's.  As cgroup and
 	 * css's lifetimes will be decoupled, it should be made
 	 * per-subsystem and moved to css->id so that lookups are
 	 * successful until the target css is released.
 	 */
 	mutex_lock(&cgroup_mutex);
 	idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
 	mutex_unlock(&cgroup_mutex);
 	cgrp->id = -1;
 	call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
 }
 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 {
 	char name[CGROUP_FILE_NAME_MAX];
 	lockdep_assert_held(&cgroup_tree_mutex);
 	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
 }
 /**
  * cgroup_clear_dir - remove subsys files in a cgroup directory
  * @cgrp: target cgroup
  * @subsys_mask: mask of the subsystem ids whose files should be removed
  */
 static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 {
 	struct cgroup_subsys *ss;
 	int i;
 	for_each_subsys(ss, i) {
 		struct cftype *cfts;
 		if (!test_bit(i, &subsys_mask))
 			continue;
 		list_for_each_entry(cfts, &ss->cfts, node)
 			cgroup_addrm_files(cgrp, cfts, false);
 	}
 }
 static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned long ss_mask)
 {
 	struct cgroup_subsys *ss;
 	int ssid, i, ret;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	for_each_subsys(ss, ssid) {
 		if (!(ss_mask & (1 << ssid)))
 			continue;
 		/* if @ss is on the dummy_root, we can always move it */
 		if (ss->root == &cgrp_dfl_root)
 			continue;
 		/* if @ss has non-root cgroups attached to it, can't move */
 		if (!list_empty(&ss->root->cgrp.children))
 			return -EBUSY;
 		/* can't move between two non-dummy roots either */
 		if (dst_root != &cgrp_dfl_root)
 			return -EBUSY;
 	}
 	ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
 	if (ret) {
 		if (dst_root != &cgrp_dfl_root)
 			return ret;
 		/*
 		 * Rebinding back to the default root is not allowed to
 		 * fail.  Using both default and non-default roots should
 		 * be rare.  Moving subsystems back and forth even more so.
 		 * Just warn about it and continue.
 		 */
 		if (cgrp_dfl_root_visible) {
 			pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",
 				   ret, ss_mask);
 			pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");
 		}
 	}
 	/*
 	 * Nothing can fail from this point on.  Remove files for the
 	 * removed subsystems and rebind each subsystem.
 	 */
 	mutex_unlock(&cgroup_mutex);
 	for_each_subsys(ss, ssid)
 		if (ss_mask & (1 << ssid))
 			cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
 	mutex_lock(&cgroup_mutex);
 	for_each_subsys(ss, ssid) {
 		struct cgroup_root *src_root;
 		struct cgroup_subsys_state *css;
 		struct css_set *cset;
 		if (!(ss_mask & (1 << ssid)))
 			continue;
 		src_root = ss->root;
 		css = cgroup_css(&src_root->cgrp, ss);
 		WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
 		RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
 		rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
 		ss->root = dst_root;
 		css->cgroup = &dst_root->cgrp;
 		down_write(&css_set_rwsem);
 		hash_for_each(css_set_table, i, cset, hlist)
 			list_move_tail(&cset->e_cset_node[ss->id],
 				       &dst_root->cgrp.e_csets[ss->id]);
 		up_write(&css_set_rwsem);
 		src_root->subsys_mask &= ~(1 << ssid);
 		src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
 		/* default hierarchy doesn't enable controllers by default */
 		dst_root->subsys_mask |= 1 << ssid;
 		if (dst_root != &cgrp_dfl_root)
 			dst_root->cgrp.child_subsys_mask |= 1 << ssid;
 		if (ss->bind)
 			ss->bind(css);
 	}
 	kernfs_activate(dst_root->cgrp.kn);
 	return 0;
 }
 static int cgroup_show_options(struct seq_file *seq,
 			       struct kernfs_root *kf_root)
 {
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_subsys *ss;
 	int ssid;
 	for_each_subsys(ss, ssid)
 		if (root->subsys_mask & (1 << ssid))
 			seq_printf(seq, ",%s", ss->name);
 	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
 		seq_puts(seq, ",sane_behavior");
 	if (root->flags & CGRP_ROOT_NOPREFIX)
 		seq_puts(seq, ",noprefix");
 	if (root->flags & CGRP_ROOT_XATTR)
 		seq_puts(seq, ",xattr");
 	spin_lock(&release_agent_path_lock);
 	if (strlen(root->release_agent_path))
 		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
 	spin_unlock(&release_agent_path_lock);
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
 		seq_printf(seq, ",name=%s", root->name);
 	return 0;
 }
 struct cgroup_sb_opts {
 	unsigned long subsys_mask;
 	unsigned long flags;
 	char *release_agent;
 	bool cpuset_clone_children;
 	char *name;
 	/* User explicitly requested empty subsystem */
 	bool none;
 };
 /*
  * Convert a hierarchy specifier into a bitmask of subsystems and
  * flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
  * array. This function takes refcounts on subsystems to be used, unless it
  * returns error, in which case no refcounts are taken.
  */
 static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 {
 	char *token, *o = data;
 	bool all_ss = false, one_ss = false;
 	unsigned long mask = (unsigned long)-1;
 	struct cgroup_subsys *ss;
 	int i;
 	BUG_ON(!mutex_is_locked(&cgroup_mutex));
 #ifdef CONFIG_CPUSETS
 	mask = ~(1UL << cpuset_cgrp_id);
 #endif
 	memset(opts, 0, sizeof(*opts));
 	while ((token = strsep(&o, ",")) != NULL) {
 		if (!*token)
 			return -EINVAL;
 		if (!strcmp(token, "none")) {
 			/* Explicitly have no subsystems */
 			opts->none = true;
 			continue;
 		}
 		if (!strcmp(token, "all")) {
 			/* Mutually exclusive option 'all' + subsystem name */
 			if (one_ss)
 				return -EINVAL;
 			all_ss = true;
 			continue;
 		}
 		if (!strcmp(token, "__DEVEL__sane_behavior")) {
 			opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
 			continue;
 		}
 		if (!strcmp(token, "noprefix")) {
 			opts->flags |= CGRP_ROOT_NOPREFIX;
 			continue;
 		}
 		if (!strcmp(token, "clone_children")) {
 			opts->cpuset_clone_children = true;
 			continue;
 		}
 		if (!strcmp(token, "xattr")) {
 			opts->flags |= CGRP_ROOT_XATTR;
 			continue;
 		}
 		if (!strncmp(token, "release_agent=", 14)) {
 			/* Specifying two release agents is forbidden */
 			if (opts->release_agent)
 				return -EINVAL;
 			opts->release_agent =
 				kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
 			if (!opts->release_agent)
 				return -ENOMEM;
 			continue;
 		}
 		if (!strncmp(token, "name=", 5)) {
 			const char *name = token + 5;
 			/* Can't specify an empty name */
 			if (!strlen(name))
 				return -EINVAL;
 			/* Must match [\w.-]+ */
 			for (i = 0; i < strlen(name); i++) {
 				char c = name[i];
 				if (isalnum(c))
 					continue;
 				if ((c == '.') || (c == '-') || (c == '_'))
 					continue;
 				return -EINVAL;
 			}
 			/* Specifying two names is forbidden */
 			if (opts->name)
 				return -EINVAL;
 			opts->name = kstrndup(name,
 					      MAX_CGROUP_ROOT_NAMELEN - 1,
 					      GFP_KERNEL);
 			if (!opts->name)
 				return -ENOMEM;
 			continue;
 		}
 		for_each_subsys(ss, i) {
 			if (strcmp(token, ss->name))
 				continue;
 			if (ss->disabled)
 				continue;
 			/* Mutually exclusive option 'all' + subsystem name */
 			if (all_ss)
 				return -EINVAL;
 			set_bit(i, &opts->subsys_mask);
 			one_ss = true;
 			break;
 		}
 		if (i == CGROUP_SUBSYS_COUNT)
 			return -ENOENT;
 	}
 	/* Consistency checks */
 	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
 		pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
 		if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
 		    opts->cpuset_clone_children || opts->release_agent ||
 		    opts->name) {
 			pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
 			return -EINVAL;
 		}
 	} else {
 		/*
 		 * If the 'all' option was specified select all the
 		 * subsystems, otherwise if 'none', 'name=' and a subsystem
 		 * name options were not specified, let's default to 'all'
 		 */
 		if (all_ss || (!one_ss && !opts->none && !opts->name))
 			for_each_subsys(ss, i)
 				if (!ss->disabled)
 					set_bit(i, &opts->subsys_mask);
 		/*
 		 * We either have to specify by name or by subsystems. (So
 		 * all empty hierarchies must have a name).
 		 */
 		if (!opts->subsys_mask && !opts->name)
 			return -EINVAL;
 	}
 	/*
 	 * Option noprefix was introduced just for backward compatibility
 	 * with the old cpuset, so we allow noprefix only if mounting just
 	 * the cpuset subsystem.
 	 */
 	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
 		return -EINVAL;
 	/* Can't specify "none" and some subsystems */
 	if (opts->subsys_mask && opts->none)
 		return -EINVAL;
 	return 0;
 }
 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 {
 	int ret = 0;
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_sb_opts opts;
 	unsigned long added_mask, removed_mask;
 	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
 		pr_err("cgroup: sane_behavior: remount is not allowed\n");
 		return -EINVAL;
 	}
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	/* See what subsystems are wanted */
 	ret = parse_cgroupfs_options(data, &opts);
 	if (ret)
 		goto out_unlock;
 	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
 		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
 			   task_tgid_nr(current), current->comm);
 	added_mask = opts.subsys_mask & ~root->subsys_mask;
 	removed_mask = root->subsys_mask & ~opts.subsys_mask;
 	/* Don't allow flags or name to change at remount */
 	if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
 	    (opts.name && strcmp(opts.name, root->name))) {
 		pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
 		       opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
 		       root->flags & CGRP_ROOT_OPTION_MASK, root->name);
 		ret = -EINVAL;
 		goto out_unlock;
 	}
 	/* remounting is not allowed for populated hierarchies */
 	if (!list_empty(&root->cgrp.children)) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
 	ret = rebind_subsystems(root, added_mask);
 	if (ret)
 		goto out_unlock;
 	rebind_subsystems(&cgrp_dfl_root, removed_mask);
 	if (opts.release_agent) {
 		spin_lock(&release_agent_path_lock);
 		strcpy(root->release_agent_path, opts.release_agent);
 		spin_unlock(&release_agent_path_lock);
 	}
  out_unlock:
 	kfree(opts.release_agent);
 	kfree(opts.name);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	return ret;
 }
 /*
  * To reduce the fork() overhead for systems that are not actually using
  * their cgroups capability, we don't maintain the lists running through
  * each css_set to its tasks until we see the list actually used - in other
  * words after the first mount.
  */
 static bool use_task_css_set_links __read_mostly;
 static void cgroup_enable_task_cg_lists(void)
 {
 	struct task_struct *p, *g;
 	down_write(&css_set_rwsem);
 	if (use_task_css_set_links)
 		goto out_unlock;
 	use_task_css_set_links = true;
 	/*
 	 * We need tasklist_lock because RCU is not safe against
 	 * while_each_thread(). Besides, a forking task that has passed
 	 * cgroup_post_fork() without seeing use_task_css_set_links = 1
 	 * is not guaranteed to have its child immediately visible in the
 	 * tasklist if we walk through it with RCU.
 	 */
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
 			     task_css_set(p) != &init_css_set);
 		/*
 		 * We should check if the process is exiting, otherwise
 		 * it will race with cgroup_exit() in that the list
 		 * entry won't be deleted though the process has exited.
 		 * Do it while holding siglock so that we don't end up
 		 * racing against cgroup_exit().
 		 */
 		spin_lock_irq(&p->sighand->siglock);
 		if (!(p->flags & PF_EXITING)) {
 			struct css_set *cset = task_css_set(p);
 			list_add(&p->cg_list, &cset->tasks);
 			get_css_set(cset);
 		}
 		spin_unlock_irq(&p->sighand->siglock);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 out_unlock:
 	up_write(&css_set_rwsem);
 }
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
 {
 	struct cgroup_subsys *ss;
 	int ssid;
 	atomic_set(&cgrp->refcnt, 1);
 	INIT_LIST_HEAD(&cgrp->sibling);
 	INIT_LIST_HEAD(&cgrp->children);
 	INIT_LIST_HEAD(&cgrp->cset_links);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
 	mutex_init(&cgrp->pidlist_mutex);
 	cgrp->dummy_css.cgroup = cgrp;
 	for_each_subsys(ss, ssid)
 		INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
 }
 static void init_cgroup_root(struct cgroup_root *root,
 			     struct cgroup_sb_opts *opts)
 {
 	struct cgroup *cgrp = &root->cgrp;
 	INIT_LIST_HEAD(&root->root_list);
 	atomic_set(&root->nr_cgrps, 1);
 	cgrp->root = root;
 	init_cgroup_housekeeping(cgrp);
 	idr_init(&root->cgroup_idr);
 	root->flags = opts->flags;
 	if (opts->release_agent)
 		strcpy(root->release_agent_path, opts->release_agent);
 	if (opts->name)
 		strcpy(root->name, opts->name);
 	if (opts->cpuset_clone_children)
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
 static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 {
 	LIST_HEAD(tmp_links);
 	struct cgroup *root_cgrp = &root->cgrp;
 	struct css_set *cset;
 	int i, ret;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
 	if (ret < 0)
 		goto out;
 	root_cgrp->id = ret;
 	/*
 	 * We're accessing css_set_count without locking css_set_rwsem here,
 	 * but that's OK - it can only be increased by someone holding
 	 * cgroup_lock, and that's us. The worst that can happen is that we
 	 * have some link structures left over
 	 */
 	ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
 	if (ret)
 		goto out;
 	ret = cgroup_init_root_id(root);
 	if (ret)
 		goto out;
 	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
 					   KERNFS_ROOT_CREATE_DEACTIVATED,
 					   root_cgrp);
 	if (IS_ERR(root->kf_root)) {
 		ret = PTR_ERR(root->kf_root);
 		goto exit_root_id;
 	}
 	root_cgrp->kn = root->kf_root->kn;
 	ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
 	if (ret)
 		goto destroy_root;
 	ret = rebind_subsystems(root, ss_mask);
 	if (ret)
 		goto destroy_root;
 	/*
 	 * There must be no failure case after here, since rebinding takes
 	 * care of subsystems' refcounts, which are explicitly dropped in
 	 * the failure exit path.
 	 */
 	list_add(&root->root_list, &cgroup_roots);
 	cgroup_root_count++;
 	/*
 	 * Link the root cgroup in this hierarchy into all the css_set
 	 * objects.
 	 */
 	down_write(&css_set_rwsem);
 	hash_for_each(css_set_table, i, cset, hlist)
 		link_css_set(&tmp_links, cset, root_cgrp);
 	up_write(&css_set_rwsem);
 	BUG_ON(!list_empty(&root_cgrp->children));
 	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
 	kernfs_activate(root_cgrp->kn);
 	ret = 0;
 	goto out;
 destroy_root:
 	kernfs_destroy_root(root->kf_root);
 	root->kf_root = NULL;
 exit_root_id:
 	cgroup_exit_root_id(root);
 out:
 	free_cgrp_cset_links(&tmp_links);
 	return ret;
 }
 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data)
 {
 	struct cgroup_root *root;
 	struct cgroup_sb_opts opts;
 	struct dentry *dentry;
 	int ret;
 	bool new_sb;
 	/*
 	 * The first time anyone tries to mount a cgroup, enable the list
 	 * linking each css_set to its tasks and fix up all existing tasks.
 	 */
 	if (!use_task_css_set_links)
 		cgroup_enable_task_cg_lists();
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	/* First find the desired set of subsystems */
 	ret = parse_cgroupfs_options(data, &opts);
 	if (ret)
 		goto out_unlock;
 retry:
 	/* look for a matching existing root */
 	if (!opts.subsys_mask && !opts.none && !opts.name) {
 		cgrp_dfl_root_visible = true;
 		root = &cgrp_dfl_root;
 		cgroup_get(&root->cgrp);
 		ret = 0;
 		goto out_unlock;
 	}
 	for_each_root(root) {
 		bool name_match = false;
 		if (root == &cgrp_dfl_root)
 			continue;
 		/*
 		 * If we asked for a name then it must match.  Also, if
 		 * name matches but sybsys_mask doesn't, we should fail.
 		 * Remember whether name matched.
 		 */
 		if (opts.name) {
 			if (strcmp(opts.name, root->name))
 				continue;
 			name_match = true;
 		}
 		/*
 		 * If we asked for subsystems (or explicitly for no
 		 * subsystems) then they must match.
 		 */
 		if ((opts.subsys_mask || opts.none) &&
 		    (opts.subsys_mask != root->subsys_mask)) {
 			if (!name_match)
 				continue;
 			ret = -EBUSY;
 			goto out_unlock;
 		}
 		if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
 			if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
 				pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
 				ret = -EINVAL;
 				goto out_unlock;
 			} else {
 				pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
 			}
 		}
 		/*
 		 * A root's lifetime is governed by its root cgroup.  Zero
 		 * ref indicate that the root is being destroyed.  Wait for
 		 * destruction to complete so that the subsystems are free.
 		 * We can use wait_queue for the wait but this path is
 		 * super cold.  Let's just sleep for a bit and retry.
 		 */
 		if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
 			mutex_unlock(&cgroup_mutex);
 			mutex_unlock(&cgroup_tree_mutex);
 			msleep(10);
 			mutex_lock(&cgroup_tree_mutex);
 			mutex_lock(&cgroup_mutex);
 			goto retry;
 		}
 		ret = 0;
 		goto out_unlock;
 	}
 	/*
 	 * No such thing, create a new one.  name= matching without subsys
 	 * specification is allowed for already existing hierarchies but we
 	 * can't create new one without subsys specification.
 	 */
 	if (!opts.subsys_mask && !opts.none) {
 		ret = -EINVAL;
 		goto out_unlock;
 	}
 	root = kzalloc(sizeof(*root), GFP_KERNEL);
 	if (!root) {
 		ret = -ENOMEM;
 		goto out_unlock;
 	}
 	init_cgroup_root(root, &opts);
 	ret = cgroup_setup_root(root, opts.subsys_mask);
 	if (ret)
 		cgroup_free_root(root);
 out_unlock:
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	kfree(opts.release_agent);
 	kfree(opts.name);
 	if (ret)
 		return ERR_PTR(ret);
 	dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
 	if (IS_ERR(dentry) || !new_sb)
 		cgroup_put(&root->cgrp);
 	return dentry;
 }
 static void cgroup_kill_sb(struct super_block *sb)
 {
 	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	cgroup_put(&root->cgrp);
 	kernfs_kill_sb(sb);
 }
 static struct file_system_type cgroup_fs_type = {
 	.name = "cgroup",
 	.mount = cgroup_mount,
 	.kill_sb = cgroup_kill_sb,
 };
 static struct kobject *cgroup_kobj;
 /**
  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
  * @task: target task
  * @buf: the buffer to write the path into
  * @buflen: the length of the buffer
  *
  * Determine @task's cgroup on the first (the one with the lowest non-zero
  * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
  * function grabs cgroup_mutex and shouldn't be used inside locks used by
  * cgroup controller callbacks.
  *
  * Return value is the same as kernfs_path().
  */
 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 {
 	struct cgroup_root *root;
 	struct cgroup *cgrp;
 	int hierarchy_id = 1;
 	char *path = NULL;
 	mutex_lock(&cgroup_mutex);
 	down_read(&css_set_rwsem);
 	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
 	if (root) {
 		cgrp = task_cgroup_from_root(task, root);
 		path = cgroup_path(cgrp, buf, buflen);
 	} else {
 		/* if no hierarchy exists, everyone is in "/" */
 		if (strlcpy(buf, "/", buflen) < buflen)
 			path = buf;
 	}
 	up_read(&css_set_rwsem);
 	mutex_unlock(&cgroup_mutex);
 	return path;
 }
 EXPORT_SYMBOL_GPL(task_cgroup_path);
 /* used to track tasks and other necessary states during migration */
 struct cgroup_taskset {
 	/* the src and dst cset list running through cset->mg_node */
 	struct list_head	src_csets;
 	struct list_head	dst_csets;
 	/*
 	 * Fields for cgroup_taskset_*() iteration.
 	 *
 	 * Before migration is committed, the target migration tasks are on
 	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
 	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
 	 * or ->dst_csets depending on whether migration is committed.
 	 *
 	 * ->cur_csets and ->cur_task point to the current task position
 	 * during iteration.
 	 */
 	struct list_head	*csets;
 	struct css_set		*cur_cset;
 	struct task_struct	*cur_task;
 };
 /**
  * cgroup_taskset_first - reset taskset and return the first task
  * @tset: taskset of interest
  *
  * @tset iteration is initialized and the first task is returned.
  */
 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
 {
 	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
 	tset->cur_task = NULL;
 	return cgroup_taskset_next(tset);
 }
 /**
  * cgroup_taskset_next - iterate to the next task in taskset
  * @tset: taskset of interest
  *
  * Return the next task in @tset.  Iteration must have been initialized
  * with cgroup_taskset_first().
  */
 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
 {
 	struct css_set *cset = tset->cur_cset;
 	struct task_struct *task = tset->cur_task;
 	while (&cset->mg_node != tset->csets) {
 		if (!task)
 			task = list_first_entry(&cset->mg_tasks,
 						struct task_struct, cg_list);
 		else
 			task = list_next_entry(task, cg_list);
 		if (&task->cg_list != &cset->mg_tasks) {
 			tset->cur_cset = cset;
 			tset->cur_task = task;
 			return task;
 		}
 		cset = list_next_entry(cset, mg_node);
 		task = NULL;
 	}
 	return NULL;
 }
 /**
  * cgroup_task_migrate - move a task from one cgroup to another.
  * @old_cgrp; the cgroup @tsk is being migrated from
  * @tsk: the task being migrated
  * @new_cset: the new css_set @tsk is being attached to
  *
  * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
  */
 static void cgroup_task_migrate(struct cgroup *old_cgrp,
 				struct task_struct *tsk,
 				struct css_set *new_cset)
 {
 	struct css_set *old_cset;
 	lockdep_assert_held(&cgroup_mutex);
 	lockdep_assert_held(&css_set_rwsem);
 	/*
 	 * We are synchronized through threadgroup_lock() against PF_EXITING
 	 * setting such that we can't race against cgroup_exit() changing the
 	 * css_set to init_css_set and dropping the old one.
 	 */
 	WARN_ON_ONCE(tsk->flags & PF_EXITING);
 	old_cset = task_css_set(tsk);
 	get_css_set(new_cset);
 	rcu_assign_pointer(tsk->cgroups, new_cset);
 	/*
 	 * Use move_tail so that cgroup_taskset_first() still returns the
 	 * leader after migration.  This works because cgroup_migrate()
 	 * ensures that the dst_cset of the leader is the first on the
 	 * tset's dst_csets list.
 	 */
 	list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
 	/*
 	 * We just gained a reference on old_cset by taking it from the
 	 * task. As trading it for new_cset is protected by cgroup_mutex,
 	 * we're safe to drop it here; it will be freed under RCU.
 	 */
 	set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
 	put_css_set_locked(old_cset, false);
 }
 /**
  * cgroup_migrate_finish - cleanup after attach
  * @preloaded_csets: list of preloaded css_sets
  *
  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
  * those functions for details.
  */
 static void cgroup_migrate_finish(struct list_head *preloaded_csets)
 {
 	struct css_set *cset, *tmp_cset;
 	lockdep_assert_held(&cgroup_mutex);
 	down_write(&css_set_rwsem);
 	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
 		cset->mg_src_cgrp = NULL;
 		cset->mg_dst_cset = NULL;
 		list_del_init(&cset->mg_preload_node);
 		put_css_set_locked(cset, false);
 	}
 	up_write(&css_set_rwsem);
 }
 /**
  * cgroup_migrate_add_src - add a migration source css_set
  * @src_cset: the source css_set to add
  * @dst_cgrp: the destination cgroup
  * @preloaded_csets: list of preloaded css_sets
  *
  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
  * @src_cset and add it to @preloaded_csets, which should later be cleaned
  * up by cgroup_migrate_finish().
  *
  * This function may be called without holding threadgroup_lock even if the
  * target is a process.  Threads may be created and destroyed but as long
  * as cgroup_mutex is not dropped, no new css_set can be put into play and
  * the preloaded css_sets are guaranteed to cover all migrations.
  */
 static void cgroup_migrate_add_src(struct css_set *src_cset,
 				   struct cgroup *dst_cgrp,
 				   struct list_head *preloaded_csets)
 {
 	struct cgroup *src_cgrp;
 	lockdep_assert_held(&cgroup_mutex);
 	lockdep_assert_held(&css_set_rwsem);
 	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
 	/* nothing to do if this cset already belongs to the cgroup */
 	if (src_cgrp == dst_cgrp)
 		return;
 	if (!list_empty(&src_cset->mg_preload_node))
 		return;
 	WARN_ON(src_cset->mg_src_cgrp);
 	WARN_ON(!list_empty(&src_cset->mg_tasks));
 	WARN_ON(!list_empty(&src_cset->mg_node));
 	src_cset->mg_src_cgrp = src_cgrp;
 	get_css_set(src_cset);
 	list_add(&src_cset->mg_preload_node, preloaded_csets);
 }
 /**
  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
  * @dst_cgrp: the destination cgroup
  * @preloaded_csets: list of preloaded source css_sets
  *
  * Tasks are about to be moved to @dst_cgrp and all the source css_sets
  * have been preloaded to @preloaded_csets.  This function looks up and
  * pins all destination css_sets, links each to its source, and put them on
  * @preloaded_csets.
  *
  * This function must be called after cgroup_migrate_add_src() has been
  * called on each migration source css_set.  After migration is performed
  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
  * @preloaded_csets.
  */
 static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
 				      struct list_head *preloaded_csets)
 {
 	LIST_HEAD(csets);
 	struct css_set *src_cset;
 	lockdep_assert_held(&cgroup_mutex);
 	/* look up the dst cset for each src cset and link it to src */
 	list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {
 		struct css_set *dst_cset;
 		dst_cset = find_css_set(src_cset, dst_cgrp);
 		if (!dst_cset)
 			goto err;
 		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
 		src_cset->mg_dst_cset = dst_cset;
 		if (list_empty(&dst_cset->mg_preload_node))
 			list_add(&dst_cset->mg_preload_node, &csets);
 		else
 			put_css_set(dst_cset, false);
 	}
 	list_splice(&csets, preloaded_csets);
 	return 0;
 err:
 	cgroup_migrate_finish(&csets);
 	return -ENOMEM;
 }
 /**
  * cgroup_migrate - migrate a process or task to a cgroup
  * @cgrp: the destination cgroup
  * @leader: the leader of the process or the task to migrate
  * @threadgroup: whether @leader points to the whole process or a single task
  *
  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
  * process, the caller must be holding threadgroup_lock of @leader.  The
  * caller is also responsible for invoking cgroup_migrate_add_src() and
  * cgroup_migrate_prepare_dst() on the targets before invoking this
  * function and following up with cgroup_migrate_finish().
  *
  * As long as a controller's ->can_attach() doesn't fail, this function is
  * guaranteed to succeed.  This means that, excluding ->can_attach()
  * failure, when migrating multiple targets, the success or failure can be
  * decided for all targets by invoking group_migrate_prepare_dst() before
  * actually starting migrating.
  */
 static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
 			  bool threadgroup)
 {
 	struct cgroup_taskset tset = {
 		.src_csets	= LIST_HEAD_INIT(tset.src_csets),
 		.dst_csets	= LIST_HEAD_INIT(tset.dst_csets),
 		.csets		= &tset.src_csets,
 	};
 	struct cgroup_subsys_state *css, *failed_css = NULL;
 	struct css_set *cset, *tmp_cset;
 	struct task_struct *task, *tmp_task;
 	int i, ret;
 	/*
 	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
 	 * already PF_EXITING could be freed from underneath us unless we
 	 * take an rcu_read_lock.
 	 */
 	down_write(&css_set_rwsem);
 	rcu_read_lock();
 	task = leader;
 	do {
 		/* @task either already exited or can't exit until the end */
 		if (task->flags & PF_EXITING)
 			goto next;
 		/* leave @task alone if post_fork() hasn't linked it yet */
 		if (list_empty(&task->cg_list))
 			goto next;
 		cset = task_css_set(task);
 		if (!cset->mg_src_cgrp)
 			goto next;
 		/*
 		 * cgroup_taskset_first() must always return the leader.
 		 * Take care to avoid disturbing the ordering.
 		 */
 		list_move_tail(&task->cg_list, &cset->mg_tasks);
 		if (list_empty(&cset->mg_node))
 			list_add_tail(&cset->mg_node, &tset.src_csets);
 		if (list_empty(&cset->mg_dst_cset->mg_node))
 			list_move_tail(&cset->mg_dst_cset->mg_node,
 				       &tset.dst_csets);
 	next:
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
 	up_write(&css_set_rwsem);
 	/* methods shouldn't be called if no task is actually migrating */
 	if (list_empty(&tset.src_csets))
 		return 0;
 	/* check that we can legitimately attach to the cgroup */
 	for_each_e_css(css, i, cgrp) {
 		if (css->ss->can_attach) {
 			ret = css->ss->can_attach(css, &tset);
 			if (ret) {
 				failed_css = css;
 				goto out_cancel_attach;
 			}
 		}
 	}
 	/*
 	 * Now that we're guaranteed success, proceed to move all tasks to
 	 * the new cgroup.  There are no failure cases after here, so this
 	 * is the commit point.
 	 */
 	down_write(&css_set_rwsem);
 	list_for_each_entry(cset, &tset.src_csets, mg_node) {
 		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
 			cgroup_task_migrate(cset->mg_src_cgrp, task,
 					    cset->mg_dst_cset);
 	}
 	up_write(&css_set_rwsem);
 	/*
 	 * Migration is committed, all target tasks are now on dst_csets.
 	 * Nothing is sensitive to fork() after this point.  Notify
 	 * controllers that migration is complete.
 	 */
 	tset.csets = &tset.dst_csets;
 	for_each_e_css(css, i, cgrp)
 		if (css->ss->attach)
 			css->ss->attach(css, &tset);
 	ret = 0;
 	goto out_release_tset;
 out_cancel_attach:
 	for_each_e_css(css, i, cgrp) {
 		if (css == failed_css)
 			break;
 		if (css->ss->cancel_attach)
 			css->ss->cancel_attach(css, &tset);
 	}
 out_release_tset:
 	down_write(&css_set_rwsem);
 	list_splice_init(&tset.dst_csets, &tset.src_csets);
 	list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
 		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
 		list_del_init(&cset->mg_node);
 	}
 	up_write(&css_set_rwsem);
 	return ret;
 }
 /**
  * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
  * @dst_cgrp: the cgroup to attach to
  * @leader: the task or the leader of the threadgroup to be attached
  * @threadgroup: attach the whole threadgroup?
  *
  * Call holding cgroup_mutex and threadgroup_lock of @leader.
  */
 static int cgroup_attach_task(struct cgroup *dst_cgrp,
 			      struct task_struct *leader, bool threadgroup)
 {
 	LIST_HEAD(preloaded_csets);
 	struct task_struct *task;
 	int ret;
 	/* look up all src csets */
 	down_read(&css_set_rwsem);
 	rcu_read_lock();
 	task = leader;
 	do {
 		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
 				       &preloaded_csets);
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
 	up_read(&css_set_rwsem);
 	/* prepare dst csets and commit */
 	ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
 	if (!ret)
 		ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
 	cgroup_migrate_finish(&preloaded_csets);
 	return ret;
 }
 /*
  * Find the task_struct of the task to attach by vpid and pass it along to the
  * function to attach either it or all tasks in its threadgroup. Will lock
  * cgroup_mutex and threadgroup.
  */
 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 {
 	struct task_struct *tsk;
 	const struct cred *cred = current_cred(), *tcred;
 	int ret;
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 retry_find_task:
 	rcu_read_lock();
 	if (pid) {
 		tsk = find_task_by_vpid(pid);
 		if (!tsk) {
 			rcu_read_unlock();
 			ret = -ESRCH;
 			goto out_unlock_cgroup;
 		}
 		/*
 		 * even if we're attaching all tasks in the thread group, we
 		 * only need to check permissions on one of them.
 		 */
 		tcred = __task_cred(tsk);
 		if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
 		    !uid_eq(cred->euid, tcred->uid) &&
 		    !uid_eq(cred->euid, tcred->suid)) {
 			rcu_read_unlock();
 			ret = -EACCES;
 			goto out_unlock_cgroup;
 		}
 	} else
 		tsk = current;
 	if (threadgroup)
 		tsk = tsk->group_leader;
 	/*
 	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
 	 * trapped in a cpuset, or RT worker may be born in a cgroup
 	 * with no rt_runtime allocated.  Just say no.
 	 */
 	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
 		rcu_read_unlock();
 		goto out_unlock_cgroup;
 	}
 	get_task_struct(tsk);
 	rcu_read_unlock();
 	threadgroup_lock(tsk);
 	if (threadgroup) {
 		if (!thread_group_leader(tsk)) {
 			/*
 			 * a race with de_thread from another thread's exec()
 			 * may strip us of our leadership, if this happens,
 			 * there is no choice but to throw this task away and
 			 * try again; this is
 			 * "double-double-toil-and-trouble-check locking".
 			 */
 			threadgroup_unlock(tsk);
 			put_task_struct(tsk);
 			goto retry_find_task;
 		}
 	}
 	ret = cgroup_attach_task(cgrp, tsk, threadgroup);
 	threadgroup_unlock(tsk);
 	put_task_struct(tsk);
 out_unlock_cgroup:
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
 /**
  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
  * @from: attach to all cgroups of a given task
  * @tsk: the task to be attached
  */
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 {
 	struct cgroup_root *root;
 	int retval = 0;
 	mutex_lock(&cgroup_mutex);
 	for_each_root(root) {
 		struct cgroup *from_cgrp;
 		if (root == &cgrp_dfl_root)
 			continue;
 		down_read(&css_set_rwsem);
 		from_cgrp = task_cgroup_from_root(from, root);
 		up_read(&css_set_rwsem);
 		retval = cgroup_attach_task(from_cgrp, tsk, false);
 		if (retval)
 			break;
 	}
 	mutex_unlock(&cgroup_mutex);
 	return retval;
 }
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 static int cgroup_tasks_write(struct cgroup_subsys_state *css,
 			      struct cftype *cft, u64 pid)
 {
 	return attach_task_by_pid(css->cgroup, pid, false);
 }
 static int cgroup_procs_write(struct cgroup_subsys_state *css,
 			      struct cftype *cft, u64 tgid)
 {
 	return attach_task_by_pid(css->cgroup, tgid, true);
 }
 static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
 				      struct cftype *cft, char *buffer)
 {
 	struct cgroup_root *root = css->cgroup->root;
 	BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
 	if (!cgroup_lock_live_group(css->cgroup))
 		return -ENODEV;
 	spin_lock(&release_agent_path_lock);
 	strlcpy(root->release_agent_path, buffer,
 		sizeof(root->release_agent_path));
 	spin_unlock(&release_agent_path_lock);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 static int cgroup_release_agent_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
 	seq_puts(seq, cgrp->root->release_agent_path);
 	seq_putc(seq, '\n');
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
 	seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
 	return 0;
 }
 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
 				 size_t nbytes, loff_t off)
 {
 	struct cgroup *cgrp = of->kn->parent->priv;
 	struct cftype *cft = of->kn->priv;
 	struct cgroup_subsys_state *css;
 	int ret;
 	/*
 	 * kernfs guarantees that a file isn't deleted with operations in
 	 * flight, which means that the matching css is and stays alive and
 	 * doesn't need to be pinned.  The RCU locking is not necessary
 	 * either.  It's just for the convenience of using cgroup_css().
 	 */
 	rcu_read_lock();
 	css = cgroup_css(cgrp, cft->ss);
 	rcu_read_unlock();
 	if (cft->write_string) {
 		ret = cft->write_string(css, cft, strstrip(buf));
 	} else if (cft->write_u64) {
 		unsigned long long v;
 		ret = kstrtoull(buf, 0, &v);
 		if (!ret)
 			ret = cft->write_u64(css, cft, v);
 	} else if (cft->write_s64) {
 		long long v;
 		ret = kstrtoll(buf, 0, &v);
 		if (!ret)
 			ret = cft->write_s64(css, cft, v);
 	} else if (cft->trigger) {
 		ret = cft->trigger(css, (unsigned int)cft->private);
 	} else {
 		ret = -EINVAL;
 	}
 	return ret ?: nbytes;
 }
 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
 {
 	return seq_cft(seq)->seq_start(seq, ppos);
 }
 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
 {
 	return seq_cft(seq)->seq_next(seq, v, ppos);
 }
 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
 {
 	seq_cft(seq)->seq_stop(seq, v);
 }
 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 {
 	struct cftype *cft = seq_cft(m);
 	struct cgroup_subsys_state *css = seq_css(m);
 	if (cft->seq_show)
 		return cft->seq_show(m, arg);
 	if (cft->read_u64)
 		seq_printf(m, "%llu\n", cft->read_u64(css, cft));
 	else if (cft->read_s64)
 		seq_printf(m, "%lld\n", cft->read_s64(css, cft));
 	else
 		return -EINVAL;
 	return 0;
 }
 static struct kernfs_ops cgroup_kf_single_ops = {
 	.atomic_write_len	= PAGE_SIZE,
 	.write			= cgroup_file_write,
 	.seq_show		= cgroup_seqfile_show,
 };
 static struct kernfs_ops cgroup_kf_ops = {
 	.atomic_write_len	= PAGE_SIZE,
 	.write			= cgroup_file_write,
 	.seq_start		= cgroup_seqfile_start,
 	.seq_next		= cgroup_seqfile_next,
 	.seq_stop		= cgroup_seqfile_stop,
 	.seq_show		= cgroup_seqfile_show,
 };
 /*
  * cgroup_rename - Only allow simple rename of directories in place.
  */
 static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
 			 const char *new_name_str)
 {
 	struct cgroup *cgrp = kn->priv;
 	int ret;
 	if (kernfs_type(kn) != KERNFS_DIR)
 		return -ENOTDIR;
 	if (kn->parent != new_parent)
 		return -EIO;
 	/*
 	 * This isn't a proper migration and its usefulness is very
 	 * limited.  Disallow if sane_behavior.
 	 */
 	if (cgroup_sane_behavior(cgrp))
 		return -EPERM;
 	/*
 	 * We're gonna grab cgroup_tree_mutex which nests outside kernfs
 	 * active_ref.  kernfs_rename() doesn't require active_ref
 	 * protection.  Break them before grabbing cgroup_tree_mutex.
 	 */
 	kernfs_break_active_protection(new_parent);
 	kernfs_break_active_protection(kn);
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	ret = kernfs_rename(kn, new_parent, new_name_str);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	kernfs_unbreak_active_protection(kn);
 	kernfs_unbreak_active_protection(new_parent);
 	return ret;
 }
 /* set uid and gid of cgroup dirs and files to that of the creator */
 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
 {
 	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
 			       .ia_uid = current_fsuid(),
 			       .ia_gid = current_fsgid(), };
 	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
 	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
 		return 0;
 	return kernfs_setattr(kn, &iattr);
 }
 static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 {
 	char name[CGROUP_FILE_NAME_MAX];
 	struct kernfs_node *kn;
 	struct lock_class_key *key = NULL;
 	int ret;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	key = &cft->lockdep_key;
 #endif
 	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
 				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
 				  NULL, false, key);
 	if (IS_ERR(kn))
 		return PTR_ERR(kn);
 	ret = cgroup_kn_set_ugid(kn);
 	if (ret)
 		kernfs_remove(kn);
 	return ret;
 }
 /**
  * cgroup_addrm_files - add or remove files to a cgroup directory
  * @cgrp: the target cgroup
  * @cfts: array of cftypes to be added
  * @is_add: whether to add or remove
  *
  * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
  * For removals, this function never fails.  If addition fails, this
  * function doesn't remove files already added.  The caller is responsible
  * for cleaning up.
  */
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add)
 {
 	struct cftype *cft;
 	int ret;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
 		/* does cft->flags tell us to skip this file on @cgrp? */
 		if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
 			continue;
 		if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
 			continue;
 		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
 			continue;
 		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
 			continue;
 		if (is_add) {
 			ret = cgroup_add_file(cgrp, cft);
 			if (ret) {
 				pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
 					cft->name, ret);
 				return ret;
 			}
 		} else {
 			cgroup_rm_file(cgrp, cft);
 		}
 	}
 	return 0;
 }
 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
 {
 	LIST_HEAD(pending);
 	struct cgroup_subsys *ss = cfts[0].ss;
 	struct cgroup *root = &ss->root->cgrp;
 	struct cgroup_subsys_state *css;
 	int ret = 0;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	/* add/rm files for all cgroups created before */
 	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
 		struct cgroup *cgrp = css->cgroup;
 		if (cgroup_is_dead(cgrp))
 			continue;
 		ret = cgroup_addrm_files(cgrp, cfts, is_add);
 		if (ret)
 			break;
 	}
 	if (is_add && !ret)
 		kernfs_activate(root->kn);
 	return ret;
 }
 static void cgroup_exit_cftypes(struct cftype *cfts)
 {
 	struct cftype *cft;
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
 		/* free copy for custom atomic_write_len, see init_cftypes() */
 		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
 			kfree(cft->kf_ops);
 		cft->kf_ops = NULL;
 		cft->ss = NULL;
 	}
 }
 static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
 	struct cftype *cft;
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
 		struct kernfs_ops *kf_ops;
 		WARN_ON(cft->ss || cft->kf_ops);
 		if (cft->seq_start)
 			kf_ops = &cgroup_kf_ops;
 		else
 			kf_ops = &cgroup_kf_single_ops;
 		/*
 		 * Ugh... if @cft wants a custom max_write_len, we need to
 		 * make a copy of kf_ops to set its atomic_write_len.
 		 */
 		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
 			kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
 			if (!kf_ops) {
 				cgroup_exit_cftypes(cfts);
 				return -ENOMEM;
 			}
 			kf_ops->atomic_write_len = cft->max_write_len;
 		}
 		cft->kf_ops = kf_ops;
 		cft->ss = ss;
 	}
 	return 0;
 }
 static int cgroup_rm_cftypes_locked(struct cftype *cfts)
 {
 	lockdep_assert_held(&cgroup_tree_mutex);
 	if (!cfts || !cfts[0].ss)
 		return -ENOENT;
 	list_del(&cfts->node);
 	cgroup_apply_cftypes(cfts, false);
 	cgroup_exit_cftypes(cfts);
 	return 0;
 }
 /**
  * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
  * @cfts: zero-length name terminated array of cftypes
  *
  * Unregister @cfts.  Files described by @cfts are removed from all
  * existing cgroups and all future cgroups won't have them either.  This
  * function can be called anytime whether @cfts' subsys is attached or not.
  *
  * Returns 0 on successful unregistration, -ENOENT if @cfts is not
  * registered.
  */
 int cgroup_rm_cftypes(struct cftype *cfts)
 {
 	int ret;
 	mutex_lock(&cgroup_tree_mutex);
 	ret = cgroup_rm_cftypes_locked(cfts);
 	mutex_unlock(&cgroup_tree_mutex);
 	return ret;
 }
 /**
  * cgroup_add_cftypes - add an array of cftypes to a subsystem
  * @ss: target cgroup subsystem
  * @cfts: zero-length name terminated array of cftypes
  *
  * Register @cfts to @ss.  Files described by @cfts are created for all
  * existing cgroups to which @ss is attached and all future cgroups will
  * have them too.  This function can be called anytime whether @ss is
  * attached or not.
  *
  * Returns 0 on successful registration, -errno on failure.  Note that this
  * function currently returns 0 as long as @cfts registration is successful
  * even if some file creation attempts on existing cgroups fail.
  */
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
 	int ret;
 	if (!cfts || cfts[0].name[0] == '\0')
 		return 0;
 	ret = cgroup_init_cftypes(ss, cfts);
 	if (ret)
 		return ret;
 	mutex_lock(&cgroup_tree_mutex);
 	list_add_tail(&cfts->node, &ss->cfts);
 	ret = cgroup_apply_cftypes(cfts, true);
 	if (ret)
 		cgroup_rm_cftypes_locked(cfts);
 	mutex_unlock(&cgroup_tree_mutex);
 	return ret;
 }
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
  *
  * Return the number of tasks in the cgroup.
  */
 static int cgroup_task_count(const struct cgroup *cgrp)
 {
 	int count = 0;
 	struct cgrp_cset_link *link;
 	down_read(&css_set_rwsem);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
 		count += atomic_read(&link->cset->refcount);
 	up_read(&css_set_rwsem);
 	return count;
 }
 /**
  * css_next_child - find the next child of a given css
  * @pos_css: the current position (%NULL to initiate traversal)
  * @parent_css: css whose children to walk
  *
  * This function returns the next child of @parent_css and should be called
  * under either cgroup_mutex or RCU read lock.  The only requirement is
  * that @parent_css and @pos_css are accessible.  The next sibling is
  * guaranteed to be returned regardless of their states.
  */
 struct cgroup_subsys_state *
 css_next_child(struct cgroup_subsys_state *pos_css,
 	       struct cgroup_subsys_state *parent_css)
 {
 	struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
 	struct cgroup *cgrp = parent_css->cgroup;
 	struct cgroup *next;
 	cgroup_assert_mutexes_or_rcu_locked();
 	/*
 	 * @pos could already have been removed.  Once a cgroup is removed,
 	 * its ->sibling.next is no longer updated when its next sibling
 	 * changes.  As CGRP_DEAD assertion is serialized and happens
 	 * before the cgroup is taken off the ->sibling list, if we see it
 	 * unasserted, it's guaranteed that the next sibling hasn't
 	 * finished its grace period even if it's already removed, and thus
 	 * safe to dereference from this RCU critical section.  If
 	 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
 	 * to be visible as %true here.
 	 *
 	 * If @pos is dead, its next pointer can't be dereferenced;
 	 * however, as each cgroup is given a monotonically increasing
 	 * unique serial number and always appended to the sibling list,
 	 * the next one can be found by walking the parent's children until
 	 * we see a cgroup with higher serial number than @pos's.  While
 	 * this path can be slower, it's taken only when either the current
 	 * cgroup is removed or iteration and removal race.
 	 */
 	if (!pos) {
 		next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
 	} else if (likely(!cgroup_is_dead(pos))) {
 		next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
 	} else {
 		list_for_each_entry_rcu(next, &cgrp->children, sibling)
 			if (next->serial_nr > pos->serial_nr)
 				break;
 	}
 	/*
 	 * @next, if not pointing to the head, can be dereferenced and is
 	 * the next sibling; however, it might have @ss disabled.  If so,
 	 * fast-forward to the next enabled one.
 	 */
 	while (&next->sibling != &cgrp->children) {
 		struct cgroup_subsys_state *next_css = cgroup_css(next, parent_css->ss);
 		if (next_css)
 			return next_css;
 		next = list_entry_rcu(next->sibling.next, struct cgroup, sibling);
 	}
 	return NULL;
 }
 /**
  * css_next_descendant_pre - find the next descendant for pre-order walk
  * @pos: the current position (%NULL to initiate traversal)
  * @root: css whose descendants to walk
  *
  * To be used by css_for_each_descendant_pre().  Find the next descendant
  * to visit for pre-order traversal of @root's descendants.  @root is
  * included in the iteration and the first node to be visited.
  *
  * While this function requires cgroup_mutex or RCU read locking, it
  * doesn't require the whole traversal to be contained in a single critical
  * section.  This function will return the correct next descendant as long
  * as both @pos and @root are accessible and @pos is a descendant of @root.
  */
 struct cgroup_subsys_state *
 css_next_descendant_pre(struct cgroup_subsys_state *pos,
 			struct cgroup_subsys_state *root)
 {
 	struct cgroup_subsys_state *next;
 	cgroup_assert_mutexes_or_rcu_locked();
 	/* if first iteration, visit @root */
 	if (!pos)
 		return root;
 	/* visit the first child if exists */
 	next = css_next_child(NULL, pos);
 	if (next)
 		return next;
 	/* no child, visit my or the closest ancestor's next sibling */
 	while (pos != root) {
 		next = css_next_child(pos, css_parent(pos));
 		if (next)
 			return next;
 		pos = css_parent(pos);
 	}
 	return NULL;
 }
 /**
  * css_rightmost_descendant - return the rightmost descendant of a css
  * @pos: css of interest
  *
  * Return the rightmost descendant of @pos.  If there's no descendant, @pos
  * is returned.  This can be used during pre-order traversal to skip
  * subtree of @pos.
  *
  * While this function requires cgroup_mutex or RCU read locking, it
  * doesn't require the whole traversal to be contained in a single critical
  * section.  This function will return the correct rightmost descendant as
  * long as @pos is accessible.
  */
 struct cgroup_subsys_state *
 css_rightmost_descendant(struct cgroup_subsys_state *pos)
 {
 	struct cgroup_subsys_state *last, *tmp;
 	cgroup_assert_mutexes_or_rcu_locked();
 	do {
 		last = pos;
 		/* ->prev isn't RCU safe, walk ->next till the end */
 		pos = NULL;
 		css_for_each_child(tmp, last)
 			pos = tmp;
 	} while (pos);
 	return last;
 }
 static struct cgroup_subsys_state *
 css_leftmost_descendant(struct cgroup_subsys_state *pos)
 {
 	struct cgroup_subsys_state *last;
 	do {
 		last = pos;
 		pos = css_next_child(NULL, pos);
 	} while (pos);
 	return last;
 }
 /**
  * css_next_descendant_post - find the next descendant for post-order walk
  * @pos: the current position (%NULL to initiate traversal)
  * @root: css whose descendants to walk
  *
  * To be used by css_for_each_descendant_post().  Find the next descendant
  * to visit for post-order traversal of @root's descendants.  @root is
  * included in the iteration and the last node to be visited.
  *
  * While this function requires cgroup_mutex or RCU read locking, it
  * doesn't require the whole traversal to be contained in a single critical
  * section.  This function will return the correct next descendant as long
  * as both @pos and @cgroup are accessible and @pos is a descendant of
  * @cgroup.
  */
 struct cgroup_subsys_state *
 css_next_descendant_post(struct cgroup_subsys_state *pos,
 			 struct cgroup_subsys_state *root)
 {
 	struct cgroup_subsys_state *next;
 	cgroup_assert_mutexes_or_rcu_locked();
 	/* if first iteration, visit leftmost descendant which may be @root */
 	if (!pos)
 		return css_leftmost_descendant(root);
 	/* if we visited @root, we're done */
 	if (pos == root)
 		return NULL;
 	/* if there's an unvisited sibling, visit its leftmost descendant */
 	next = css_next_child(pos, css_parent(pos));
 	if (next)
 		return css_leftmost_descendant(next);
 	/* no sibling left, visit parent */
 	return css_parent(pos);
 }
 /**
  * css_advance_task_iter - advance a task itererator to the next css_set
  * @it: the iterator to advance
  *
  * Advance @it to the next css_set to walk.
  */
 static void css_advance_task_iter(struct css_task_iter *it)
 {
 	struct list_head *l = it->cset_pos;
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
 	/* Advance to the next non-empty css_set */
 	do {
 		l = l->next;
 		if (l == it->cset_head) {
 			it->cset_pos = NULL;
 			return;
 		}
 		if (it->ss) {
 			cset = container_of(l, struct css_set,
 					    e_cset_node[it->ss->id]);
 		} else {
 			link = list_entry(l, struct cgrp_cset_link, cset_link);
 			cset = link->cset;
 		}
 	} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
 	it->cset_pos = l;
 	if (!list_empty(&cset->tasks))
 		it->task_pos = cset->tasks.next;
 	else
 		it->task_pos = cset->mg_tasks.next;
 	it->tasks_head = &cset->tasks;
 	it->mg_tasks_head = &cset->mg_tasks;
 }
 /**
  * css_task_iter_start - initiate task iteration
  * @css: the css to walk tasks of
  * @it: the task iterator to use
  *
  * Initiate iteration through the tasks of @css.  The caller can call
  * css_task_iter_next() to walk through the tasks until the function
  * returns NULL.  On completion of iteration, css_task_iter_end() must be
  * called.
  *
  * Note that this function acquires a lock which is released when the
  * iteration finishes.  The caller can't sleep while iteration is in
  * progress.
  */
 void css_task_iter_start(struct cgroup_subsys_state *css,
 			 struct css_task_iter *it)
 	__acquires(css_set_rwsem)
 {
 	/* no one should try to iterate before mounting cgroups */
 	WARN_ON_ONCE(!use_task_css_set_links);
 	down_read(&css_set_rwsem);
 	it->ss = css->ss;
 	if (it->ss)
 		it->cset_pos = &css->cgroup->e_csets[css->ss->id];
 	else
 		it->cset_pos = &css->cgroup->cset_links;
 	it->cset_head = it->cset_pos;
 	css_advance_task_iter(it);
 }
 /**
  * css_task_iter_next - return the next task for the iterator
  * @it: the task iterator being iterated
  *
  * The "next" function for task iteration.  @it should have been
  * initialized via css_task_iter_start().  Returns NULL when the iteration
  * reaches the end.
  */
 struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
 	struct task_struct *res;
 	struct list_head *l = it->task_pos;
 	/* If the iterator cg is NULL, we have no tasks */
 	if (!it->cset_pos)
 		return NULL;
 	res = list_entry(l, struct task_struct, cg_list);
 	/*
 	 * Advance iterator to find next entry.  cset->tasks is consumed
 	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
 	 * next cset.
 	 */
 	l = l->next;
 	if (l == it->tasks_head)
 		l = it->mg_tasks_head->next;
 	if (l == it->mg_tasks_head)
 		css_advance_task_iter(it);
 	else
 		it->task_pos = l;
 	return res;
 }
 /**
  * css_task_iter_end - finish task iteration
  * @it: the task iterator to finish
  *
  * Finish task iteration started by css_task_iter_start().
  */
 void css_task_iter_end(struct css_task_iter *it)
 	__releases(css_set_rwsem)
 {
 	up_read(&css_set_rwsem);
 }
 /**
  * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
  * @to: cgroup to which the tasks will be moved
  * @from: cgroup in which the tasks currently reside
  *
  * Locking rules between cgroup_post_fork() and the migration path
  * guarantee that, if a task is forking while being migrated, the new child
  * is guaranteed to be either visible in the source cgroup after the
  * parent's migration is complete or put into the target cgroup.  No task
  * can slip out of migration through forking.
  */
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 {
 	LIST_HEAD(preloaded_csets);
 	struct cgrp_cset_link *link;
 	struct css_task_iter it;
 	struct task_struct *task;
 	int ret;
 	mutex_lock(&cgroup_mutex);
 	/* all tasks in @from are being moved, all csets are source */
 	down_read(&css_set_rwsem);
 	list_for_each_entry(link, &from->cset_links, cset_link)
 		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
 	up_read(&css_set_rwsem);
 	ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
 	if (ret)
 		goto out_err;
 	/*
 	 * Migrate tasks one-by-one until @form is empty.  This fails iff
 	 * ->can_attach() fails.
 	 */
 	do {
 		css_task_iter_start(&from->dummy_css, &it);
 		task = css_task_iter_next(&it);
 		if (task)
 			get_task_struct(task);
 		css_task_iter_end(&it);
 		if (task) {
 			ret = cgroup_migrate(to, task, false);
 			put_task_struct(task);
 		}
 	} while (task && !ret);
 out_err:
 	cgroup_migrate_finish(&preloaded_csets);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
 /*
  * Stuff for reading the 'tasks'/'procs' files.
  *
  * Reading this file can return large amounts of data if a cgroup has
  * *lots* of attached tasks. So it may need several calls to read(),
  * but we cannot guarantee that the information we produce is correct
  * unless we produce it entirely atomically.
  *
  */
 /* which pidlist file are we talking about? */
 enum cgroup_filetype {
 	CGROUP_FILE_PROCS,
 	CGROUP_FILE_TASKS,
 };
 /*
  * A pidlist is a list of pids that virtually represents the contents of one
  * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
  * a pair (one each for procs, tasks) for each pid namespace that's relevant
  * to the cgroup.
  */
 struct cgroup_pidlist {
 	/*
 	 * used to find which pidlist is wanted. doesn't change as long as
 	 * this particular list stays in the list.
 	*/
 	struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
 	/* array of xids */
 	pid_t *list;
 	/* how many elements the above list has */
 	int length;
 	/* each of these stored in a list by its cgroup */
 	struct list_head links;
 	/* pointer to the cgroup we belong to, for list removal purposes */
 	struct cgroup *owner;
 	/* for delayed destruction */
 	struct delayed_work destroy_dwork;
 };
 /*
  * The following two functions "fix" the issue where there are more pids
  * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
  * TODO: replace with a kernel-wide solution to this problem
  */
 #define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
 static void *pidlist_allocate(int count)
 {
 	if (PIDLIST_TOO_LARGE(count))
 		return vmalloc(count * sizeof(pid_t));
 	else
 		return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
 }
 static void pidlist_free(void *p)
 {
 	if (is_vmalloc_addr(p))
 		vfree(p);
 	else
 		kfree(p);
 }
 /*
  * Used to destroy all pidlists lingering waiting for destroy timer.  None
  * should be left afterwards.
  */
 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
 {
 	struct cgroup_pidlist *l, *tmp_l;
 	mutex_lock(&cgrp->pidlist_mutex);
 	list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
 	mutex_unlock(&cgrp->pidlist_mutex);
 	flush_workqueue(cgroup_pidlist_destroy_wq);
 	BUG_ON(!list_empty(&cgrp->pidlists));
 }
 static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
 {
 	struct delayed_work *dwork = to_delayed_work(work);
 	struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
 						destroy_dwork);
 	struct cgroup_pidlist *tofree = NULL;
 	mutex_lock(&l->owner->pidlist_mutex);
 	/*
 	 * Destroy iff we didn't get queued again.  The state won't change
 	 * as destroy_dwork can only be queued while locked.
 	 */
 	if (!delayed_work_pending(dwork)) {
 		list_del(&l->links);
 		pidlist_free(l->list);
 		put_pid_ns(l->key.ns);
 		tofree = l;
 	}
 	mutex_unlock(&l->owner->pidlist_mutex);
 	kfree(tofree);
 }
 /*
  * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
  * Returns the number of unique elements.
  */
 static int pidlist_uniq(pid_t *list, int length)
 {
 	int src, dest = 1;
 	/*
 	 * we presume the 0th element is unique, so i starts at 1. trivial
 	 * edge cases first; no work needs to be done for either
 	 */
 	if (length == 0 || length == 1)
 		return length;
 	/* src and dest walk down the list; dest counts unique elements */
 	for (src = 1; src < length; src++) {
 		/* find next unique element */
 		while (list[src] == list[src-1]) {
 			src++;
 			if (src == length)
 				goto after;
 		}
 		/* dest always points to where the next unique element goes */
 		list[dest] = list[src];
 		dest++;
 	}
 after:
 	return dest;
 }
 /*
  * The two pid files - task and cgroup.procs - guaranteed that the result
  * is sorted, which forced this whole pidlist fiasco.  As pid order is
  * different per namespace, each namespace needs differently sorted list,
  * making it impossible to use, for example, single rbtree of member tasks
  * sorted by task pointer.  As pidlists can be fairly large, allocating one
  * per open file is dangerous, so cgroup had to implement shared pool of
  * pidlists keyed by cgroup and namespace.
  *
  * All this extra complexity was caused by the original implementation
  * committing to an entirely unnecessary property.  In the long term, we
  * want to do away with it.  Explicitly scramble sort order if
  * sane_behavior so that no such expectation exists in the new interface.
  *
  * Scrambling is done by swapping every two consecutive bits, which is
  * non-identity one-to-one mapping which disturbs sort order sufficiently.
  */
 static pid_t pid_fry(pid_t pid)
 {
 	unsigned a = pid & 0x55555555;
 	unsigned b = pid & 0xAAAAAAAA;
 	return (a << 1) | (b >> 1);
 }
 static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
 {
 	if (cgroup_sane_behavior(cgrp))
 		return pid_fry(pid);
 	else
 		return pid;
 }
 static int cmppid(const void *a, const void *b)
 {
 	return *(pid_t *)a - *(pid_t *)b;
 }
 static int fried_cmppid(const void *a, const void *b)
 {
 	return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
 }
 static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
 						  enum cgroup_filetype type)
 {
 	struct cgroup_pidlist *l;
 	/* don't need task_nsproxy() if we're looking at ourself */
 	struct pid_namespace *ns = task_active_pid_ns(current);
 	lockdep_assert_held(&cgrp->pidlist_mutex);
 	list_for_each_entry(l, &cgrp->pidlists, links)
 		if (l->key.type == type && l->key.ns == ns)
 			return l;
 	return NULL;
 }
 /*
  * find the appropriate pidlist for our purpose (given procs vs tasks)
  * returns with the lock on that pidlist already held, and takes care
  * of the use count, or returns NULL with no locks held if we're out of
  * memory.
  */
 static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
 						enum cgroup_filetype type)
 {
 	struct cgroup_pidlist *l;
 	lockdep_assert_held(&cgrp->pidlist_mutex);
 	l = cgroup_pidlist_find(cgrp, type);
 	if (l)
 		return l;
 	/* entry not found; create a new one */
 	l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
 	if (!l)
 		return l;
 	INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
 	l->key.type = type;
 	/* don't need task_nsproxy() if we're looking at ourself */
 	l->key.ns = get_pid_ns(task_active_pid_ns(current));
 	l->owner = cgrp;
 	list_add(&l->links, &cgrp->pidlists);
 	return l;
 }
 /*
  * Load a cgroup's pidarray with either procs' tgids or tasks' pids
  */
 static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
 			      struct cgroup_pidlist **lp)
 {
 	pid_t *array;
 	int length;
 	int pid, n = 0; /* used for populating the array */
 	struct css_task_iter it;
 	struct task_struct *tsk;
 	struct cgroup_pidlist *l;
 	lockdep_assert_held(&cgrp->pidlist_mutex);
 	/*
 	 * If cgroup gets more users after we read count, we won't have
 	 * enough space - tough.  This race is indistinguishable to the
 	 * caller from the case that the additional cgroup users didn't
 	 * show up until sometime later on.
 	 */
 	length = cgroup_task_count(cgrp);
 	array = pidlist_allocate(length);
 	if (!array)
 		return -ENOMEM;
 	/* now, populate the array */
 	css_task_iter_start(&cgrp->dummy_css, &it);
 	while ((tsk = css_task_iter_next(&it))) {
 		if (unlikely(n == length))
 			break;
 		/* get tgid or pid for procs or tasks file respectively */
 		if (type == CGROUP_FILE_PROCS)
 			pid = task_tgid_vnr(tsk);
 		else
 			pid = task_pid_vnr(tsk);
 		if (pid > 0) /* make sure to only use valid results */
 			array[n++] = pid;
 	}
 	css_task_iter_end(&it);
 	length = n;
 	/* now sort & (if procs) strip out duplicates */
 	if (cgroup_sane_behavior(cgrp))
 		sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
 	else
 		sort(array, length, sizeof(pid_t), cmppid, NULL);
 	if (type == CGROUP_FILE_PROCS)
 		length = pidlist_uniq(array, length);
 	l = cgroup_pidlist_find_create(cgrp, type);
 	if (!l) {
 		mutex_unlock(&cgrp->pidlist_mutex);
 		pidlist_free(array);
 		return -ENOMEM;
 	}
 	/* store array, freeing old if necessary */
 	pidlist_free(l->list);
 	l->list = array;
 	l->length = length;
 	*lp = l;
 	return 0;
 }
 /**
  * cgroupstats_build - build and fill cgroupstats
  * @stats: cgroupstats to fill information into
  * @dentry: A dentry entry belonging to the cgroup for which stats have
  * been requested.
  *
  * Build and fill cgroupstats so that taskstats can export it to user
  * space.
  */
 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 {
 	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
 	struct cgroup *cgrp;
 	struct css_task_iter it;
 	struct task_struct *tsk;
 	/* it should be kernfs_node belonging to cgroupfs and is a directory */
 	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
 	    kernfs_type(kn) != KERNFS_DIR)
 		return -EINVAL;
 	mutex_lock(&cgroup_mutex);
 	/*
 	 * We aren't being called from kernfs and there's no guarantee on
 	 * @kn->priv's validity.  For this and css_tryget_from_dir(),
 	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
 	 */
 	rcu_read_lock();
 	cgrp = rcu_dereference(kn->priv);
 	if (!cgrp || cgroup_is_dead(cgrp)) {
 		rcu_read_unlock();
 		mutex_unlock(&cgroup_mutex);
 		return -ENOENT;
 	}
 	rcu_read_unlock();
 	css_task_iter_start(&cgrp->dummy_css, &it);
 	while ((tsk = css_task_iter_next(&it))) {
 		switch (tsk->state) {
 		case TASK_RUNNING:
 			stats->nr_running++;
 			break;
 		case TASK_INTERRUPTIBLE:
 			stats->nr_sleeping++;
 			break;
 		case TASK_UNINTERRUPTIBLE:
 			stats->nr_uninterruptible++;
 			break;
 		case TASK_STOPPED:
 			stats->nr_stopped++;
 			break;
 		default:
 			if (delayacct_is_task_waiting_on_io(tsk))
 				stats->nr_io_wait++;
 			break;
 		}
 	}
 	css_task_iter_end(&it);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 /*
  * seq_file methods for the tasks/procs files. The seq_file position is the
  * next pid to display; the seq_file iterator is a pointer to the pid
  * in the cgroup->l->list array.
  */
 static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 {
 	/*
 	 * Initially we receive a position value that corresponds to
 	 * one more than the last pid shown (or 0 on the first call or
 	 * after a seek to the start). Use a binary-search to find the
 	 * next pid to display, if any
 	 */
 	struct kernfs_open_file *of = s->private;
 	struct cgroup *cgrp = seq_css(s)->cgroup;
 	struct cgroup_pidlist *l;
 	enum cgroup_filetype type = seq_cft(s)->private;
 	int index = 0, pid = *pos;
 	int *iter, ret;
 	mutex_lock(&cgrp->pidlist_mutex);
 	/*
 	 * !NULL @of->priv indicates that this isn't the first start()
 	 * after open.  If the matching pidlist is around, we can use that.
 	 * Look for it.  Note that @of->priv can't be used directly.  It
 	 * could already have been destroyed.
 	 */
 	if (of->priv)
 		of->priv = cgroup_pidlist_find(cgrp, type);
 	/*
 	 * Either this is the first start() after open or the matching
 	 * pidlist has been destroyed inbetween.  Create a new one.
 	 */
 	if (!of->priv) {
 		ret = pidlist_array_load(cgrp, type,
 					 (struct cgroup_pidlist **)&of->priv);
 		if (ret)
 			return ERR_PTR(ret);
 	}
 	l = of->priv;
 	if (pid) {
 		int end = l->length;
 		while (index < end) {
 			int mid = (index + end) / 2;
 			if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
 				index = mid;
 				break;
 			} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
 				index = mid + 1;
 			else
 				end = mid;
 		}
 	}
 	/* If we're off the end of the array, we're done */
 	if (index >= l->length)
 		return NULL;
 	/* Update the abstract position to be the actual pid that we found */
 	iter = l->list + index;
 	*pos = cgroup_pid_fry(cgrp, *iter);
 	return iter;
 }
 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
 	struct kernfs_open_file *of = s->private;
 	struct cgroup_pidlist *l = of->priv;
 	if (l)
 		mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
 				 CGROUP_PIDLIST_DESTROY_DELAY);
 	mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
 }
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 {
 	struct kernfs_open_file *of = s->private;
 	struct cgroup_pidlist *l = of->priv;
 	pid_t *p = v;
 	pid_t *end = l->list + l->length;
 	/*
 	 * Advance to the next pid in the array. If this goes off the
 	 * end, we're done
 	 */
 	p++;
 	if (p >= end) {
 		return NULL;
 	} else {
 		*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
 		return p;
 	}
 }
 static int cgroup_pidlist_show(struct seq_file *s, void *v)
 {
 	return seq_printf(s, "%d\n", *(int *)v);
 }
 /*
  * seq_operations functions for iterating on pidlists through seq_file -
  * independent of whether it's tasks or procs
  */
 static const struct seq_operations cgroup_pidlist_seq_operations = {
 	.start = cgroup_pidlist_start,
 	.stop = cgroup_pidlist_stop,
 	.next = cgroup_pidlist_next,
 	.show = cgroup_pidlist_show,
 };
 static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
 					 struct cftype *cft)
 {
 	return notify_on_release(css->cgroup);
 }
 static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
 					  struct cftype *cft, u64 val)
 {
 	clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
 	if (val)
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
 	else
 		clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
 	return 0;
 }
 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
 				      struct cftype *cft)
 {
 	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
 }
 static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
 				       struct cftype *cft, u64 val)
 {
 	if (val)
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
 	else
 		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
 	return 0;
 }
 static struct cftype cgroup_base_files[] = {
 	{
 		.name = "cgroup.procs",
 		.seq_start = cgroup_pidlist_start,
 		.seq_next = cgroup_pidlist_next,
 		.seq_stop = cgroup_pidlist_stop,
 		.seq_show = cgroup_pidlist_show,
 		.private = CGROUP_FILE_PROCS,
 		.write_u64 = cgroup_procs_write,
 		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
 		.name = "cgroup.clone_children",
 		.flags = CFTYPE_INSANE,
 		.read_u64 = cgroup_clone_children_read,
 		.write_u64 = cgroup_clone_children_write,
 	},
 	{
 		.name = "cgroup.sane_behavior",
 		.flags = CFTYPE_ONLY_ON_ROOT,
 		.seq_show = cgroup_sane_behavior_show,
 	},
 	/*
 	 * Historical crazy stuff.  These don't have "cgroup."  prefix and
 	 * don't exist if sane_behavior.  If you're depending on these, be
 	 * prepared to be burned.
 	 */
 	{
 		.name = "tasks",
 		.flags = CFTYPE_INSANE,		/* use "procs" instead */
 		.seq_start = cgroup_pidlist_start,
 		.seq_next = cgroup_pidlist_next,
 		.seq_stop = cgroup_pidlist_stop,
 		.seq_show = cgroup_pidlist_show,
 		.private = CGROUP_FILE_TASKS,
 		.write_u64 = cgroup_tasks_write,
 		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
 		.name = "notify_on_release",
 		.flags = CFTYPE_INSANE,
 		.read_u64 = cgroup_read_notify_on_release,
 		.write_u64 = cgroup_write_notify_on_release,
 	},
 	{
 		.name = "release_agent",
 		.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
 		.seq_show = cgroup_release_agent_show,
 		.write_string = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX - 1,
 	},
 	{ }	/* terminate */
 };
 /**
  * cgroup_populate_dir - create subsys files in a cgroup directory
  * @cgrp: target cgroup
  * @subsys_mask: mask of the subsystem ids whose files should be added
  *
  * On failure, no file is added.
  */
 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 {
 	struct cgroup_subsys *ss;
 	int i, ret = 0;
 	/* process cftsets of each subsystem */
 	for_each_subsys(ss, i) {
 		struct cftype *cfts;
 		if (!test_bit(i, &subsys_mask))
 			continue;
 		list_for_each_entry(cfts, &ss->cfts, node) {
 			ret = cgroup_addrm_files(cgrp, cfts, true);
 			if (ret < 0)
 				goto err;
 		}
 	}
 	return 0;
 err:
 	cgroup_clear_dir(cgrp, subsys_mask);
 	return ret;
 }
 /*
  * css destruction is four-stage process.
  *
  * 1. Destruction starts.  Killing of the percpu_ref is initiated.
  *    Implemented in kill_css().
  *
  * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
  *    and thus css_tryget() is guaranteed to fail, the css can be offlined
  *    by invoking offline_css().  After offlining, the base ref is put.
  *    Implemented in css_killed_work_fn().
  *
  * 3. When the percpu_ref reaches zero, the only possible remaining
  *    accessors are inside RCU read sections.  css_release() schedules the
  *    RCU callback.
  *
  * 4. After the grace period, the css can be freed.  Implemented in
  *    css_free_work_fn().
  *
  * It is actually hairier because both step 2 and 4 require process context
  * and thus involve punting to css->destroy_work adding two additional
  * steps to the already complex sequence.
  */
 static void css_free_work_fn(struct work_struct *work)
 {
 	struct cgroup_subsys_state *css =
 		container_of(work, struct cgroup_subsys_state, destroy_work);
 	struct cgroup *cgrp = css->cgroup;
 	if (css->parent)
 		css_put(css->parent);
 	css->ss->css_free(css);
 	cgroup_put(cgrp);
 }
 static void css_free_rcu_fn(struct rcu_head *rcu_head)
 {
 	struct cgroup_subsys_state *css =
 		container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
 	INIT_WORK(&css->destroy_work, css_free_work_fn);
 	queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 static void css_release(struct percpu_ref *ref)
 {
 	struct cgroup_subsys_state *css =
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 	RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
 	call_rcu(&css->rcu_head, css_free_rcu_fn);
 }
 static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
 		     struct cgroup *cgrp)
 {
 	css->cgroup = cgrp;
 	css->ss = ss;
 	css->flags = 0;
 	if (cgrp->parent)
 		css->parent = cgroup_css(cgrp->parent, ss);
 	else
 		css->flags |= CSS_ROOT;
 	BUG_ON(cgroup_css(cgrp, ss));
 }
 /* invoke ->css_online() on a new CSS and mark it online if successful */
 static int online_css(struct cgroup_subsys_state *css)
 {
 	struct cgroup_subsys *ss = css->ss;
 	int ret = 0;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	if (ss->css_online)
 		ret = ss->css_online(css);
 	if (!ret) {
 		css->flags |= CSS_ONLINE;
 		css->cgroup->nr_css++;
 		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
 	}
 	return ret;
 }
 /* if the CSS is online, invoke ->css_offline() on it and mark it offline */
 static void offline_css(struct cgroup_subsys_state *css)
 {
 	struct cgroup_subsys *ss = css->ss;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	if (!(css->flags & CSS_ONLINE))
 		return;
 	if (ss->css_offline)
 		ss->css_offline(css);
 	css->flags &= ~CSS_ONLINE;
 	css->cgroup->nr_css--;
 	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
 }
 /**
  * create_css - create a cgroup_subsys_state
  * @cgrp: the cgroup new css will be associated with
  * @ss: the subsys of new css
  *
  * Create a new css associated with @cgrp - @ss pair.  On success, the new
  * css is online and installed in @cgrp with all interface files created.
  * Returns 0 on success, -errno on failure.
  */
 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
 {
 	struct cgroup *parent = cgrp->parent;
 	struct cgroup_subsys_state *css;
 	int err;
 	lockdep_assert_held(&cgroup_mutex);
 	css = ss->css_alloc(cgroup_css(parent, ss));
 	if (IS_ERR(css))
 		return PTR_ERR(css);
 	err = percpu_ref_init(&css->refcnt, css_release);
 	if (err)
 		goto err_free_css;
 	init_css(css, ss, cgrp);
 	err = cgroup_populate_dir(cgrp, 1 << ss->id);
 	if (err)
 		goto err_free_percpu_ref;
 	err = online_css(css);
 	if (err)
 		goto err_clear_dir;
 	cgroup_get(cgrp);
 	css_get(css->parent);
 	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
 	    parent->parent) {
 		pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
 			   current->comm, current->pid, ss->name);
 		if (!strcmp(ss->name, "memory"))
 			pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
 		ss->warned_broken_hierarchy = true;
 	}
 	return 0;
 err_clear_dir:
 	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
 err_free_percpu_ref:
 	percpu_ref_cancel_init(&css->refcnt);
 err_free_css:
 	ss->css_free(css);
 	return err;
 }
 /**
  * cgroup_create - create a cgroup
  * @parent: cgroup that will be parent of the new cgroup
  * @name: name of the new cgroup
  * @mode: mode to set on new cgroup
  */
 static long cgroup_create(struct cgroup *parent, const char *name,
 			  umode_t mode)
 {
 	struct cgroup *cgrp;
 	struct cgroup_root *root = parent->root;
 	int ssid, err;
 	struct cgroup_subsys *ss;
 	struct kernfs_node *kn;
 	/* allocate the cgroup and its ID, 0 is reserved for the root */
 	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
 	if (!cgrp)
 		return -ENOMEM;
 	mutex_lock(&cgroup_tree_mutex);
 	/*
 	 * Only live parents can have children.  Note that the liveliness
 	 * check isn't strictly necessary because cgroup_mkdir() and
 	 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
 	 * anyway so that locking is contained inside cgroup proper and we
 	 * don't get nasty surprises if we ever grow another caller.
 	 */
 	if (!cgroup_lock_live_group(parent)) {
 		err = -ENODEV;
 		goto err_unlock_tree;
 	}
 	/*
 	 * Temporarily set the pointer to NULL, so idr_find() won't return
 	 * a half-baked cgroup.
 	 */
 	cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
 	if (cgrp->id < 0) {
 		err = -ENOMEM;
 		goto err_unlock;
 	}
 	init_cgroup_housekeeping(cgrp);
 	cgrp->parent = parent;
 	cgrp->dummy_css.parent = &parent->dummy_css;
 	cgrp->root = parent->root;
 	if (notify_on_release(parent))
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
 	/* create the directory */
 	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
 	if (IS_ERR(kn)) {
 		err = PTR_ERR(kn);
 		goto err_free_id;
 	}
 	cgrp->kn = kn;
 	/*
 	 * This extra ref will be put in cgroup_free_fn() and guarantees
 	 * that @cgrp->kn is always accessible.
 	 */
 	kernfs_get(kn);
 	cgrp->serial_nr = cgroup_serial_nr_next++;
 	/* allocation complete, commit to creation */
 	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
 	atomic_inc(&root->nr_cgrps);
 	cgroup_get(parent);
 	/*
 	 * @cgrp is now fully operational.  If something fails after this
 	 * point, it'll be released via the normal destruction path.
 	 */
 	idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
 	err = cgroup_kn_set_ugid(kn);
 	if (err)
 		goto err_destroy;
 	err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
 	if (err)
 		goto err_destroy;
 	/* let's create and online css's */
 	for_each_subsys(ss, ssid) {
 		if (parent->child_subsys_mask & (1 << ssid)) {
 			err = create_css(cgrp, ss);
 			if (err)
 				goto err_destroy;
 		}
 	}
 	/*
 	 * On the default hierarchy, a child doesn't automatically inherit
 	 * child_subsys_mask from the parent.  Each is configured manually.
 	 */
 	if (!cgroup_on_dfl(cgrp))
 		cgrp->child_subsys_mask = parent->child_subsys_mask;
 	kernfs_activate(kn);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	return 0;
 err_free_id:
 	idr_remove(&root->cgroup_idr, cgrp->id);
 err_unlock:
 	mutex_unlock(&cgroup_mutex);
 err_unlock_tree:
 	mutex_unlock(&cgroup_tree_mutex);
 	kfree(cgrp);
 	return err;
 err_destroy:
 	cgroup_destroy_locked(cgrp);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	return err;
 }
 static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 			umode_t mode)
 {
 	struct cgroup *parent = parent_kn->priv;
 	int ret;
 	/*
 	 * cgroup_create() grabs cgroup_tree_mutex which nests outside
 	 * kernfs active_ref and cgroup_create() already synchronizes
 	 * properly against removal through cgroup_lock_live_group().
 	 * Break it before calling cgroup_create().
 	 */
 	cgroup_get(parent);
 	kernfs_break_active_protection(parent_kn);
 	ret = cgroup_create(parent, name, mode);
 	kernfs_unbreak_active_protection(parent_kn);
 	cgroup_put(parent);
 	return ret;
 }
 /*
  * This is called when the refcnt of a css is confirmed to be killed.
  * css_tryget() is now guaranteed to fail.
  */
 static void css_killed_work_fn(struct work_struct *work)
 {
 	struct cgroup_subsys_state *css =
 		container_of(work, struct cgroup_subsys_state, destroy_work);
 	struct cgroup *cgrp = css->cgroup;
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	/*
 	 * css_tryget() is guaranteed to fail now.  Tell subsystems to
 	 * initate destruction.
 	 */
 	offline_css(css);
 	/*
 	 * If @cgrp is marked dead, it's waiting for refs of all css's to
 	 * be disabled before proceeding to the second phase of cgroup
 	 * destruction.  If we are the last one, kick it off.
 	 */
 	if (!cgrp->nr_css && cgroup_is_dead(cgrp))
 		cgroup_destroy_css_killed(cgrp);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	/*
 	 * Put the css refs from kill_css().  Each css holds an extra
 	 * reference to the cgroup's dentry and cgroup removal proceeds
 	 * regardless of css refs.  On the last put of each css, whenever
 	 * that may be, the extra dentry ref is put so that dentry
 	 * destruction happens only after all css's are released.
 	 */
 	css_put(css);
 }
 /* css kill confirmation processing requires process context, bounce */
 static void css_killed_ref_fn(struct percpu_ref *ref)
 {
 	struct cgroup_subsys_state *css =
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 	INIT_WORK(&css->destroy_work, css_killed_work_fn);
 	queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 /**
  * kill_css - destroy a css
  * @css: css to destroy
  *
  * This function initiates destruction of @css by removing cgroup interface
  * files and putting its base reference.  ->css_offline() will be invoked
  * asynchronously once css_tryget() is guaranteed to fail and when the
  * reference count reaches zero, @css will be released.
  */
 static void kill_css(struct cgroup_subsys_state *css)
 {
 	lockdep_assert_held(&cgroup_tree_mutex);
 	/*
 	 * This must happen before css is disassociated with its cgroup.
 	 * See seq_css() for details.
 	 */
 	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
 	/*
 	 * Killing would put the base ref, but we need to keep it alive
 	 * until after ->css_offline().
 	 */
 	css_get(css);
 	/*
 	 * cgroup core guarantees that, by the time ->css_offline() is
 	 * invoked, no new css reference will be given out via
 	 * css_tryget().  We can't simply call percpu_ref_kill() and
 	 * proceed to offlining css's because percpu_ref_kill() doesn't
 	 * guarantee that the ref is seen as killed on all CPUs on return.
 	 *
 	 * Use percpu_ref_kill_and_confirm() to get notifications as each
 	 * css is confirmed to be seen as killed on all CPUs.
 	 */
 	percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
 }
 /**
  * cgroup_destroy_locked - the first stage of cgroup destruction
  * @cgrp: cgroup to be destroyed
  *
  * css's make use of percpu refcnts whose killing latency shouldn't be
  * exposed to userland and are RCU protected.  Also, cgroup core needs to
  * guarantee that css_tryget() won't succeed by the time ->css_offline() is
  * invoked.  To satisfy all the requirements, destruction is implemented in
  * the following two steps.
  *
  * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
  *     userland visible parts and start killing the percpu refcnts of
  *     css's.  Set up so that the next stage will be kicked off once all
  *     the percpu refcnts are confirmed to be killed.
  *
  * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
  *     rest of destruction.  Once all cgroup references are gone, the
  *     cgroup is RCU-freed.
  *
  * This function implements s1.  After this step, @cgrp is gone as far as
  * the userland is concerned and a new cgroup with the same name may be
  * created.  As cgroup doesn't care about the names internally, this
  * doesn't cause any problem.
  */
 static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
 	struct cgroup *child;
 	struct cgroup_subsys_state *css;
 	bool empty;
 	int ssid;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	/*
 	 * css_set_rwsem synchronizes access to ->cset_links and prevents
 	 * @cgrp from being removed while put_css_set() is in progress.
 	 */
 	down_read(&css_set_rwsem);
 	empty = list_empty(&cgrp->cset_links);
 	up_read(&css_set_rwsem);
 	if (!empty)
 		return -EBUSY;
 	/*
 	 * Make sure there's no live children.  We can't test ->children
 	 * emptiness as dead children linger on it while being destroyed;
 	 * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
 	 */
 	empty = true;
 	rcu_read_lock();
 	list_for_each_entry_rcu(child, &cgrp->children, sibling) {
 		empty = cgroup_is_dead(child);
 		if (!empty)
 			break;
 	}
 	rcu_read_unlock();
 	if (!empty)
 		return -EBUSY;
 	/*
 	 * Mark @cgrp dead.  This prevents further task migration and child
 	 * creation by disabling cgroup_lock_live_group().  Note that
 	 * CGRP_DEAD assertion is depended upon by css_next_child() to
 	 * resume iteration after dropping RCU read lock.  See
 	 * css_next_child() for details.
 	 */
 	set_bit(CGRP_DEAD, &cgrp->flags);
 	/*
 	 * Initiate massacre of all css's.  cgroup_destroy_css_killed()
 	 * will be invoked to perform the rest of destruction once the
 	 * percpu refs of all css's are confirmed to be killed.  This
 	 * involves removing the subsystem's files, drop cgroup_mutex.
 	 */
 	mutex_unlock(&cgroup_mutex);
 	for_each_css(css, ssid, cgrp)
 		kill_css(css);
 	mutex_lock(&cgroup_mutex);
 	/* CGRP_DEAD is set, remove from ->release_list for the last time */
 	raw_spin_lock(&release_list_lock);
 	if (!list_empty(&cgrp->release_list))
 		list_del_init(&cgrp->release_list);
 	raw_spin_unlock(&release_list_lock);
 	/*
 	 * If @cgrp has css's attached, the second stage of cgroup
 	 * destruction is kicked off from css_killed_work_fn() after the
 	 * refs of all attached css's are killed.  If @cgrp doesn't have
 	 * any css, we kick it off here.
 	 */
 	if (!cgrp->nr_css)
 		cgroup_destroy_css_killed(cgrp);
 	/* remove @cgrp directory along with the base files */
 	mutex_unlock(&cgroup_mutex);
 	/*
 	 * There are two control paths which try to determine cgroup from
 	 * dentry without going through kernfs - cgroupstats_build() and
 	 * css_tryget_from_dir().  Those are supported by RCU protecting
 	 * clearing of cgrp->kn->priv backpointer, which should happen
 	 * after all files under it have been removed.
 	 */
 	kernfs_remove(cgrp->kn);	/* @cgrp has an extra ref on its kn */
 	RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
 	mutex_lock(&cgroup_mutex);
 	return 0;
 };
 /**
  * cgroup_destroy_css_killed - the second step of cgroup destruction
  * @work: cgroup->destroy_free_work
  *
  * This function is invoked from a work item for a cgroup which is being
  * destroyed after all css's are offlined and performs the rest of
  * destruction.  This is the second step of destruction described in the
  * comment above cgroup_destroy_locked().
  */
 static void cgroup_destroy_css_killed(struct cgroup *cgrp)
 {
 	struct cgroup *parent = cgrp->parent;
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 	/* delete this cgroup from parent->children */
 	list_del_rcu(&cgrp->sibling);
 	cgroup_put(cgrp);
 	set_bit(CGRP_RELEASABLE, &parent->flags);
 	check_for_release(parent);
 }
 static int cgroup_rmdir(struct kernfs_node *kn)
 {
 	struct cgroup *cgrp = kn->priv;
 	int ret = 0;
 	/*
 	 * This is self-destruction but @kn can't be removed while this
 	 * callback is in progress.  Let's break active protection.  Once
 	 * the protection is broken, @cgrp can be destroyed at any point.
 	 * Pin it so that it stays accessible.
 	 */
 	cgroup_get(cgrp);
 	kernfs_break_active_protection(kn);
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	/*
 	 * @cgrp might already have been destroyed while we're trying to
 	 * grab the mutexes.
 	 */
 	if (!cgroup_is_dead(cgrp))
 		ret = cgroup_destroy_locked(cgrp);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	kernfs_unbreak_active_protection(kn);
 	cgroup_put(cgrp);
 	return ret;
 }
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
 	.remount_fs		= cgroup_remount,
 	.show_options		= cgroup_show_options,
 	.mkdir			= cgroup_mkdir,
 	.rmdir			= cgroup_rmdir,
 	.rename			= cgroup_rename,
 };
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 {
 	struct cgroup_subsys_state *css;
 	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	INIT_LIST_HEAD(&ss->cfts);
 	/* Create the root cgroup state for this subsystem */
 	ss->root = &cgrp_dfl_root;
 	css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
 	/* We don't handle early failures gracefully */
 	BUG_ON(IS_ERR(css));
 	init_css(css, ss, &cgrp_dfl_root.cgrp);
 	/* Update the init_css_set to contain a subsys
 	 * pointer to this state - since the subsystem is
 	 * newly registered, all tasks and hence the
 	 * init_css_set is in the subsystem's root cgroup. */
 	init_css_set.subsys[ss->id] = css;
 	need_forkexit_callback |= ss->fork || ss->exit;
 	/* At system boot, before all subsystems have been
 	 * registered, no tasks have been forked, so we don't
 	 * need to invoke fork callbacks here. */
 	BUG_ON(!list_empty(&init_task.tasks));
 	BUG_ON(online_css(css));
 	cgrp_dfl_root.subsys_mask |= 1 << ss->id;
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 }
 /**
  * cgroup_init_early - cgroup initialization at system boot
  *
  * Initialize cgroups at system boot, and initialize any
  * subsystems that request early init.
  */
 int __init cgroup_init_early(void)
 {
 	static struct cgroup_sb_opts __initdata opts =
 		{ .flags = CGRP_ROOT_SANE_BEHAVIOR };
 	struct cgroup_subsys *ss;
 	int i;
 	init_cgroup_root(&cgrp_dfl_root, &opts);
 	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
 	for_each_subsys(ss, i) {
 		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
 		     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
 		     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
 		     ss->id, ss->name);
 		WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
 		     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
 		ss->id = i;
 		ss->name = cgroup_subsys_name[i];
 		if (ss->early_init)
 			cgroup_init_subsys(ss);
 	}
 	return 0;
 }
 /**
  * cgroup_init - cgroup initialization
  *
  * Register cgroup filesystem and /proc file, and initialize
  * any subsystems that didn't request early init.
  */
 int __init cgroup_init(void)
 {
 	struct cgroup_subsys *ss;
 	unsigned long key;
 	int ssid, err;
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	/* Add init_css_set to the hash table */
 	key = css_set_hash(init_css_set.subsys);
 	hash_add(css_set_table, &init_css_set.hlist, key);
 	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	for_each_subsys(ss, ssid) {
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
 		list_add_tail(&init_css_set.e_cset_node[ssid],
 			      &cgrp_dfl_root.cgrp.e_csets[ssid]);
 		/*
 		 * cftype registration needs kmalloc and can't be done
 		 * during early_init.  Register base cftypes separately.
 		 */
 		if (ss->base_cftypes)
 			WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
 	}
 	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
 	if (!cgroup_kobj)
 		return -ENOMEM;
 	err = register_filesystem(&cgroup_fs_type);
 	if (err < 0) {
 		kobject_put(cgroup_kobj);
 		return err;
 	}
 	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
 	return 0;
 }
 static int __init cgroup_wq_init(void)
 {
 	/*
 	 * There isn't much point in executing destruction path in
 	 * parallel.  Good chunk is serialized with cgroup_mutex anyway.
 	 * Use 1 for @max_active.
 	 *
 	 * We would prefer to do this in cgroup_init() above, but that
 	 * is called before init_workqueues(): so leave this until after.
 	 */
 	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
 	BUG_ON(!cgroup_destroy_wq);
 	/*
 	 * Used to destroy pidlists and separate to serve as flush domain.
 	 * Cap @max_active to 1 too.
 	 */
 	cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
 						    0, 1);
 	BUG_ON(!cgroup_pidlist_destroy_wq);
 	return 0;
 }
 core_initcall(cgroup_wq_init);
 /*
  * proc_cgroup_show()
  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
  *  - Used for /proc/<pid>/cgroup.
  */
 /* TODO: Use a proper seq_file iterator */
 int proc_cgroup_show(struct seq_file *m, void *v)
 {
 	struct pid *pid;
 	struct task_struct *tsk;
 	char *buf, *path;
 	int retval;
 	struct cgroup_root *root;
 	retval = -ENOMEM;
 	buf = kmalloc(PATH_MAX, GFP_KERNEL);
 	if (!buf)
 		goto out;
 	retval = -ESRCH;
 	pid = m->private;
 	tsk = get_pid_task(pid, PIDTYPE_PID);
 	if (!tsk)
 		goto out_free;
 	retval = 0;
 	mutex_lock(&cgroup_mutex);
 	down_read(&css_set_rwsem);
 	for_each_root(root) {
 		struct cgroup_subsys *ss;
 		struct cgroup *cgrp;
 		int ssid, count = 0;
 		if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
 			continue;
 		seq_printf(m, "%d:", root->hierarchy_id);
 		for_each_subsys(ss, ssid)
 			if (root->subsys_mask & (1 << ssid))
 				seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
 		if (strlen(root->name))
 			seq_printf(m, "%sname=%s", count ? "," : "",
 				   root->name);
 		seq_putc(m, ':');
 		cgrp = task_cgroup_from_root(tsk, root);
 		path = cgroup_path(cgrp, buf, PATH_MAX);
 		if (!path) {
 			retval = -ENAMETOOLONG;
 			goto out_unlock;
 		}
 		seq_puts(m, path);
 		seq_putc(m, '\n');
 	}
 out_unlock:
 	up_read(&css_set_rwsem);
 	mutex_unlock(&cgroup_mutex);
 	put_task_struct(tsk);
 out_free:
 	kfree(buf);
 out:
 	return retval;
 }
 /* Display information about each subsystem and each hierarchy */
 static int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
 	struct cgroup_subsys *ss;
 	int i;
 	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
 	/*
 	 * ideally we don't want subsystems moving around while we do this.
 	 * cgroup_mutex is also necessary to guarantee an atomic snapshot of
 	 * subsys/hierarchy state.
 	 */
 	mutex_lock(&cgroup_mutex);
 	for_each_subsys(ss, i)
 		seq_printf(m, "%s\t%d\t%d\t%d\n",
 			   ss->name, ss->root->hierarchy_id,
 			   atomic_read(&ss->root->nr_cgrps), !ss->disabled);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 static int cgroupstats_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, proc_cgroupstats_show, NULL);
 }
 static const struct file_operations proc_cgroupstats_operations = {
 	.open = cgroupstats_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.release = single_release,
 };
 /**
  * cgroup_fork - initialize cgroup related fields during copy_process()
  * @child: pointer to task_struct of forking parent process.
  *
  * A task is associated with the init_css_set until cgroup_post_fork()
  * attaches it to the parent's css_set.  Empty cg_list indicates that
  * @child isn't holding reference to its css_set.
  */
 void cgroup_fork(struct task_struct *child)
 {
 	RCU_INIT_POINTER(child->cgroups, &init_css_set);
 	INIT_LIST_HEAD(&child->cg_list);
 }
 /**
  * cgroup_post_fork - called on a new task after adding it to the task list
  * @child: the task in question
  *
  * Adds the task to the list running through its css_set if necessary and
  * call the subsystem fork() callbacks.  Has to be after the task is
  * visible on the task list in case we race with the first call to
  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
  * list.
  */
 void cgroup_post_fork(struct task_struct *child)
 {
 	struct cgroup_subsys *ss;
 	int i;
 	/*
 	 * This may race against cgroup_enable_task_cg_links().  As that
 	 * function sets use_task_css_set_links before grabbing
 	 * tasklist_lock and we just went through tasklist_lock to add
 	 * @child, it's guaranteed that either we see the set
 	 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
 	 * @child during its iteration.
 	 *
 	 * If we won the race, @child is associated with %current's
 	 * css_set.  Grabbing css_set_rwsem guarantees both that the
 	 * association is stable, and, on completion of the parent's
 	 * migration, @child is visible in the source of migration or
 	 * already in the destination cgroup.  This guarantee is necessary
 	 * when implementing operations which need to migrate all tasks of
 	 * a cgroup to another.
 	 *
 	 * Note that if we lose to cgroup_enable_task_cg_links(), @child
 	 * will remain in init_css_set.  This is safe because all tasks are
 	 * in the init_css_set before cg_links is enabled and there's no
 	 * operation which transfers all tasks out of init_css_set.
 	 */
 	if (use_task_css_set_links) {
 		struct css_set *cset;
 		down_write(&css_set_rwsem);
 		cset = task_css_set(current);
 		if (list_empty(&child->cg_list)) {
 			rcu_assign_pointer(child->cgroups, cset);
 			list_add(&child->cg_list, &cset->tasks);
 			get_css_set(cset);
 		}
 		up_write(&css_set_rwsem);
 	}
 	/*
 	 * Call ss->fork().  This must happen after @child is linked on
 	 * css_set; otherwise, @child might change state between ->fork()
 	 * and addition to css_set.
 	 */
 	if (need_forkexit_callback) {
 		for_each_subsys(ss, i)
 			if (ss->fork)
 				ss->fork(child);
 	}
 }
 /**
  * cgroup_exit - detach cgroup from exiting task
  * @tsk: pointer to task_struct of exiting process
  *
  * Description: Detach cgroup from @tsk and release it.
  *
  * Note that cgroups marked notify_on_release force every task in
  * them to take the global cgroup_mutex mutex when exiting.
  * This could impact scaling on very large systems.  Be reluctant to
  * use notify_on_release cgroups where very high task exit scaling
  * is required on large systems.
  *
  * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
  * call cgroup_exit() while the task is still competent to handle
  * notify_on_release(), then leave the task attached to the root cgroup in
  * each hierarchy for the remainder of its exit.  No need to bother with
  * init_css_set refcnting.  init_css_set never goes away and we can't race
  * with migration path - PF_EXITING is visible to migration path.
  */
 void cgroup_exit(struct task_struct *tsk)
 {
 	struct cgroup_subsys *ss;
 	struct css_set *cset;
 	bool put_cset = false;
 	int i;
 	/*
 	 * Unlink from @tsk from its css_set.  As migration path can't race
 	 * with us, we can check cg_list without grabbing css_set_rwsem.
 	 */
 	if (!list_empty(&tsk->cg_list)) {
 		down_write(&css_set_rwsem);
 		list_del_init(&tsk->cg_list);
 		up_write(&css_set_rwsem);
 		put_cset = true;
 	}
 	/* Reassign the task to the init_css_set. */
 	cset = task_css_set(tsk);
 	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
 	if (need_forkexit_callback) {
 		/* see cgroup_post_fork() for details */
 		for_each_subsys(ss, i) {
 			if (ss->exit) {
 				struct cgroup_subsys_state *old_css = cset->subsys[i];
 				struct cgroup_subsys_state *css = task_css(tsk, i);
 				ss->exit(css, old_css, tsk);
 			}
 		}
 	}
 	if (put_cset)
 		put_css_set(cset, true);
 }
 static void check_for_release(struct cgroup *cgrp)
 {
 	if (cgroup_is_releasable(cgrp) &&
 	    list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
 		/*
 		 * Control Group is currently removeable. If it's not
 		 * already queued for a userspace notification, queue
 		 * it now
 		 */
 		int need_schedule_work = 0;
 		raw_spin_lock(&release_list_lock);
 		if (!cgroup_is_dead(cgrp) &&
 		    list_empty(&cgrp->release_list)) {
 			list_add(&cgrp->release_list, &release_list);
 			need_schedule_work = 1;
 		}
 		raw_spin_unlock(&release_list_lock);
 		if (need_schedule_work)
 			schedule_work(&release_agent_work);
 	}
 }
 /*
  * Notify userspace when a cgroup is released, by running the
  * configured release agent with the name of the cgroup (path
  * relative to the root of cgroup file system) as the argument.
  *
  * Most likely, this user command will try to rmdir this cgroup.
  *
  * This races with the possibility that some other task will be
  * attached to this cgroup before it is removed, or that some other
  * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
  * The presumed 'rmdir' will fail quietly if this cgroup is no longer
  * unused, and this cgroup will be reprieved from its death sentence,
  * to continue to serve a useful existence.  Next time it's released,
  * we will get notified again, if it still has 'notify_on_release' set.
  *
  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
  * means only wait until the task is successfully execve()'d.  The
  * separate release agent task is forked by call_usermodehelper(),
  * then control in this thread returns here, without waiting for the
  * release agent task.  We don't bother to wait because the caller of
  * this routine has no use for the exit status of the release agent
  * task, so no sense holding our caller up for that.
  */
 static void cgroup_release_agent(struct work_struct *work)
 {
 	BUG_ON(work != &release_agent_work);
 	mutex_lock(&cgroup_mutex);
 	raw_spin_lock(&release_list_lock);
 	while (!list_empty(&release_list)) {
 		char *argv[3], *envp[3];
 		int i;
 		char *pathbuf = NULL, *agentbuf = NULL, *path;
 		struct cgroup *cgrp = list_entry(release_list.next,
 						    struct cgroup,
 						    release_list);
 		list_del_init(&cgrp->release_list);
 		raw_spin_unlock(&release_list_lock);
 		pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
 		if (!pathbuf)
 			goto continue_free;
 		path = cgroup_path(cgrp, pathbuf, PATH_MAX);
 		if (!path)
 			goto continue_free;
 		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
 		if (!agentbuf)
 			goto continue_free;
 		i = 0;
 		argv[i++] = agentbuf;
 		argv[i++] = path;
 		argv[i] = NULL;
 		i = 0;
 		/* minimal command environment */
 		envp[i++] = "HOME=/";
 		envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 		envp[i] = NULL;
 		/* Drop the lock while we invoke the usermode helper,
 		 * since the exec could involve hitting disk and hence
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
 		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
 		mutex_lock(&cgroup_mutex);
  continue_free:
 		kfree(pathbuf);
 		kfree(agentbuf);
 		raw_spin_lock(&release_list_lock);
 	}
 	raw_spin_unlock(&release_list_lock);
 	mutex_unlock(&cgroup_mutex);
 }
 static int __init cgroup_disable(char *str)
 {
 	struct cgroup_subsys *ss;
 	char *token;
 	int i;
 	while ((token = strsep(&str, ",")) != NULL) {
 		if (!*token)
 			continue;
 		for_each_subsys(ss, i) {
 			if (!strcmp(token, ss->name)) {
 				ss->disabled = 1;
 				printk(KERN_INFO "Disabling %s control group"
 					" subsystem\n", ss->name);
 				break;
 			}
 		}
 	}
 	return 1;
 }
 __setup("cgroup_disable=", cgroup_disable);
 /**
  * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
  * @dentry: directory dentry of interest
  * @ss: subsystem of interest
  *
  * If @dentry is a directory for a cgroup which has @ss enabled on it, try
  * to get the corresponding css and return it.  If such css doesn't exist
  * or can't be pinned, an ERR_PTR value is returned.
  */
 struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
 						struct cgroup_subsys *ss)
 {
 	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
 	struct cgroup_subsys_state *css = NULL;
 	struct cgroup *cgrp;
 	/* is @dentry a cgroup dir? */
 	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
 	    kernfs_type(kn) != KERNFS_DIR)
 		return ERR_PTR(-EBADF);
 	rcu_read_lock();
 	/*
 	 * This path doesn't originate from kernfs and @kn could already
 	 * have been or be removed at any point.  @kn->priv is RCU
 	 * protected for this access.  See destroy_locked() for details.
 	 */
 	cgrp = rcu_dereference(kn->priv);
 	if (cgrp)
 		css = cgroup_css(cgrp, ss);
 	if (!css || !css_tryget(css))
 		css = ERR_PTR(-ENOENT);
 	rcu_read_unlock();
 	return css;
 }
 /**
  * css_from_id - lookup css by id
  * @id: the cgroup id
  * @ss: cgroup subsys to be looked into
  *
  * Returns the css if there's valid one with @id, otherwise returns NULL.
  * Should be called under rcu_read_lock().
  */
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 {
 	struct cgroup *cgrp;
 	cgroup_assert_mutexes_or_rcu_locked();
 	cgrp = idr_find(&ss->root->cgroup_idr, id);
 	if (cgrp)
 		return cgroup_css(cgrp, ss);
 	return NULL;
 }
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *
 debug_css_alloc(struct cgroup_subsys_state *parent_css)
 {
 	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
 	if (!css)
 		return ERR_PTR(-ENOMEM);
 	return css;
 }
 static void debug_css_free(struct cgroup_subsys_state *css)
 {
 	kfree(css);
 }
 static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
 				struct cftype *cft)
 {
 	return cgroup_task_count(css->cgroup);
 }
 static u64 current_css_set_read(struct cgroup_subsys_state *css,
 				struct cftype *cft)
 {
 	return (u64)(unsigned long)current->cgroups;
 }
 static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
 					 struct cftype *cft)
 {
 	u64 count;
 	rcu_read_lock();
 	count = atomic_read(&task_css_set(current)->refcount);
 	rcu_read_unlock();
 	return count;
 }
 static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 {
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
 	char *name_buf;
 	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
 	if (!name_buf)
 		return -ENOMEM;
 	down_read(&css_set_rwsem);
 	rcu_read_lock();
 	cset = rcu_dereference(current->cgroups);
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
 		cgroup_name(c, name_buf, NAME_MAX + 1);
 		seq_printf(seq, "Root %d group %s\n",
 			   c->root->hierarchy_id, name_buf);
 	}
 	rcu_read_unlock();
 	up_read(&css_set_rwsem);
 	kfree(name_buf);
 	return 0;
 }
 #define MAX_TASKS_SHOWN_PER_CSS 25
 static int cgroup_css_links_read(struct seq_file *seq, void *v)
 {
 	struct cgroup_subsys_state *css = seq_css(seq);
 	struct cgrp_cset_link *link;
 	down_read(&css_set_rwsem);
 	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
 		struct css_set *cset = link->cset;
 		struct task_struct *task;
 		int count = 0;
 		seq_printf(seq, "css_set %p\n", cset);
 		list_for_each_entry(task, &cset->tasks, cg_list) {
 			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
 				goto overflow;
 			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
 		}
 		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
 			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
 				goto overflow;
 			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
 		}
 		continue;
 	overflow:
 		seq_puts(seq, "  ...\n");
 	}
 	up_read(&css_set_rwsem);
 	return 0;
 }
 static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
 	return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
 }
 static struct cftype debug_files[] =  {
 	{
 		.name = "taskcount",
 		.read_u64 = debug_taskcount_read,
 	},
 	{
 		.name = "current_css_set",
 		.read_u64 = current_css_set_read,
 	},
 	{
 		.name = "current_css_set_refcount",
 		.read_u64 = current_css_set_refcount_read,
 	},
 	{
 		.name = "current_css_set_cg_links",
 		.seq_show = current_css_set_cg_links_read,
 	},
 	{
 		.name = "cgroup_css_links",
 		.seq_show = cgroup_css_links_read,
 	},
 	{
 		.name = "releasable",
 		.read_u64 = releasable_read,
 	},
 	{ }	/* terminate */
 };
 struct cgroup_subsys debug_cgrp_subsys = {
 	.css_alloc = debug_css_alloc,
 	.css_free = debug_css_free,
 	.base_cftypes = debug_files,
 };
 #endif /* CONFIG_CGROUP_DEBUG */