Eric Lee / linux-smarc-t335x-v3.2

1

/*

1

/*

2

* Generic process-grouping system.

2

* Generic process-grouping system.

3

*

3

*

4

* Based originally on the cpuset system, extracted by Paul Menage

4

* Based originally on the cpuset system, extracted by Paul Menage

5

6

*

6

*

7

* Copyright notices from the original cpuset code:

7

* Copyright notices from the original cpuset code:

8

* --------------------------------------------------

8

* --------------------------------------------------

9

10

11

*

11

*

12

* Portions derived from Patrick Mochel's sysfs code.

12

* Portions derived from Patrick Mochel's sysfs code.

13

14

*

14

*

15

* 2003-10-10 Written by Simon Derr.

15

* 2003-10-10 Written by Simon Derr.

16

* 2003-10-22 Updates by Stephen Hemminger.

16

* 2003-10-22 Updates by Stephen Hemminger.

17

* 2004 May-July Rework by Paul Jackson.

17

* 2004 May-July Rework by Paul Jackson.

18

* ---------------------------------------------------

18

* ---------------------------------------------------

19

*

19

*

20

* This file is subject to the terms and conditions of the GNU General Public

20

* This file is subject to the terms and conditions of the GNU General Public

21

* License. See the file COPYING in the main directory of the Linux

21

* License. See the file COPYING in the main directory of the Linux

22

* distribution for more details.

22

* distribution for more details.

23

*/

23

*/

24

25

#include <linux/cgroup.h>

25

#include <linux/cgroup.h>

26

#include <linux/errno.h>

26

#include <linux/errno.h>

27

#include <linux/fs.h>

27

#include <linux/fs.h>

28

#include <linux/kernel.h>

28

#include <linux/kernel.h>

29

#include <linux/list.h>

29

#include <linux/list.h>

30

#include <linux/mm.h>

30

#include <linux/mm.h>

31

#include <linux/mutex.h>

31

#include <linux/mutex.h>

32

#include <linux/mount.h>

32

#include <linux/mount.h>

33

#include <linux/pagemap.h>

33

#include <linux/pagemap.h>

34

#include <linux/proc_fs.h>

34

#include <linux/proc_fs.h>

35

#include <linux/rcupdate.h>

35

#include <linux/rcupdate.h>

36

#include <linux/sched.h>

36

#include <linux/sched.h>

37

#include <linux/backing-dev.h>

37

#include <linux/backing-dev.h>

38

#include <linux/seq_file.h>

38

#include <linux/seq_file.h>

39

#include <linux/slab.h>

39

#include <linux/slab.h>

40

#include <linux/magic.h>

40

#include <linux/magic.h>

41

#include <linux/spinlock.h>

41

#include <linux/spinlock.h>

42

#include <linux/string.h>

42

#include <linux/string.h>

43

#include <linux/sort.h>

43

#include <linux/sort.h>

44

#include <linux/kmod.h>

44

#include <linux/kmod.h>

45

#include <linux/delayacct.h>

45

#include <linux/delayacct.h>

46

#include <linux/cgroupstats.h>

46

#include <linux/cgroupstats.h>

47

48

#include <asm/atomic.h>

48

#include <asm/atomic.h>

49

50

static DEFINE_MUTEX(cgroup_mutex);

50

static DEFINE_MUTEX(cgroup_mutex);

51

52

/* Generate an array of cgroup subsystem pointers */

52

/* Generate an array of cgroup subsystem pointers */

53

#define SUBSYS(_x) &_x ## _subsys,

53

#define SUBSYS(_x) &_x ## _subsys,

54

55

static struct cgroup_subsys *subsys[] = {

55

static struct cgroup_subsys *subsys[] = {

56

#include <linux/cgroup_subsys.h>

56

#include <linux/cgroup_subsys.h>

57

};

57

};

58

59

/*

59

/*

60

* A cgroupfs_root represents the root of a cgroup hierarchy,

60

* A cgroupfs_root represents the root of a cgroup hierarchy,

61

* and may be associated with a superblock to form an active

61

* and may be associated with a superblock to form an active

62

* hierarchy

62

* hierarchy

63

*/

63

*/

64

struct cgroupfs_root {

64

struct cgroupfs_root {

65

struct super_block *sb;

65

struct super_block *sb;

66

67

/*

67

/*

68

* The bitmask of subsystems intended to be attached to this

68

* The bitmask of subsystems intended to be attached to this

69

* hierarchy

69

* hierarchy

70

*/

70

*/

71

unsigned long subsys_bits;

71

unsigned long subsys_bits;

72

73

/* The bitmask of subsystems currently attached to this hierarchy */

73

/* The bitmask of subsystems currently attached to this hierarchy */

74

unsigned long actual_subsys_bits;

74

unsigned long actual_subsys_bits;

75

76

/* A list running through the attached subsystems */

76

/* A list running through the attached subsystems */

77

struct list_head subsys_list;

77

struct list_head subsys_list;

78

79

/* The root cgroup for this hierarchy */

79

/* The root cgroup for this hierarchy */

80

struct cgroup top_cgroup;

80

struct cgroup top_cgroup;

81

82

/* Tracks how many cgroups are currently defined in hierarchy.*/

82

/* Tracks how many cgroups are currently defined in hierarchy.*/

83

int number_of_cgroups;

83

int number_of_cgroups;

84

85

/* A list running through the mounted hierarchies */

85

/* A list running through the mounted hierarchies */

86

struct list_head root_list;

86

struct list_head root_list;

87

88

/* Hierarchy-specific flags */

88

/* Hierarchy-specific flags */

89

unsigned long flags;

89

unsigned long flags;

90

91

/* The path to use for release notifications. No locking

91

/* The path to use for release notifications. No locking

92

* between setting and use - so if userspace updates this

92

* between setting and use - so if userspace updates this

93

* while child cgroups exist, you could miss a

93

* while child cgroups exist, you could miss a

94

* notification. We ensure that it's always a valid

94

* notification. We ensure that it's always a valid

95

* NUL-terminated string */

95

* NUL-terminated string */

96

char release_agent_path[PATH_MAX];

96

char release_agent_path[PATH_MAX];

97

};

97

};

98

99

100

/*

100

/*

101

* The "rootnode" hierarchy is the "dummy hierarchy", reserved for the

101

* The "rootnode" hierarchy is the "dummy hierarchy", reserved for the

102

* subsystems that are otherwise unattached - it never has more than a

102

* subsystems that are otherwise unattached - it never has more than a

103

* single cgroup, and all tasks are part of that cgroup.

103

* single cgroup, and all tasks are part of that cgroup.

104

*/

104

*/

105

static struct cgroupfs_root rootnode;

105

static struct cgroupfs_root rootnode;

106

107

/* The list of hierarchy roots */

107

/* The list of hierarchy roots */

108

109

static LIST_HEAD(roots);

109

static LIST_HEAD(roots);

110

static int root_count;

110

static int root_count;

111

112

/* dummytop is a shorthand for the dummy hierarchy's top cgroup */

112

/* dummytop is a shorthand for the dummy hierarchy's top cgroup */

113

#define dummytop (&rootnode.top_cgroup)

113

#define dummytop (&rootnode.top_cgroup)

114

115

/* This flag indicates whether tasks in the fork and exit paths should

115

/* This flag indicates whether tasks in the fork and exit paths should

116

* check for fork/exit handlers to call. This avoids us having to do

116

* check for fork/exit handlers to call. This avoids us having to do

117

* extra work in the fork/exit path if none of the subsystems need to

117

* extra work in the fork/exit path if none of the subsystems need to

118

* be called.

118

* be called.

119

*/

119

*/

120

static int need_forkexit_callback;

120

static int need_forkexit_callback;

121

122

/* bits in struct cgroup flags field */

123

enum {

124

/* Control Group is dead */

125

CGRP_REMOVED,

126

/* Control Group has previously had a child cgroup or a task,

127

* but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */

128

CGRP_RELEASABLE,

129

/* Control Group requires release notifications to userspace */

130

CGRP_NOTIFY_ON_RELEASE,

131

};

132

133

/* convenient tests for these bits */

122

/* convenient tests for these bits */

134

inline int cgroup_is_removed(const struct cgroup *cgrp)

123

inline int cgroup_is_removed(const struct cgroup *cgrp)

135

{

124

{

136

return test_bit(CGRP_REMOVED, &cgrp->flags);

125

return test_bit(CGRP_REMOVED, &cgrp->flags);

137

}

126

}

138

127

139

/* bits in struct cgroupfs_root flags field */

128

/* bits in struct cgroupfs_root flags field */

140

enum {

129

enum {

141

ROOT_NOPREFIX, /* mounted subsystems have no named prefix */

130

ROOT_NOPREFIX, /* mounted subsystems have no named prefix */

142

};

131

};

143

132

144

static int cgroup_is_releasable(const struct cgroup *cgrp)

133

static int cgroup_is_releasable(const struct cgroup *cgrp)

145

{

134

{

146

const int bits =

135

const int bits =

147

(1 << CGRP_RELEASABLE) |

136

(1 << CGRP_RELEASABLE) |

148

(1 << CGRP_NOTIFY_ON_RELEASE);

137

(1 << CGRP_NOTIFY_ON_RELEASE);

149

return (cgrp->flags & bits) == bits;

138

return (cgrp->flags & bits) == bits;

150

}

139

}

151

140

152

static int notify_on_release(const struct cgroup *cgrp)

141

static int notify_on_release(const struct cgroup *cgrp)

153

{

142

{

154

return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

143

return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

155

}

144

}

156

145

157

/*

146

/*

158

* for_each_subsys() allows you to iterate on each subsystem attached to

147

* for_each_subsys() allows you to iterate on each subsystem attached to

159

* an active hierarchy

148

* an active hierarchy

160

*/

149

*/

161

#define for_each_subsys(_root, _ss) \

150

#define for_each_subsys(_root, _ss) \

162

list_for_each_entry(_ss, &_root->subsys_list, sibling)

151

list_for_each_entry(_ss, &_root->subsys_list, sibling)

163

152

164

/* for_each_root() allows you to iterate across the active hierarchies */

153

/* for_each_root() allows you to iterate across the active hierarchies */

165

#define for_each_root(_root) \

154

#define for_each_root(_root) \

166

list_for_each_entry(_root, &roots, root_list)

155

list_for_each_entry(_root, &roots, root_list)

167

156

168

/* the list of cgroups eligible for automatic release. Protected by

157

/* the list of cgroups eligible for automatic release. Protected by

169

* release_list_lock */

158

* release_list_lock */

170

static LIST_HEAD(release_list);

159

static LIST_HEAD(release_list);

171

static DEFINE_SPINLOCK(release_list_lock);

160

static DEFINE_SPINLOCK(release_list_lock);

172

static void cgroup_release_agent(struct work_struct *work);

161

static void cgroup_release_agent(struct work_struct *work);

173

static DECLARE_WORK(release_agent_work, cgroup_release_agent);

162

static DECLARE_WORK(release_agent_work, cgroup_release_agent);

174

static void check_for_release(struct cgroup *cgrp);

163

static void check_for_release(struct cgroup *cgrp);

175

164

176

/* Link structure for associating css_set objects with cgroups */

165

/* Link structure for associating css_set objects with cgroups */

177

struct cg_cgroup_link {

166

struct cg_cgroup_link {

178

/*

167

/*

179

* List running through cg_cgroup_links associated with a

168

* List running through cg_cgroup_links associated with a

180

* cgroup, anchored on cgroup->css_sets

169

* cgroup, anchored on cgroup->css_sets

181

*/

170

*/

182

struct list_head cgrp_link_list;

171

struct list_head cgrp_link_list;

183

/*

172

/*

184

* List running through cg_cgroup_links pointing at a

173

* List running through cg_cgroup_links pointing at a

185

* single css_set object, anchored on css_set->cg_links

174

* single css_set object, anchored on css_set->cg_links

186

*/

175

*/

187

struct list_head cg_link_list;

176

struct list_head cg_link_list;

188

struct css_set *cg;

177

struct css_set *cg;

189

};

178

};

190

179

191

/* The default css_set - used by init and its children prior to any

180

/* The default css_set - used by init and its children prior to any

192

* hierarchies being mounted. It contains a pointer to the root state

181

* hierarchies being mounted. It contains a pointer to the root state

193

* for each subsystem. Also used to anchor the list of css_sets. Not

182

* for each subsystem. Also used to anchor the list of css_sets. Not

194

* reference-counted, to improve performance when child cgroups

183

* reference-counted, to improve performance when child cgroups

195

* haven't been created.

184

* haven't been created.

196

*/

185

*/

197

186

198

static struct css_set init_css_set;

187

static struct css_set init_css_set;

199

static struct cg_cgroup_link init_css_set_link;

188

static struct cg_cgroup_link init_css_set_link;

200

189

201

/* css_set_lock protects the list of css_set objects, and the

190

/* css_set_lock protects the list of css_set objects, and the

202

* chain of tasks off each css_set. Nests outside task->alloc_lock

191

* chain of tasks off each css_set. Nests outside task->alloc_lock

203

* due to cgroup_iter_start() */

192

* due to cgroup_iter_start() */

204

static DEFINE_RWLOCK(css_set_lock);

193

static DEFINE_RWLOCK(css_set_lock);

205

static int css_set_count;

194

static int css_set_count;

206

195

207

/* We don't maintain the lists running through each css_set to its

196

/* We don't maintain the lists running through each css_set to its

208

* task until after the first call to cgroup_iter_start(). This

197

* task until after the first call to cgroup_iter_start(). This

209

* reduces the fork()/exit() overhead for people who have cgroups

198

* reduces the fork()/exit() overhead for people who have cgroups

210

* compiled into their kernel but not actually in use */

199

* compiled into their kernel but not actually in use */

211

static int use_task_css_set_links;

200

static int use_task_css_set_links;

212

201

213

/* When we create or destroy a css_set, the operation simply

202

/* When we create or destroy a css_set, the operation simply

214

* takes/releases a reference count on all the cgroups referenced

203

* takes/releases a reference count on all the cgroups referenced

215

* by subsystems in this css_set. This can end up multiple-counting

204

* by subsystems in this css_set. This can end up multiple-counting

216

* some cgroups, but that's OK - the ref-count is just a

205

* some cgroups, but that's OK - the ref-count is just a

217

* busy/not-busy indicator; ensuring that we only count each cgroup

206

* busy/not-busy indicator; ensuring that we only count each cgroup

218

* once would require taking a global lock to ensure that no

207

* once would require taking a global lock to ensure that no

219

* subsystems moved between hierarchies while we were doing so.

208

* subsystems moved between hierarchies while we were doing so.

220

*

209

*

221

* Possible TODO: decide at boot time based on the number of

210

* Possible TODO: decide at boot time based on the number of

222

* registered subsystems and the number of CPUs or NUMA nodes whether

211

* registered subsystems and the number of CPUs or NUMA nodes whether

223

* it's better for performance to ref-count every subsystem, or to

212

* it's better for performance to ref-count every subsystem, or to

224

* take a global lock and only add one ref count to each hierarchy.

213

* take a global lock and only add one ref count to each hierarchy.

225

*/

214

*/

226

215

227

/*

216

/*

228

* unlink a css_set from the list and free it

217

* unlink a css_set from the list and free it

229

*/

218

*/

230

static void unlink_css_set(struct css_set *cg)

219

static void unlink_css_set(struct css_set *cg)

231

{

220

{

232

write_lock(&css_set_lock);

221

write_lock(&css_set_lock);

233

list_del(&cg->list);

222

list_del(&cg->list);

234

css_set_count--;

223

css_set_count--;

235

while (!list_empty(&cg->cg_links)) {

224

while (!list_empty(&cg->cg_links)) {

236

struct cg_cgroup_link *link;

225

struct cg_cgroup_link *link;

237

link = list_entry(cg->cg_links.next,

226

link = list_entry(cg->cg_links.next,

238

struct cg_cgroup_link, cg_link_list);

227

struct cg_cgroup_link, cg_link_list);

239

list_del(&link->cg_link_list);

228

list_del(&link->cg_link_list);

240

list_del(&link->cgrp_link_list);

229

list_del(&link->cgrp_link_list);

241

kfree(link);

230

kfree(link);

242

}

231

}

243

write_unlock(&css_set_lock);

232

write_unlock(&css_set_lock);

244

}

233

}

245

234

246

static void __release_css_set(struct kref *k, int taskexit)

235

static void __release_css_set(struct kref *k, int taskexit)

247

{

236

{

248

int i;

237

int i;

249

struct css_set *cg = container_of(k, struct css_set, ref);

238

struct css_set *cg = container_of(k, struct css_set, ref);

250

239

251

unlink_css_set(cg);

240

unlink_css_set(cg);

252

241

253

rcu_read_lock();

242

rcu_read_lock();

254

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

243

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

255

struct cgroup *cgrp = cg->subsys[i]->cgroup;

244

struct cgroup *cgrp = cg->subsys[i]->cgroup;

256

if (atomic_dec_and_test(&cgrp->count) &&

245

if (atomic_dec_and_test(&cgrp->count) &&

257

notify_on_release(cgrp)) {

246

notify_on_release(cgrp)) {

258

if (taskexit)

247

if (taskexit)

259

set_bit(CGRP_RELEASABLE, &cgrp->flags);

248

set_bit(CGRP_RELEASABLE, &cgrp->flags);

260

check_for_release(cgrp);

249

check_for_release(cgrp);

261

}

250

}

262

}

251

}

263

rcu_read_unlock();

252

rcu_read_unlock();

264

kfree(cg);

253

kfree(cg);

265

}

254

}

266

255

267

static void release_css_set(struct kref *k)

256

static void release_css_set(struct kref *k)

268

{

257

{

269

__release_css_set(k, 0);

258

__release_css_set(k, 0);

270

}

259

}

271

260

272

static void release_css_set_taskexit(struct kref *k)

261

static void release_css_set_taskexit(struct kref *k)

273

{

262

{

274

__release_css_set(k, 1);

263

__release_css_set(k, 1);

275

}

264

}

276

265

277

/*

266

/*

278

* refcounted get/put for css_set objects

267

* refcounted get/put for css_set objects

279

*/

268

*/

280

static inline void get_css_set(struct css_set *cg)

269

static inline void get_css_set(struct css_set *cg)

281

{

270

{

282

kref_get(&cg->ref);

271

kref_get(&cg->ref);

283

}

272

}

284

273

285

static inline void put_css_set(struct css_set *cg)

274

static inline void put_css_set(struct css_set *cg)

286

{

275

{

287

kref_put(&cg->ref, release_css_set);

276

kref_put(&cg->ref, release_css_set);

288

}

277

}

289

278

290

static inline void put_css_set_taskexit(struct css_set *cg)

279

static inline void put_css_set_taskexit(struct css_set *cg)

291

{

280

{

292

kref_put(&cg->ref, release_css_set_taskexit);

281

kref_put(&cg->ref, release_css_set_taskexit);

293

}

282

}

294

283

295

/*

284

/*

296

* find_existing_css_set() is a helper for

285

* find_existing_css_set() is a helper for

297

* find_css_set(), and checks to see whether an existing

286

* find_css_set(), and checks to see whether an existing

298

* css_set is suitable. This currently walks a linked-list for

287

* css_set is suitable. This currently walks a linked-list for

299

* simplicity; a later patch will use a hash table for better

288

* simplicity; a later patch will use a hash table for better

300

* performance

289

* performance

301

*

290

*

302

* oldcg: the cgroup group that we're using before the cgroup

291

* oldcg: the cgroup group that we're using before the cgroup

303

* transition

292

* transition

304

*

293

*

305

* cgrp: the cgroup that we're moving into

294

* cgrp: the cgroup that we're moving into

306

*

295

*

307

* template: location in which to build the desired set of subsystem

296

* template: location in which to build the desired set of subsystem

308

* state objects for the new cgroup group

297

* state objects for the new cgroup group

309

*/

298

*/

310

static struct css_set *find_existing_css_set(

299

static struct css_set *find_existing_css_set(

311

struct css_set *oldcg,

300

struct css_set *oldcg,

312

struct cgroup *cgrp,

301

struct cgroup *cgrp,

313

struct cgroup_subsys_state *template[])

302

struct cgroup_subsys_state *template[])

314

{

303

{

315

int i;

304

int i;

316

struct cgroupfs_root *root = cgrp->root;

305

struct cgroupfs_root *root = cgrp->root;

317

struct list_head *l = &init_css_set.list;

306

struct list_head *l = &init_css_set.list;

318

307

319

/* Built the set of subsystem state objects that we want to

308

/* Built the set of subsystem state objects that we want to

320

* see in the new css_set */

309

* see in the new css_set */

321

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

310

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

322

if (root->subsys_bits & (1UL << i)) {

311

if (root->subsys_bits & (1UL << i)) {

323

/* Subsystem is in this hierarchy. So we want

312

/* Subsystem is in this hierarchy. So we want

324

* the subsystem state from the new

313

* the subsystem state from the new

325

* cgroup */

314

* cgroup */

326

template[i] = cgrp->subsys[i];

315

template[i] = cgrp->subsys[i];

327

} else {

316

} else {

328

/* Subsystem is not in this hierarchy, so we

317

/* Subsystem is not in this hierarchy, so we

329

* don't want to change the subsystem state */

318

* don't want to change the subsystem state */

330

template[i] = oldcg->subsys[i];

319

template[i] = oldcg->subsys[i];

331

}

320

}

332

}

321

}

333

322

334

/* Look through existing cgroup groups to find one to reuse */

323

/* Look through existing cgroup groups to find one to reuse */

335

do {

324

do {

336

struct css_set *cg =

325

struct css_set *cg =

337

list_entry(l, struct css_set, list);

326

list_entry(l, struct css_set, list);

338

327

339

if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {

328

if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {

340

/* All subsystems matched */

329

/* All subsystems matched */

341

return cg;

330

return cg;

342

}

331

}

343

/* Try the next cgroup group */

332

/* Try the next cgroup group */

344

l = l->next;

333

l = l->next;

345

} while (l != &init_css_set.list);

334

} while (l != &init_css_set.list);

346

335

347

/* No existing cgroup group matched */

336

/* No existing cgroup group matched */

348

return NULL;

337

return NULL;

349

}

338

}

350

339

351

/*

340

/*

352

* allocate_cg_links() allocates "count" cg_cgroup_link structures

341

* allocate_cg_links() allocates "count" cg_cgroup_link structures

353

* and chains them on tmp through their cgrp_link_list fields. Returns 0 on

342

* and chains them on tmp through their cgrp_link_list fields. Returns 0 on

354

* success or a negative error

343

* success or a negative error

355

*/

344

*/

356

static int allocate_cg_links(int count, struct list_head *tmp)

345

static int allocate_cg_links(int count, struct list_head *tmp)

357

{

346

{

358

struct cg_cgroup_link *link;

347

struct cg_cgroup_link *link;

359

int i;

348

int i;

360

INIT_LIST_HEAD(tmp);

349

INIT_LIST_HEAD(tmp);

361

for (i = 0; i < count; i++) {

350

for (i = 0; i < count; i++) {

362

link = kmalloc(sizeof(*link), GFP_KERNEL);

351

link = kmalloc(sizeof(*link), GFP_KERNEL);

363

if (!link) {

352

if (!link) {

364

while (!list_empty(tmp)) {

353

while (!list_empty(tmp)) {

365

link = list_entry(tmp->next,

354

link = list_entry(tmp->next,

366

struct cg_cgroup_link,

355

struct cg_cgroup_link,

367

cgrp_link_list);

356

cgrp_link_list);

368

list_del(&link->cgrp_link_list);

357

list_del(&link->cgrp_link_list);

369

kfree(link);

358

kfree(link);

370

}

359

}

371

return -ENOMEM;

360

return -ENOMEM;

372

}

361

}

373

list_add(&link->cgrp_link_list, tmp);

362

list_add(&link->cgrp_link_list, tmp);

374

}

363

}

375

return 0;

364

return 0;

376

}

365

}

377

366

378

static void free_cg_links(struct list_head *tmp)

367

static void free_cg_links(struct list_head *tmp)

379

{

368

{

380

while (!list_empty(tmp)) {

369

while (!list_empty(tmp)) {

381

struct cg_cgroup_link *link;

370

struct cg_cgroup_link *link;

382

link = list_entry(tmp->next,

371

link = list_entry(tmp->next,

383

struct cg_cgroup_link,

372

struct cg_cgroup_link,

384

cgrp_link_list);

373

cgrp_link_list);

385

list_del(&link->cgrp_link_list);

374

list_del(&link->cgrp_link_list);

386

kfree(link);

375

kfree(link);

387

}

376

}

388

}

377

}

389

378

390

/*

379

/*

391

* find_css_set() takes an existing cgroup group and a

380

* find_css_set() takes an existing cgroup group and a

392

* cgroup object, and returns a css_set object that's

381

* cgroup object, and returns a css_set object that's

393

* equivalent to the old group, but with the given cgroup

382

* equivalent to the old group, but with the given cgroup

394

* substituted into the appropriate hierarchy. Must be called with

383

* substituted into the appropriate hierarchy. Must be called with

395

* cgroup_mutex held

384

* cgroup_mutex held

396

*/

385

*/

397

static struct css_set *find_css_set(

386

static struct css_set *find_css_set(

398

struct css_set *oldcg, struct cgroup *cgrp)

387

struct css_set *oldcg, struct cgroup *cgrp)

399

{

388

{

400

struct css_set *res;

389

struct css_set *res;

401

struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];

390

struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];

402

int i;

391

int i;

403

392

404

struct list_head tmp_cg_links;

393

struct list_head tmp_cg_links;

405

struct cg_cgroup_link *link;

394

struct cg_cgroup_link *link;

406

395

407

/* First see if we already have a cgroup group that matches

396

/* First see if we already have a cgroup group that matches

408

* the desired set */

397

* the desired set */

409

write_lock(&css_set_lock);

398

write_lock(&css_set_lock);

410

res = find_existing_css_set(oldcg, cgrp, template);

399

res = find_existing_css_set(oldcg, cgrp, template);

411

if (res)

400

if (res)

412

get_css_set(res);

401

get_css_set(res);

413

write_unlock(&css_set_lock);

402

write_unlock(&css_set_lock);

414

403

415

if (res)

404

if (res)

416

return res;

405

return res;

417

406

418

res = kmalloc(sizeof(*res), GFP_KERNEL);

407

res = kmalloc(sizeof(*res), GFP_KERNEL);

419

if (!res)

408

if (!res)

420

return NULL;

409

return NULL;

421

410

422

/* Allocate all the cg_cgroup_link objects that we'll need */

411

/* Allocate all the cg_cgroup_link objects that we'll need */

423

if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {

412

if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {

424

kfree(res);

413

kfree(res);

425

return NULL;

414

return NULL;

426

}

415

}

427

416

428

kref_init(&res->ref);

417

kref_init(&res->ref);

429

INIT_LIST_HEAD(&res->cg_links);

418

INIT_LIST_HEAD(&res->cg_links);

430

INIT_LIST_HEAD(&res->tasks);

419

INIT_LIST_HEAD(&res->tasks);

431

420

432

/* Copy the set of subsystem state objects generated in

421

/* Copy the set of subsystem state objects generated in

433

* find_existing_css_set() */

422

* find_existing_css_set() */

434

memcpy(res->subsys, template, sizeof(res->subsys));

423

memcpy(res->subsys, template, sizeof(res->subsys));

435

424

436

write_lock(&css_set_lock);

425

write_lock(&css_set_lock);

437

/* Add reference counts and links from the new css_set. */

426

/* Add reference counts and links from the new css_set. */

438

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

427

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

439

struct cgroup *cgrp = res->subsys[i]->cgroup;

428

struct cgroup *cgrp = res->subsys[i]->cgroup;

440

struct cgroup_subsys *ss = subsys[i];

429

struct cgroup_subsys *ss = subsys[i];

441

atomic_inc(&cgrp->count);

430

atomic_inc(&cgrp->count);

442

/*

431

/*

443

* We want to add a link once per cgroup, so we

432

* We want to add a link once per cgroup, so we

444

* only do it for the first subsystem in each

433

* only do it for the first subsystem in each

445

* hierarchy

434

* hierarchy

446

*/

435

*/

447

if (ss->root->subsys_list.next == &ss->sibling) {

436

if (ss->root->subsys_list.next == &ss->sibling) {

448

BUG_ON(list_empty(&tmp_cg_links));

437

BUG_ON(list_empty(&tmp_cg_links));

449

link = list_entry(tmp_cg_links.next,

438

link = list_entry(tmp_cg_links.next,

450

struct cg_cgroup_link,

439

struct cg_cgroup_link,

451

cgrp_link_list);

440

cgrp_link_list);

452

list_del(&link->cgrp_link_list);

441

list_del(&link->cgrp_link_list);

453

list_add(&link->cgrp_link_list, &cgrp->css_sets);

442

list_add(&link->cgrp_link_list, &cgrp->css_sets);

454

link->cg = res;

443

link->cg = res;

455

list_add(&link->cg_link_list, &res->cg_links);

444

list_add(&link->cg_link_list, &res->cg_links);

456

}

445

}

457

}

446

}

458

if (list_empty(&rootnode.subsys_list)) {

447

if (list_empty(&rootnode.subsys_list)) {

459

link = list_entry(tmp_cg_links.next,

448

link = list_entry(tmp_cg_links.next,

460

struct cg_cgroup_link,

449

struct cg_cgroup_link,

461

cgrp_link_list);

450

cgrp_link_list);

462

list_del(&link->cgrp_link_list);

451

list_del(&link->cgrp_link_list);

463

list_add(&link->cgrp_link_list, &dummytop->css_sets);

452

list_add(&link->cgrp_link_list, &dummytop->css_sets);

464

link->cg = res;

453

link->cg = res;

465

list_add(&link->cg_link_list, &res->cg_links);

454

list_add(&link->cg_link_list, &res->cg_links);

466

}

455

}

467

456

468

BUG_ON(!list_empty(&tmp_cg_links));

457

BUG_ON(!list_empty(&tmp_cg_links));

469

458

470

/* Link this cgroup group into the list */

459

/* Link this cgroup group into the list */

471

list_add(&res->list, &init_css_set.list);

460

list_add(&res->list, &init_css_set.list);

472

css_set_count++;

461

css_set_count++;

473

write_unlock(&css_set_lock);

462

write_unlock(&css_set_lock);

474

463

475

return res;

464

return res;

476

}

465

}

477

466

478

/*

467

/*

479

* There is one global cgroup mutex. We also require taking

468

* There is one global cgroup mutex. We also require taking

480

* task_lock() when dereferencing a task's cgroup subsys pointers.

469

* task_lock() when dereferencing a task's cgroup subsys pointers.

481

* See "The task_lock() exception", at the end of this comment.

470

* See "The task_lock() exception", at the end of this comment.

482

*

471

*

483

* A task must hold cgroup_mutex to modify cgroups.

472

* A task must hold cgroup_mutex to modify cgroups.

484

*

473

*

485

* Any task can increment and decrement the count field without lock.

474

* Any task can increment and decrement the count field without lock.

486

* So in general, code holding cgroup_mutex can't rely on the count

475

* So in general, code holding cgroup_mutex can't rely on the count

487

* field not changing. However, if the count goes to zero, then only

476

* field not changing. However, if the count goes to zero, then only

488

* cgroup_attach_task() can increment it again. Because a count of zero

477

* cgroup_attach_task() can increment it again. Because a count of zero

489

* means that no tasks are currently attached, therefore there is no

478

* means that no tasks are currently attached, therefore there is no

490

* way a task attached to that cgroup can fork (the other way to

479

* way a task attached to that cgroup can fork (the other way to

491

* increment the count). So code holding cgroup_mutex can safely

480

* increment the count). So code holding cgroup_mutex can safely

492

* assume that if the count is zero, it will stay zero. Similarly, if

481

* assume that if the count is zero, it will stay zero. Similarly, if

493

* a task holds cgroup_mutex on a cgroup with zero count, it

482

* a task holds cgroup_mutex on a cgroup with zero count, it

494

* knows that the cgroup won't be removed, as cgroup_rmdir()

483

* knows that the cgroup won't be removed, as cgroup_rmdir()

495

* needs that mutex.

484

* needs that mutex.

496

*

485

*

497

* The cgroup_common_file_write handler for operations that modify

486

* The cgroup_common_file_write handler for operations that modify

498

* the cgroup hierarchy holds cgroup_mutex across the entire operation,

487

* the cgroup hierarchy holds cgroup_mutex across the entire operation,

499

* single threading all such cgroup modifications across the system.

488

* single threading all such cgroup modifications across the system.

500

*

489

*

501

* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't

490

* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't

502

* (usually) take cgroup_mutex. These are the two most performance

491

* (usually) take cgroup_mutex. These are the two most performance

503

* critical pieces of code here. The exception occurs on cgroup_exit(),

492

* critical pieces of code here. The exception occurs on cgroup_exit(),

504

* when a task in a notify_on_release cgroup exits. Then cgroup_mutex

493

* when a task in a notify_on_release cgroup exits. Then cgroup_mutex

505

* is taken, and if the cgroup count is zero, a usermode call made

494

* is taken, and if the cgroup count is zero, a usermode call made

506

* to the release agent with the name of the cgroup (path relative to

495

* to the release agent with the name of the cgroup (path relative to

507

* the root of cgroup file system) as the argument.

496

* the root of cgroup file system) as the argument.

508

*

497

*

509

* A cgroup can only be deleted if both its 'count' of using tasks

498

* A cgroup can only be deleted if both its 'count' of using tasks

510

* is zero, and its list of 'children' cgroups is empty. Since all

499

* is zero, and its list of 'children' cgroups is empty. Since all

511

* tasks in the system use _some_ cgroup, and since there is always at

500

* tasks in the system use _some_ cgroup, and since there is always at

512

* least one task in the system (init, pid == 1), therefore, top_cgroup

501

* least one task in the system (init, pid == 1), therefore, top_cgroup

513

* always has either children cgroups and/or using tasks. So we don't

502

* always has either children cgroups and/or using tasks. So we don't

514

* need a special hack to ensure that top_cgroup cannot be deleted.

503

* need a special hack to ensure that top_cgroup cannot be deleted.

515

*

504

*

516

* The task_lock() exception

505

* The task_lock() exception

517

*

506

*

518

* The need for this exception arises from the action of

507

* The need for this exception arises from the action of

519

* cgroup_attach_task(), which overwrites one tasks cgroup pointer with

508

* cgroup_attach_task(), which overwrites one tasks cgroup pointer with

520

* another. It does so using cgroup_mutex, however there are

509

* another. It does so using cgroup_mutex, however there are

521

* several performance critical places that need to reference

510

* several performance critical places that need to reference

522

* task->cgroup without the expense of grabbing a system global

511

* task->cgroup without the expense of grabbing a system global

523

* mutex. Therefore except as noted below, when dereferencing or, as

512

* mutex. Therefore except as noted below, when dereferencing or, as

524

* in cgroup_attach_task(), modifying a task'ss cgroup pointer we use

513

* in cgroup_attach_task(), modifying a task'ss cgroup pointer we use

525

* task_lock(), which acts on a spinlock (task->alloc_lock) already in

514

* task_lock(), which acts on a spinlock (task->alloc_lock) already in

526

* the task_struct routinely used for such matters.

515

* the task_struct routinely used for such matters.

527

*

516

*

528

* P.S. One more locking exception. RCU is used to guard the

517

* P.S. One more locking exception. RCU is used to guard the

529

* update of a tasks cgroup pointer by cgroup_attach_task()

518

* update of a tasks cgroup pointer by cgroup_attach_task()

530

*/

519

*/

531

520

532

/**

521

/**

533

* cgroup_lock - lock out any changes to cgroup structures

522

* cgroup_lock - lock out any changes to cgroup structures

534

*

523

*

535

*/

524

*/

536

void cgroup_lock(void)

525

void cgroup_lock(void)

537

{

526

{

538

mutex_lock(&cgroup_mutex);

527

mutex_lock(&cgroup_mutex);

539

}

528

}

540

529

541

/**

530

/**

542

* cgroup_unlock - release lock on cgroup changes

531

* cgroup_unlock - release lock on cgroup changes

543

*

532

*

544

* Undo the lock taken in a previous cgroup_lock() call.

533

* Undo the lock taken in a previous cgroup_lock() call.

545

*/

534

*/

546

void cgroup_unlock(void)

535

void cgroup_unlock(void)

547

{

536

{

548

mutex_unlock(&cgroup_mutex);

537

mutex_unlock(&cgroup_mutex);

549

}

538

}

550

539

551

/*

540

/*

552

* A couple of forward declarations required, due to cyclic reference loop:

541

* A couple of forward declarations required, due to cyclic reference loop:

553

* cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->

542

* cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->

554

* cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations

543

* cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations

555

* -> cgroup_mkdir.

544

* -> cgroup_mkdir.

556

*/

545

*/

557

546

558

static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);

547

static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);

559

static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);

548

static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);

560

static int cgroup_populate_dir(struct cgroup *cgrp);

549

static int cgroup_populate_dir(struct cgroup *cgrp);

561

static struct inode_operations cgroup_dir_inode_operations;

550

static struct inode_operations cgroup_dir_inode_operations;

562

static struct file_operations proc_cgroupstats_operations;

551

static struct file_operations proc_cgroupstats_operations;

563

552

564

static struct backing_dev_info cgroup_backing_dev_info = {

553

static struct backing_dev_info cgroup_backing_dev_info = {

565

.capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,

554

.capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,

566

};

555

};

567

556

568

static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)

557

static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)

569

{

558

{

570

struct inode *inode = new_inode(sb);

559

struct inode *inode = new_inode(sb);

571

560

572

if (inode) {

561

if (inode) {

573

inode->i_mode = mode;

562

inode->i_mode = mode;

574

inode->i_uid = current->fsuid;

563

inode->i_uid = current->fsuid;

575

inode->i_gid = current->fsgid;

564

inode->i_gid = current->fsgid;

576

inode->i_blocks = 0;

565

inode->i_blocks = 0;

577

inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;

566

inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;

578

inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;

567

inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;

579

}

568

}

580

return inode;

569

return inode;

581

}

570

}

582

571

583

/*

572

/*

584

* Call subsys's pre_destroy handler.

573

* Call subsys's pre_destroy handler.

585

* This is called before css refcnt check.

574

* This is called before css refcnt check.

586

*/

575

*/

587

static void cgroup_call_pre_destroy(struct cgroup *cgrp)

576

static void cgroup_call_pre_destroy(struct cgroup *cgrp)

588

{

577

{

589

struct cgroup_subsys *ss;

578

struct cgroup_subsys *ss;

590

for_each_subsys(cgrp->root, ss)

579

for_each_subsys(cgrp->root, ss)

591

if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])

580

if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])

592

ss->pre_destroy(ss, cgrp);

581

ss->pre_destroy(ss, cgrp);

593

return;

582

return;

594

}

583

}

595

584

596

static void cgroup_diput(struct dentry *dentry, struct inode *inode)

585

static void cgroup_diput(struct dentry *dentry, struct inode *inode)

597

{

586

{

598

/* is dentry a directory ? if so, kfree() associated cgroup */

587

/* is dentry a directory ? if so, kfree() associated cgroup */

599

if (S_ISDIR(inode->i_mode)) {

588

if (S_ISDIR(inode->i_mode)) {

600

struct cgroup *cgrp = dentry->d_fsdata;

589

struct cgroup *cgrp = dentry->d_fsdata;

601

struct cgroup_subsys *ss;

590

struct cgroup_subsys *ss;

602

BUG_ON(!(cgroup_is_removed(cgrp)));

591

BUG_ON(!(cgroup_is_removed(cgrp)));

603

/* It's possible for external users to be holding css

592

/* It's possible for external users to be holding css

604

* reference counts on a cgroup; css_put() needs to

593

* reference counts on a cgroup; css_put() needs to

605

* be able to access the cgroup after decrementing

594

* be able to access the cgroup after decrementing

606

* the reference count in order to know if it needs to

595

* the reference count in order to know if it needs to

607

* queue the cgroup to be handled by the release

596

* queue the cgroup to be handled by the release

608

* agent */

597

* agent */

609

synchronize_rcu();

598

synchronize_rcu();

610

599

611

mutex_lock(&cgroup_mutex);

600

mutex_lock(&cgroup_mutex);

612

/*

601

/*

613

* Release the subsystem state objects.

602

* Release the subsystem state objects.

614

*/

603

*/

615

for_each_subsys(cgrp->root, ss) {

604

for_each_subsys(cgrp->root, ss) {

616

if (cgrp->subsys[ss->subsys_id])

605

if (cgrp->subsys[ss->subsys_id])

617

ss->destroy(ss, cgrp);

606

ss->destroy(ss, cgrp);

618

}

607

}

619

608

620

cgrp->root->number_of_cgroups--;

609

cgrp->root->number_of_cgroups--;

621

mutex_unlock(&cgroup_mutex);

610

mutex_unlock(&cgroup_mutex);

622

611

623

/* Drop the active superblock reference that we took when we

612

/* Drop the active superblock reference that we took when we

624

* created the cgroup */

613

* created the cgroup */

625

deactivate_super(cgrp->root->sb);

614

deactivate_super(cgrp->root->sb);

626

615

627

kfree(cgrp);

616

kfree(cgrp);

628

}

617

}

629

iput(inode);

618

iput(inode);

630

}

619

}

631

620

632

static void remove_dir(struct dentry *d)

621

static void remove_dir(struct dentry *d)

633

{

622

{

634

struct dentry *parent = dget(d->d_parent);

623

struct dentry *parent = dget(d->d_parent);

635

624

636

d_delete(d);

625

d_delete(d);

637

simple_rmdir(parent->d_inode, d);

626

simple_rmdir(parent->d_inode, d);

638

dput(parent);

627

dput(parent);

639

}

628

}

640

629

641

static void cgroup_clear_directory(struct dentry *dentry)

630

static void cgroup_clear_directory(struct dentry *dentry)

642

{

631

{

643

struct list_head *node;

632

struct list_head *node;

644

633

645

BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));

634

BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));

646

spin_lock(&dcache_lock);

635

spin_lock(&dcache_lock);

647

node = dentry->d_subdirs.next;

636

node = dentry->d_subdirs.next;

648

while (node != &dentry->d_subdirs) {

637

while (node != &dentry->d_subdirs) {

649

struct dentry *d = list_entry(node, struct dentry, d_u.d_child);

638

struct dentry *d = list_entry(node, struct dentry, d_u.d_child);

650

list_del_init(node);

639

list_del_init(node);

651

if (d->d_inode) {

640

if (d->d_inode) {

652

/* This should never be called on a cgroup

641

/* This should never be called on a cgroup

653

* directory with child cgroups */

642

* directory with child cgroups */

654

BUG_ON(d->d_inode->i_mode & S_IFDIR);

643

BUG_ON(d->d_inode->i_mode & S_IFDIR);

655

d = dget_locked(d);

644

d = dget_locked(d);

656

spin_unlock(&dcache_lock);

645

spin_unlock(&dcache_lock);

657

d_delete(d);

646

d_delete(d);

658

simple_unlink(dentry->d_inode, d);

647

simple_unlink(dentry->d_inode, d);

659

dput(d);

648

dput(d);

660

spin_lock(&dcache_lock);

649

spin_lock(&dcache_lock);

661

}

650

}

662

node = dentry->d_subdirs.next;

651

node = dentry->d_subdirs.next;

663

}

652

}

664

spin_unlock(&dcache_lock);

653

spin_unlock(&dcache_lock);

665

}

654

}

666

655

667

/*

656

/*

668

* NOTE : the dentry must have been dget()'ed

657

* NOTE : the dentry must have been dget()'ed

669

*/

658

*/

670

static void cgroup_d_remove_dir(struct dentry *dentry)

659

static void cgroup_d_remove_dir(struct dentry *dentry)

671

{

660

{

672

cgroup_clear_directory(dentry);

661

cgroup_clear_directory(dentry);

673

662

674

spin_lock(&dcache_lock);

663

spin_lock(&dcache_lock);

675

list_del_init(&dentry->d_u.d_child);

664

list_del_init(&dentry->d_u.d_child);

676

spin_unlock(&dcache_lock);

665

spin_unlock(&dcache_lock);

677

remove_dir(dentry);

666

remove_dir(dentry);

678

}

667

}

679

668

680

static int rebind_subsystems(struct cgroupfs_root *root,

669

static int rebind_subsystems(struct cgroupfs_root *root,

681

unsigned long final_bits)

670

unsigned long final_bits)

682

{

671

{

683

unsigned long added_bits, removed_bits;

672

unsigned long added_bits, removed_bits;

684

struct cgroup *cgrp = &root->top_cgroup;

673

struct cgroup *cgrp = &root->top_cgroup;

685

int i;

674

int i;

686

675

687

removed_bits = root->actual_subsys_bits & ~final_bits;

676

removed_bits = root->actual_subsys_bits & ~final_bits;

688

added_bits = final_bits & ~root->actual_subsys_bits;

677

added_bits = final_bits & ~root->actual_subsys_bits;

689

/* Check that any added subsystems are currently free */

678

/* Check that any added subsystems are currently free */

690

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

679

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

691

unsigned long bit = 1UL << i;

680

unsigned long bit = 1UL << i;

692

struct cgroup_subsys *ss = subsys[i];

681

struct cgroup_subsys *ss = subsys[i];

693

if (!(bit & added_bits))

682

if (!(bit & added_bits))

694

continue;

683

continue;

695

if (ss->root != &rootnode) {

684

if (ss->root != &rootnode) {

696

/* Subsystem isn't free */

685

/* Subsystem isn't free */

697

return -EBUSY;

686

return -EBUSY;

698

}

687

}

699

}

688

}

700

689

701

/* Currently we don't handle adding/removing subsystems when

690

/* Currently we don't handle adding/removing subsystems when

702

* any child cgroups exist. This is theoretically supportable

691

* any child cgroups exist. This is theoretically supportable

703

* but involves complex error handling, so it's being left until

692

* but involves complex error handling, so it's being left until

704

* later */

693

* later */

705

if (!list_empty(&cgrp->children))

694

if (!list_empty(&cgrp->children))

706

return -EBUSY;

695

return -EBUSY;

707

696

708

/* Process each subsystem */

697

/* Process each subsystem */

709

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

698

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

710

struct cgroup_subsys *ss = subsys[i];

699

struct cgroup_subsys *ss = subsys[i];

711

unsigned long bit = 1UL << i;

700

unsigned long bit = 1UL << i;

712

if (bit & added_bits) {

701

if (bit & added_bits) {

713

/* We're binding this subsystem to this hierarchy */

702

/* We're binding this subsystem to this hierarchy */

714

BUG_ON(cgrp->subsys[i]);

703

BUG_ON(cgrp->subsys[i]);

715

BUG_ON(!dummytop->subsys[i]);

704

BUG_ON(!dummytop->subsys[i]);

716

BUG_ON(dummytop->subsys[i]->cgroup != dummytop);

705

BUG_ON(dummytop->subsys[i]->cgroup != dummytop);

717

cgrp->subsys[i] = dummytop->subsys[i];

706

cgrp->subsys[i] = dummytop->subsys[i];

718

cgrp->subsys[i]->cgroup = cgrp;

707

cgrp->subsys[i]->cgroup = cgrp;

719

list_add(&ss->sibling, &root->subsys_list);

708

list_add(&ss->sibling, &root->subsys_list);

720

rcu_assign_pointer(ss->root, root);

709

rcu_assign_pointer(ss->root, root);

721

if (ss->bind)

710

if (ss->bind)

722

ss->bind(ss, cgrp);

711

ss->bind(ss, cgrp);

723

712

724

} else if (bit & removed_bits) {

713

} else if (bit & removed_bits) {

725

/* We're removing this subsystem */

714

/* We're removing this subsystem */

726

BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);

715

BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);

727

BUG_ON(cgrp->subsys[i]->cgroup != cgrp);

716

BUG_ON(cgrp->subsys[i]->cgroup != cgrp);

728

if (ss->bind)

717

if (ss->bind)

729

ss->bind(ss, dummytop);

718

ss->bind(ss, dummytop);

730

dummytop->subsys[i]->cgroup = dummytop;

719

dummytop->subsys[i]->cgroup = dummytop;

731

cgrp->subsys[i] = NULL;

720

cgrp->subsys[i] = NULL;

732

rcu_assign_pointer(subsys[i]->root, &rootnode);

721

rcu_assign_pointer(subsys[i]->root, &rootnode);

733

list_del(&ss->sibling);

722

list_del(&ss->sibling);

734

} else if (bit & final_bits) {

723

} else if (bit & final_bits) {

735

/* Subsystem state should already exist */

724

/* Subsystem state should already exist */

736

BUG_ON(!cgrp->subsys[i]);

725

BUG_ON(!cgrp->subsys[i]);

737

} else {

726

} else {

738

/* Subsystem state shouldn't exist */

727

/* Subsystem state shouldn't exist */

739

BUG_ON(cgrp->subsys[i]);

728

BUG_ON(cgrp->subsys[i]);

740

}

729

}

741

}

730

}

742

root->subsys_bits = root->actual_subsys_bits = final_bits;

731

root->subsys_bits = root->actual_subsys_bits = final_bits;

743

synchronize_rcu();

732

synchronize_rcu();

744

733

745

return 0;

734

return 0;

746

}

735

}

747

736

748

static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)

737

static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)

749

{

738

{

750

struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;

739

struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;

751

struct cgroup_subsys *ss;

740

struct cgroup_subsys *ss;

752

741

753

mutex_lock(&cgroup_mutex);

742

mutex_lock(&cgroup_mutex);

754

for_each_subsys(root, ss)

743

for_each_subsys(root, ss)

755

seq_printf(seq, ",%s", ss->name);

744

seq_printf(seq, ",%s", ss->name);

756

if (test_bit(ROOT_NOPREFIX, &root->flags))

745

if (test_bit(ROOT_NOPREFIX, &root->flags))

757

seq_puts(seq, ",noprefix");

746

seq_puts(seq, ",noprefix");

758

if (strlen(root->release_agent_path))

747

if (strlen(root->release_agent_path))

759

seq_printf(seq, ",release_agent=%s", root->release_agent_path);

748

seq_printf(seq, ",release_agent=%s", root->release_agent_path);

760

mutex_unlock(&cgroup_mutex);

749

mutex_unlock(&cgroup_mutex);

761

return 0;

750

return 0;

762

}

751

}

763

752

764

struct cgroup_sb_opts {

753

struct cgroup_sb_opts {

765

unsigned long subsys_bits;

754

unsigned long subsys_bits;

766

unsigned long flags;

755

unsigned long flags;

767

char *release_agent;

756

char *release_agent;

768

};

757

};

769

758

770

/* Convert a hierarchy specifier into a bitmask of subsystems and

759

/* Convert a hierarchy specifier into a bitmask of subsystems and

771

* flags. */

760

* flags. */

772

static int parse_cgroupfs_options(char *data,

761

static int parse_cgroupfs_options(char *data,

773

struct cgroup_sb_opts *opts)

762

struct cgroup_sb_opts *opts)

774

{

763

{

775

char *token, *o = data ?: "all";

764

char *token, *o = data ?: "all";

776

765

777

opts->subsys_bits = 0;

766

opts->subsys_bits = 0;

778

opts->flags = 0;

767

opts->flags = 0;

779

opts->release_agent = NULL;

768

opts->release_agent = NULL;

780

769

781

while ((token = strsep(&o, ",")) != NULL) {

770

while ((token = strsep(&o, ",")) != NULL) {

782

if (!*token)

771

if (!*token)

783

return -EINVAL;

772

return -EINVAL;

784

if (!strcmp(token, "all")) {

773

if (!strcmp(token, "all")) {

785

/* Add all non-disabled subsystems */

774

/* Add all non-disabled subsystems */

786

int i;

775

int i;

787

opts->subsys_bits = 0;

776

opts->subsys_bits = 0;

788

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

777

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

789

struct cgroup_subsys *ss = subsys[i];

778

struct cgroup_subsys *ss = subsys[i];

790

if (!ss->disabled)

779

if (!ss->disabled)

791

opts->subsys_bits |= 1ul << i;

780

opts->subsys_bits |= 1ul << i;

792

}

781

}

793

} else if (!strcmp(token, "noprefix")) {

782

} else if (!strcmp(token, "noprefix")) {

794

set_bit(ROOT_NOPREFIX, &opts->flags);

783

set_bit(ROOT_NOPREFIX, &opts->flags);

795

} else if (!strncmp(token, "release_agent=", 14)) {

784

} else if (!strncmp(token, "release_agent=", 14)) {

796

/* Specifying two release agents is forbidden */

785

/* Specifying two release agents is forbidden */

797

if (opts->release_agent)

786

if (opts->release_agent)

798

return -EINVAL;

787

return -EINVAL;

799

opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);

788

opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);

800

if (!opts->release_agent)

789

if (!opts->release_agent)

801

return -ENOMEM;

790

return -ENOMEM;

802

strncpy(opts->release_agent, token + 14, PATH_MAX - 1);

791

strncpy(opts->release_agent, token + 14, PATH_MAX - 1);

803

opts->release_agent[PATH_MAX - 1] = 0;

792

opts->release_agent[PATH_MAX - 1] = 0;

804

} else {

793

} else {

805

struct cgroup_subsys *ss;

794

struct cgroup_subsys *ss;

806

int i;

795

int i;

807

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

796

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

808

ss = subsys[i];

797

ss = subsys[i];

809

if (!strcmp(token, ss->name)) {

798

if (!strcmp(token, ss->name)) {

810

if (!ss->disabled)

799

if (!ss->disabled)

811

set_bit(i, &opts->subsys_bits);

800

set_bit(i, &opts->subsys_bits);

812

break;

801

break;

813

}

802

}

814

}

803

}

815

if (i == CGROUP_SUBSYS_COUNT)

804

if (i == CGROUP_SUBSYS_COUNT)

816

return -ENOENT;

805

return -ENOENT;

817

}

806

}

818

}

807

}

819

808

820

/* We can't have an empty hierarchy */

809

/* We can't have an empty hierarchy */

821

if (!opts->subsys_bits)

810

if (!opts->subsys_bits)

822

return -EINVAL;

811

return -EINVAL;

823

812

824

return 0;

813

return 0;

825

}

814

}

826

815

827

static int cgroup_remount(struct super_block *sb, int *flags, char *data)

816

static int cgroup_remount(struct super_block *sb, int *flags, char *data)

828

{

817

{

829

int ret = 0;

818

int ret = 0;

830

struct cgroupfs_root *root = sb->s_fs_info;

819

struct cgroupfs_root *root = sb->s_fs_info;

831

struct cgroup *cgrp = &root->top_cgroup;

820

struct cgroup *cgrp = &root->top_cgroup;

832

struct cgroup_sb_opts opts;

821

struct cgroup_sb_opts opts;

833

822

834

mutex_lock(&cgrp->dentry->d_inode->i_mutex);

823

mutex_lock(&cgrp->dentry->d_inode->i_mutex);

835

mutex_lock(&cgroup_mutex);

824

mutex_lock(&cgroup_mutex);

836

825

837

/* See what subsystems are wanted */

826

/* See what subsystems are wanted */

838

ret = parse_cgroupfs_options(data, &opts);

827

ret = parse_cgroupfs_options(data, &opts);

839

if (ret)

828

if (ret)

840

goto out_unlock;

829

goto out_unlock;

841

830

842

/* Don't allow flags to change at remount */

831

/* Don't allow flags to change at remount */

843

if (opts.flags != root->flags) {

832

if (opts.flags != root->flags) {

844

ret = -EINVAL;

833

ret = -EINVAL;

845

goto out_unlock;

834

goto out_unlock;

846

}

835

}

847

836

848

ret = rebind_subsystems(root, opts.subsys_bits);

837

ret = rebind_subsystems(root, opts.subsys_bits);

849

838

850

/* (re)populate subsystem files */

839

/* (re)populate subsystem files */

851

if (!ret)

840

if (!ret)

852

cgroup_populate_dir(cgrp);

841

cgroup_populate_dir(cgrp);

853

842

854

if (opts.release_agent)

843

if (opts.release_agent)

855

strcpy(root->release_agent_path, opts.release_agent);

844

strcpy(root->release_agent_path, opts.release_agent);

856

out_unlock:

845

out_unlock:

857

if (opts.release_agent)

846

if (opts.release_agent)

858

kfree(opts.release_agent);

847

kfree(opts.release_agent);

859

mutex_unlock(&cgroup_mutex);

848

mutex_unlock(&cgroup_mutex);

860

mutex_unlock(&cgrp->dentry->d_inode->i_mutex);

849

mutex_unlock(&cgrp->dentry->d_inode->i_mutex);

861

return ret;

850

return ret;

862

}

851

}

863

852

864

static struct super_operations cgroup_ops = {

853

static struct super_operations cgroup_ops = {

865

.statfs = simple_statfs,

854

.statfs = simple_statfs,

866

.drop_inode = generic_delete_inode,

855

.drop_inode = generic_delete_inode,

867

.show_options = cgroup_show_options,

856

.show_options = cgroup_show_options,

868

.remount_fs = cgroup_remount,

857

.remount_fs = cgroup_remount,

869

};

858

};

870

859

871

static void init_cgroup_root(struct cgroupfs_root *root)

860

static void init_cgroup_root(struct cgroupfs_root *root)

872

{

861

{

873

struct cgroup *cgrp = &root->top_cgroup;

862

struct cgroup *cgrp = &root->top_cgroup;

874

INIT_LIST_HEAD(&root->subsys_list);

863

INIT_LIST_HEAD(&root->subsys_list);

875

INIT_LIST_HEAD(&root->root_list);

864

INIT_LIST_HEAD(&root->root_list);

876

root->number_of_cgroups = 1;

865

root->number_of_cgroups = 1;

877

cgrp->root = root;

866

cgrp->root = root;

878

cgrp->top_cgroup = cgrp;

867

cgrp->top_cgroup = cgrp;

879

INIT_LIST_HEAD(&cgrp->sibling);

868

INIT_LIST_HEAD(&cgrp->sibling);

880

INIT_LIST_HEAD(&cgrp->children);

869

INIT_LIST_HEAD(&cgrp->children);

881

INIT_LIST_HEAD(&cgrp->css_sets);

870

INIT_LIST_HEAD(&cgrp->css_sets);

882

INIT_LIST_HEAD(&cgrp->release_list);

871

INIT_LIST_HEAD(&cgrp->release_list);

883

}

872

}

884

873

885

static int cgroup_test_super(struct super_block *sb, void *data)

874

static int cgroup_test_super(struct super_block *sb, void *data)

886

{

875

{

887

struct cgroupfs_root *new = data;

876

struct cgroupfs_root *new = data;

888

struct cgroupfs_root *root = sb->s_fs_info;

877

struct cgroupfs_root *root = sb->s_fs_info;

889

878

890

/* First check subsystems */

879

/* First check subsystems */

891

if (new->subsys_bits != root->subsys_bits)

880

if (new->subsys_bits != root->subsys_bits)

892

return 0;

881

return 0;

893

882

894

/* Next check flags */

883

/* Next check flags */

895

if (new->flags != root->flags)

884

if (new->flags != root->flags)

896

return 0;

885

return 0;

897

886

898

return 1;

887

return 1;

899

}

888

}

900

889

901

static int cgroup_set_super(struct super_block *sb, void *data)

890

static int cgroup_set_super(struct super_block *sb, void *data)

902

{

891

{

903

int ret;

892

int ret;

904

struct cgroupfs_root *root = data;

893

struct cgroupfs_root *root = data;

905

894

906

ret = set_anon_super(sb, NULL);

895

ret = set_anon_super(sb, NULL);

907

if (ret)

896

if (ret)

908

return ret;

897

return ret;

909

898

910

sb->s_fs_info = root;

899

sb->s_fs_info = root;

911

root->sb = sb;

900

root->sb = sb;

912

901

913

sb->s_blocksize = PAGE_CACHE_SIZE;

902

sb->s_blocksize = PAGE_CACHE_SIZE;

914

sb->s_blocksize_bits = PAGE_CACHE_SHIFT;

903

sb->s_blocksize_bits = PAGE_CACHE_SHIFT;

915

sb->s_magic = CGROUP_SUPER_MAGIC;

904

sb->s_magic = CGROUP_SUPER_MAGIC;

916

sb->s_op = &cgroup_ops;

905

sb->s_op = &cgroup_ops;

917

906

918

return 0;

907

return 0;

919

}

908

}

920

909

921

static int cgroup_get_rootdir(struct super_block *sb)

910

static int cgroup_get_rootdir(struct super_block *sb)

922

{

911

{

923

struct inode *inode =

912

struct inode *inode =

924

cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);

913

cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);

925

struct dentry *dentry;

914

struct dentry *dentry;

926

915

927

if (!inode)

916

if (!inode)

928

return -ENOMEM;

917

return -ENOMEM;

929

918

930

inode->i_fop = &simple_dir_operations;

919

inode->i_fop = &simple_dir_operations;

931

inode->i_op = &cgroup_dir_inode_operations;

920

inode->i_op = &cgroup_dir_inode_operations;

932

/* directories start off with i_nlink == 2 (for "." entry) */

921

/* directories start off with i_nlink == 2 (for "." entry) */

933

inc_nlink(inode);

922

inc_nlink(inode);

934

dentry = d_alloc_root(inode);

923

dentry = d_alloc_root(inode);

935

if (!dentry) {

924

if (!dentry) {

936

iput(inode);

925

iput(inode);

937

return -ENOMEM;

926

return -ENOMEM;

938

}

927

}

939

sb->s_root = dentry;

928

sb->s_root = dentry;

940

return 0;

929

return 0;

941

}

930

}

942

931

943

static int cgroup_get_sb(struct file_system_type *fs_type,

932

static int cgroup_get_sb(struct file_system_type *fs_type,

944

int flags, const char *unused_dev_name,

933

int flags, const char *unused_dev_name,

945

void *data, struct vfsmount *mnt)

934

void *data, struct vfsmount *mnt)

946

{

935

{

947

struct cgroup_sb_opts opts;

936

struct cgroup_sb_opts opts;

948

int ret = 0;

937

int ret = 0;

949

struct super_block *sb;

938

struct super_block *sb;

950

struct cgroupfs_root *root;

939

struct cgroupfs_root *root;

951

struct list_head tmp_cg_links, *l;

940

struct list_head tmp_cg_links, *l;

952

INIT_LIST_HEAD(&tmp_cg_links);

941

INIT_LIST_HEAD(&tmp_cg_links);

953

942

954

/* First find the desired set of subsystems */

943

/* First find the desired set of subsystems */

955

ret = parse_cgroupfs_options(data, &opts);

944

ret = parse_cgroupfs_options(data, &opts);

956

if (ret) {

945

if (ret) {

957

if (opts.release_agent)

946

if (opts.release_agent)

958

kfree(opts.release_agent);

947

kfree(opts.release_agent);

959

return ret;

948

return ret;

960

}

949

}

961

950

962

root = kzalloc(sizeof(*root), GFP_KERNEL);

951

root = kzalloc(sizeof(*root), GFP_KERNEL);

963

if (!root) {

952

if (!root) {

964

if (opts.release_agent)

953

if (opts.release_agent)

965

kfree(opts.release_agent);

954

kfree(opts.release_agent);

966

return -ENOMEM;

955

return -ENOMEM;

967

}

956

}

968

957

969

init_cgroup_root(root);

958

init_cgroup_root(root);

970

root->subsys_bits = opts.subsys_bits;

959

root->subsys_bits = opts.subsys_bits;

971

root->flags = opts.flags;

960

root->flags = opts.flags;

972

if (opts.release_agent) {

961

if (opts.release_agent) {

973

strcpy(root->release_agent_path, opts.release_agent);

962

strcpy(root->release_agent_path, opts.release_agent);

974

kfree(opts.release_agent);

963

kfree(opts.release_agent);

975

}

964

}

976

965

977

sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);

966

sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);

978

967

979

if (IS_ERR(sb)) {

968

if (IS_ERR(sb)) {

980

kfree(root);

969

kfree(root);

981

return PTR_ERR(sb);

970

return PTR_ERR(sb);

982

}

971

}

983

972

984

if (sb->s_fs_info != root) {

973

if (sb->s_fs_info != root) {

985

/* Reusing an existing superblock */

974

/* Reusing an existing superblock */

986

BUG_ON(sb->s_root == NULL);

975

BUG_ON(sb->s_root == NULL);

987

kfree(root);

976

kfree(root);

988

root = NULL;

977

root = NULL;

989

} else {

978

} else {

990

/* New superblock */

979

/* New superblock */

991

struct cgroup *cgrp = &root->top_cgroup;

980

struct cgroup *cgrp = &root->top_cgroup;

992

struct inode *inode;

981

struct inode *inode;

993

982

994

BUG_ON(sb->s_root != NULL);

983

BUG_ON(sb->s_root != NULL);

995

984

996

ret = cgroup_get_rootdir(sb);

985

ret = cgroup_get_rootdir(sb);

997

if (ret)

986

if (ret)

998

goto drop_new_super;

987

goto drop_new_super;

999

inode = sb->s_root->d_inode;

988

inode = sb->s_root->d_inode;

1000

989

1001

mutex_lock(&inode->i_mutex);

990

mutex_lock(&inode->i_mutex);

1002

mutex_lock(&cgroup_mutex);

991

mutex_lock(&cgroup_mutex);

1003

992

1004

/*

993

/*

1005

* We're accessing css_set_count without locking

994

* We're accessing css_set_count without locking

1006

* css_set_lock here, but that's OK - it can only be

995

* css_set_lock here, but that's OK - it can only be

1007

* increased by someone holding cgroup_lock, and

996

* increased by someone holding cgroup_lock, and

1008

* that's us. The worst that can happen is that we

997

* that's us. The worst that can happen is that we

1009

* have some link structures left over

998

* have some link structures left over

1010

*/

999

*/

1011

ret = allocate_cg_links(css_set_count, &tmp_cg_links);

1000

ret = allocate_cg_links(css_set_count, &tmp_cg_links);

1012

if (ret) {

1001

if (ret) {

1013

mutex_unlock(&cgroup_mutex);

1002

mutex_unlock(&cgroup_mutex);

1014

mutex_unlock(&inode->i_mutex);

1003

mutex_unlock(&inode->i_mutex);

1015

goto drop_new_super;

1004

goto drop_new_super;

1016

}

1005

}

1017

1006

1018

ret = rebind_subsystems(root, root->subsys_bits);

1007

ret = rebind_subsystems(root, root->subsys_bits);

1019

if (ret == -EBUSY) {

1008

if (ret == -EBUSY) {

1020

mutex_unlock(&cgroup_mutex);

1009

mutex_unlock(&cgroup_mutex);

1021

mutex_unlock(&inode->i_mutex);

1010

mutex_unlock(&inode->i_mutex);

1022

goto drop_new_super;

1011

goto drop_new_super;

1023

}

1012

}

1024

1013

1025

/* EBUSY should be the only error here */

1014

/* EBUSY should be the only error here */

1026

BUG_ON(ret);

1015

BUG_ON(ret);

1027

1016

1028

list_add(&root->root_list, &roots);

1017

list_add(&root->root_list, &roots);

1029

root_count++;

1018

root_count++;

1030

1019

1031

sb->s_root->d_fsdata = &root->top_cgroup;

1020

sb->s_root->d_fsdata = &root->top_cgroup;

1032

root->top_cgroup.dentry = sb->s_root;

1021

root->top_cgroup.dentry = sb->s_root;

1033

1022

1034

/* Link the top cgroup in this hierarchy into all

1023

/* Link the top cgroup in this hierarchy into all

1035

* the css_set objects */

1024

* the css_set objects */

1036

write_lock(&css_set_lock);

1025

write_lock(&css_set_lock);

1037

l = &init_css_set.list;

1026

l = &init_css_set.list;

1038

do {

1027

do {

1039

struct css_set *cg;

1028

struct css_set *cg;

1040

struct cg_cgroup_link *link;

1029

struct cg_cgroup_link *link;

1041

cg = list_entry(l, struct css_set, list);

1030

cg = list_entry(l, struct css_set, list);

1042

BUG_ON(list_empty(&tmp_cg_links));

1031

BUG_ON(list_empty(&tmp_cg_links));

1043

link = list_entry(tmp_cg_links.next,

1032

link = list_entry(tmp_cg_links.next,

1044

struct cg_cgroup_link,

1033

struct cg_cgroup_link,

1045

cgrp_link_list);

1034

cgrp_link_list);

1046

list_del(&link->cgrp_link_list);

1035

list_del(&link->cgrp_link_list);

1047

link->cg = cg;

1036

link->cg = cg;

1048

list_add(&link->cgrp_link_list,

1037

list_add(&link->cgrp_link_list,

1049

&root->top_cgroup.css_sets);

1038

&root->top_cgroup.css_sets);

1050

list_add(&link->cg_link_list, &cg->cg_links);

1039

list_add(&link->cg_link_list, &cg->cg_links);

1051

l = l->next;

1040

l = l->next;

1052

} while (l != &init_css_set.list);

1041

} while (l != &init_css_set.list);

1053

write_unlock(&css_set_lock);

1042

write_unlock(&css_set_lock);

1054

1043

1055

free_cg_links(&tmp_cg_links);

1044

free_cg_links(&tmp_cg_links);

1056

1045

1057

BUG_ON(!list_empty(&cgrp->sibling));

1046

BUG_ON(!list_empty(&cgrp->sibling));

1058

BUG_ON(!list_empty(&cgrp->children));

1047

BUG_ON(!list_empty(&cgrp->children));

1059

BUG_ON(root->number_of_cgroups != 1);

1048

BUG_ON(root->number_of_cgroups != 1);

1060

1049

1061

cgroup_populate_dir(cgrp);

1050

cgroup_populate_dir(cgrp);

1062

mutex_unlock(&inode->i_mutex);

1051

mutex_unlock(&inode->i_mutex);

1063

mutex_unlock(&cgroup_mutex);

1052

mutex_unlock(&cgroup_mutex);

1064

}

1053

}

1065

1054

1066

return simple_set_mnt(mnt, sb);

1055

return simple_set_mnt(mnt, sb);

1067

1056

1068

drop_new_super:

1057

drop_new_super:

1069

up_write(&sb->s_umount);

1058

up_write(&sb->s_umount);

1070

deactivate_super(sb);

1059

deactivate_super(sb);

1071

free_cg_links(&tmp_cg_links);

1060

free_cg_links(&tmp_cg_links);

1072

return ret;

1061

return ret;

1073

}

1062

}

1074

1063

1075

static void cgroup_kill_sb(struct super_block *sb) {

1064

static void cgroup_kill_sb(struct super_block *sb) {

1076

struct cgroupfs_root *root = sb->s_fs_info;

1065

struct cgroupfs_root *root = sb->s_fs_info;

1077

struct cgroup *cgrp = &root->top_cgroup;

1066

struct cgroup *cgrp = &root->top_cgroup;

1078

int ret;

1067

int ret;

1079

1068

1080

BUG_ON(!root);

1069

BUG_ON(!root);

1081

1070

1082

BUG_ON(root->number_of_cgroups != 1);

1071

BUG_ON(root->number_of_cgroups != 1);

1083

BUG_ON(!list_empty(&cgrp->children));

1072

BUG_ON(!list_empty(&cgrp->children));

1084

BUG_ON(!list_empty(&cgrp->sibling));

1073

BUG_ON(!list_empty(&cgrp->sibling));

1085

1074

1086

mutex_lock(&cgroup_mutex);

1075

mutex_lock(&cgroup_mutex);

1087

1076

1088

/* Rebind all subsystems back to the default hierarchy */

1077

/* Rebind all subsystems back to the default hierarchy */

1089

ret = rebind_subsystems(root, 0);

1078

ret = rebind_subsystems(root, 0);

1090

/* Shouldn't be able to fail ... */

1079

/* Shouldn't be able to fail ... */

1091

BUG_ON(ret);

1080

BUG_ON(ret);

1092

1081

1093

/*

1082

/*

1094

* Release all the links from css_sets to this hierarchy's

1083

* Release all the links from css_sets to this hierarchy's

1095

* root cgroup

1084

* root cgroup

1096

*/

1085

*/

1097

write_lock(&css_set_lock);

1086

write_lock(&css_set_lock);

1098

while (!list_empty(&cgrp->css_sets)) {

1087

while (!list_empty(&cgrp->css_sets)) {

1099

struct cg_cgroup_link *link;

1088

struct cg_cgroup_link *link;

1100

link = list_entry(cgrp->css_sets.next,

1089

link = list_entry(cgrp->css_sets.next,

1101

struct cg_cgroup_link, cgrp_link_list);

1090

struct cg_cgroup_link, cgrp_link_list);

1102

list_del(&link->cg_link_list);

1091

list_del(&link->cg_link_list);

1103

list_del(&link->cgrp_link_list);

1092

list_del(&link->cgrp_link_list);

1104

kfree(link);

1093

kfree(link);

1105

}

1094

}

1106

write_unlock(&css_set_lock);

1095

write_unlock(&css_set_lock);

1107

1096

1108

if (!list_empty(&root->root_list)) {

1097

if (!list_empty(&root->root_list)) {

1109

list_del(&root->root_list);

1098

list_del(&root->root_list);

1110

root_count--;

1099

root_count--;

1111

}

1100

}

1112

mutex_unlock(&cgroup_mutex);

1101

mutex_unlock(&cgroup_mutex);

1113

1102

1114

kfree(root);

1103

kfree(root);

1115

kill_litter_super(sb);

1104

kill_litter_super(sb);

1116

}

1105

}

1117

1106

1118

static struct file_system_type cgroup_fs_type = {

1107

static struct file_system_type cgroup_fs_type = {

1119

.name = "cgroup",

1108

.name = "cgroup",

1120

.get_sb = cgroup_get_sb,

1109

.get_sb = cgroup_get_sb,

1121

.kill_sb = cgroup_kill_sb,

1110

.kill_sb = cgroup_kill_sb,

1122

};

1111

};

1123

1112

1124

static inline struct cgroup *__d_cgrp(struct dentry *dentry)

1113

static inline struct cgroup *__d_cgrp(struct dentry *dentry)

1125

{

1114

{

1126

return dentry->d_fsdata;

1115

return dentry->d_fsdata;

1127

}

1116

}

1128

1117

1129

static inline struct cftype *__d_cft(struct dentry *dentry)

1118

static inline struct cftype *__d_cft(struct dentry *dentry)

1130

{

1119

{

1131

return dentry->d_fsdata;

1120

return dentry->d_fsdata;

1132

}

1121

}

1133

1122

1134

/**

1123

/**

1135

* cgroup_path - generate the path of a cgroup

1124

* cgroup_path - generate the path of a cgroup

1136

* @cgrp: the cgroup in question

1125

* @cgrp: the cgroup in question

1137

* @buf: the buffer to write the path into

1126

* @buf: the buffer to write the path into

1138

* @buflen: the length of the buffer

1127

* @buflen: the length of the buffer

1139

*

1128

*

1140

* Called with cgroup_mutex held. Writes path of cgroup into buf.

1129

* Called with cgroup_mutex held. Writes path of cgroup into buf.

1141

* Returns 0 on success, -errno on error.

1130

* Returns 0 on success, -errno on error.

1142

*/

1131

*/

1143

int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)

1132

int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)

1144

{

1133

{

1145

char *start;

1134

char *start;

1146

1135

1147

if (cgrp == dummytop) {

1136

if (cgrp == dummytop) {

1148

/*

1137

/*

1149

* Inactive subsystems have no dentry for their root

1138

* Inactive subsystems have no dentry for their root

1150

* cgroup

1139

* cgroup

1151

*/

1140

*/

1152

strcpy(buf, "/");

1141

strcpy(buf, "/");

1153

return 0;

1142

return 0;

1154

}

1143

}

1155

1144

1156

start = buf + buflen;

1145

start = buf + buflen;

1157

1146

1158

*--start = '\0';

1147

*--start = '\0';

1159

for (;;) {

1148

for (;;) {

1160

int len = cgrp->dentry->d_name.len;

1149

int len = cgrp->dentry->d_name.len;

1161

if ((start -= len) < buf)

1150

if ((start -= len) < buf)

1162

return -ENAMETOOLONG;

1151

return -ENAMETOOLONG;

1163

memcpy(start, cgrp->dentry->d_name.name, len);

1152

memcpy(start, cgrp->dentry->d_name.name, len);

1164

cgrp = cgrp->parent;

1153

cgrp = cgrp->parent;

1165

if (!cgrp)

1154

if (!cgrp)

1166

break;

1155

break;

1167

if (!cgrp->parent)

1156

if (!cgrp->parent)

1168

continue;

1157

continue;

1169

if (--start < buf)

1158

if (--start < buf)

1170

return -ENAMETOOLONG;

1159

return -ENAMETOOLONG;

1171

*start = '/';

1160

*start = '/';

1172

}

1161

}

1173

memmove(buf, start, buf + buflen - start);

1162

memmove(buf, start, buf + buflen - start);

1174

return 0;

1163

return 0;

1175

}

1164

}

1176

1165

1177

/*

1166

/*

1178

* Return the first subsystem attached to a cgroup's hierarchy, and

1167

* Return the first subsystem attached to a cgroup's hierarchy, and

1179

* its subsystem id.

1168

* its subsystem id.

1180

*/

1169

*/

1181

1170

1182

static void get_first_subsys(const struct cgroup *cgrp,

1171

static void get_first_subsys(const struct cgroup *cgrp,

1183

struct cgroup_subsys_state **css, int *subsys_id)

1172

struct cgroup_subsys_state **css, int *subsys_id)

1184

{

1173

{

1185

const struct cgroupfs_root *root = cgrp->root;

1174

const struct cgroupfs_root *root = cgrp->root;

1186

const struct cgroup_subsys *test_ss;

1175

const struct cgroup_subsys *test_ss;

1187

BUG_ON(list_empty(&root->subsys_list));

1176

BUG_ON(list_empty(&root->subsys_list));

1188

test_ss = list_entry(root->subsys_list.next,

1177

test_ss = list_entry(root->subsys_list.next,

1189

struct cgroup_subsys, sibling);

1178

struct cgroup_subsys, sibling);

1190

if (css) {

1179

if (css) {

1191

*css = cgrp->subsys[test_ss->subsys_id];

1180

*css = cgrp->subsys[test_ss->subsys_id];

1192

BUG_ON(!*css);

1181

BUG_ON(!*css);

1193

}

1182

}

1194

if (subsys_id)

1183

if (subsys_id)

1195

*subsys_id = test_ss->subsys_id;

1184

*subsys_id = test_ss->subsys_id;

1196

}

1185

}

1197

1186

1198

/**

1187

/**

1199

* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'

1188

* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'

1200

* @cgrp: the cgroup the task is attaching to

1189

* @cgrp: the cgroup the task is attaching to

1201

* @tsk: the task to be attached

1190

* @tsk: the task to be attached

1202

*

1191

*

1203

* Call holding cgroup_mutex. May take task_lock of

1192

* Call holding cgroup_mutex. May take task_lock of

1204

* the task 'tsk' during call.

1193

* the task 'tsk' during call.

1205

*/

1194

*/

1206

int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

1195

int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)

1207

{

1196

{

1208

int retval = 0;

1197

int retval = 0;

1209

struct cgroup_subsys *ss;

1198

struct cgroup_subsys *ss;

1210

struct cgroup *oldcgrp;

1199

struct cgroup *oldcgrp;

1211

struct css_set *cg = tsk->cgroups;

1200

struct css_set *cg = tsk->cgroups;

1212

struct css_set *newcg;

1201

struct css_set *newcg;

1213

struct cgroupfs_root *root = cgrp->root;

1202

struct cgroupfs_root *root = cgrp->root;

1214

int subsys_id;

1203

int subsys_id;

1215

1204

1216

get_first_subsys(cgrp, NULL, &subsys_id);

1205

get_first_subsys(cgrp, NULL, &subsys_id);

1217

1206

1218

/* Nothing to do if the task is already in that cgroup */

1207

/* Nothing to do if the task is already in that cgroup */

1219

oldcgrp = task_cgroup(tsk, subsys_id);

1208

oldcgrp = task_cgroup(tsk, subsys_id);

1220

if (cgrp == oldcgrp)

1209

if (cgrp == oldcgrp)

1221

return 0;

1210

return 0;

1222

1211

1223

for_each_subsys(root, ss) {

1212

for_each_subsys(root, ss) {

1224

if (ss->can_attach) {

1213

if (ss->can_attach) {

1225

retval = ss->can_attach(ss, cgrp, tsk);

1214

retval = ss->can_attach(ss, cgrp, tsk);

1226

if (retval)

1215

if (retval)

1227

return retval;

1216

return retval;

1228

}

1217

}

1229

}

1218

}

1230

1219

1231

/*

1220

/*

1232

* Locate or allocate a new css_set for this task,

1221

* Locate or allocate a new css_set for this task,

1233

* based on its final set of cgroups

1222

* based on its final set of cgroups

1234

*/

1223

*/

1235

newcg = find_css_set(cg, cgrp);

1224

newcg = find_css_set(cg, cgrp);

1236

if (!newcg)

1225

if (!newcg)

1237

return -ENOMEM;

1226

return -ENOMEM;

1238

1227

1239

task_lock(tsk);

1228

task_lock(tsk);

1240

if (tsk->flags & PF_EXITING) {

1229

if (tsk->flags & PF_EXITING) {

1241

task_unlock(tsk);

1230

task_unlock(tsk);

1242

put_css_set(newcg);

1231

put_css_set(newcg);

1243

return -ESRCH;

1232

return -ESRCH;

1244

}

1233

}

1245

rcu_assign_pointer(tsk->cgroups, newcg);

1234

rcu_assign_pointer(tsk->cgroups, newcg);

1246

task_unlock(tsk);

1235

task_unlock(tsk);

1247

1236

1248

/* Update the css_set linked lists if we're using them */

1237

/* Update the css_set linked lists if we're using them */

1249

write_lock(&css_set_lock);

1238

write_lock(&css_set_lock);

1250

if (!list_empty(&tsk->cg_list)) {

1239

if (!list_empty(&tsk->cg_list)) {

1251

list_del(&tsk->cg_list);

1240

list_del(&tsk->cg_list);

1252

list_add(&tsk->cg_list, &newcg->tasks);

1241

list_add(&tsk->cg_list, &newcg->tasks);

1253

}

1242

}

1254

write_unlock(&css_set_lock);

1243

write_unlock(&css_set_lock);

1255

1244

1256

for_each_subsys(root, ss) {

1245

for_each_subsys(root, ss) {

1257

if (ss->attach)

1246

if (ss->attach)

1258

ss->attach(ss, cgrp, oldcgrp, tsk);

1247

ss->attach(ss, cgrp, oldcgrp, tsk);

1259

}

1248

}

1260

set_bit(CGRP_RELEASABLE, &oldcgrp->flags);

1249

set_bit(CGRP_RELEASABLE, &oldcgrp->flags);

1261

synchronize_rcu();

1250

synchronize_rcu();

1262

put_css_set(cg);

1251

put_css_set(cg);

1263

return 0;

1252

return 0;

1264

}

1253

}

1265

1254

1266

/*

1255

/*

1267

* Attach task with pid 'pid' to cgroup 'cgrp'. Call with

1256

* Attach task with pid 'pid' to cgroup 'cgrp'. Call with

1268

* cgroup_mutex, may take task_lock of task

1257

* cgroup_mutex, may take task_lock of task

1269

*/

1258

*/

1270

static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)

1259

static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)

1271

{

1260

{

1272

pid_t pid;

1261

pid_t pid;

1273

struct task_struct *tsk;

1262

struct task_struct *tsk;

1274

int ret;

1263

int ret;

1275

1264

1276

if (sscanf(pidbuf, "%d", &pid) != 1)

1265

if (sscanf(pidbuf, "%d", &pid) != 1)

1277

return -EIO;

1266

return -EIO;

1278

1267

1279

if (pid) {

1268

if (pid) {

1280

rcu_read_lock();

1269

rcu_read_lock();

1281

tsk = find_task_by_vpid(pid);

1270

tsk = find_task_by_vpid(pid);

1282

if (!tsk || tsk->flags & PF_EXITING) {

1271

if (!tsk || tsk->flags & PF_EXITING) {

1283

rcu_read_unlock();

1272

rcu_read_unlock();

1284

return -ESRCH;

1273

return -ESRCH;

1285

}

1274

}

1286

get_task_struct(tsk);

1275

get_task_struct(tsk);

1287

rcu_read_unlock();

1276

rcu_read_unlock();

1288

1277

1289

if ((current->euid) && (current->euid != tsk->uid)

1278

if ((current->euid) && (current->euid != tsk->uid)

1290

&& (current->euid != tsk->suid)) {

1279

&& (current->euid != tsk->suid)) {

1291

put_task_struct(tsk);

1280

put_task_struct(tsk);

1292

return -EACCES;

1281

return -EACCES;

1293

}

1282

}

1294

} else {

1283

} else {

1295

tsk = current;

1284

tsk = current;

1296

get_task_struct(tsk);

1285

get_task_struct(tsk);

1297

}

1286

}

1298

1287

1299

ret = cgroup_attach_task(cgrp, tsk);

1288

ret = cgroup_attach_task(cgrp, tsk);

1300

put_task_struct(tsk);

1289

put_task_struct(tsk);

1301

return ret;

1290

return ret;

1302

}

1291

}

1303

1292

1304

/* The various types of files and directories in a cgroup file system */

1293

/* The various types of files and directories in a cgroup file system */

1305

enum cgroup_filetype {

1294

enum cgroup_filetype {

1306

FILE_ROOT,

1295

FILE_ROOT,

1307

FILE_DIR,

1296

FILE_DIR,

1308

FILE_TASKLIST,

1297

FILE_TASKLIST,

1309

FILE_NOTIFY_ON_RELEASE,

1298

FILE_NOTIFY_ON_RELEASE,

1310

FILE_RELEASABLE,

1311

FILE_RELEASE_AGENT,

1299

FILE_RELEASE_AGENT,

1312

};

1300

};

1313

1301

1314

static ssize_t cgroup_write_u64(struct cgroup *cgrp, struct cftype *cft,

1302

static ssize_t cgroup_write_u64(struct cgroup *cgrp, struct cftype *cft,

1315

struct file *file,

1303

struct file *file,

1316

const char __user *userbuf,

1304

const char __user *userbuf,

1317

size_t nbytes, loff_t *unused_ppos)

1305

size_t nbytes, loff_t *unused_ppos)

1318

{

1306

{

1319

char buffer[64];

1307

char buffer[64];

1320

int retval = 0;

1308

int retval = 0;

1321

u64 val;

1309

u64 val;

1322

char *end;

1310

char *end;

1323

1311

1324

if (!nbytes)

1312

if (!nbytes)

1325

return -EINVAL;

1313

return -EINVAL;

1326

if (nbytes >= sizeof(buffer))

1314

if (nbytes >= sizeof(buffer))

1327

return -E2BIG;

1315

return -E2BIG;

1328

if (copy_from_user(buffer, userbuf, nbytes))

1316

if (copy_from_user(buffer, userbuf, nbytes))

1329

return -EFAULT;

1317

return -EFAULT;

1330

1318

1331

buffer[nbytes] = 0; /* nul-terminate */

1319

buffer[nbytes] = 0; /* nul-terminate */

1332

strstrip(buffer);

1320

strstrip(buffer);

1333

val = simple_strtoull(buffer, &end, 0);

1321

val = simple_strtoull(buffer, &end, 0);

1334

if (*end)

1322

if (*end)

1335

return -EINVAL;

1323

return -EINVAL;

1336

1324

1337

/* Pass to subsystem */

1325

/* Pass to subsystem */

1338

retval = cft->write_u64(cgrp, cft, val);

1326

retval = cft->write_u64(cgrp, cft, val);

1339

if (!retval)

1327

if (!retval)

1340

retval = nbytes;

1328

retval = nbytes;

1341

return retval;

1329

return retval;

1342

}

1330

}

1343

1331

1344

static ssize_t cgroup_common_file_write(struct cgroup *cgrp,

1332

static ssize_t cgroup_common_file_write(struct cgroup *cgrp,

1345

struct cftype *cft,

1333

struct cftype *cft,

1346

struct file *file,

1334

struct file *file,

1347

const char __user *userbuf,

1335

const char __user *userbuf,

1348

size_t nbytes, loff_t *unused_ppos)

1336

size_t nbytes, loff_t *unused_ppos)

1349

{

1337

{

1350

enum cgroup_filetype type = cft->private;

1338

enum cgroup_filetype type = cft->private;

1351

char *buffer;

1339

char *buffer;

1352

int retval = 0;

1340

int retval = 0;

1353

1341

1354

if (nbytes >= PATH_MAX)

1342

if (nbytes >= PATH_MAX)

1355

return -E2BIG;

1343

return -E2BIG;

1356

1344

1357

/* +1 for nul-terminator */

1345

/* +1 for nul-terminator */

1358

buffer = kmalloc(nbytes + 1, GFP_KERNEL);

1346

buffer = kmalloc(nbytes + 1, GFP_KERNEL);

1359

if (buffer == NULL)

1347

if (buffer == NULL)

1360

return -ENOMEM;

1348

return -ENOMEM;

1361

1349

1362

if (copy_from_user(buffer, userbuf, nbytes)) {

1350

if (copy_from_user(buffer, userbuf, nbytes)) {

1363

retval = -EFAULT;

1351

retval = -EFAULT;

1364

goto out1;

1352

goto out1;

1365

}

1353

}

1366

buffer[nbytes] = 0; /* nul-terminate */

1354

buffer[nbytes] = 0; /* nul-terminate */

1367

strstrip(buffer); /* strip -just- trailing whitespace */

1355

strstrip(buffer); /* strip -just- trailing whitespace */

1368

1356

1369

mutex_lock(&cgroup_mutex);

1357

mutex_lock(&cgroup_mutex);

1370

1358

1371

/*

1359

/*

1372

* This was already checked for in cgroup_file_write(), but

1360

* This was already checked for in cgroup_file_write(), but

1373

* check again now we're holding cgroup_mutex.

1361

* check again now we're holding cgroup_mutex.

1374

*/

1362

*/

1375

if (cgroup_is_removed(cgrp)) {

1363

if (cgroup_is_removed(cgrp)) {

1376

retval = -ENODEV;

1364

retval = -ENODEV;

1377

goto out2;

1365

goto out2;

1378

}

1366

}

1379

1367

1380

switch (type) {

1368

switch (type) {

1381

case FILE_TASKLIST:

1369

case FILE_TASKLIST:

1382

retval = attach_task_by_pid(cgrp, buffer);

1370

retval = attach_task_by_pid(cgrp, buffer);

1383

break;

1371

break;

1384

case FILE_NOTIFY_ON_RELEASE:

1372

case FILE_NOTIFY_ON_RELEASE:

1385

clear_bit(CGRP_RELEASABLE, &cgrp->flags);

1373

clear_bit(CGRP_RELEASABLE, &cgrp->flags);

1386

if (simple_strtoul(buffer, NULL, 10) != 0)

1374

if (simple_strtoul(buffer, NULL, 10) != 0)

1387

set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

1375

set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

1388

else

1376

else

1389

clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

1377

clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

1390

break;

1378

break;

1391

case FILE_RELEASE_AGENT:

1379

case FILE_RELEASE_AGENT:

1392

BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);

1380

BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);

1393

strcpy(cgrp->root->release_agent_path, buffer);

1381

strcpy(cgrp->root->release_agent_path, buffer);

1394

break;

1382

break;

1395

default:

1383

default:

1396

retval = -EINVAL;

1384

retval = -EINVAL;

1397

goto out2;

1385

goto out2;

1398

}

1386

}

1399

1387

1400

if (retval == 0)

1388

if (retval == 0)

1401

retval = nbytes;

1389

retval = nbytes;

1402

out2:

1390

out2:

1403

mutex_unlock(&cgroup_mutex);

1391

mutex_unlock(&cgroup_mutex);

1404

out1:

1392

out1:

1405

kfree(buffer);

1393

kfree(buffer);

1406

return retval;

1394

return retval;

1407

}

1395

}

1408

1396

1409

static ssize_t cgroup_file_write(struct file *file, const char __user *buf,

1397

static ssize_t cgroup_file_write(struct file *file, const char __user *buf,

1410

size_t nbytes, loff_t *ppos)

1398

size_t nbytes, loff_t *ppos)

1411

{

1399

{

1412

struct cftype *cft = __d_cft(file->f_dentry);

1400

struct cftype *cft = __d_cft(file->f_dentry);

1413

struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);

1401

struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);

1414

1402

1415

if (!cft || cgroup_is_removed(cgrp))

1403

if (!cft || cgroup_is_removed(cgrp))

1416

return -ENODEV;

1404

return -ENODEV;

1417

if (cft->write)

1405

if (cft->write)

1418

return cft->write(cgrp, cft, file, buf, nbytes, ppos);

1406

return cft->write(cgrp, cft, file, buf, nbytes, ppos);

1419

if (cft->write_u64)

1407

if (cft->write_u64)

1420

return cgroup_write_u64(cgrp, cft, file, buf, nbytes, ppos);

1408

return cgroup_write_u64(cgrp, cft, file, buf, nbytes, ppos);

1421

return -EINVAL;

1409

return -EINVAL;

1422

}

1410

}

1423

1411

1424

static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,

1412

static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,

1425

struct file *file,

1413

struct file *file,

1426

char __user *buf, size_t nbytes,

1414

char __user *buf, size_t nbytes,

1427

loff_t *ppos)

1415

loff_t *ppos)

1428

{

1416

{

1429

char tmp[64];

1417

char tmp[64];

1430

u64 val = cft->read_u64(cgrp, cft);

1418

u64 val = cft->read_u64(cgrp, cft);

1431

int len = sprintf(tmp, "%llu\n", (unsigned long long) val);

1419

int len = sprintf(tmp, "%llu\n", (unsigned long long) val);

1432

1420

1433

return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);

1421

return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);

1434

}

1422

}

1435

1423

1436

static ssize_t cgroup_common_file_read(struct cgroup *cgrp,

1424

static ssize_t cgroup_common_file_read(struct cgroup *cgrp,

1437

struct cftype *cft,

1425

struct cftype *cft,

1438

struct file *file,

1426

struct file *file,

1439

char __user *buf,

1427

char __user *buf,

1440

size_t nbytes, loff_t *ppos)

1428

size_t nbytes, loff_t *ppos)

1441

{

1429

{

1442

enum cgroup_filetype type = cft->private;

1430

enum cgroup_filetype type = cft->private;

1443

char *page;

1431

char *page;

1444

ssize_t retval = 0;

1432

ssize_t retval = 0;

1445

char *s;

1433

char *s;

1446

1434

1447

if (!(page = (char *)__get_free_page(GFP_KERNEL)))

1435

if (!(page = (char *)__get_free_page(GFP_KERNEL)))

1448

return -ENOMEM;

1436

return -ENOMEM;

1449

1437

1450

s = page;

1438

s = page;

1451

1439

1452

switch (type) {

1440

switch (type) {

1453

case FILE_RELEASE_AGENT:

1441

case FILE_RELEASE_AGENT:

1454

{

1442

{

1455

struct cgroupfs_root *root;

1443

struct cgroupfs_root *root;

1456

size_t n;

1444

size_t n;

1457

mutex_lock(&cgroup_mutex);

1445

mutex_lock(&cgroup_mutex);

1458

root = cgrp->root;

1446

root = cgrp->root;

1459

n = strnlen(root->release_agent_path,

1447

n = strnlen(root->release_agent_path,

1460

sizeof(root->release_agent_path));

1448

sizeof(root->release_agent_path));

1461

n = min(n, (size_t) PAGE_SIZE);

1449

n = min(n, (size_t) PAGE_SIZE);

1462

strncpy(s, root->release_agent_path, n);

1450

strncpy(s, root->release_agent_path, n);

1463

mutex_unlock(&cgroup_mutex);

1451

mutex_unlock(&cgroup_mutex);

1464

s += n;

1452

s += n;

1465

break;

1453

break;

1466

}

1454

}

1467

default:

1455

default:

1468

retval = -EINVAL;

1456

retval = -EINVAL;

1469

goto out;

1457

goto out;

1470

}

1458

}

1471

*s++ = '\n';

1459

*s++ = '\n';

1472

1460

1473

retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);

1461

retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);

1474

out:

1462

out:

1475

free_page((unsigned long)page);

1463

free_page((unsigned long)page);

1476

return retval;

1464

return retval;

1477

}

1465

}

1478

1466

1479

static ssize_t cgroup_file_read(struct file *file, char __user *buf,

1467

static ssize_t cgroup_file_read(struct file *file, char __user *buf,

1480

size_t nbytes, loff_t *ppos)

1468

size_t nbytes, loff_t *ppos)

1481

{

1469

{

1482

struct cftype *cft = __d_cft(file->f_dentry);

1470

struct cftype *cft = __d_cft(file->f_dentry);

1483

struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);

1471

struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);

1484

1472

1485

if (!cft || cgroup_is_removed(cgrp))

1473

if (!cft || cgroup_is_removed(cgrp))

1486

return -ENODEV;

1474

return -ENODEV;

1487

1475

1488

if (cft->read)

1476

if (cft->read)

1489

return cft->read(cgrp, cft, file, buf, nbytes, ppos);

1477

return cft->read(cgrp, cft, file, buf, nbytes, ppos);

1490

if (cft->read_u64)

1478

if (cft->read_u64)

1491

return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);

1479

return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);

1492

return -EINVAL;

1480

return -EINVAL;

1493

}

1481

}

1494

1482

1495

/*

1483

/*

1496

* seqfile ops/methods for returning structured data. Currently just

1484

* seqfile ops/methods for returning structured data. Currently just

1497

* supports string->u64 maps, but can be extended in future.

1485

* supports string->u64 maps, but can be extended in future.

1498

*/

1486

*/

1499

1487

1500

struct cgroup_seqfile_state {

1488

struct cgroup_seqfile_state {

1501

struct cftype *cft;

1489

struct cftype *cft;

1502

struct cgroup *cgroup;

1490

struct cgroup *cgroup;

1503

};

1491

};

1504

1492

1505

static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)

1493

static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)

1506

{

1494

{

1507

struct seq_file *sf = cb->state;

1495

struct seq_file *sf = cb->state;

1508

return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);

1496

return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);

1509

}

1497

}

1510

1498

1511

static int cgroup_seqfile_show(struct seq_file *m, void *arg)

1499

static int cgroup_seqfile_show(struct seq_file *m, void *arg)

1512

{

1500

{

1513

struct cgroup_seqfile_state *state = m->private;

1501

struct cgroup_seqfile_state *state = m->private;

1514

struct cftype *cft = state->cft;

1502

struct cftype *cft = state->cft;

1515

struct cgroup_map_cb cb = {

1503

struct cgroup_map_cb cb = {

1516

.fill = cgroup_map_add,

1504

.fill = cgroup_map_add,

1517

.state = m,

1505

.state = m,

1518

};

1506

};

1519

return cft->read_map(state->cgroup, cft, &cb);

1507

return cft->read_map(state->cgroup, cft, &cb);

1520

}

1508

}

1521

1509

1522

int cgroup_seqfile_release(struct inode *inode, struct file *file)

1510

int cgroup_seqfile_release(struct inode *inode, struct file *file)

1523

{

1511

{

1524

struct seq_file *seq = file->private_data;

1512

struct seq_file *seq = file->private_data;

1525

kfree(seq->private);

1513

kfree(seq->private);

1526

return single_release(inode, file);

1514

return single_release(inode, file);

1527

}

1515

}

1528

1516

1529

static struct file_operations cgroup_seqfile_operations = {

1517

static struct file_operations cgroup_seqfile_operations = {

1530

.read = seq_read,

1518

.read = seq_read,

1531

.llseek = seq_lseek,

1519

.llseek = seq_lseek,

1532

.release = cgroup_seqfile_release,

1520

.release = cgroup_seqfile_release,

1533

};

1521

};

1534

1522

1535

static int cgroup_file_open(struct inode *inode, struct file *file)

1523

static int cgroup_file_open(struct inode *inode, struct file *file)

1536

{

1524

{

1537

int err;

1525

int err;

1538

struct cftype *cft;

1526

struct cftype *cft;

1539

1527

1540

err = generic_file_open(inode, file);

1528

err = generic_file_open(inode, file);

1541

if (err)

1529

if (err)

1542

return err;

1530

return err;

1543

1531

1544

cft = __d_cft(file->f_dentry);

1532

cft = __d_cft(file->f_dentry);

1545

if (!cft)

1533

if (!cft)

1546

return -ENODEV;

1534

return -ENODEV;

1547

if (cft->read_map) {

1535

if (cft->read_map) {

1548

struct cgroup_seqfile_state *state =

1536

struct cgroup_seqfile_state *state =

1549

kzalloc(sizeof(*state), GFP_USER);

1537

kzalloc(sizeof(*state), GFP_USER);

1550

if (!state)

1538

if (!state)

1551

return -ENOMEM;

1539

return -ENOMEM;

1552

state->cft = cft;

1540

state->cft = cft;

1553

state->cgroup = __d_cgrp(file->f_dentry->d_parent);

1541

state->cgroup = __d_cgrp(file->f_dentry->d_parent);

1554

file->f_op = &cgroup_seqfile_operations;

1542

file->f_op = &cgroup_seqfile_operations;

1555

err = single_open(file, cgroup_seqfile_show, state);

1543

err = single_open(file, cgroup_seqfile_show, state);

1556

if (err < 0)

1544

if (err < 0)

1557

kfree(state);

1545

kfree(state);

1558

} else if (cft->open)

1546

} else if (cft->open)

1559

err = cft->open(inode, file);

1547

err = cft->open(inode, file);

1560

else

1548

else

1561

err = 0;

1549

err = 0;

1562

1550

1563

return err;

1551

return err;

1564

}

1552

}

1565

1553

1566

static int cgroup_file_release(struct inode *inode, struct file *file)

1554

static int cgroup_file_release(struct inode *inode, struct file *file)

1567

{

1555

{

1568

struct cftype *cft = __d_cft(file->f_dentry);

1556

struct cftype *cft = __d_cft(file->f_dentry);

1569

if (cft->release)

1557

if (cft->release)

1570

return cft->release(inode, file);

1558

return cft->release(inode, file);

1571

return 0;

1559

return 0;

1572

}

1560

}

1573

1561

1574

/*

1562

/*

1575

* cgroup_rename - Only allow simple rename of directories in place.

1563

* cgroup_rename - Only allow simple rename of directories in place.

1576

*/

1564

*/

1577

static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,

1565

static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,

1578

struct inode *new_dir, struct dentry *new_dentry)

1566

struct inode *new_dir, struct dentry *new_dentry)

1579

{

1567

{

1580

if (!S_ISDIR(old_dentry->d_inode->i_mode))

1568

if (!S_ISDIR(old_dentry->d_inode->i_mode))

1581

return -ENOTDIR;

1569

return -ENOTDIR;

1582

if (new_dentry->d_inode)

1570

if (new_dentry->d_inode)

1583

return -EEXIST;

1571

return -EEXIST;

1584

if (old_dir != new_dir)

1572

if (old_dir != new_dir)

1585

return -EIO;

1573

return -EIO;

1586

return simple_rename(old_dir, old_dentry, new_dir, new_dentry);

1574

return simple_rename(old_dir, old_dentry, new_dir, new_dentry);

1587

}

1575

}

1588

1576

1589

static struct file_operations cgroup_file_operations = {

1577

static struct file_operations cgroup_file_operations = {

1590

.read = cgroup_file_read,

1578

.read = cgroup_file_read,

1591

.write = cgroup_file_write,

1579

.write = cgroup_file_write,

1592

.llseek = generic_file_llseek,

1580

.llseek = generic_file_llseek,

1593

.open = cgroup_file_open,

1581

.open = cgroup_file_open,

1594

.release = cgroup_file_release,

1582

.release = cgroup_file_release,

1595

};

1583

};

1596

1584

1597

static struct inode_operations cgroup_dir_inode_operations = {

1585

static struct inode_operations cgroup_dir_inode_operations = {

1598

.lookup = simple_lookup,

1586

.lookup = simple_lookup,

1599

.mkdir = cgroup_mkdir,

1587

.mkdir = cgroup_mkdir,

1600

.rmdir = cgroup_rmdir,

1588

.rmdir = cgroup_rmdir,

1601

.rename = cgroup_rename,

1589

.rename = cgroup_rename,

1602

};

1590

};

1603

1591

1604

static int cgroup_create_file(struct dentry *dentry, int mode,

1592

static int cgroup_create_file(struct dentry *dentry, int mode,

1605

struct super_block *sb)

1593

struct super_block *sb)

1606

{

1594

{

1607

static struct dentry_operations cgroup_dops = {

1595

static struct dentry_operations cgroup_dops = {

1608

.d_iput = cgroup_diput,

1596

.d_iput = cgroup_diput,

1609

};

1597

};

1610

1598

1611

struct inode *inode;

1599

struct inode *inode;

1612

1600

1613

if (!dentry)

1601

if (!dentry)

1614

return -ENOENT;

1602

return -ENOENT;

1615

if (dentry->d_inode)

1603

if (dentry->d_inode)

1616

return -EEXIST;

1604

return -EEXIST;

1617

1605

1618

inode = cgroup_new_inode(mode, sb);

1606

inode = cgroup_new_inode(mode, sb);

1619

if (!inode)

1607

if (!inode)

1620

return -ENOMEM;

1608

return -ENOMEM;

1621

1609

1622

if (S_ISDIR(mode)) {

1610

if (S_ISDIR(mode)) {

1623

inode->i_op = &cgroup_dir_inode_operations;

1611

inode->i_op = &cgroup_dir_inode_operations;

1624

inode->i_fop = &simple_dir_operations;

1612

inode->i_fop = &simple_dir_operations;

1625

1613

1626

/* start off with i_nlink == 2 (for "." entry) */

1614

/* start off with i_nlink == 2 (for "." entry) */

1627

inc_nlink(inode);

1615

inc_nlink(inode);

1628

1616

1629

/* start with the directory inode held, so that we can

1617

/* start with the directory inode held, so that we can

1630

* populate it without racing with another mkdir */

1618

* populate it without racing with another mkdir */

1631

mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);

1619

mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);

1632

} else if (S_ISREG(mode)) {

1620

} else if (S_ISREG(mode)) {

1633

inode->i_size = 0;

1621

inode->i_size = 0;

1634

inode->i_fop = &cgroup_file_operations;

1622

inode->i_fop = &cgroup_file_operations;

1635

}

1623

}

1636

dentry->d_op = &cgroup_dops;

1624

dentry->d_op = &cgroup_dops;

1637

d_instantiate(dentry, inode);

1625

d_instantiate(dentry, inode);

1638

dget(dentry); /* Extra count - pin the dentry in core */

1626

dget(dentry); /* Extra count - pin the dentry in core */

1639

return 0;

1627

return 0;

1640

}

1628

}

1641

1629

1642

/*

1630

/*

1643

* cgroup_create_dir - create a directory for an object.

1631

* cgroup_create_dir - create a directory for an object.

1644

* @cgrp: the cgroup we create the directory for. It must have a valid

1632

* @cgrp: the cgroup we create the directory for. It must have a valid

1645

* ->parent field. And we are going to fill its ->dentry field.

1633

* ->parent field. And we are going to fill its ->dentry field.

1646

* @dentry: dentry of the new cgroup

1634

* @dentry: dentry of the new cgroup

1647

* @mode: mode to set on new directory.

1635

* @mode: mode to set on new directory.

1648

*/

1636

*/

1649

static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,

1637

static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,

1650

int mode)

1638

int mode)

1651

{

1639

{

1652

struct dentry *parent;

1640

struct dentry *parent;

1653

int error = 0;

1641

int error = 0;

1654

1642

1655

parent = cgrp->parent->dentry;

1643

parent = cgrp->parent->dentry;

1656

error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);

1644

error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);

1657

if (!error) {

1645

if (!error) {

1658

dentry->d_fsdata = cgrp;

1646

dentry->d_fsdata = cgrp;

1659

inc_nlink(parent->d_inode);

1647

inc_nlink(parent->d_inode);

1660

cgrp->dentry = dentry;

1648

cgrp->dentry = dentry;

1661

dget(dentry);

1649

dget(dentry);

1662

}

1650

}

1663

dput(dentry);

1651

dput(dentry);

1664

1652

1665

return error;

1653

return error;

1666

}

1654

}

1667

1655

1668

int cgroup_add_file(struct cgroup *cgrp,

1656

int cgroup_add_file(struct cgroup *cgrp,

1669

struct cgroup_subsys *subsys,

1657

struct cgroup_subsys *subsys,

1670

const struct cftype *cft)

1658

const struct cftype *cft)

1671

{

1659

{

1672

struct dentry *dir = cgrp->dentry;

1660

struct dentry *dir = cgrp->dentry;

1673

struct dentry *dentry;

1661

struct dentry *dentry;

1674

int error;

1662

int error;

1675

1663

1676

char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };

1664

char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };

1677

if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {

1665

if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {

1678

strcpy(name, subsys->name);

1666

strcpy(name, subsys->name);

1679

strcat(name, ".");

1667

strcat(name, ".");

1680

}

1668

}

1681

strcat(name, cft->name);

1669

strcat(name, cft->name);

1682

BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));

1670

BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));

1683

dentry = lookup_one_len(name, dir, strlen(name));

1671

dentry = lookup_one_len(name, dir, strlen(name));

1684

if (!IS_ERR(dentry)) {

1672

if (!IS_ERR(dentry)) {

1685

error = cgroup_create_file(dentry, 0644 | S_IFREG,

1673

error = cgroup_create_file(dentry, 0644 | S_IFREG,

1686

cgrp->root->sb);

1674

cgrp->root->sb);

1687

if (!error)

1675

if (!error)

1688

dentry->d_fsdata = (void *)cft;

1676

dentry->d_fsdata = (void *)cft;

1689

dput(dentry);

1677

dput(dentry);

1690

} else

1678

} else

1691

error = PTR_ERR(dentry);

1679

error = PTR_ERR(dentry);

1692

return error;

1680

return error;

1693

}

1681

}

1694

1682

1695

int cgroup_add_files(struct cgroup *cgrp,

1683

int cgroup_add_files(struct cgroup *cgrp,

1696

struct cgroup_subsys *subsys,

1684

struct cgroup_subsys *subsys,

1697

const struct cftype cft[],

1685

const struct cftype cft[],

1698

int count)

1686

int count)

1699

{

1687

{

1700

int i, err;

1688

int i, err;

1701

for (i = 0; i < count; i++) {

1689

for (i = 0; i < count; i++) {

1702

err = cgroup_add_file(cgrp, subsys, &cft[i]);

1690

err = cgroup_add_file(cgrp, subsys, &cft[i]);

1703

if (err)

1691

if (err)

1704

return err;

1692

return err;

1705

}

1693

}

1706

return 0;

1694

return 0;

1707

}

1695

}

1708

1696

1709

/**

1697

/**

1710

* cgroup_task_count - count the number of tasks in a cgroup.

1698

* cgroup_task_count - count the number of tasks in a cgroup.

1711

* @cgrp: the cgroup in question

1699

* @cgrp: the cgroup in question

1712

*

1700

*

1713

* Return the number of tasks in the cgroup.

1701

* Return the number of tasks in the cgroup.

1714

*/

1702

*/

1715

int cgroup_task_count(const struct cgroup *cgrp)

1703

int cgroup_task_count(const struct cgroup *cgrp)

1716

{

1704

{

1717

int count = 0;

1705

int count = 0;

1718

struct list_head *l;

1706

struct list_head *l;

1719

1707

1720

read_lock(&css_set_lock);

1708

read_lock(&css_set_lock);

1721

l = cgrp->css_sets.next;

1709

l = cgrp->css_sets.next;

1722

while (l != &cgrp->css_sets) {

1710

while (l != &cgrp->css_sets) {

1723

struct cg_cgroup_link *link =

1711

struct cg_cgroup_link *link =

1724

list_entry(l, struct cg_cgroup_link, cgrp_link_list);

1712

list_entry(l, struct cg_cgroup_link, cgrp_link_list);

1725

count += atomic_read(&link->cg->ref.refcount);

1713

count += atomic_read(&link->cg->ref.refcount);

1726

l = l->next;

1714

l = l->next;

1727

}

1715

}

1728

read_unlock(&css_set_lock);

1716

read_unlock(&css_set_lock);

1729

return count;

1717

return count;

1730

}

1718

}

1731

1719

1732

/*

1720

/*

1733

* Advance a list_head iterator. The iterator should be positioned at

1721

* Advance a list_head iterator. The iterator should be positioned at

1734

* the start of a css_set

1722

* the start of a css_set

1735

*/

1723

*/

1736

static void cgroup_advance_iter(struct cgroup *cgrp,

1724

static void cgroup_advance_iter(struct cgroup *cgrp,

1737

struct cgroup_iter *it)

1725

struct cgroup_iter *it)

1738

{

1726

{

1739

struct list_head *l = it->cg_link;

1727

struct list_head *l = it->cg_link;

1740

struct cg_cgroup_link *link;

1728

struct cg_cgroup_link *link;

1741

struct css_set *cg;

1729

struct css_set *cg;

1742

1730

1743

/* Advance to the next non-empty css_set */

1731

/* Advance to the next non-empty css_set */

1744

do {

1732

do {

1745

l = l->next;

1733

l = l->next;

1746

if (l == &cgrp->css_sets) {

1734

if (l == &cgrp->css_sets) {

1747

it->cg_link = NULL;

1735

it->cg_link = NULL;

1748

return;

1736

return;

1749

}

1737

}

1750

link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);

1738

link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);

1751

cg = link->cg;

1739

cg = link->cg;

1752

} while (list_empty(&cg->tasks));

1740

} while (list_empty(&cg->tasks));

1753

it->cg_link = l;

1741

it->cg_link = l;

1754

it->task = cg->tasks.next;

1742

it->task = cg->tasks.next;

1755

}

1743

}

1756

1744

1757

/*

1745

/*

1758

* To reduce the fork() overhead for systems that are not actually

1746

* To reduce the fork() overhead for systems that are not actually

1759

* using their cgroups capability, we don't maintain the lists running

1747

* using their cgroups capability, we don't maintain the lists running

1760

* through each css_set to its tasks until we see the list actually

1748

* through each css_set to its tasks until we see the list actually

1761

* used - in other words after the first call to cgroup_iter_start().

1749

* used - in other words after the first call to cgroup_iter_start().

1762

*

1750

*

1763

* The tasklist_lock is not held here, as do_each_thread() and

1751

* The tasklist_lock is not held here, as do_each_thread() and

1764

* while_each_thread() are protected by RCU.

1752

* while_each_thread() are protected by RCU.

1765

*/

1753

*/

1766

static void cgroup_enable_task_cg_lists(void)

1754

static void cgroup_enable_task_cg_lists(void)

1767

{

1755

{

1768

struct task_struct *p, *g;

1756

struct task_struct *p, *g;

1769

write_lock(&css_set_lock);

1757

write_lock(&css_set_lock);

1770

use_task_css_set_links = 1;

1758

use_task_css_set_links = 1;

1771

do_each_thread(g, p) {

1759

do_each_thread(g, p) {

1772

task_lock(p);

1760

task_lock(p);

1773

/*

1761

/*

1774

* We should check if the process is exiting, otherwise

1762

* We should check if the process is exiting, otherwise

1775

* it will race with cgroup_exit() in that the list

1763

* it will race with cgroup_exit() in that the list

1776

* entry won't be deleted though the process has exited.

1764

* entry won't be deleted though the process has exited.

1777

*/

1765

*/

1778

if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))

1766

if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))

1779

list_add(&p->cg_list, &p->cgroups->tasks);

1767

list_add(&p->cg_list, &p->cgroups->tasks);

1780

task_unlock(p);

1768

task_unlock(p);

1781

} while_each_thread(g, p);

1769

} while_each_thread(g, p);

1782

write_unlock(&css_set_lock);

1770

write_unlock(&css_set_lock);

1783

}

1771

}

1784

1772

1785

void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)

1773

void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)

1786

{

1774

{

1787

/*

1775

/*

1788

* The first time anyone tries to iterate across a cgroup,

1776

* The first time anyone tries to iterate across a cgroup,

1789

* we need to enable the list linking each css_set to its

1777

* we need to enable the list linking each css_set to its

1790

* tasks, and fix up all existing tasks.

1778

* tasks, and fix up all existing tasks.

1791

*/

1779

*/

1792

if (!use_task_css_set_links)

1780

if (!use_task_css_set_links)

1793

cgroup_enable_task_cg_lists();

1781

cgroup_enable_task_cg_lists();

1794

1782

1795

read_lock(&css_set_lock);

1783

read_lock(&css_set_lock);

1796

it->cg_link = &cgrp->css_sets;

1784

it->cg_link = &cgrp->css_sets;

1797

cgroup_advance_iter(cgrp, it);

1785

cgroup_advance_iter(cgrp, it);

1798

}

1786

}

1799

1787

1800

struct task_struct *cgroup_iter_next(struct cgroup *cgrp,

1788

struct task_struct *cgroup_iter_next(struct cgroup *cgrp,

1801

struct cgroup_iter *it)

1789

struct cgroup_iter *it)

1802

{

1790

{

1803

struct task_struct *res;

1791

struct task_struct *res;

1804

struct list_head *l = it->task;

1792

struct list_head *l = it->task;

1805

1793

1806

/* If the iterator cg is NULL, we have no tasks */

1794

/* If the iterator cg is NULL, we have no tasks */

1807

if (!it->cg_link)

1795

if (!it->cg_link)

1808

return NULL;

1796

return NULL;

1809

res = list_entry(l, struct task_struct, cg_list);

1797

res = list_entry(l, struct task_struct, cg_list);

1810

/* Advance iterator to find next entry */

1798

/* Advance iterator to find next entry */

1811

l = l->next;

1799

l = l->next;

1812

if (l == &res->cgroups->tasks) {

1800

if (l == &res->cgroups->tasks) {

1813

/* We reached the end of this task list - move on to

1801

/* We reached the end of this task list - move on to

1814

* the next cg_cgroup_link */

1802

* the next cg_cgroup_link */

1815

cgroup_advance_iter(cgrp, it);

1803

cgroup_advance_iter(cgrp, it);

1816

} else {

1804

} else {

1817

it->task = l;

1805

it->task = l;

1818

}

1806

}

1819

return res;

1807

return res;

1820

}

1808

}

1821

1809

1822

void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)

1810

void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)

1823

{

1811

{

1824

read_unlock(&css_set_lock);

1812

read_unlock(&css_set_lock);

1825

}

1813

}

1826

1814

1827

static inline int started_after_time(struct task_struct *t1,

1815

static inline int started_after_time(struct task_struct *t1,

1828

struct timespec *time,

1816

struct timespec *time,

1829

struct task_struct *t2)

1817

struct task_struct *t2)

1830

{

1818

{

1831

int start_diff = timespec_compare(&t1->start_time, time);

1819

int start_diff = timespec_compare(&t1->start_time, time);

1832

if (start_diff > 0) {

1820

if (start_diff > 0) {

1833

return 1;

1821

return 1;

1834

} else if (start_diff < 0) {

1822

} else if (start_diff < 0) {

1835

return 0;

1823

return 0;

1836

} else {

1824

} else {

1837

/*

1825

/*

1838

* Arbitrarily, if two processes started at the same

1826

* Arbitrarily, if two processes started at the same

1839

* time, we'll say that the lower pointer value

1827

* time, we'll say that the lower pointer value

1840

* started first. Note that t2 may have exited by now

1828

* started first. Note that t2 may have exited by now

1841

* so this may not be a valid pointer any longer, but

1829

* so this may not be a valid pointer any longer, but

1842

* that's fine - it still serves to distinguish

1830

* that's fine - it still serves to distinguish

1843

* between two tasks started (effectively) simultaneously.

1831

* between two tasks started (effectively) simultaneously.

1844

*/

1832

*/

1845

return t1 > t2;

1833

return t1 > t2;

1846

}

1834

}

1847

}

1835

}

1848

1836

1849

/*

1837

/*

1850

* This function is a callback from heap_insert() and is used to order

1838

* This function is a callback from heap_insert() and is used to order

1851

* the heap.

1839

* the heap.

1852

* In this case we order the heap in descending task start time.

1840

* In this case we order the heap in descending task start time.

1853

*/

1841

*/

1854

static inline int started_after(void *p1, void *p2)

1842

static inline int started_after(void *p1, void *p2)

1855

{

1843

{

1856

struct task_struct *t1 = p1;

1844

struct task_struct *t1 = p1;

1857

struct task_struct *t2 = p2;

1845

struct task_struct *t2 = p2;

1858

return started_after_time(t1, &t2->start_time, t2);

1846

return started_after_time(t1, &t2->start_time, t2);

1859

}

1847

}

1860

1848

1861

/**

1849

/**

1862

* cgroup_scan_tasks - iterate though all the tasks in a cgroup

1850

* cgroup_scan_tasks - iterate though all the tasks in a cgroup

1863

* @scan: struct cgroup_scanner containing arguments for the scan

1851

* @scan: struct cgroup_scanner containing arguments for the scan

1864

*

1852

*

1865

* Arguments include pointers to callback functions test_task() and

1853

* Arguments include pointers to callback functions test_task() and

1866

* process_task().

1854

* process_task().

1867

* Iterate through all the tasks in a cgroup, calling test_task() for each,

1855

* Iterate through all the tasks in a cgroup, calling test_task() for each,

1868

* and if it returns true, call process_task() for it also.

1856

* and if it returns true, call process_task() for it also.

1869

* The test_task pointer may be NULL, meaning always true (select all tasks).

1857

* The test_task pointer may be NULL, meaning always true (select all tasks).

1870

* Effectively duplicates cgroup_iter_{start,next,end}()

1858

* Effectively duplicates cgroup_iter_{start,next,end}()

1871

* but does not lock css_set_lock for the call to process_task().

1859

* but does not lock css_set_lock for the call to process_task().

1872

* The struct cgroup_scanner may be embedded in any structure of the caller's

1860

* The struct cgroup_scanner may be embedded in any structure of the caller's

1873

* creation.

1861

* creation.

1874

* It is guaranteed that process_task() will act on every task that

1862

* It is guaranteed that process_task() will act on every task that

1875

* is a member of the cgroup for the duration of this call. This

1863

* is a member of the cgroup for the duration of this call. This

1876

* function may or may not call process_task() for tasks that exit

1864

* function may or may not call process_task() for tasks that exit

1877

* or move to a different cgroup during the call, or are forked or

1865

* or move to a different cgroup during the call, or are forked or

1878

* move into the cgroup during the call.

1866

* move into the cgroup during the call.

1879

*

1867

*

1880

* Note that test_task() may be called with locks held, and may in some

1868

* Note that test_task() may be called with locks held, and may in some

1881

* situations be called multiple times for the same task, so it should

1869

* situations be called multiple times for the same task, so it should

1882

* be cheap.

1870

* be cheap.

1883

* If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been

1871

* If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been

1884

* pre-allocated and will be used for heap operations (and its "gt" member will

1872

* pre-allocated and will be used for heap operations (and its "gt" member will

1885

* be overwritten), else a temporary heap will be used (allocation of which

1873

* be overwritten), else a temporary heap will be used (allocation of which

1886

* may cause this function to fail).

1874

* may cause this function to fail).

1887

*/

1875

*/

1888

int cgroup_scan_tasks(struct cgroup_scanner *scan)

1876

int cgroup_scan_tasks(struct cgroup_scanner *scan)

1889

{

1877

{

1890

int retval, i;

1878

int retval, i;

1891

struct cgroup_iter it;

1879

struct cgroup_iter it;

1892

struct task_struct *p, *dropped;

1880

struct task_struct *p, *dropped;

1893

/* Never dereference latest_task, since it's not refcounted */

1881

/* Never dereference latest_task, since it's not refcounted */

1894

struct task_struct *latest_task = NULL;

1882

struct task_struct *latest_task = NULL;

1895

struct ptr_heap tmp_heap;

1883

struct ptr_heap tmp_heap;

1896

struct ptr_heap *heap;

1884

struct ptr_heap *heap;

1897

struct timespec latest_time = { 0, 0 };

1885

struct timespec latest_time = { 0, 0 };

1898

1886

1899

if (scan->heap) {

1887

if (scan->heap) {

1900

/* The caller supplied our heap and pre-allocated its memory */

1888

/* The caller supplied our heap and pre-allocated its memory */

1901

heap = scan->heap;

1889

heap = scan->heap;

1902

heap->gt = &started_after;

1890

heap->gt = &started_after;

1903

} else {

1891

} else {

1904

/* We need to allocate our own heap memory */

1892

/* We need to allocate our own heap memory */

1905

heap = &tmp_heap;

1893

heap = &tmp_heap;

1906

retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);

1894

retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);

1907

if (retval)

1895

if (retval)

1908

/* cannot allocate the heap */

1896

/* cannot allocate the heap */

1909

return retval;

1897

return retval;

1910

}

1898

}

1911

1899

1912

again:

1900

again:

1913

/*

1901

/*

1914

* Scan tasks in the cgroup, using the scanner's "test_task" callback

1902

* Scan tasks in the cgroup, using the scanner's "test_task" callback

1915

* to determine which are of interest, and using the scanner's

1903

* to determine which are of interest, and using the scanner's

1916

* "process_task" callback to process any of them that need an update.

1904

* "process_task" callback to process any of them that need an update.

1917

* Since we don't want to hold any locks during the task updates,

1905

* Since we don't want to hold any locks during the task updates,

1918

* gather tasks to be processed in a heap structure.

1906

* gather tasks to be processed in a heap structure.

1919

* The heap is sorted by descending task start time.

1907

* The heap is sorted by descending task start time.

1920

* If the statically-sized heap fills up, we overflow tasks that

1908

* If the statically-sized heap fills up, we overflow tasks that

1921

* started later, and in future iterations only consider tasks that

1909

* started later, and in future iterations only consider tasks that

1922

* started after the latest task in the previous pass. This

1910

* started after the latest task in the previous pass. This

1923

* guarantees forward progress and that we don't miss any tasks.

1911

* guarantees forward progress and that we don't miss any tasks.

1924

*/

1912

*/

1925

heap->size = 0;

1913

heap->size = 0;

1926

cgroup_iter_start(scan->cg, &it);

1914

cgroup_iter_start(scan->cg, &it);

1927

while ((p = cgroup_iter_next(scan->cg, &it))) {

1915

while ((p = cgroup_iter_next(scan->cg, &it))) {

1928

/*

1916

/*

1929

* Only affect tasks that qualify per the caller's callback,

1917

* Only affect tasks that qualify per the caller's callback,

1930

* if he provided one

1918

* if he provided one

1931

*/

1919

*/

1932

if (scan->test_task && !scan->test_task(p, scan))

1920

if (scan->test_task && !scan->test_task(p, scan))

1933

continue;

1921

continue;

1934

/*

1922

/*

1935

* Only process tasks that started after the last task

1923

* Only process tasks that started after the last task

1936

* we processed

1924

* we processed

1937

*/

1925

*/

1938

if (!started_after_time(p, &latest_time, latest_task))

1926

if (!started_after_time(p, &latest_time, latest_task))

1939

continue;

1927

continue;

1940

dropped = heap_insert(heap, p);

1928

dropped = heap_insert(heap, p);

1941

if (dropped == NULL) {

1929

if (dropped == NULL) {

1942

/*

1930

/*

1943

* The new task was inserted; the heap wasn't

1931

* The new task was inserted; the heap wasn't

1944

* previously full

1932

* previously full

1945

*/

1933

*/

1946

get_task_struct(p);

1934

get_task_struct(p);

1947

} else if (dropped != p) {

1935

} else if (dropped != p) {

1948

/*

1936

/*

1949

* The new task was inserted, and pushed out a

1937

* The new task was inserted, and pushed out a

1950

* different task

1938

* different task

1951

*/

1939

*/

1952

get_task_struct(p);

1940

get_task_struct(p);

1953

put_task_struct(dropped);

1941

put_task_struct(dropped);

1954

}

1942

}

1955

/*

1943

/*

1956

* Else the new task was newer than anything already in

1944

* Else the new task was newer than anything already in

1957

* the heap and wasn't inserted

1945

* the heap and wasn't inserted

1958

*/

1946

*/

1959

}

1947

}

1960

cgroup_iter_end(scan->cg, &it);

1948

cgroup_iter_end(scan->cg, &it);

1961

1949

1962

if (heap->size) {

1950

if (heap->size) {

1963

for (i = 0; i < heap->size; i++) {

1951

for (i = 0; i < heap->size; i++) {

1964

struct task_struct *q = heap->ptrs[i];

1952

struct task_struct *q = heap->ptrs[i];

1965

if (i == 0) {

1953

if (i == 0) {

1966

latest_time = q->start_time;

1954

latest_time = q->start_time;

1967

latest_task = q;

1955

latest_task = q;

1968

}

1956

}

1969

/* Process the task per the caller's callback */

1957

/* Process the task per the caller's callback */

1970

scan->process_task(q, scan);

1958

scan->process_task(q, scan);

1971

put_task_struct(q);

1959

put_task_struct(q);

1972

}

1960

}

1973

/*

1961

/*

1974

* If we had to process any tasks at all, scan again

1962

* If we had to process any tasks at all, scan again

1975

* in case some of them were in the middle of forking

1963

* in case some of them were in the middle of forking

1976

* children that didn't get processed.

1964

* children that didn't get processed.

1977

* Not the most efficient way to do it, but it avoids

1965

* Not the most efficient way to do it, but it avoids

1978

* having to take callback_mutex in the fork path

1966

* having to take callback_mutex in the fork path

1979

*/

1967

*/

1980

goto again;

1968

goto again;

1981

}

1969

}

1982

if (heap == &tmp_heap)

1970

if (heap == &tmp_heap)

1983

heap_free(&tmp_heap);

1971

heap_free(&tmp_heap);

1984

return 0;

1972

return 0;

1985

}

1973

}

1986

1974

1987

/*

1975

/*

1988

* Stuff for reading the 'tasks' file.

1976

* Stuff for reading the 'tasks' file.

1989

*

1977

*

1990

* Reading this file can return large amounts of data if a cgroup has

1978

* Reading this file can return large amounts of data if a cgroup has

1991

* *lots* of attached tasks. So it may need several calls to read(),

1979

* *lots* of attached tasks. So it may need several calls to read(),

1992

* but we cannot guarantee that the information we produce is correct

1980

* but we cannot guarantee that the information we produce is correct

1993

* unless we produce it entirely atomically.

1981

* unless we produce it entirely atomically.

1994

*

1982

*

1995

* Upon tasks file open(), a struct ctr_struct is allocated, that

1983

* Upon tasks file open(), a struct ctr_struct is allocated, that

1996

* will have a pointer to an array (also allocated here). The struct

1984

* will have a pointer to an array (also allocated here). The struct

1997

* ctr_struct * is stored in file->private_data. Its resources will

1985

* ctr_struct * is stored in file->private_data. Its resources will

1998

* be freed by release() when the file is closed. The array is used

1986

* be freed by release() when the file is closed. The array is used

1999

* to sprintf the PIDs and then used by read().

1987

* to sprintf the PIDs and then used by read().

2000

*/

1988

*/

2001

struct ctr_struct {

1989

struct ctr_struct {

2002

char *buf;

1990

char *buf;

2003

int bufsz;

1991

int bufsz;

2004

};

1992

};

2005

1993

2006

/*

1994

/*

2007

* Load into 'pidarray' up to 'npids' of the tasks using cgroup

1995

* Load into 'pidarray' up to 'npids' of the tasks using cgroup

2008

* 'cgrp'. Return actual number of pids loaded. No need to

1996

* 'cgrp'. Return actual number of pids loaded. No need to

2009

* task_lock(p) when reading out p->cgroup, since we're in an RCU

1997

* task_lock(p) when reading out p->cgroup, since we're in an RCU

2010

* read section, so the css_set can't go away, and is

1998

* read section, so the css_set can't go away, and is

2011

* immutable after creation.

1999

* immutable after creation.

2012

*/

2000

*/

2013

static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)

2001

static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)

2014

{

2002

{

2015

int n = 0;

2003

int n = 0;

2016

struct cgroup_iter it;

2004

struct cgroup_iter it;

2017

struct task_struct *tsk;

2005

struct task_struct *tsk;

2018

cgroup_iter_start(cgrp, &it);

2006

cgroup_iter_start(cgrp, &it);

2019

while ((tsk = cgroup_iter_next(cgrp, &it))) {

2007

while ((tsk = cgroup_iter_next(cgrp, &it))) {

2020

if (unlikely(n == npids))

2008

if (unlikely(n == npids))

2021

break;

2009

break;

2022

pidarray[n++] = task_pid_vnr(tsk);

2010

pidarray[n++] = task_pid_vnr(tsk);

2023

}

2011

}

2024

cgroup_iter_end(cgrp, &it);

2012

cgroup_iter_end(cgrp, &it);

2025

return n;

2013

return n;

2026

}

2014

}

2027

2015

2028

/**

2016

/**

2029

* cgroupstats_build - build and fill cgroupstats

2017

* cgroupstats_build - build and fill cgroupstats

2030

* @stats: cgroupstats to fill information into

2018

* @stats: cgroupstats to fill information into

2031

* @dentry: A dentry entry belonging to the cgroup for which stats have

2019

* @dentry: A dentry entry belonging to the cgroup for which stats have

2032

* been requested.

2020

* been requested.

2033

*

2021

*

2034

* Build and fill cgroupstats so that taskstats can export it to user

2022

* Build and fill cgroupstats so that taskstats can export it to user

2035

* space.

2023

* space.

2036

*/

2024

*/

2037

int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)

2025

int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)

2038

{

2026

{

2039

int ret = -EINVAL;

2027

int ret = -EINVAL;

2040

struct cgroup *cgrp;

2028

struct cgroup *cgrp;

2041

struct cgroup_iter it;

2029

struct cgroup_iter it;

2042

struct task_struct *tsk;

2030

struct task_struct *tsk;

2043

/*

2031

/*

2044

* Validate dentry by checking the superblock operations

2032

* Validate dentry by checking the superblock operations

2045

*/

2033

*/

2046

if (dentry->d_sb->s_op != &cgroup_ops)

2034

if (dentry->d_sb->s_op != &cgroup_ops)

2047

goto err;

2035

goto err;

2048

2036

2049

ret = 0;

2037

ret = 0;

2050

cgrp = dentry->d_fsdata;

2038

cgrp = dentry->d_fsdata;

2051

rcu_read_lock();

2039

rcu_read_lock();

2052

2040

2053

cgroup_iter_start(cgrp, &it);

2041

cgroup_iter_start(cgrp, &it);

2054

while ((tsk = cgroup_iter_next(cgrp, &it))) {

2042

while ((tsk = cgroup_iter_next(cgrp, &it))) {

2055

switch (tsk->state) {

2043

switch (tsk->state) {

2056

case TASK_RUNNING:

2044

case TASK_RUNNING:

2057

stats->nr_running++;

2045

stats->nr_running++;

2058

break;

2046

break;

2059

case TASK_INTERRUPTIBLE:

2047

case TASK_INTERRUPTIBLE:

2060

stats->nr_sleeping++;

2048

stats->nr_sleeping++;

2061

break;

2049

break;

2062

case TASK_UNINTERRUPTIBLE:

2050

case TASK_UNINTERRUPTIBLE:

2063

stats->nr_uninterruptible++;

2051

stats->nr_uninterruptible++;

2064

break;

2052

break;

2065

case TASK_STOPPED:

2053

case TASK_STOPPED:

2066

stats->nr_stopped++;

2054

stats->nr_stopped++;

2067

break;

2055

break;

2068

default:

2056

default:

2069

if (delayacct_is_task_waiting_on_io(tsk))

2057

if (delayacct_is_task_waiting_on_io(tsk))

2070

stats->nr_io_wait++;

2058

stats->nr_io_wait++;

2071

break;

2059

break;

2072

}

2060

}

2073

}

2061

}

2074

cgroup_iter_end(cgrp, &it);

2062

cgroup_iter_end(cgrp, &it);

2075

2063

2076

rcu_read_unlock();

2064

rcu_read_unlock();

2077

err:

2065

err:

2078

return ret;

2066

return ret;

2079

}

2067

}

2080

2068

2081

static int cmppid(const void *a, const void *b)

2069

static int cmppid(const void *a, const void *b)

2082

{

2070

{

2083

return *(pid_t *)a - *(pid_t *)b;

2071

return *(pid_t *)a - *(pid_t *)b;

2084

}

2072

}

2085

2073

2086

/*

2074

/*

2087

* Convert array 'a' of 'npids' pid_t's to a string of newline separated

2075

* Convert array 'a' of 'npids' pid_t's to a string of newline separated

2088

* decimal pids in 'buf'. Don't write more than 'sz' chars, but return

2076

* decimal pids in 'buf'. Don't write more than 'sz' chars, but return

2089

* count 'cnt' of how many chars would be written if buf were large enough.

2077

* count 'cnt' of how many chars would be written if buf were large enough.

2090

*/

2078

*/

2091

static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)

2079

static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)

2092

{

2080

{

2093

int cnt = 0;

2081

int cnt = 0;

2094

int i;

2082

int i;

2095

2083

2096

for (i = 0; i < npids; i++)

2084

for (i = 0; i < npids; i++)

2097

cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);

2085

cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);

2098

return cnt;

2086

return cnt;

2099

}

2087

}

2100

2088

2101

/*

2089

/*

2102

* Handle an open on 'tasks' file. Prepare a buffer listing the

2090

* Handle an open on 'tasks' file. Prepare a buffer listing the

2103

* process id's of tasks currently attached to the cgroup being opened.

2091

* process id's of tasks currently attached to the cgroup being opened.

2104

*

2092

*

2105

* Does not require any specific cgroup mutexes, and does not take any.

2093

* Does not require any specific cgroup mutexes, and does not take any.

2106

*/

2094

*/

2107

static int cgroup_tasks_open(struct inode *unused, struct file *file)

2095

static int cgroup_tasks_open(struct inode *unused, struct file *file)

2108

{

2096

{

2109

struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);

2097

struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);

2110

struct ctr_struct *ctr;

2098

struct ctr_struct *ctr;

2111

pid_t *pidarray;

2099

pid_t *pidarray;

2112

int npids;

2100

int npids;

2113

char c;

2101

char c;

2114

2102

2115

if (!(file->f_mode & FMODE_READ))

2103

if (!(file->f_mode & FMODE_READ))

2116

return 0;

2104

return 0;

2117

2105

2118

ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);

2106

ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);

2119

if (!ctr)

2107

if (!ctr)

2120

goto err0;

2108

goto err0;

2121

2109

2122

/*

2110

/*

2123

* If cgroup gets more users after we read count, we won't have

2111

* If cgroup gets more users after we read count, we won't have

2124

* enough space - tough. This race is indistinguishable to the

2112

* enough space - tough. This race is indistinguishable to the

2125

* caller from the case that the additional cgroup users didn't

2113

* caller from the case that the additional cgroup users didn't

2126

* show up until sometime later on.

2114

* show up until sometime later on.

2127

*/

2115

*/

2128

npids = cgroup_task_count(cgrp);

2116

npids = cgroup_task_count(cgrp);

2129

if (npids) {

2117

if (npids) {

2130

pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);

2118

pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);

2131

if (!pidarray)

2119

if (!pidarray)

2132

goto err1;

2120

goto err1;

2133

2121

2134

npids = pid_array_load(pidarray, npids, cgrp);

2122

npids = pid_array_load(pidarray, npids, cgrp);

2135

sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);

2123

sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);

2136

2124

2137

/* Call pid_array_to_buf() twice, first just to get bufsz */

2125

/* Call pid_array_to_buf() twice, first just to get bufsz */

2138

ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;

2126

ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;

2139

ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);

2127

ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);

2140

if (!ctr->buf)

2128

if (!ctr->buf)

2141

goto err2;

2129

goto err2;

2142

ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);

2130

ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);

2143

2131

2144

kfree(pidarray);

2132

kfree(pidarray);

2145

} else {

2133

} else {

2146

ctr->buf = NULL;

2134

ctr->buf = NULL;

2147

ctr->bufsz = 0;

2135

ctr->bufsz = 0;

2148

}

2136

}

2149

file->private_data = ctr;

2137

file->private_data = ctr;

2150

return 0;

2138

return 0;

2151

2139

2152

err2:

2140

err2:

2153

kfree(pidarray);

2141

kfree(pidarray);

2154

err1:

2142

err1:

2155

kfree(ctr);

2143

kfree(ctr);

2156

err0:

2144

err0:

2157

return -ENOMEM;

2145

return -ENOMEM;

2158

}

2146

}

2159

2147

2160

static ssize_t cgroup_tasks_read(struct cgroup *cgrp,

2148

static ssize_t cgroup_tasks_read(struct cgroup *cgrp,

2161

struct cftype *cft,

2149

struct cftype *cft,

2162

struct file *file, char __user *buf,

2150

struct file *file, char __user *buf,

2163

size_t nbytes, loff_t *ppos)

2151

size_t nbytes, loff_t *ppos)

2164

{

2152

{

2165

struct ctr_struct *ctr = file->private_data;

2153

struct ctr_struct *ctr = file->private_data;

2166

2154

2167

return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);

2155

return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);

2168

}

2156

}

2169

2157

2170

static int cgroup_tasks_release(struct inode *unused_inode,

2158

static int cgroup_tasks_release(struct inode *unused_inode,

2171

struct file *file)

2159

struct file *file)

2172

{

2160

{

2173

struct ctr_struct *ctr;

2161

struct ctr_struct *ctr;

2174

2162

2175

if (file->f_mode & FMODE_READ) {

2163

if (file->f_mode & FMODE_READ) {

2176

ctr = file->private_data;

2164

ctr = file->private_data;

2177

kfree(ctr->buf);

2165

kfree(ctr->buf);

2178

kfree(ctr);

2166

kfree(ctr);

2179

}

2167

}

2180

return 0;

2168

return 0;

2181

}

2169

}

2182

2170

2183

static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,

2171

static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,

2184

struct cftype *cft)

2172

struct cftype *cft)

2185

{

2173

{

2186

return notify_on_release(cgrp);

2174

return notify_on_release(cgrp);

2187

}

2175

}

2188

2176

2189

static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft)

2190

{

2191

return test_bit(CGRP_RELEASABLE, &cgrp->flags);

2192

}

2193

2194

/*

2177

/*

2195

* for the common functions, 'private' gives the type of file

2178

* for the common functions, 'private' gives the type of file

2196

*/

2179

*/

2197

static struct cftype files[] = {

2180

static struct cftype files[] = {

2198

{

2181

{

2199

.name = "tasks",

2182

.name = "tasks",

2200

.open = cgroup_tasks_open,

2183

.open = cgroup_tasks_open,

2201

.read = cgroup_tasks_read,

2184

.read = cgroup_tasks_read,

2202

.write = cgroup_common_file_write,

2185

.write = cgroup_common_file_write,

2203

.release = cgroup_tasks_release,

2186

.release = cgroup_tasks_release,

2204

.private = FILE_TASKLIST,

2187

.private = FILE_TASKLIST,

2205

},

2188

},

2206

2189

2207

{

2190

{

2208

.name = "notify_on_release",

2191

.name = "notify_on_release",

2209

.read_u64 = cgroup_read_notify_on_release,

2192

.read_u64 = cgroup_read_notify_on_release,

2210

.write = cgroup_common_file_write,

2193

.write = cgroup_common_file_write,

2211

.private = FILE_NOTIFY_ON_RELEASE,

2194

.private = FILE_NOTIFY_ON_RELEASE,

2212

},

2195

},

2213

2214

{

2215

.name = "releasable",

2216

.read_u64 = cgroup_read_releasable,

2217

.private = FILE_RELEASABLE,

2218

}

2219

};

2196

};

2220

2197

2221

static struct cftype cft_release_agent = {

2198

static struct cftype cft_release_agent = {

2222

.name = "release_agent",

2199

.name = "release_agent",

2223

.read = cgroup_common_file_read,

2200

.read = cgroup_common_file_read,

2224

.write = cgroup_common_file_write,

2201

.write = cgroup_common_file_write,

2225

.private = FILE_RELEASE_AGENT,

2202

.private = FILE_RELEASE_AGENT,

2226

};

2203

};

2227

2204

2228

static int cgroup_populate_dir(struct cgroup *cgrp)

2205

static int cgroup_populate_dir(struct cgroup *cgrp)

2229

{

2206

{

2230

int err;

2207

int err;

2231

struct cgroup_subsys *ss;

2208

struct cgroup_subsys *ss;

2232

2209

2233

/* First clear out any existing files */

2210

/* First clear out any existing files */

2234

cgroup_clear_directory(cgrp->dentry);

2211

cgroup_clear_directory(cgrp->dentry);

2235

2212

2236

err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));

2213

err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));

2237

if (err < 0)

2214

if (err < 0)

2238

return err;

2215

return err;

2239

2216

2240

if (cgrp == cgrp->top_cgroup) {

2217

if (cgrp == cgrp->top_cgroup) {

2241

if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)

2218

if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)

2242

return err;

2219

return err;

2243

}

2220

}

2244

2221

2245

for_each_subsys(cgrp->root, ss) {

2222

for_each_subsys(cgrp->root, ss) {

2246

if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)

2223

if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)

2247

return err;

2224

return err;

2248

}

2225

}

2249

2226

2250

return 0;

2227

return 0;

2251

}

2228

}

2252

2229

2253

static void init_cgroup_css(struct cgroup_subsys_state *css,

2230

static void init_cgroup_css(struct cgroup_subsys_state *css,

2254

struct cgroup_subsys *ss,

2231

struct cgroup_subsys *ss,

2255

struct cgroup *cgrp)

2232

struct cgroup *cgrp)

2256

{

2233

{

2257

css->cgroup = cgrp;

2234

css->cgroup = cgrp;

2258

atomic_set(&css->refcnt, 0);

2235

atomic_set(&css->refcnt, 0);

2259

css->flags = 0;

2236

css->flags = 0;

2260

if (cgrp == dummytop)

2237

if (cgrp == dummytop)

2261

set_bit(CSS_ROOT, &css->flags);

2238

set_bit(CSS_ROOT, &css->flags);

2262

BUG_ON(cgrp->subsys[ss->subsys_id]);

2239

BUG_ON(cgrp->subsys[ss->subsys_id]);

2263

cgrp->subsys[ss->subsys_id] = css;

2240

cgrp->subsys[ss->subsys_id] = css;

2264

}

2241

}

2265

2242

2266

/*

2243

/*

2267

* cgroup_create - create a cgroup

2244

* cgroup_create - create a cgroup

2268

* @parent: cgroup that will be parent of the new cgroup

2245

* @parent: cgroup that will be parent of the new cgroup

2269

* @dentry: dentry of the new cgroup

2246

* @dentry: dentry of the new cgroup

2270

* @mode: mode to set on new inode

2247

* @mode: mode to set on new inode

2271

*

2248

*

2272

* Must be called with the mutex on the parent inode held

2249

* Must be called with the mutex on the parent inode held

2273

*/

2250

*/

2274

static long cgroup_create(struct cgroup *parent, struct dentry *dentry,

2251

static long cgroup_create(struct cgroup *parent, struct dentry *dentry,

2275

int mode)

2252

int mode)

2276

{

2253

{

2277

struct cgroup *cgrp;

2254

struct cgroup *cgrp;

2278

struct cgroupfs_root *root = parent->root;

2255

struct cgroupfs_root *root = parent->root;

2279

int err = 0;

2256

int err = 0;

2280

struct cgroup_subsys *ss;

2257

struct cgroup_subsys *ss;

2281

struct super_block *sb = root->sb;

2258

struct super_block *sb = root->sb;

2282

2259

2283

cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);

2260

cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);

2284

if (!cgrp)

2261

if (!cgrp)

2285

return -ENOMEM;

2262

return -ENOMEM;

2286

2263

2287

/* Grab a reference on the superblock so the hierarchy doesn't

2264

/* Grab a reference on the superblock so the hierarchy doesn't

2288

* get deleted on unmount if there are child cgroups. This

2265

* get deleted on unmount if there are child cgroups. This

2289

* can be done outside cgroup_mutex, since the sb can't

2266

* can be done outside cgroup_mutex, since the sb can't

2290

* disappear while someone has an open control file on the

2267

* disappear while someone has an open control file on the

2291

* fs */

2268

* fs */

2292

atomic_inc(&sb->s_active);

2269

atomic_inc(&sb->s_active);

2293

2270

2294

mutex_lock(&cgroup_mutex);

2271

mutex_lock(&cgroup_mutex);

2295

2272

2296

INIT_LIST_HEAD(&cgrp->sibling);

2273

INIT_LIST_HEAD(&cgrp->sibling);

2297

INIT_LIST_HEAD(&cgrp->children);

2274

INIT_LIST_HEAD(&cgrp->children);

2298

INIT_LIST_HEAD(&cgrp->css_sets);

2275

INIT_LIST_HEAD(&cgrp->css_sets);

2299

INIT_LIST_HEAD(&cgrp->release_list);

2276

INIT_LIST_HEAD(&cgrp->release_list);

2300

2277

2301

cgrp->parent = parent;

2278

cgrp->parent = parent;

2302

cgrp->root = parent->root;

2279

cgrp->root = parent->root;

2303

cgrp->top_cgroup = parent->top_cgroup;

2280

cgrp->top_cgroup = parent->top_cgroup;

2304

2281

2305

if (notify_on_release(parent))

2282

if (notify_on_release(parent))

2306

set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

2283

set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

2307

2284

2308

for_each_subsys(root, ss) {

2285

for_each_subsys(root, ss) {

2309

struct cgroup_subsys_state *css = ss->create(ss, cgrp);

2286

struct cgroup_subsys_state *css = ss->create(ss, cgrp);

2310

if (IS_ERR(css)) {

2287

if (IS_ERR(css)) {

2311

err = PTR_ERR(css);

2288

err = PTR_ERR(css);

2312

goto err_destroy;

2289

goto err_destroy;

2313

}

2290

}

2314

init_cgroup_css(css, ss, cgrp);

2291

init_cgroup_css(css, ss, cgrp);

2315

}

2292

}

2316

2293

2317

list_add(&cgrp->sibling, &cgrp->parent->children);

2294

list_add(&cgrp->sibling, &cgrp->parent->children);

2318

root->number_of_cgroups++;

2295

root->number_of_cgroups++;

2319

2296

2320

err = cgroup_create_dir(cgrp, dentry, mode);

2297

err = cgroup_create_dir(cgrp, dentry, mode);

2321

if (err < 0)

2298

if (err < 0)

2322

goto err_remove;

2299

goto err_remove;

2323

2300

2324

/* The cgroup directory was pre-locked for us */

2301

/* The cgroup directory was pre-locked for us */

2325

BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));

2302

BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));

2326

2303

2327

err = cgroup_populate_dir(cgrp);

2304

err = cgroup_populate_dir(cgrp);

2328

/* If err < 0, we have a half-filled directory - oh well ;) */

2305

/* If err < 0, we have a half-filled directory - oh well ;) */

2329

2306

2330

mutex_unlock(&cgroup_mutex);

2307

mutex_unlock(&cgroup_mutex);

2331

mutex_unlock(&cgrp->dentry->d_inode->i_mutex);

2308

mutex_unlock(&cgrp->dentry->d_inode->i_mutex);

2332

2309

2333

return 0;

2310

return 0;

2334

2311

2335

err_remove:

2312

err_remove:

2336

2313

2337

list_del(&cgrp->sibling);

2314

list_del(&cgrp->sibling);

2338

root->number_of_cgroups--;

2315

root->number_of_cgroups--;

2339

2316

2340

err_destroy:

2317

err_destroy:

2341

2318

2342

for_each_subsys(root, ss) {

2319

for_each_subsys(root, ss) {

2343

if (cgrp->subsys[ss->subsys_id])

2320

if (cgrp->subsys[ss->subsys_id])

2344

ss->destroy(ss, cgrp);

2321

ss->destroy(ss, cgrp);

2345

}

2322

}

2346

2323

2347

mutex_unlock(&cgroup_mutex);

2324

mutex_unlock(&cgroup_mutex);

2348

2325

2349

/* Release the reference count that we took on the superblock */

2326

/* Release the reference count that we took on the superblock */

2350

deactivate_super(sb);

2327

deactivate_super(sb);

2351

2328

2352

kfree(cgrp);

2329

kfree(cgrp);

2353

return err;

2330

return err;

2354

}

2331

}

2355

2332

2356

static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)

2333

static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)

2357

{

2334

{

2358

struct cgroup *c_parent = dentry->d_parent->d_fsdata;

2335

struct cgroup *c_parent = dentry->d_parent->d_fsdata;

2359

2336

2360

/* the vfs holds inode->i_mutex already */

2337

/* the vfs holds inode->i_mutex already */

2361

return cgroup_create(c_parent, dentry, mode | S_IFDIR);

2338

return cgroup_create(c_parent, dentry, mode | S_IFDIR);

2362

}

2339

}

2363

2340

2364

static inline int cgroup_has_css_refs(struct cgroup *cgrp)

2341

static inline int cgroup_has_css_refs(struct cgroup *cgrp)

2365

{

2342

{

2366

/* Check the reference count on each subsystem. Since we

2343

/* Check the reference count on each subsystem. Since we

2367

* already established that there are no tasks in the

2344

* already established that there are no tasks in the

2368

* cgroup, if the css refcount is also 0, then there should

2345

* cgroup, if the css refcount is also 0, then there should

2369

* be no outstanding references, so the subsystem is safe to

2346

* be no outstanding references, so the subsystem is safe to

2370

* destroy. We scan across all subsystems rather than using

2347

* destroy. We scan across all subsystems rather than using

2371

* the per-hierarchy linked list of mounted subsystems since

2348

* the per-hierarchy linked list of mounted subsystems since

2372

* we can be called via check_for_release() with no

2349

* we can be called via check_for_release() with no

2373

* synchronization other than RCU, and the subsystem linked

2350

* synchronization other than RCU, and the subsystem linked

2374

* list isn't RCU-safe */

2351

* list isn't RCU-safe */

2375

int i;

2352

int i;

2376

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

2353

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

2377

struct cgroup_subsys *ss = subsys[i];

2354

struct cgroup_subsys *ss = subsys[i];

2378

struct cgroup_subsys_state *css;

2355

struct cgroup_subsys_state *css;

2379

/* Skip subsystems not in this hierarchy */

2356

/* Skip subsystems not in this hierarchy */

2380

if (ss->root != cgrp->root)

2357

if (ss->root != cgrp->root)

2381

continue;

2358

continue;

2382

css = cgrp->subsys[ss->subsys_id];

2359

css = cgrp->subsys[ss->subsys_id];

2383

/* When called from check_for_release() it's possible

2360

/* When called from check_for_release() it's possible

2384

* that by this point the cgroup has been removed

2361

* that by this point the cgroup has been removed

2385

* and the css deleted. But a false-positive doesn't

2362

* and the css deleted. But a false-positive doesn't

2386

* matter, since it can only happen if the cgroup

2363

* matter, since it can only happen if the cgroup

2387

* has been deleted and hence no longer needs the

2364

* has been deleted and hence no longer needs the

2388

* release agent to be called anyway. */

2365

* release agent to be called anyway. */

2389

if (css && atomic_read(&css->refcnt))

2366

if (css && atomic_read(&css->refcnt))

2390

return 1;

2367

return 1;

2391

}

2368

}

2392

return 0;

2369

return 0;

2393

}

2370

}

2394

2371

2395

static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)

2372

static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)

2396

{

2373

{

2397

struct cgroup *cgrp = dentry->d_fsdata;

2374

struct cgroup *cgrp = dentry->d_fsdata;

2398

struct dentry *d;

2375

struct dentry *d;

2399

struct cgroup *parent;

2376

struct cgroup *parent;

2400

struct super_block *sb;

2377

struct super_block *sb;

2401

struct cgroupfs_root *root;

2378

struct cgroupfs_root *root;

2402

2379

2403

/* the vfs holds both inode->i_mutex already */

2380

/* the vfs holds both inode->i_mutex already */

2404

2381

2405

mutex_lock(&cgroup_mutex);

2382

mutex_lock(&cgroup_mutex);

2406

if (atomic_read(&cgrp->count) != 0) {

2383

if (atomic_read(&cgrp->count) != 0) {

2407

mutex_unlock(&cgroup_mutex);

2384

mutex_unlock(&cgroup_mutex);

2408

return -EBUSY;

2385

return -EBUSY;

2409

}

2386

}

2410

if (!list_empty(&cgrp->children)) {

2387

if (!list_empty(&cgrp->children)) {

2411

mutex_unlock(&cgroup_mutex);

2388

mutex_unlock(&cgroup_mutex);

2412

return -EBUSY;

2389

return -EBUSY;

2413

}

2390

}

2414

2391

2415

parent = cgrp->parent;

2392

parent = cgrp->parent;

2416

root = cgrp->root;

2393

root = cgrp->root;

2417

sb = root->sb;

2394

sb = root->sb;

2418

2395

2419

/*

2396

/*

2420

* Call pre_destroy handlers of subsys. Notify subsystems

2397

* Call pre_destroy handlers of subsys. Notify subsystems

2421

* that rmdir() request comes.

2398

* that rmdir() request comes.

2422

*/

2399

*/

2423

cgroup_call_pre_destroy(cgrp);

2400

cgroup_call_pre_destroy(cgrp);

2424

2401

2425

if (cgroup_has_css_refs(cgrp)) {

2402

if (cgroup_has_css_refs(cgrp)) {

2426

mutex_unlock(&cgroup_mutex);

2403

mutex_unlock(&cgroup_mutex);

2427

return -EBUSY;

2404

return -EBUSY;

2428

}

2405

}

2429

2406

2430

spin_lock(&release_list_lock);

2407

spin_lock(&release_list_lock);

2431

set_bit(CGRP_REMOVED, &cgrp->flags);

2408

set_bit(CGRP_REMOVED, &cgrp->flags);

2432

if (!list_empty(&cgrp->release_list))

2409

if (!list_empty(&cgrp->release_list))

2433

list_del(&cgrp->release_list);

2410

list_del(&cgrp->release_list);

2434

spin_unlock(&release_list_lock);

2411

spin_unlock(&release_list_lock);

2435

/* delete my sibling from parent->children */

2412

/* delete my sibling from parent->children */

2436

list_del(&cgrp->sibling);

2413

list_del(&cgrp->sibling);

2437

spin_lock(&cgrp->dentry->d_lock);

2414

spin_lock(&cgrp->dentry->d_lock);

2438

d = dget(cgrp->dentry);

2415

d = dget(cgrp->dentry);

2439

cgrp->dentry = NULL;

2416

cgrp->dentry = NULL;

2440

spin_unlock(&d->d_lock);

2417

spin_unlock(&d->d_lock);

2441

2418

2442

cgroup_d_remove_dir(d);

2419

cgroup_d_remove_dir(d);

2443

dput(d);

2420

dput(d);

2444

2421

2445

set_bit(CGRP_RELEASABLE, &parent->flags);

2422

set_bit(CGRP_RELEASABLE, &parent->flags);

2446

check_for_release(parent);

2423

check_for_release(parent);

2447

2424

2448

mutex_unlock(&cgroup_mutex);

2425

mutex_unlock(&cgroup_mutex);

2449

return 0;

2426

return 0;

2450

}

2427

}

2451

2428

2452

static void cgroup_init_subsys(struct cgroup_subsys *ss)

2429

static void cgroup_init_subsys(struct cgroup_subsys *ss)

2453

{

2430

{

2454

struct cgroup_subsys_state *css;

2431

struct cgroup_subsys_state *css;

2455

struct list_head *l;

2432

struct list_head *l;

2456

2433

2457

printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);

2434

printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);

2458

2435

2459

/* Create the top cgroup state for this subsystem */

2436

/* Create the top cgroup state for this subsystem */

2460

ss->root = &rootnode;

2437

ss->root = &rootnode;

2461

css = ss->create(ss, dummytop);

2438

css = ss->create(ss, dummytop);

2462

/* We don't handle early failures gracefully */

2439

/* We don't handle early failures gracefully */

2463

BUG_ON(IS_ERR(css));

2440

BUG_ON(IS_ERR(css));

2464

init_cgroup_css(css, ss, dummytop);

2441

init_cgroup_css(css, ss, dummytop);

2465

2442

2466

/* Update all cgroup groups to contain a subsys

2443

/* Update all cgroup groups to contain a subsys

2467

* pointer to this state - since the subsystem is

2444

* pointer to this state - since the subsystem is

2468

* newly registered, all tasks and hence all cgroup

2445

* newly registered, all tasks and hence all cgroup

2469

* groups are in the subsystem's top cgroup. */

2446

* groups are in the subsystem's top cgroup. */

2470

write_lock(&css_set_lock);

2447

write_lock(&css_set_lock);

2471

l = &init_css_set.list;

2448

l = &init_css_set.list;

2472

do {

2449

do {

2473

struct css_set *cg =

2450

struct css_set *cg =

2474

list_entry(l, struct css_set, list);

2451

list_entry(l, struct css_set, list);

2475

cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];

2452

cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];

2476

l = l->next;

2453

l = l->next;

2477

} while (l != &init_css_set.list);

2454

} while (l != &init_css_set.list);

2478

write_unlock(&css_set_lock);

2455

write_unlock(&css_set_lock);

2479

2456

2480

/* If this subsystem requested that it be notified with fork

2457

/* If this subsystem requested that it be notified with fork

2481

* events, we should send it one now for every process in the

2458

* events, we should send it one now for every process in the

2482

* system */

2459

* system */

2483

if (ss->fork) {

2460

if (ss->fork) {

2484

struct task_struct *g, *p;

2461

struct task_struct *g, *p;

2485

2462

2486

read_lock(&tasklist_lock);

2463

read_lock(&tasklist_lock);

2487

do_each_thread(g, p) {

2464

do_each_thread(g, p) {

2488

ss->fork(ss, p);

2465

ss->fork(ss, p);

2489

} while_each_thread(g, p);

2466

} while_each_thread(g, p);

2490

read_unlock(&tasklist_lock);

2467

read_unlock(&tasklist_lock);

2491

}

2468

}

2492

2469

2493

need_forkexit_callback |= ss->fork || ss->exit;

2470

need_forkexit_callback |= ss->fork || ss->exit;

2494

2471

2495

ss->active = 1;

2472

ss->active = 1;

2496

}

2473

}

2497

2474

2498

/**

2475

/**

2499

* cgroup_init_early - cgroup initialization at system boot

2476

* cgroup_init_early - cgroup initialization at system boot

2500

*

2477

*

2501

* Initialize cgroups at system boot, and initialize any

2478

* Initialize cgroups at system boot, and initialize any

2502

* subsystems that request early init.

2479

* subsystems that request early init.

2503

*/

2480

*/

2504

int __init cgroup_init_early(void)

2481

int __init cgroup_init_early(void)

2505

{

2482

{

2506

int i;

2483

int i;

2507

kref_init(&init_css_set.ref);

2484

kref_init(&init_css_set.ref);

2508

kref_get(&init_css_set.ref);

2485

kref_get(&init_css_set.ref);

2509

INIT_LIST_HEAD(&init_css_set.list);

2486

INIT_LIST_HEAD(&init_css_set.list);

2510

INIT_LIST_HEAD(&init_css_set.cg_links);

2487

INIT_LIST_HEAD(&init_css_set.cg_links);

2511

INIT_LIST_HEAD(&init_css_set.tasks);

2488

INIT_LIST_HEAD(&init_css_set.tasks);

2512

css_set_count = 1;

2489

css_set_count = 1;

2513

init_cgroup_root(&rootnode);

2490

init_cgroup_root(&rootnode);

2514

list_add(&rootnode.root_list, &roots);

2491

list_add(&rootnode.root_list, &roots);

2515

root_count = 1;

2492

root_count = 1;

2516

init_task.cgroups = &init_css_set;

2493

init_task.cgroups = &init_css_set;

2517

2494

2518

init_css_set_link.cg = &init_css_set;

2495

init_css_set_link.cg = &init_css_set;

2519

list_add(&init_css_set_link.cgrp_link_list,

2496

list_add(&init_css_set_link.cgrp_link_list,

2520

&rootnode.top_cgroup.css_sets);

2497

&rootnode.top_cgroup.css_sets);

2521

list_add(&init_css_set_link.cg_link_list,

2498

list_add(&init_css_set_link.cg_link_list,

2522

&init_css_set.cg_links);

2499

&init_css_set.cg_links);

2523

2500

2524

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

2501

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

2525

struct cgroup_subsys *ss = subsys[i];

2502

struct cgroup_subsys *ss = subsys[i];

2526

2503

2527

BUG_ON(!ss->name);

2504

BUG_ON(!ss->name);

2528

BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);

2505

BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);

2529

BUG_ON(!ss->create);

2506

BUG_ON(!ss->create);

2530

BUG_ON(!ss->destroy);

2507

BUG_ON(!ss->destroy);

2531

if (ss->subsys_id != i) {

2508

if (ss->subsys_id != i) {

2532

printk(KERN_ERR "cgroup: Subsys %s id == %d\n",

2509

printk(KERN_ERR "cgroup: Subsys %s id == %d\n",

2533

ss->name, ss->subsys_id);

2510

ss->name, ss->subsys_id);

2534

BUG();

2511

BUG();

2535

}

2512

}

2536

2513

2537

if (ss->early_init)

2514

if (ss->early_init)

2538

cgroup_init_subsys(ss);

2515

cgroup_init_subsys(ss);

2539

}

2516

}

2540

return 0;

2517

return 0;

2541

}

2518

}

2542

2519

2543

/**

2520

/**

2544

* cgroup_init - cgroup initialization

2521

* cgroup_init - cgroup initialization

2545

*

2522

*

2546

* Register cgroup filesystem and /proc file, and initialize

2523

* Register cgroup filesystem and /proc file, and initialize

2547

* any subsystems that didn't request early init.

2524

* any subsystems that didn't request early init.

2548

*/

2525

*/

2549

int __init cgroup_init(void)

2526

int __init cgroup_init(void)

2550

{

2527

{

2551

int err;

2528

int err;

2552

int i;

2529

int i;

2553

struct proc_dir_entry *entry;

2530

struct proc_dir_entry *entry;

2554

2531

2555

err = bdi_init(&cgroup_backing_dev_info);

2532

err = bdi_init(&cgroup_backing_dev_info);

2556

if (err)

2533

if (err)

2557

return err;

2534

return err;

2558

2535

2559

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

2536

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

2560

struct cgroup_subsys *ss = subsys[i];

2537

struct cgroup_subsys *ss = subsys[i];

2561

if (!ss->early_init)

2538

if (!ss->early_init)

2562

cgroup_init_subsys(ss);

2539

cgroup_init_subsys(ss);

2563

}

2540

}

2564

2541

2565

err = register_filesystem(&cgroup_fs_type);

2542

err = register_filesystem(&cgroup_fs_type);

2566

if (err < 0)

2543

if (err < 0)

2567

goto out;

2544

goto out;

2568

2545

2569

entry = create_proc_entry("cgroups", 0, NULL);

2546

entry = create_proc_entry("cgroups", 0, NULL);

2570

if (entry)

2547

if (entry)

2571

entry->proc_fops = &proc_cgroupstats_operations;

2548

entry->proc_fops = &proc_cgroupstats_operations;

2572

2549

2573

out:

2550

out:

2574

if (err)

2551

if (err)

2575

bdi_destroy(&cgroup_backing_dev_info);

2552

bdi_destroy(&cgroup_backing_dev_info);

2576

2553

2577

return err;

2554

return err;

2578

}

2555

}

2579

2556

2580

/*

2557

/*

2581

* proc_cgroup_show()

2558

* proc_cgroup_show()

2582

* - Print task's cgroup paths into seq_file, one line for each hierarchy

2559

* - Print task's cgroup paths into seq_file, one line for each hierarchy

2583

* - Used for /proc/<pid>/cgroup.

2560

* - Used for /proc/<pid>/cgroup.

2584

* - No need to task_lock(tsk) on this tsk->cgroup reference, as it

2561

* - No need to task_lock(tsk) on this tsk->cgroup reference, as it

2585

* doesn't really matter if tsk->cgroup changes after we read it,

2562

* doesn't really matter if tsk->cgroup changes after we read it,

2586

* and we take cgroup_mutex, keeping cgroup_attach_task() from changing it

2563

* and we take cgroup_mutex, keeping cgroup_attach_task() from changing it

2587

* anyway. No need to check that tsk->cgroup != NULL, thanks to

2564

* anyway. No need to check that tsk->cgroup != NULL, thanks to

2588

* the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks

2565

* the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks

2589

* cgroup to top_cgroup.

2566

* cgroup to top_cgroup.

2590

*/

2567

*/

2591

2568

2592

/* TODO: Use a proper seq_file iterator */

2569

/* TODO: Use a proper seq_file iterator */

2593

static int proc_cgroup_show(struct seq_file *m, void *v)

2570

static int proc_cgroup_show(struct seq_file *m, void *v)

2594

{

2571

{

2595

struct pid *pid;

2572

struct pid *pid;

2596

struct task_struct *tsk;

2573

struct task_struct *tsk;

2597

char *buf;

2574

char *buf;

2598

int retval;

2575

int retval;

2599

struct cgroupfs_root *root;

2576

struct cgroupfs_root *root;

2600

2577

2601

retval = -ENOMEM;

2578

retval = -ENOMEM;

2602

buf = kmalloc(PAGE_SIZE, GFP_KERNEL);

2579

buf = kmalloc(PAGE_SIZE, GFP_KERNEL);

2603

if (!buf)

2580

if (!buf)

2604

goto out;

2581

goto out;

2605

2582

2606

retval = -ESRCH;

2583

retval = -ESRCH;

2607

pid = m->private;

2584

pid = m->private;

2608

tsk = get_pid_task(pid, PIDTYPE_PID);

2585

tsk = get_pid_task(pid, PIDTYPE_PID);

2609

if (!tsk)

2586

if (!tsk)

2610

goto out_free;

2587

goto out_free;

2611

2588

2612

retval = 0;

2589

retval = 0;

2613

2590

2614

mutex_lock(&cgroup_mutex);

2591

mutex_lock(&cgroup_mutex);

2615

2592

2616

for_each_root(root) {

2593

for_each_root(root) {

2617

struct cgroup_subsys *ss;

2594

struct cgroup_subsys *ss;

2618

struct cgroup *cgrp;

2595

struct cgroup *cgrp;

2619

int subsys_id;

2596

int subsys_id;

2620

int count = 0;

2597

int count = 0;

2621

2598

2622

/* Skip this hierarchy if it has no active subsystems */

2599

/* Skip this hierarchy if it has no active subsystems */

2623

if (!root->actual_subsys_bits)

2600

if (!root->actual_subsys_bits)

2624

continue;

2601

continue;

2625

seq_printf(m, "%lu:", root->subsys_bits);

2602

seq_printf(m, "%lu:", root->subsys_bits);

2626

for_each_subsys(root, ss)

2603

for_each_subsys(root, ss)

2627

seq_printf(m, "%s%s", count++ ? "," : "", ss->name);

2604

seq_printf(m, "%s%s", count++ ? "," : "", ss->name);

2628

seq_putc(m, ':');

2605

seq_putc(m, ':');

2629

get_first_subsys(&root->top_cgroup, NULL, &subsys_id);

2606

get_first_subsys(&root->top_cgroup, NULL, &subsys_id);

2630

cgrp = task_cgroup(tsk, subsys_id);

2607

cgrp = task_cgroup(tsk, subsys_id);

2631

retval = cgroup_path(cgrp, buf, PAGE_SIZE);

2608

retval = cgroup_path(cgrp, buf, PAGE_SIZE);

2632

if (retval < 0)

2609

if (retval < 0)

2633

goto out_unlock;

2610

goto out_unlock;

2634

seq_puts(m, buf);

2611

seq_puts(m, buf);

2635

seq_putc(m, '\n');

2612

seq_putc(m, '\n');

2636

}

2613

}

2637

2614

2638

out_unlock:

2615

out_unlock:

2639

mutex_unlock(&cgroup_mutex);

2616

mutex_unlock(&cgroup_mutex);

2640

put_task_struct(tsk);

2617

put_task_struct(tsk);

2641

out_free:

2618

out_free:

2642

kfree(buf);

2619

kfree(buf);

2643

out:

2620

out:

2644

return retval;

2621

return retval;

2645

}

2622

}

2646

2623

2647

static int cgroup_open(struct inode *inode, struct file *file)

2624

static int cgroup_open(struct inode *inode, struct file *file)

2648

{

2625

{

2649

struct pid *pid = PROC_I(inode)->pid;

2626

struct pid *pid = PROC_I(inode)->pid;

2650

return single_open(file, proc_cgroup_show, pid);

2627

return single_open(file, proc_cgroup_show, pid);

2651

}

2628

}

2652

2629

2653

struct file_operations proc_cgroup_operations = {

2630

struct file_operations proc_cgroup_operations = {

2654

.open = cgroup_open,

2631

.open = cgroup_open,

2655

.read = seq_read,

2632

.read = seq_read,

2656

.llseek = seq_lseek,

2633

.llseek = seq_lseek,

2657

.release = single_release,

2634

.release = single_release,

2658

};

2635

};

2659

2636

2660

/* Display information about each subsystem and each hierarchy */

2637

/* Display information about each subsystem and each hierarchy */

2661

static int proc_cgroupstats_show(struct seq_file *m, void *v)

2638

static int proc_cgroupstats_show(struct seq_file *m, void *v)

2662

{

2639

{

2663

int i;

2640

int i;

2664

2641

2665

seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");

2642

seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");

2666

mutex_lock(&cgroup_mutex);

2643

mutex_lock(&cgroup_mutex);

2667

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

2644

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

2668

struct cgroup_subsys *ss = subsys[i];

2645

struct cgroup_subsys *ss = subsys[i];

2669

seq_printf(m, "%s\t%lu\t%d\t%d\n",

2646

seq_printf(m, "%s\t%lu\t%d\t%d\n",

2670

ss->name, ss->root->subsys_bits,

2647

ss->name, ss->root->subsys_bits,

2671

ss->root->number_of_cgroups, !ss->disabled);

2648

ss->root->number_of_cgroups, !ss->disabled);

2672

}

2649

}

2673

mutex_unlock(&cgroup_mutex);

2650

mutex_unlock(&cgroup_mutex);

2674

return 0;

2651

return 0;

2675

}

2652

}

2676

2653

2677

static int cgroupstats_open(struct inode *inode, struct file *file)

2654

static int cgroupstats_open(struct inode *inode, struct file *file)

2678

{

2655

{

2679

return single_open(file, proc_cgroupstats_show, NULL);

2656

return single_open(file, proc_cgroupstats_show, NULL);

2680

}

2657

}

2681

2658

2682

static struct file_operations proc_cgroupstats_operations = {

2659

static struct file_operations proc_cgroupstats_operations = {

2683

.open = cgroupstats_open,

2660

.open = cgroupstats_open,

2684

.read = seq_read,

2661

.read = seq_read,

2685

.llseek = seq_lseek,

2662

.llseek = seq_lseek,

2686

.release = single_release,

2663

.release = single_release,

2687

};

2664

};

2688

2665

2689

/**

2666

/**

2690

* cgroup_fork - attach newly forked task to its parents cgroup.

2667

* cgroup_fork - attach newly forked task to its parents cgroup.

2691

* @child: pointer to task_struct of forking parent process.

2668

* @child: pointer to task_struct of forking parent process.

2692

*

2669

*

2693

* Description: A task inherits its parent's cgroup at fork().

2670

* Description: A task inherits its parent's cgroup at fork().

2694

*

2671

*

2695

* A pointer to the shared css_set was automatically copied in

2672

* A pointer to the shared css_set was automatically copied in

2696

* fork.c by dup_task_struct(). However, we ignore that copy, since

2673

* fork.c by dup_task_struct(). However, we ignore that copy, since

2697

* it was not made under the protection of RCU or cgroup_mutex, so

2674

* it was not made under the protection of RCU or cgroup_mutex, so

2698

* might no longer be a valid cgroup pointer. cgroup_attach_task() might

2675

* might no longer be a valid cgroup pointer. cgroup_attach_task() might

2699

* have already changed current->cgroups, allowing the previously

2676

* have already changed current->cgroups, allowing the previously

2700

* referenced cgroup group to be removed and freed.

2677

* referenced cgroup group to be removed and freed.

2701

*

2678

*

2702

* At the point that cgroup_fork() is called, 'current' is the parent

2679

* At the point that cgroup_fork() is called, 'current' is the parent

2703

* task, and the passed argument 'child' points to the child task.

2680

* task, and the passed argument 'child' points to the child task.

2704

*/

2681

*/

2705

void cgroup_fork(struct task_struct *child)

2682

void cgroup_fork(struct task_struct *child)

2706

{

2683

{

2707

task_lock(current);

2684

task_lock(current);

2708

child->cgroups = current->cgroups;

2685

child->cgroups = current->cgroups;

2709

get_css_set(child->cgroups);

2686

get_css_set(child->cgroups);

2710

task_unlock(current);

2687

task_unlock(current);

2711

INIT_LIST_HEAD(&child->cg_list);

2688

INIT_LIST_HEAD(&child->cg_list);

2712

}

2689

}

2713

2690

2714

/**

2691

/**

2715

* cgroup_fork_callbacks - run fork callbacks

2692

* cgroup_fork_callbacks - run fork callbacks

2716

* @child: the new task

2693

* @child: the new task

2717

*

2694

*

2718

* Called on a new task very soon before adding it to the

2695

* Called on a new task very soon before adding it to the

2719

* tasklist. No need to take any locks since no-one can

2696

* tasklist. No need to take any locks since no-one can

2720

* be operating on this task.

2697

* be operating on this task.

2721

*/

2698

*/

2722

void cgroup_fork_callbacks(struct task_struct *child)

2699

void cgroup_fork_callbacks(struct task_struct *child)

2723

{

2700

{

2724

if (need_forkexit_callback) {

2701

if (need_forkexit_callback) {

2725

int i;

2702

int i;

2726

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

2703

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

2727

struct cgroup_subsys *ss = subsys[i];

2704

struct cgroup_subsys *ss = subsys[i];

2728

if (ss->fork)

2705

if (ss->fork)

2729

ss->fork(ss, child);

2706

ss->fork(ss, child);

2730

}

2707

}

2731

}

2708

}

2732

}

2709

}

2733

2710

2734

/**

2711

/**

2735

* cgroup_post_fork - called on a new task after adding it to the task list

2712

* cgroup_post_fork - called on a new task after adding it to the task list

2736

* @child: the task in question

2713

* @child: the task in question

2737

*

2714

*

2738

* Adds the task to the list running through its css_set if necessary.

2715

* Adds the task to the list running through its css_set if necessary.

2739

* Has to be after the task is visible on the task list in case we race

2716

* Has to be after the task is visible on the task list in case we race

2740

* with the first call to cgroup_iter_start() - to guarantee that the

2717

* with the first call to cgroup_iter_start() - to guarantee that the

2741

* new task ends up on its list.

2718

* new task ends up on its list.

2742

*/

2719

*/

2743

void cgroup_post_fork(struct task_struct *child)

2720

void cgroup_post_fork(struct task_struct *child)

2744

{

2721

{

2745

if (use_task_css_set_links) {

2722

if (use_task_css_set_links) {

2746

write_lock(&css_set_lock);

2723

write_lock(&css_set_lock);

2747

if (list_empty(&child->cg_list))

2724

if (list_empty(&child->cg_list))

2748

list_add(&child->cg_list, &child->cgroups->tasks);

2725

list_add(&child->cg_list, &child->cgroups->tasks);

2749

write_unlock(&css_set_lock);

2726

write_unlock(&css_set_lock);

2750

}

2727

}

2751

}

2728

}

2752

/**

2729

/**

2753

* cgroup_exit - detach cgroup from exiting task

2730

* cgroup_exit - detach cgroup from exiting task

2754

* @tsk: pointer to task_struct of exiting process

2731

* @tsk: pointer to task_struct of exiting process

2755

* @run_callback: run exit callbacks?

2732

* @run_callback: run exit callbacks?

2756

*

2733

*

2757

* Description: Detach cgroup from @tsk and release it.

2734

* Description: Detach cgroup from @tsk and release it.

2758

*

2735

*

2759

* Note that cgroups marked notify_on_release force every task in

2736

* Note that cgroups marked notify_on_release force every task in

2760

* them to take the global cgroup_mutex mutex when exiting.

2737

* them to take the global cgroup_mutex mutex when exiting.

2761

* This could impact scaling on very large systems. Be reluctant to

2738

* This could impact scaling on very large systems. Be reluctant to

2762

* use notify_on_release cgroups where very high task exit scaling

2739

* use notify_on_release cgroups where very high task exit scaling

2763

* is required on large systems.

2740

* is required on large systems.

2764

*

2741

*

2765

* the_top_cgroup_hack:

2742

* the_top_cgroup_hack:

2766

*

2743

*

2767

* Set the exiting tasks cgroup to the root cgroup (top_cgroup).

2744

* Set the exiting tasks cgroup to the root cgroup (top_cgroup).

2768

*

2745

*

2769

* We call cgroup_exit() while the task is still competent to

2746

* We call cgroup_exit() while the task is still competent to

2770

* handle notify_on_release(), then leave the task attached to the

2747

* handle notify_on_release(), then leave the task attached to the

2771

* root cgroup in each hierarchy for the remainder of its exit.

2748

* root cgroup in each hierarchy for the remainder of its exit.

2772

*

2749

*

2773

* To do this properly, we would increment the reference count on

2750

* To do this properly, we would increment the reference count on

2774

* top_cgroup, and near the very end of the kernel/exit.c do_exit()

2751

* top_cgroup, and near the very end of the kernel/exit.c do_exit()

2775

* code we would add a second cgroup function call, to drop that

2752

* code we would add a second cgroup function call, to drop that

2776

* reference. This would just create an unnecessary hot spot on

2753

* reference. This would just create an unnecessary hot spot on

2777

* the top_cgroup reference count, to no avail.

2754

* the top_cgroup reference count, to no avail.

2778

*

2755

*

2779

* Normally, holding a reference to a cgroup without bumping its

2756

* Normally, holding a reference to a cgroup without bumping its

2780

* count is unsafe. The cgroup could go away, or someone could

2757

* count is unsafe. The cgroup could go away, or someone could

2781

* attach us to a different cgroup, decrementing the count on

2758

* attach us to a different cgroup, decrementing the count on

2782

* the first cgroup that we never incremented. But in this case,

2759

* the first cgroup that we never incremented. But in this case,

2783

* top_cgroup isn't going away, and either task has PF_EXITING set,

2760

* top_cgroup isn't going away, and either task has PF_EXITING set,

2784

* which wards off any cgroup_attach_task() attempts, or task is a failed

2761

* which wards off any cgroup_attach_task() attempts, or task is a failed

2785

* fork, never visible to cgroup_attach_task.

2762

* fork, never visible to cgroup_attach_task.

2786

*/

2763

*/

2787

void cgroup_exit(struct task_struct *tsk, int run_callbacks)

2764

void cgroup_exit(struct task_struct *tsk, int run_callbacks)

2788

{

2765

{

2789

int i;

2766

int i;

2790

struct css_set *cg;

2767

struct css_set *cg;

2791

2768

2792

if (run_callbacks && need_forkexit_callback) {

2769

if (run_callbacks && need_forkexit_callback) {

2793

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

2770

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

2794

struct cgroup_subsys *ss = subsys[i];

2771

struct cgroup_subsys *ss = subsys[i];

2795

if (ss->exit)

2772

if (ss->exit)

2796

ss->exit(ss, tsk);

2773

ss->exit(ss, tsk);

2797

}

2774

}

2798

}

2775

}

2799

2776

2800

/*

2777

/*

2801

* Unlink from the css_set task list if necessary.

2778

* Unlink from the css_set task list if necessary.

2802

* Optimistically check cg_list before taking

2779

* Optimistically check cg_list before taking

2803

* css_set_lock

2780

* css_set_lock

2804

*/

2781

*/

2805

if (!list_empty(&tsk->cg_list)) {

2782

if (!list_empty(&tsk->cg_list)) {

2806

write_lock(&css_set_lock);

2783

write_lock(&css_set_lock);

2807

if (!list_empty(&tsk->cg_list))

2784

if (!list_empty(&tsk->cg_list))

2808

list_del(&tsk->cg_list);

2785

list_del(&tsk->cg_list);

2809

write_unlock(&css_set_lock);

2786

write_unlock(&css_set_lock);

2810

}

2787

}

2811

2788

2812

/* Reassign the task to the init_css_set. */

2789

/* Reassign the task to the init_css_set. */

2813

task_lock(tsk);

2790

task_lock(tsk);

2814

cg = tsk->cgroups;

2791

cg = tsk->cgroups;

2815

tsk->cgroups = &init_css_set;

2792

tsk->cgroups = &init_css_set;

2816

task_unlock(tsk);

2793

task_unlock(tsk);

2817

if (cg)

2794

if (cg)

2818

put_css_set_taskexit(cg);

2795

put_css_set_taskexit(cg);

2819

}

2796

}

2820

2797

2821

/**

2798

/**

2822

* cgroup_clone - clone the cgroup the given subsystem is attached to

2799

* cgroup_clone - clone the cgroup the given subsystem is attached to

2823

* @tsk: the task to be moved

2800

* @tsk: the task to be moved

2824

* @subsys: the given subsystem

2801

* @subsys: the given subsystem

2825

*

2802

*

2826

* Duplicate the current cgroup in the hierarchy that the given

2803

* Duplicate the current cgroup in the hierarchy that the given

2827

* subsystem is attached to, and move this task into the new

2804

* subsystem is attached to, and move this task into the new

2828

* child.

2805

* child.

2829

*/

2806

*/

2830

int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)

2807

int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)

2831

{

2808

{

2832

struct dentry *dentry;

2809

struct dentry *dentry;

2833

int ret = 0;

2810

int ret = 0;

2834

char nodename[MAX_CGROUP_TYPE_NAMELEN];

2811

char nodename[MAX_CGROUP_TYPE_NAMELEN];

2835

struct cgroup *parent, *child;

2812

struct cgroup *parent, *child;

2836

struct inode *inode;

2813

struct inode *inode;

2837

struct css_set *cg;

2814

struct css_set *cg;

2838

struct cgroupfs_root *root;

2815

struct cgroupfs_root *root;

2839

struct cgroup_subsys *ss;

2816

struct cgroup_subsys *ss;

2840

2817

2841

/* We shouldn't be called by an unregistered subsystem */

2818

/* We shouldn't be called by an unregistered subsystem */

2842

BUG_ON(!subsys->active);

2819

BUG_ON(!subsys->active);

2843

2820

2844

/* First figure out what hierarchy and cgroup we're dealing

2821

/* First figure out what hierarchy and cgroup we're dealing

2845

* with, and pin them so we can drop cgroup_mutex */

2822

* with, and pin them so we can drop cgroup_mutex */

2846

mutex_lock(&cgroup_mutex);

2823

mutex_lock(&cgroup_mutex);

2847

again:

2824

again:

2848

root = subsys->root;

2825

root = subsys->root;

2849

if (root == &rootnode) {

2826

if (root == &rootnode) {

2850

printk(KERN_INFO

2827

printk(KERN_INFO

2851

"Not cloning cgroup for unused subsystem %s\n",

2828

"Not cloning cgroup for unused subsystem %s\n",

2852

subsys->name);

2829

subsys->name);

2853

mutex_unlock(&cgroup_mutex);

2830

mutex_unlock(&cgroup_mutex);

2854

return 0;

2831

return 0;

2855

}

2832

}

2856

cg = tsk->cgroups;

2833

cg = tsk->cgroups;

2857

parent = task_cgroup(tsk, subsys->subsys_id);

2834

parent = task_cgroup(tsk, subsys->subsys_id);

2858

2835

2859

snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid);

2836

snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid);

2860

2837

2861

/* Pin the hierarchy */

2838

/* Pin the hierarchy */

2862

atomic_inc(&parent->root->sb->s_active);

2839

atomic_inc(&parent->root->sb->s_active);

2863

2840

2864

/* Keep the cgroup alive */

2841

/* Keep the cgroup alive */

2865

get_css_set(cg);

2842

get_css_set(cg);

2866

mutex_unlock(&cgroup_mutex);

2843

mutex_unlock(&cgroup_mutex);

2867

2844

2868

/* Now do the VFS work to create a cgroup */

2845

/* Now do the VFS work to create a cgroup */

2869

inode = parent->dentry->d_inode;

2846

inode = parent->dentry->d_inode;

2870

2847

2871

/* Hold the parent directory mutex across this operation to

2848

/* Hold the parent directory mutex across this operation to

2872

* stop anyone else deleting the new cgroup */

2849

* stop anyone else deleting the new cgroup */

2873

mutex_lock(&inode->i_mutex);

2850

mutex_lock(&inode->i_mutex);

2874

dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));

2851

dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));

2875

if (IS_ERR(dentry)) {

2852

if (IS_ERR(dentry)) {

2876

printk(KERN_INFO

2853

printk(KERN_INFO

2877

"cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,

2854

"cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,

2878

PTR_ERR(dentry));

2855

PTR_ERR(dentry));

2879

ret = PTR_ERR(dentry);

2856

ret = PTR_ERR(dentry);

2880

goto out_release;

2857

goto out_release;

2881

}

2858

}

2882

2859

2883

/* Create the cgroup directory, which also creates the cgroup */

2860

/* Create the cgroup directory, which also creates the cgroup */

2884

ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);

2861

ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);

2885

child = __d_cgrp(dentry);

2862

child = __d_cgrp(dentry);

2886

dput(dentry);

2863

dput(dentry);

2887

if (ret) {

2864

if (ret) {

2888

printk(KERN_INFO

2865

printk(KERN_INFO

2889

"Failed to create cgroup %s: %d\n", nodename,

2866

"Failed to create cgroup %s: %d\n", nodename,

2890

ret);

2867

ret);

2891

goto out_release;

2868

goto out_release;

2892

}

2869

}

2893

2870

2894

if (!child) {

2871

if (!child) {

2895

printk(KERN_INFO

2872

printk(KERN_INFO

2896

"Couldn't find new cgroup %s\n", nodename);

2873

"Couldn't find new cgroup %s\n", nodename);

2897

ret = -ENOMEM;

2874

ret = -ENOMEM;

2898

goto out_release;

2875

goto out_release;

2899

}

2876

}

2900

2877

2901

/* The cgroup now exists. Retake cgroup_mutex and check

2878

/* The cgroup now exists. Retake cgroup_mutex and check

2902

* that we're still in the same state that we thought we

2879

* that we're still in the same state that we thought we

2903

* were. */

2880

* were. */

2904

mutex_lock(&cgroup_mutex);

2881

mutex_lock(&cgroup_mutex);

2905

if ((root != subsys->root) ||

2882

if ((root != subsys->root) ||

2906

(parent != task_cgroup(tsk, subsys->subsys_id))) {

2883

(parent != task_cgroup(tsk, subsys->subsys_id))) {

2907

/* Aargh, we raced ... */

2884

/* Aargh, we raced ... */

2908

mutex_unlock(&inode->i_mutex);

2885

mutex_unlock(&inode->i_mutex);

2909

put_css_set(cg);

2886

put_css_set(cg);

2910

2887

2911

deactivate_super(parent->root->sb);

2888

deactivate_super(parent->root->sb);

2912

/* The cgroup is still accessible in the VFS, but

2889

/* The cgroup is still accessible in the VFS, but

2913

* we're not going to try to rmdir() it at this

2890

* we're not going to try to rmdir() it at this

2914

* point. */

2891

* point. */

2915

printk(KERN_INFO

2892

printk(KERN_INFO

2916

"Race in cgroup_clone() - leaking cgroup %s\n",

2893

"Race in cgroup_clone() - leaking cgroup %s\n",

2917

nodename);

2894

nodename);

2918

goto again;

2895

goto again;

2919

}

2896

}

2920

2897

2921

/* do any required auto-setup */

2898

/* do any required auto-setup */

2922

for_each_subsys(root, ss) {

2899

for_each_subsys(root, ss) {

2923

if (ss->post_clone)

2900

if (ss->post_clone)

2924

ss->post_clone(ss, child);

2901

ss->post_clone(ss, child);

2925

}

2902

}

2926

2903

2927

/* All seems fine. Finish by moving the task into the new cgroup */

2904

/* All seems fine. Finish by moving the task into the new cgroup */

2928

ret = cgroup_attach_task(child, tsk);

2905

ret = cgroup_attach_task(child, tsk);

2929

mutex_unlock(&cgroup_mutex);

2906

mutex_unlock(&cgroup_mutex);

2930

2907

2931

out_release:

2908

out_release:

2932

mutex_unlock(&inode->i_mutex);

2909

mutex_unlock(&inode->i_mutex);

2933

2910

2934

mutex_lock(&cgroup_mutex);

2911

mutex_lock(&cgroup_mutex);

2935

put_css_set(cg);

2912

put_css_set(cg);

2936

mutex_unlock(&cgroup_mutex);

2913

mutex_unlock(&cgroup_mutex);

2937

deactivate_super(parent->root->sb);

2914

deactivate_super(parent->root->sb);

2938

return ret;

2915

return ret;

2939

}

2916

}

2940

2917

2941

/**

2918

/**

2942

* cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp

2919

* cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp

2943

* @cgrp: the cgroup in question

2920

* @cgrp: the cgroup in question

2944

*

2921

*

2945

* See if @cgrp is a descendant of the current task's cgroup in

2922

* See if @cgrp is a descendant of the current task's cgroup in

2946

* the appropriate hierarchy.

2923

* the appropriate hierarchy.

2947

*

2924

*

2948

* If we are sending in dummytop, then presumably we are creating

2925

* If we are sending in dummytop, then presumably we are creating

2949

* the top cgroup in the subsystem.

2926

* the top cgroup in the subsystem.

2950

*

2927

*

2951

* Called only by the ns (nsproxy) cgroup.

2928

* Called only by the ns (nsproxy) cgroup.

2952

*/

2929

*/

2953

int cgroup_is_descendant(const struct cgroup *cgrp)

2930

int cgroup_is_descendant(const struct cgroup *cgrp)

2954

{

2931

{

2955

int ret;

2932

int ret;

2956

struct cgroup *target;

2933

struct cgroup *target;

2957

int subsys_id;

2934

int subsys_id;

2958

2935

2959

if (cgrp == dummytop)

2936

if (cgrp == dummytop)

2960

return 1;

2937

return 1;

2961

2938

2962

get_first_subsys(cgrp, NULL, &subsys_id);

2939

get_first_subsys(cgrp, NULL, &subsys_id);

2963

target = task_cgroup(current, subsys_id);

2940

target = task_cgroup(current, subsys_id);

2964

while (cgrp != target && cgrp!= cgrp->top_cgroup)

2941

while (cgrp != target && cgrp!= cgrp->top_cgroup)

2965

cgrp = cgrp->parent;

2942

cgrp = cgrp->parent;

2966

ret = (cgrp == target);

2943

ret = (cgrp == target);

2967

return ret;

2944

return ret;

2968

}

2945

}

2969

2946

2970

static void check_for_release(struct cgroup *cgrp)

2947

static void check_for_release(struct cgroup *cgrp)

2971

{

2948

{

2972

/* All of these checks rely on RCU to keep the cgroup

2949

/* All of these checks rely on RCU to keep the cgroup

2973

* structure alive */

2950

* structure alive */

2974

if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)

2951

if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)

2975

&& list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {

2952

&& list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {

2976

/* Control Group is currently removeable. If it's not

2953

/* Control Group is currently removeable. If it's not

2977

* already queued for a userspace notification, queue

2954

* already queued for a userspace notification, queue

2978

* it now */

2955

* it now */

2979

int need_schedule_work = 0;

2956

int need_schedule_work = 0;

2980

spin_lock(&release_list_lock);

2957

spin_lock(&release_list_lock);

2981

if (!cgroup_is_removed(cgrp) &&

2958

if (!cgroup_is_removed(cgrp) &&

2982

list_empty(&cgrp->release_list)) {

2959

list_empty(&cgrp->release_list)) {

2983

list_add(&cgrp->release_list, &release_list);

2960

list_add(&cgrp->release_list, &release_list);

2984

need_schedule_work = 1;

2961

need_schedule_work = 1;

2985

}

2962

}

2986

spin_unlock(&release_list_lock);

2963

spin_unlock(&release_list_lock);

2987

if (need_schedule_work)

2964

if (need_schedule_work)

2988

schedule_work(&release_agent_work);

2965

schedule_work(&release_agent_work);

2989

}

2966

}

2990

}

2967

}

2991

2968

2992

void __css_put(struct cgroup_subsys_state *css)

2969

void __css_put(struct cgroup_subsys_state *css)

2993

{

2970

{

2994

struct cgroup *cgrp = css->cgroup;

2971

struct cgroup *cgrp = css->cgroup;

2995

rcu_read_lock();

2972

rcu_read_lock();

2996

if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) {

2973

if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) {

2997

set_bit(CGRP_RELEASABLE, &cgrp->flags);

2974

set_bit(CGRP_RELEASABLE, &cgrp->flags);

2998

check_for_release(cgrp);

2975

check_for_release(cgrp);

2999

}

2976

}

3000

rcu_read_unlock();

2977

rcu_read_unlock();

3001

}

2978

}

3002

2979

3003

/*

2980

/*

3004

* Notify userspace when a cgroup is released, by running the

2981

* Notify userspace when a cgroup is released, by running the

3005

* configured release agent with the name of the cgroup (path

2982

* configured release agent with the name of the cgroup (path

3006

* relative to the root of cgroup file system) as the argument.

2983

* relative to the root of cgroup file system) as the argument.

3007

*

2984

*

3008

* Most likely, this user command will try to rmdir this cgroup.

2985

* Most likely, this user command will try to rmdir this cgroup.

3009

*

2986

*

3010

* This races with the possibility that some other task will be

2987

* This races with the possibility that some other task will be

3011

* attached to this cgroup before it is removed, or that some other

2988

* attached to this cgroup before it is removed, or that some other

3012

* user task will 'mkdir' a child cgroup of this cgroup. That's ok.

2989

* user task will 'mkdir' a child cgroup of this cgroup. That's ok.

3013

* The presumed 'rmdir' will fail quietly if this cgroup is no longer

2990

* The presumed 'rmdir' will fail quietly if this cgroup is no longer

3014

* unused, and this cgroup will be reprieved from its death sentence,

2991

* unused, and this cgroup will be reprieved from its death sentence,

3015

* to continue to serve a useful existence. Next time it's released,

2992

* to continue to serve a useful existence. Next time it's released,

3016

* we will get notified again, if it still has 'notify_on_release' set.

2993

* we will get notified again, if it still has 'notify_on_release' set.

3017

*

2994

*

3018

* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which

2995

* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which

3019

* means only wait until the task is successfully execve()'d. The

2996

* means only wait until the task is successfully execve()'d. The

3020

* separate release agent task is forked by call_usermodehelper(),

2997

* separate release agent task is forked by call_usermodehelper(),

3021

* then control in this thread returns here, without waiting for the

2998

* then control in this thread returns here, without waiting for the

3022

* release agent task. We don't bother to wait because the caller of

2999

* release agent task. We don't bother to wait because the caller of

3023

* this routine has no use for the exit status of the release agent

3000

* this routine has no use for the exit status of the release agent

3024

* task, so no sense holding our caller up for that.

3001

* task, so no sense holding our caller up for that.

3025

*/

3002

*/

3026

static void cgroup_release_agent(struct work_struct *work)

3003

static void cgroup_release_agent(struct work_struct *work)

3027

{

3004

{

3028

BUG_ON(work != &release_agent_work);

3005

BUG_ON(work != &release_agent_work);

3029

mutex_lock(&cgroup_mutex);

3006

mutex_lock(&cgroup_mutex);

3030

spin_lock(&release_list_lock);

3007

spin_lock(&release_list_lock);

3031

while (!list_empty(&release_list)) {

3008

while (!list_empty(&release_list)) {

3032

char *argv[3], *envp[3];

3009

char *argv[3], *envp[3];

3033

int i;

3010

int i;

3034

char *pathbuf;

3011

char *pathbuf;

3035

struct cgroup *cgrp = list_entry(release_list.next,

3012

struct cgroup *cgrp = list_entry(release_list.next,

3036

struct cgroup,

3013

struct cgroup,

3037

release_list);

3014

release_list);

3038

list_del_init(&cgrp->release_list);

3015

list_del_init(&cgrp->release_list);

3039

spin_unlock(&release_list_lock);

3016

spin_unlock(&release_list_lock);

3040

pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);

3017

pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);

3041

if (!pathbuf) {

3018

if (!pathbuf) {

3042

spin_lock(&release_list_lock);

3019

spin_lock(&release_list_lock);

3043

continue;

3020

continue;

3044

}

3021

}

3045

3022

3046

if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) {

3023

if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) {

3047

kfree(pathbuf);

3024

kfree(pathbuf);

3048

spin_lock(&release_list_lock);

3025

spin_lock(&release_list_lock);

3049

continue;

3026

continue;

3050

}

3027

}

3051

3028

3052

i = 0;

3029

i = 0;

3053

argv[i++] = cgrp->root->release_agent_path;

3030

argv[i++] = cgrp->root->release_agent_path;

3054

argv[i++] = (char *)pathbuf;

3031

argv[i++] = (char *)pathbuf;

3055

argv[i] = NULL;

3032

argv[i] = NULL;

3056

3033

3057

i = 0;

3034

i = 0;

3058

/* minimal command environment */

3035

/* minimal command environment */

3059

envp[i++] = "HOME=/";

3036

envp[i++] = "HOME=/";

3060

envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";

3037

envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";

3061

envp[i] = NULL;

3038

envp[i] = NULL;

3062

3039

3063

/* Drop the lock while we invoke the usermode helper,

3040

/* Drop the lock while we invoke the usermode helper,

3064

* since the exec could involve hitting disk and hence

3041

* since the exec could involve hitting disk and hence

3065

* be a slow process */

3042

* be a slow process */

3066

mutex_unlock(&cgroup_mutex);

3043

mutex_unlock(&cgroup_mutex);

3067

call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);

3044

call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);

3068

kfree(pathbuf);

3045

kfree(pathbuf);

3069

mutex_lock(&cgroup_mutex);

3046

mutex_lock(&cgroup_mutex);

3070

spin_lock(&release_list_lock);

3047

spin_lock(&release_list_lock);

3071

}

3048

}

3072

spin_unlock(&release_list_lock);

3049

spin_unlock(&release_list_lock);

3073

mutex_unlock(&cgroup_mutex);

3050

mutex_unlock(&cgroup_mutex);

3074

}

3051

}

3075

3052

3076

static int __init cgroup_disable(char *str)

3053

static int __init cgroup_disable(char *str)

3077

{

3054

{

3078

int i;

3055

int i;

3079

char *token;

3056

char *token;

3080

3057

3081

while ((token = strsep(&str, ",")) != NULL) {

3058

while ((token = strsep(&str, ",")) != NULL) {

3082

if (!*token)

3059

if (!*token)

3083

continue;

3060

continue;

3084

3061

3085

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

3062

for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {

3086

struct cgroup_subsys *ss = subsys[i];

3063

struct cgroup_subsys *ss = subsys[i];

3087

3064

3088

if (!strcmp(token, ss->name)) {

3065

if (!strcmp(token, ss->name)) {

3089

ss->disabled = 1;

3066

ss->disabled = 1;

3090

printk(KERN_INFO "Disabling %s control group"

3067

printk(KERN_INFO "Disabling %s control group"

3091

" subsystem\n", ss->name);

3068

" subsystem\n", ss->name);

3092

break;

3069

break;

3093

}

3070

}

3094

}

3071

}

3095

}

3072

}

3096

return 1;

3073

return 1;

3097

}

3074

}

3098

__setup("cgroup_disable=", cgroup_disable);

3075

__setup("cgroup_disable=", cgroup_disable);

3099

3076

GITLAB

Eric Lee / linux-smarc-t335x-v3.2

CGroup API files: move "releasable" to cgroup_debug subsystem

 #ifndef _LINUX_CGROUP_H
 #define _LINUX_CGROUP_H
 /*
  *  cgroup interface
  *
  *  Copyright (C) 2003 BULL SA
  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  *
  */
 #include <linux/sched.h>
 #include <linux/kref.h>
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/rcupdate.h>
 #include <linux/cgroupstats.h>
 #include <linux/prio_heap.h>
 #ifdef CONFIG_CGROUPS
 struct cgroupfs_root;
 struct cgroup_subsys;
 struct inode;
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
 extern void cgroup_init_smp(void);
 extern void cgroup_lock(void);
 extern void cgroup_unlock(void);
 extern void cgroup_fork(struct task_struct *p);
 extern void cgroup_fork_callbacks(struct task_struct *p);
 extern void cgroup_post_fork(struct task_struct *p);
 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
 extern int cgroupstats_build(struct cgroupstats *stats,
 				struct dentry *dentry);
 extern struct file_operations proc_cgroup_operations;
 /* Define the enumeration of all cgroup subsystems */
 #define SUBSYS(_x) _x ## _subsys_id,
 enum cgroup_subsys_id {
 #include <linux/cgroup_subsys.h>
 	CGROUP_SUBSYS_COUNT
 };
 #undef SUBSYS
 /* Per-subsystem/per-cgroup state maintained by the system. */
 struct cgroup_subsys_state {
 	/* The cgroup that this subsystem is attached to. Useful
 	 * for subsystems that want to know about the cgroup
 	 * hierarchy structure */
 	struct cgroup *cgroup;
 	/* State maintained by the cgroup system to allow
 	 * subsystems to be "busy". Should be accessed via css_get()
 	 * and css_put() */
 	atomic_t refcnt;
 	unsigned long flags;
 };
 /* bits in struct cgroup_subsys_state flags field */
 enum {
 	CSS_ROOT, /* This CSS is the root of the subsystem */
 };
 /*
  * Call css_get() to hold a reference on the cgroup;
  *
  */
 static inline void css_get(struct cgroup_subsys_state *css)
 {
 	/* We don't need to reference count the root state */
 	if (!test_bit(CSS_ROOT, &css->flags))
 		atomic_inc(&css->refcnt);
 }
 /*
  * css_put() should be called to release a reference taken by
  * css_get()
  */
 extern void __css_put(struct cgroup_subsys_state *css);
 static inline void css_put(struct cgroup_subsys_state *css)
 {
 	if (!test_bit(CSS_ROOT, &css->flags))
 		__css_put(css);
 }
+/* bits in struct cgroup flags field */
+enum {
+	/* Control Group is dead */
+	CGRP_REMOVED,
+	/* Control Group has previously had a child cgroup or a task,
+	 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
+	CGRP_RELEASABLE,
+	/* Control Group requires release notifications to userspace */
+	CGRP_NOTIFY_ON_RELEASE,
+};
 struct cgroup {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 	/* count users of this cgroup. >0 means busy, but doesn't
 	 * necessarily indicate the number of tasks in the
 	 * cgroup */
 	atomic_t count;
 	/*
 	 * We link our 'sibling' struct into our parent's 'children'.
 	 * Our children link their 'sibling' into our 'children'.
 	 */
 	struct list_head sibling;	/* my parent's children */
 	struct list_head children;	/* my children */
 	struct cgroup *parent;	/* my parent */
 	struct dentry *dentry;	  	/* cgroup fs entry */
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 	struct cgroupfs_root *root;
 	struct cgroup *top_cgroup;
 	/*
 	 * List of cg_cgroup_links pointing at css_sets with
 	 * tasks in this cgroup. Protected by css_set_lock
 	 */
 	struct list_head css_sets;
 	/*
 	 * Linked list running through all cgroups that can
 	 * potentially be reaped by the release agent. Protected by
 	 * release_list_lock
 	 */
 	struct list_head release_list;
 };
 /* A css_set is a structure holding pointers to a set of
  * cgroup_subsys_state objects. This saves space in the task struct
  * object and speeds up fork()/exit(), since a single inc/dec and a
  * list_add()/del() can bump the reference count on the entire
  * cgroup set for a task.
  */
 struct css_set {
 	/* Reference count */
 	struct kref ref;
 	/*
 	 * List running through all cgroup groups. Protected by
 	 * css_set_lock
 	 */
 	struct list_head list;
 	/*
 	 * List running through all tasks using this cgroup
 	 * group. Protected by css_set_lock
 	 */
 	struct list_head tasks;
 	/*
 	 * List of cg_cgroup_link objects on link chains from
 	 * cgroups referenced from this css_set. Protected by
 	 * css_set_lock
 	 */
 	struct list_head cg_links;
 	/*
 	 * Set of subsystem states, one for each subsystem. This array
 	 * is immutable after creation apart from the init_css_set
 	 * during subsystem registration (at boot time).
 	 */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 };
 /*
  * cgroup_map_cb is an abstract callback API for reporting map-valued
  * control files
  */
 struct cgroup_map_cb {
 	int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
 	void *state;
 };
 /* struct cftype:
  *
  * The files in the cgroup filesystem mostly have a very simple read/write
  * handling, some common function will take care of it. Nevertheless some cases
  * (read tasks) are special and therefore I define this structure for every
  * kind of file.
  *
  *
  * When reading/writing to a file:
  *	- the cgroup to use is file->f_dentry->d_parent->d_fsdata
  *	- the 'cftype' of the file is file->f_dentry->d_fsdata
  */
 #define MAX_CFTYPE_NAME 64
 struct cftype {
 	/* By convention, the name should begin with the name of the
 	 * subsystem, followed by a period */
 	char name[MAX_CFTYPE_NAME];
 	int private;
 	int (*open) (struct inode *inode, struct file *file);
 	ssize_t (*read) (struct cgroup *cgrp, struct cftype *cft,
 			 struct file *file,
 			 char __user *buf, size_t nbytes, loff_t *ppos);
 	/*
 	 * read_u64() is a shortcut for the common case of returning a
 	 * single integer. Use it in place of read()
 	 */
 	u64 (*read_u64) (struct cgroup *cgrp, struct cftype *cft);
 	/*
 	 * read_map() is used for defining a map of key/value
 	 * pairs. It should call cb->fill(cb, key, value) for each
 	 * entry. The key/value pairs (and their ordering) should not
 	 * change between reboots.
 	 */
 	int (*read_map) (struct cgroup *cont, struct cftype *cft,
 			 struct cgroup_map_cb *cb);
 	ssize_t (*write) (struct cgroup *cgrp, struct cftype *cft,
 			  struct file *file,
 			  const char __user *buf, size_t nbytes, loff_t *ppos);
 	/*
 	 * write_u64() is a shortcut for the common case of accepting
 	 * a single integer (as parsed by simple_strtoull) from
 	 * userspace. Use in place of write(); return 0 or error.
 	 */
 	int (*write_u64) (struct cgroup *cgrp, struct cftype *cft, u64 val);
 	int (*release) (struct inode *inode, struct file *file);
 };
 struct cgroup_scanner {
 	struct cgroup *cg;
 	int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
 	void (*process_task)(struct task_struct *p,
 			struct cgroup_scanner *scan);
 	struct ptr_heap *heap;
 };
 /* Add a new file to the given cgroup directory. Should only be
  * called by subsystems from within a populate() method */
 int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 		       const struct cftype *cft);
 /* Add a set of new files to the given cgroup directory. Should
  * only be called by subsystems from within a populate() method */
 int cgroup_add_files(struct cgroup *cgrp,
 			struct cgroup_subsys *subsys,
 			const struct cftype cft[],
 			int count);
 int cgroup_is_removed(const struct cgroup *cgrp);
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
 int cgroup_task_count(const struct cgroup *cgrp);
 /* Return true if the cgroup is a descendant of the current cgroup */
 int cgroup_is_descendant(const struct cgroup *cgrp);
 /* Control Group subsystem type. See Documentation/cgroups.txt for details */
 struct cgroup_subsys {
 	struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
 						  struct cgroup *cgrp);
 	void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	int (*can_attach)(struct cgroup_subsys *ss,
 			  struct cgroup *cgrp, struct task_struct *tsk);
 	void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
 			struct cgroup *old_cgrp, struct task_struct *tsk);
 	void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
 	void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
 	int (*populate)(struct cgroup_subsys *ss,
 			struct cgroup *cgrp);
 	void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
 	int subsys_id;
 	int active;
 	int disabled;
 	int early_init;
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 	/* Protected by RCU */
 	struct cgroupfs_root *root;
 	struct list_head sibling;
 	void *private;
 };
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
 #include <linux/cgroup_subsys.h>
 #undef SUBSYS
 static inline struct cgroup_subsys_state *cgroup_subsys_state(
 	struct cgroup *cgrp, int subsys_id)
 {
 	return cgrp->subsys[subsys_id];
 }
 static inline struct cgroup_subsys_state *task_subsys_state(
 	struct task_struct *task, int subsys_id)
 {
 	return rcu_dereference(task->cgroups->subsys[subsys_id]);
 }
 static inline struct cgroup* task_cgroup(struct task_struct *task,
 					       int subsys_id)
 {
 	return task_subsys_state(task, subsys_id)->cgroup;
 }
 int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *ss);
 /* A cgroup_iter should be treated as an opaque object */
 struct cgroup_iter {
 	struct list_head *cg_link;
 	struct list_head *task;
 };
 /* To iterate across the tasks in a cgroup:
  *
  * 1) call cgroup_iter_start to intialize an iterator
  *
  * 2) call cgroup_iter_next() to retrieve member tasks until it
  *    returns NULL or until you want to end the iteration
  *
  * 3) call cgroup_iter_end() to destroy the iterator.
  *
  * Or, call cgroup_scan_tasks() to iterate through every task in a cpuset.
  *    - cgroup_scan_tasks() holds the css_set_lock when calling the test_task()
  *      callback, but not while calling the process_task() callback.
  */
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it);
 struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 					struct cgroup_iter *it);
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *, struct task_struct *);
 #else /* !CONFIG_CGROUPS */
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 static inline void cgroup_init_smp(void) {}
 static inline void cgroup_fork(struct task_struct *p) {}
 static inline void cgroup_fork_callbacks(struct task_struct *p) {}
 static inline void cgroup_post_fork(struct task_struct *p) {}
 static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
 static inline void cgroup_lock(void) {}
 static inline void cgroup_unlock(void) {}
 static inline int cgroupstats_build(struct cgroupstats *stats,
 					struct dentry *dentry)
 {
 	return -EINVAL;
 }
 #endif /* !CONFIG_CGROUPS */
 #endif /* _LINUX_CGROUP_H */

 /*
  *  Generic process-grouping system.
  *
  *  Based originally on the cpuset system, extracted by Paul Menage
  *  Copyright (C) 2006 Google, Inc
  *
  *  Copyright notices from the original cpuset code:
  *  --------------------------------------------------
  *  Copyright (C) 2003 BULL SA.
  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  *
  *  Portions derived from Patrick Mochel's sysfs code.
  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  *
  *  2003-10-10 Written by Simon Derr.
  *  2003-10-22 Updates by Stephen Hemminger.
  *  2004 May-July Rework by Paul Jackson.
  *  ---------------------------------------------------
  *
  *  This file is subject to the terms and conditions of the GNU General Public
  *  License.  See the file COPYING in the main directory of the Linux
  *  distribution for more details.
  */
 #include <linux/cgroup.h>
 #include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
 #include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
 #include <linux/backing-dev.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/magic.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <asm/atomic.h>
 static DEFINE_MUTEX(cgroup_mutex);
 /* Generate an array of cgroup subsystem pointers */
 #define SUBSYS(_x) &_x ## _subsys,
 static struct cgroup_subsys *subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
 /*
  * A cgroupfs_root represents the root of a cgroup hierarchy,
  * and may be associated with a superblock to form an active
  * hierarchy
  */
 struct cgroupfs_root {
 	struct super_block *sb;
 	/*
 	 * The bitmask of subsystems intended to be attached to this
 	 * hierarchy
 	 */
 	unsigned long subsys_bits;
 	/* The bitmask of subsystems currently attached to this hierarchy */
 	unsigned long actual_subsys_bits;
 	/* A list running through the attached subsystems */
 	struct list_head subsys_list;
 	/* The root cgroup for this hierarchy */
 	struct cgroup top_cgroup;
 	/* Tracks how many cgroups are currently defined in hierarchy.*/
 	int number_of_cgroups;
 	/* A list running through the mounted hierarchies */
 	struct list_head root_list;
 	/* Hierarchy-specific flags */
 	unsigned long flags;
 	/* The path to use for release notifications. No locking
 	 * between setting and use - so if userspace updates this
 	 * while child cgroups exist, you could miss a
 	 * notification. We ensure that it's always a valid
 	 * NUL-terminated string */
 	char release_agent_path[PATH_MAX];
 };
 /*
  * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
  * subsystems that are otherwise unattached - it never has more than a
  * single cgroup, and all tasks are part of that cgroup.
  */
 static struct cgroupfs_root rootnode;
 /* The list of hierarchy roots */
 static LIST_HEAD(roots);
 static int root_count;
 /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
 #define dummytop (&rootnode.top_cgroup)
 /* This flag indicates whether tasks in the fork and exit paths should
  * check for fork/exit handlers to call. This avoids us having to do
  * extra work in the fork/exit path if none of the subsystems need to
  * be called.
  */
 static int need_forkexit_callback;
-/* bits in struct cgroup flags field */
-enum {
-	/* Control Group is dead */
-	CGRP_REMOVED,
-	/* Control Group has previously had a child cgroup or a task,
-	 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
-	CGRP_RELEASABLE,
-	/* Control Group requires release notifications to userspace */
-	CGRP_NOTIFY_ON_RELEASE,
-};
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
 {
 	return test_bit(CGRP_REMOVED, &cgrp->flags);
 }
 /* bits in struct cgroupfs_root flags field */
 enum {
 	ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
 };
 static int cgroup_is_releasable(const struct cgroup *cgrp)
 {
 	const int bits =
 		(1 << CGRP_RELEASABLE) |
 		(1 << CGRP_NOTIFY_ON_RELEASE);
 	return (cgrp->flags & bits) == bits;
 }
 static int notify_on_release(const struct cgroup *cgrp)
 {
 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 }
 /*
  * for_each_subsys() allows you to iterate on each subsystem attached to
  * an active hierarchy
  */
 #define for_each_subsys(_root, _ss) \
 list_for_each_entry(_ss, &_root->subsys_list, sibling)
 /* for_each_root() allows you to iterate across the active hierarchies */
 #define for_each_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
 static LIST_HEAD(release_list);
 static DEFINE_SPINLOCK(release_list_lock);
 static void cgroup_release_agent(struct work_struct *work);
 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
 /* Link structure for associating css_set objects with cgroups */
 struct cg_cgroup_link {
 	/*
 	 * List running through cg_cgroup_links associated with a
 	 * cgroup, anchored on cgroup->css_sets
 	 */
 	struct list_head cgrp_link_list;
 	/*
 	 * List running through cg_cgroup_links pointing at a
 	 * single css_set object, anchored on css_set->cg_links
 	 */
 	struct list_head cg_link_list;
 	struct css_set *cg;
 };
 /* The default css_set - used by init and its children prior to any
  * hierarchies being mounted. It contains a pointer to the root state
  * for each subsystem. Also used to anchor the list of css_sets. Not
  * reference-counted, to improve performance when child cgroups
  * haven't been created.
  */
 static struct css_set init_css_set;
 static struct cg_cgroup_link init_css_set_link;
 /* css_set_lock protects the list of css_set objects, and the
  * chain of tasks off each css_set.  Nests outside task->alloc_lock
  * due to cgroup_iter_start() */
 static DEFINE_RWLOCK(css_set_lock);
 static int css_set_count;
 /* We don't maintain the lists running through each css_set to its
  * task until after the first call to cgroup_iter_start(). This
  * reduces the fork()/exit() overhead for people who have cgroups
  * compiled into their kernel but not actually in use */
 static int use_task_css_set_links;
 /* When we create or destroy a css_set, the operation simply
  * takes/releases a reference count on all the cgroups referenced
  * by subsystems in this css_set. This can end up multiple-counting
  * some cgroups, but that's OK - the ref-count is just a
  * busy/not-busy indicator; ensuring that we only count each cgroup
  * once would require taking a global lock to ensure that no
  * subsystems moved between hierarchies while we were doing so.
  *
  * Possible TODO: decide at boot time based on the number of
  * registered subsystems and the number of CPUs or NUMA nodes whether
  * it's better for performance to ref-count every subsystem, or to
  * take a global lock and only add one ref count to each hierarchy.
  */
 /*
  * unlink a css_set from the list and free it
  */
 static void unlink_css_set(struct css_set *cg)
 {
 	write_lock(&css_set_lock);
 	list_del(&cg->list);
 	css_set_count--;
 	while (!list_empty(&cg->cg_links)) {
 		struct cg_cgroup_link *link;
 		link = list_entry(cg->cg_links.next,
 				  struct cg_cgroup_link, cg_link_list);
 		list_del(&link->cg_link_list);
 		list_del(&link->cgrp_link_list);
 		kfree(link);
 	}
 	write_unlock(&css_set_lock);
 }
 static void __release_css_set(struct kref *k, int taskexit)
 {
 	int i;
 	struct css_set *cg = container_of(k, struct css_set, ref);
 	unlink_css_set(cg);
 	rcu_read_lock();
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup *cgrp = cg->subsys[i]->cgroup;
 		if (atomic_dec_and_test(&cgrp->count) &&
 		    notify_on_release(cgrp)) {
 			if (taskexit)
 				set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
 		}
 	}
 	rcu_read_unlock();
 	kfree(cg);
 }
 static void release_css_set(struct kref *k)
 {
 	__release_css_set(k, 0);
 }
 static void release_css_set_taskexit(struct kref *k)
 {
 	__release_css_set(k, 1);
 }
 /*
  * refcounted get/put for css_set objects
  */
 static inline void get_css_set(struct css_set *cg)
 {
 	kref_get(&cg->ref);
 }
 static inline void put_css_set(struct css_set *cg)
 {
 	kref_put(&cg->ref, release_css_set);
 }
 static inline void put_css_set_taskexit(struct css_set *cg)
 {
 	kref_put(&cg->ref, release_css_set_taskexit);
 }
 /*
  * find_existing_css_set() is a helper for
  * find_css_set(), and checks to see whether an existing
  * css_set is suitable. This currently walks a linked-list for
  * simplicity; a later patch will use a hash table for better
  * performance
  *
  * oldcg: the cgroup group that we're using before the cgroup
  * transition
  *
  * cgrp: the cgroup that we're moving into
  *
  * template: location in which to build the desired set of subsystem
  * state objects for the new cgroup group
  */
 static struct css_set *find_existing_css_set(
 	struct css_set *oldcg,
 	struct cgroup *cgrp,
 	struct cgroup_subsys_state *template[])
 {
 	int i;
 	struct cgroupfs_root *root = cgrp->root;
 	struct list_head *l = &init_css_set.list;
 	/* Built the set of subsystem state objects that we want to
 	 * see in the new css_set */
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		if (root->subsys_bits & (1UL << i)) {
 			/* Subsystem is in this hierarchy. So we want
 			 * the subsystem state from the new
 			 * cgroup */
 			template[i] = cgrp->subsys[i];
 		} else {
 			/* Subsystem is not in this hierarchy, so we
 			 * don't want to change the subsystem state */
 			template[i] = oldcg->subsys[i];
 		}
 	}
 	/* Look through existing cgroup groups to find one to reuse */
 	do {
 		struct css_set *cg =
 			list_entry(l, struct css_set, list);
 		if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
 			/* All subsystems matched */
 			return cg;
 		}
 		/* Try the next cgroup group */
 		l = l->next;
 	} while (l != &init_css_set.list);
 	/* No existing cgroup group matched */
 	return NULL;
 }
 /*
  * allocate_cg_links() allocates "count" cg_cgroup_link structures
  * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
  * success or a negative error
  */
 static int allocate_cg_links(int count, struct list_head *tmp)
 {
 	struct cg_cgroup_link *link;
 	int i;
 	INIT_LIST_HEAD(tmp);
 	for (i = 0; i < count; i++) {
 		link = kmalloc(sizeof(*link), GFP_KERNEL);
 		if (!link) {
 			while (!list_empty(tmp)) {
 				link = list_entry(tmp->next,
 						  struct cg_cgroup_link,
 						  cgrp_link_list);
 				list_del(&link->cgrp_link_list);
 				kfree(link);
 			}
 			return -ENOMEM;
 		}
 		list_add(&link->cgrp_link_list, tmp);
 	}
 	return 0;
 }
 static void free_cg_links(struct list_head *tmp)
 {
 	while (!list_empty(tmp)) {
 		struct cg_cgroup_link *link;
 		link = list_entry(tmp->next,
 				  struct cg_cgroup_link,
 				  cgrp_link_list);
 		list_del(&link->cgrp_link_list);
 		kfree(link);
 	}
 }
 /*
  * find_css_set() takes an existing cgroup group and a
  * cgroup object, and returns a css_set object that's
  * equivalent to the old group, but with the given cgroup
  * substituted into the appropriate hierarchy. Must be called with
  * cgroup_mutex held
  */
 static struct css_set *find_css_set(
 	struct css_set *oldcg, struct cgroup *cgrp)
 {
 	struct css_set *res;
 	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
 	int i;
 	struct list_head tmp_cg_links;
 	struct cg_cgroup_link *link;
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
 	write_lock(&css_set_lock);
 	res = find_existing_css_set(oldcg, cgrp, template);
 	if (res)
 		get_css_set(res);
 	write_unlock(&css_set_lock);
 	if (res)
 		return res;
 	res = kmalloc(sizeof(*res), GFP_KERNEL);
 	if (!res)
 		return NULL;
 	/* Allocate all the cg_cgroup_link objects that we'll need */
 	if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
 		kfree(res);
 		return NULL;
 	}
 	kref_init(&res->ref);
 	INIT_LIST_HEAD(&res->cg_links);
 	INIT_LIST_HEAD(&res->tasks);
 	/* Copy the set of subsystem state objects generated in
 	 * find_existing_css_set() */
 	memcpy(res->subsys, template, sizeof(res->subsys));
 	write_lock(&css_set_lock);
 	/* Add reference counts and links from the new css_set. */
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup *cgrp = res->subsys[i]->cgroup;
 		struct cgroup_subsys *ss = subsys[i];
 		atomic_inc(&cgrp->count);
 		/*
 		 * We want to add a link once per cgroup, so we
 		 * only do it for the first subsystem in each
 		 * hierarchy
 		 */
 		if (ss->root->subsys_list.next == &ss->sibling) {
 			BUG_ON(list_empty(&tmp_cg_links));
 			link = list_entry(tmp_cg_links.next,
 					  struct cg_cgroup_link,
 					  cgrp_link_list);
 			list_del(&link->cgrp_link_list);
 			list_add(&link->cgrp_link_list, &cgrp->css_sets);
 			link->cg = res;
 			list_add(&link->cg_link_list, &res->cg_links);
 		}
 	}
 	if (list_empty(&rootnode.subsys_list)) {
 		link = list_entry(tmp_cg_links.next,
 				  struct cg_cgroup_link,
 				  cgrp_link_list);
 		list_del(&link->cgrp_link_list);
 		list_add(&link->cgrp_link_list, &dummytop->css_sets);
 		link->cg = res;
 		list_add(&link->cg_link_list, &res->cg_links);
 	}
 	BUG_ON(!list_empty(&tmp_cg_links));
 	/* Link this cgroup group into the list */
 	list_add(&res->list, &init_css_set.list);
 	css_set_count++;
 	write_unlock(&css_set_lock);
 	return res;
 }
 /*
  * There is one global cgroup mutex. We also require taking
  * task_lock() when dereferencing a task's cgroup subsys pointers.
  * See "The task_lock() exception", at the end of this comment.
  *
  * A task must hold cgroup_mutex to modify cgroups.
  *
  * Any task can increment and decrement the count field without lock.
  * So in general, code holding cgroup_mutex can't rely on the count
  * field not changing.  However, if the count goes to zero, then only
  * cgroup_attach_task() can increment it again.  Because a count of zero
  * means that no tasks are currently attached, therefore there is no
  * way a task attached to that cgroup can fork (the other way to
  * increment the count).  So code holding cgroup_mutex can safely
  * assume that if the count is zero, it will stay zero. Similarly, if
  * a task holds cgroup_mutex on a cgroup with zero count, it
  * knows that the cgroup won't be removed, as cgroup_rmdir()
  * needs that mutex.
  *
  * The cgroup_common_file_write handler for operations that modify
  * the cgroup hierarchy holds cgroup_mutex across the entire operation,
  * single threading all such cgroup modifications across the system.
  *
  * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
  * (usually) take cgroup_mutex.  These are the two most performance
  * critical pieces of code here.  The exception occurs on cgroup_exit(),
  * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
  * is taken, and if the cgroup count is zero, a usermode call made
  * to the release agent with the name of the cgroup (path relative to
  * the root of cgroup file system) as the argument.
  *
  * A cgroup can only be deleted if both its 'count' of using tasks
  * is zero, and its list of 'children' cgroups is empty.  Since all
  * tasks in the system use _some_ cgroup, and since there is always at
  * least one task in the system (init, pid == 1), therefore, top_cgroup
  * always has either children cgroups and/or using tasks.  So we don't
  * need a special hack to ensure that top_cgroup cannot be deleted.
  *
  *	The task_lock() exception
  *
  * The need for this exception arises from the action of
  * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
  * another.  It does so using cgroup_mutex, however there are
  * several performance critical places that need to reference
  * task->cgroup without the expense of grabbing a system global
  * mutex.  Therefore except as noted below, when dereferencing or, as
  * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
  * task_lock(), which acts on a spinlock (task->alloc_lock) already in
  * the task_struct routinely used for such matters.
  *
  * P.S.  One more locking exception.  RCU is used to guard the
  * update of a tasks cgroup pointer by cgroup_attach_task()
  */
 /**
  * cgroup_lock - lock out any changes to cgroup structures
  *
  */
 void cgroup_lock(void)
 {
 	mutex_lock(&cgroup_mutex);
 }
 /**
  * cgroup_unlock - release lock on cgroup changes
  *
  * Undo the lock taken in a previous cgroup_lock() call.
  */
 void cgroup_unlock(void)
 {
 	mutex_unlock(&cgroup_mutex);
 }
 /*
  * A couple of forward declarations required, due to cyclic reference loop:
  * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
  * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
  * -> cgroup_mkdir.
  */
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
 static struct inode_operations cgroup_dir_inode_operations;
 static struct file_operations proc_cgroupstats_operations;
 static struct backing_dev_info cgroup_backing_dev_info = {
 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
 };
 static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 {
 	struct inode *inode = new_inode(sb);
 	if (inode) {
 		inode->i_mode = mode;
 		inode->i_uid = current->fsuid;
 		inode->i_gid = current->fsgid;
 		inode->i_blocks = 0;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
 	}
 	return inode;
 }
 /*
  * Call subsys's pre_destroy handler.
  * This is called before css refcnt check.
  */
 static void cgroup_call_pre_destroy(struct cgroup *cgrp)
 {
 	struct cgroup_subsys *ss;
 	for_each_subsys(cgrp->root, ss)
 		if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])
 			ss->pre_destroy(ss, cgrp);
 	return;
 }
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 {
 	/* is dentry a directory ? if so, kfree() associated cgroup */
 	if (S_ISDIR(inode->i_mode)) {
 		struct cgroup *cgrp = dentry->d_fsdata;
 		struct cgroup_subsys *ss;
 		BUG_ON(!(cgroup_is_removed(cgrp)));
 		/* It's possible for external users to be holding css
 		 * reference counts on a cgroup; css_put() needs to
 		 * be able to access the cgroup after decrementing
 		 * the reference count in order to know if it needs to
 		 * queue the cgroup to be handled by the release
 		 * agent */
 		synchronize_rcu();
 		mutex_lock(&cgroup_mutex);
 		/*
 		 * Release the subsystem state objects.
 		 */
 		for_each_subsys(cgrp->root, ss) {
 			if (cgrp->subsys[ss->subsys_id])
 				ss->destroy(ss, cgrp);
 		}
 		cgrp->root->number_of_cgroups--;
 		mutex_unlock(&cgroup_mutex);
 		/* Drop the active superblock reference that we took when we
 		 * created the cgroup */
 		deactivate_super(cgrp->root->sb);
 		kfree(cgrp);
 	}
 	iput(inode);
 }
 static void remove_dir(struct dentry *d)
 {
 	struct dentry *parent = dget(d->d_parent);
 	d_delete(d);
 	simple_rmdir(parent->d_inode, d);
 	dput(parent);
 }
 static void cgroup_clear_directory(struct dentry *dentry)
 {
 	struct list_head *node;
 	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
 	spin_lock(&dcache_lock);
 	node = dentry->d_subdirs.next;
 	while (node != &dentry->d_subdirs) {
 		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
 		list_del_init(node);
 		if (d->d_inode) {
 			/* This should never be called on a cgroup
 			 * directory with child cgroups */
 			BUG_ON(d->d_inode->i_mode & S_IFDIR);
 			d = dget_locked(d);
 			spin_unlock(&dcache_lock);
 			d_delete(d);
 			simple_unlink(dentry->d_inode, d);
 			dput(d);
 			spin_lock(&dcache_lock);
 		}
 		node = dentry->d_subdirs.next;
 	}
 	spin_unlock(&dcache_lock);
 }
 /*
  * NOTE : the dentry must have been dget()'ed
  */
 static void cgroup_d_remove_dir(struct dentry *dentry)
 {
 	cgroup_clear_directory(dentry);
 	spin_lock(&dcache_lock);
 	list_del_init(&dentry->d_u.d_child);
 	spin_unlock(&dcache_lock);
 	remove_dir(dentry);
 }
 static int rebind_subsystems(struct cgroupfs_root *root,
 			      unsigned long final_bits)
 {
 	unsigned long added_bits, removed_bits;
 	struct cgroup *cgrp = &root->top_cgroup;
 	int i;
 	removed_bits = root->actual_subsys_bits & ~final_bits;
 	added_bits = final_bits & ~root->actual_subsys_bits;
 	/* Check that any added subsystems are currently free */
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		unsigned long bit = 1UL << i;
 		struct cgroup_subsys *ss = subsys[i];
 		if (!(bit & added_bits))
 			continue;
 		if (ss->root != &rootnode) {
 			/* Subsystem isn't free */
 			return -EBUSY;
 		}
 	}
 	/* Currently we don't handle adding/removing subsystems when
 	 * any child cgroups exist. This is theoretically supportable
 	 * but involves complex error handling, so it's being left until
 	 * later */
 	if (!list_empty(&cgrp->children))
 		return -EBUSY;
 	/* Process each subsystem */
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		unsigned long bit = 1UL << i;
 		if (bit & added_bits) {
 			/* We're binding this subsystem to this hierarchy */
 			BUG_ON(cgrp->subsys[i]);
 			BUG_ON(!dummytop->subsys[i]);
 			BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
 			cgrp->subsys[i] = dummytop->subsys[i];
 			cgrp->subsys[i]->cgroup = cgrp;
 			list_add(&ss->sibling, &root->subsys_list);
 			rcu_assign_pointer(ss->root, root);
 			if (ss->bind)
 				ss->bind(ss, cgrp);
 		} else if (bit & removed_bits) {
 			/* We're removing this subsystem */
 			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
 			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
 			if (ss->bind)
 				ss->bind(ss, dummytop);
 			dummytop->subsys[i]->cgroup = dummytop;
 			cgrp->subsys[i] = NULL;
 			rcu_assign_pointer(subsys[i]->root, &rootnode);
 			list_del(&ss->sibling);
 		} else if (bit & final_bits) {
 			/* Subsystem state should already exist */
 			BUG_ON(!cgrp->subsys[i]);
 		} else {
 			/* Subsystem state shouldn't exist */
 			BUG_ON(cgrp->subsys[i]);
 		}
 	}
 	root->subsys_bits = root->actual_subsys_bits = final_bits;
 	synchronize_rcu();
 	return 0;
 }
 static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
 	struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
 	struct cgroup_subsys *ss;
 	mutex_lock(&cgroup_mutex);
 	for_each_subsys(root, ss)
 		seq_printf(seq, ",%s", ss->name);
 	if (test_bit(ROOT_NOPREFIX, &root->flags))
 		seq_puts(seq, ",noprefix");
 	if (strlen(root->release_agent_path))
 		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 struct cgroup_sb_opts {
 	unsigned long subsys_bits;
 	unsigned long flags;
 	char *release_agent;
 };
 /* Convert a hierarchy specifier into a bitmask of subsystems and
  * flags. */
 static int parse_cgroupfs_options(char *data,
 				     struct cgroup_sb_opts *opts)
 {
 	char *token, *o = data ?: "all";
 	opts->subsys_bits = 0;
 	opts->flags = 0;
 	opts->release_agent = NULL;
 	while ((token = strsep(&o, ",")) != NULL) {
 		if (!*token)
 			return -EINVAL;
 		if (!strcmp(token, "all")) {
 			/* Add all non-disabled subsystems */
 			int i;
 			opts->subsys_bits = 0;
 			for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 				struct cgroup_subsys *ss = subsys[i];
 				if (!ss->disabled)
 					opts->subsys_bits |= 1ul << i;
 			}
 		} else if (!strcmp(token, "noprefix")) {
 			set_bit(ROOT_NOPREFIX, &opts->flags);
 		} else if (!strncmp(token, "release_agent=", 14)) {
 			/* Specifying two release agents is forbidden */
 			if (opts->release_agent)
 				return -EINVAL;
 			opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
 			if (!opts->release_agent)
 				return -ENOMEM;
 			strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
 			opts->release_agent[PATH_MAX - 1] = 0;
 		} else {
 			struct cgroup_subsys *ss;
 			int i;
 			for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 				ss = subsys[i];
 				if (!strcmp(token, ss->name)) {
 					if (!ss->disabled)
 						set_bit(i, &opts->subsys_bits);
 					break;
 				}
 			}
 			if (i == CGROUP_SUBSYS_COUNT)
 				return -ENOENT;
 		}
 	}
 	/* We can't have an empty hierarchy */
 	if (!opts->subsys_bits)
 		return -EINVAL;
 	return 0;
 }
 static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 {
 	int ret = 0;
 	struct cgroupfs_root *root = sb->s_fs_info;
 	struct cgroup *cgrp = &root->top_cgroup;
 	struct cgroup_sb_opts opts;
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
 	/* See what subsystems are wanted */
 	ret = parse_cgroupfs_options(data, &opts);
 	if (ret)
 		goto out_unlock;
 	/* Don't allow flags to change at remount */
 	if (opts.flags != root->flags) {
 		ret = -EINVAL;
 		goto out_unlock;
 	}
 	ret = rebind_subsystems(root, opts.subsys_bits);
 	/* (re)populate subsystem files */
 	if (!ret)
 		cgroup_populate_dir(cgrp);
 	if (opts.release_agent)
 		strcpy(root->release_agent_path, opts.release_agent);
  out_unlock:
 	if (opts.release_agent)
 		kfree(opts.release_agent);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 	return ret;
 }
 static struct super_operations cgroup_ops = {
 	.statfs = simple_statfs,
 	.drop_inode = generic_delete_inode,
 	.show_options = cgroup_show_options,
 	.remount_fs = cgroup_remount,
 };
 static void init_cgroup_root(struct cgroupfs_root *root)
 {
 	struct cgroup *cgrp = &root->top_cgroup;
 	INIT_LIST_HEAD(&root->subsys_list);
 	INIT_LIST_HEAD(&root->root_list);
 	root->number_of_cgroups = 1;
 	cgrp->root = root;
 	cgrp->top_cgroup = cgrp;
 	INIT_LIST_HEAD(&cgrp->sibling);
 	INIT_LIST_HEAD(&cgrp->children);
 	INIT_LIST_HEAD(&cgrp->css_sets);
 	INIT_LIST_HEAD(&cgrp->release_list);
 }
 static int cgroup_test_super(struct super_block *sb, void *data)
 {
 	struct cgroupfs_root *new = data;
 	struct cgroupfs_root *root = sb->s_fs_info;
 	/* First check subsystems */
 	if (new->subsys_bits != root->subsys_bits)
 	    return 0;
 	/* Next check flags */
 	if (new->flags != root->flags)
 		return 0;
 	return 1;
 }
 static int cgroup_set_super(struct super_block *sb, void *data)
 {
 	int ret;
 	struct cgroupfs_root *root = data;
 	ret = set_anon_super(sb, NULL);
 	if (ret)
 		return ret;
 	sb->s_fs_info = root;
 	root->sb = sb;
 	sb->s_blocksize = PAGE_CACHE_SIZE;
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
 	sb->s_magic = CGROUP_SUPER_MAGIC;
 	sb->s_op = &cgroup_ops;
 	return 0;
 }
 static int cgroup_get_rootdir(struct super_block *sb)
 {
 	struct inode *inode =
 		cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
 	struct dentry *dentry;
 	if (!inode)
 		return -ENOMEM;
 	inode->i_fop = &simple_dir_operations;
 	inode->i_op = &cgroup_dir_inode_operations;
 	/* directories start off with i_nlink == 2 (for "." entry) */
 	inc_nlink(inode);
 	dentry = d_alloc_root(inode);
 	if (!dentry) {
 		iput(inode);
 		return -ENOMEM;
 	}
 	sb->s_root = dentry;
 	return 0;
 }
 static int cgroup_get_sb(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data, struct vfsmount *mnt)
 {
 	struct cgroup_sb_opts opts;
 	int ret = 0;
 	struct super_block *sb;
 	struct cgroupfs_root *root;
 	struct list_head tmp_cg_links, *l;
 	INIT_LIST_HEAD(&tmp_cg_links);
 	/* First find the desired set of subsystems */
 	ret = parse_cgroupfs_options(data, &opts);
 	if (ret) {
 		if (opts.release_agent)
 			kfree(opts.release_agent);
 		return ret;
 	}
 	root = kzalloc(sizeof(*root), GFP_KERNEL);
 	if (!root) {
 		if (opts.release_agent)
 			kfree(opts.release_agent);
 		return -ENOMEM;
 	}
 	init_cgroup_root(root);
 	root->subsys_bits = opts.subsys_bits;
 	root->flags = opts.flags;
 	if (opts.release_agent) {
 		strcpy(root->release_agent_path, opts.release_agent);
 		kfree(opts.release_agent);
 	}
 	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
 	if (IS_ERR(sb)) {
 		kfree(root);
 		return PTR_ERR(sb);
 	}
 	if (sb->s_fs_info != root) {
 		/* Reusing an existing superblock */
 		BUG_ON(sb->s_root == NULL);
 		kfree(root);
 		root = NULL;
 	} else {
 		/* New superblock */
 		struct cgroup *cgrp = &root->top_cgroup;
 		struct inode *inode;
 		BUG_ON(sb->s_root != NULL);
 		ret = cgroup_get_rootdir(sb);
 		if (ret)
 			goto drop_new_super;
 		inode = sb->s_root->d_inode;
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_mutex);
 		/*
 		 * We're accessing css_set_count without locking
 		 * css_set_lock here, but that's OK - it can only be
 		 * increased by someone holding cgroup_lock, and
 		 * that's us. The worst that can happen is that we
 		 * have some link structures left over
 		 */
 		ret = allocate_cg_links(css_set_count, &tmp_cg_links);
 		if (ret) {
 			mutex_unlock(&cgroup_mutex);
 			mutex_unlock(&inode->i_mutex);
 			goto drop_new_super;
 		}
 		ret = rebind_subsystems(root, root->subsys_bits);
 		if (ret == -EBUSY) {
 			mutex_unlock(&cgroup_mutex);
 			mutex_unlock(&inode->i_mutex);
 			goto drop_new_super;
 		}
 		/* EBUSY should be the only error here */
 		BUG_ON(ret);
 		list_add(&root->root_list, &roots);
 		root_count++;
 		sb->s_root->d_fsdata = &root->top_cgroup;
 		root->top_cgroup.dentry = sb->s_root;
 		/* Link the top cgroup in this hierarchy into all
 		 * the css_set objects */
 		write_lock(&css_set_lock);
 		l = &init_css_set.list;
 		do {
 			struct css_set *cg;
 			struct cg_cgroup_link *link;
 			cg = list_entry(l, struct css_set, list);
 			BUG_ON(list_empty(&tmp_cg_links));
 			link = list_entry(tmp_cg_links.next,
 					  struct cg_cgroup_link,
 					  cgrp_link_list);
 			list_del(&link->cgrp_link_list);
 			link->cg = cg;
 			list_add(&link->cgrp_link_list,
 				 &root->top_cgroup.css_sets);
 			list_add(&link->cg_link_list, &cg->cg_links);
 			l = l->next;
 		} while (l != &init_css_set.list);
 		write_unlock(&css_set_lock);
 		free_cg_links(&tmp_cg_links);
 		BUG_ON(!list_empty(&cgrp->sibling));
 		BUG_ON(!list_empty(&cgrp->children));
 		BUG_ON(root->number_of_cgroups != 1);
 		cgroup_populate_dir(cgrp);
 		mutex_unlock(&inode->i_mutex);
 		mutex_unlock(&cgroup_mutex);
 	}
 	return simple_set_mnt(mnt, sb);
  drop_new_super:
 	up_write(&sb->s_umount);
 	deactivate_super(sb);
 	free_cg_links(&tmp_cg_links);
 	return ret;
 }
 static void cgroup_kill_sb(struct super_block *sb) {
 	struct cgroupfs_root *root = sb->s_fs_info;
 	struct cgroup *cgrp = &root->top_cgroup;
 	int ret;
 	BUG_ON(!root);
 	BUG_ON(root->number_of_cgroups != 1);
 	BUG_ON(!list_empty(&cgrp->children));
 	BUG_ON(!list_empty(&cgrp->sibling));
 	mutex_lock(&cgroup_mutex);
 	/* Rebind all subsystems back to the default hierarchy */
 	ret = rebind_subsystems(root, 0);
 	/* Shouldn't be able to fail ... */
 	BUG_ON(ret);
 	/*
 	 * Release all the links from css_sets to this hierarchy's
 	 * root cgroup
 	 */
 	write_lock(&css_set_lock);
 	while (!list_empty(&cgrp->css_sets)) {
 		struct cg_cgroup_link *link;
 		link = list_entry(cgrp->css_sets.next,
 				  struct cg_cgroup_link, cgrp_link_list);
 		list_del(&link->cg_link_list);
 		list_del(&link->cgrp_link_list);
 		kfree(link);
 	}
 	write_unlock(&css_set_lock);
 	if (!list_empty(&root->root_list)) {
 		list_del(&root->root_list);
 		root_count--;
 	}
 	mutex_unlock(&cgroup_mutex);
 	kfree(root);
 	kill_litter_super(sb);
 }
 static struct file_system_type cgroup_fs_type = {
 	.name = "cgroup",
 	.get_sb = cgroup_get_sb,
 	.kill_sb = cgroup_kill_sb,
 };
 static inline struct cgroup *__d_cgrp(struct dentry *dentry)
 {
 	return dentry->d_fsdata;
 }
 static inline struct cftype *__d_cft(struct dentry *dentry)
 {
 	return dentry->d_fsdata;
 }
 /**
  * cgroup_path - generate the path of a cgroup
  * @cgrp: the cgroup in question
  * @buf: the buffer to write the path into
  * @buflen: the length of the buffer
  *
  * Called with cgroup_mutex held. Writes path of cgroup into buf.
  * Returns 0 on success, -errno on error.
  */
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
 	char *start;
 	if (cgrp == dummytop) {
 		/*
 		 * Inactive subsystems have no dentry for their root
 		 * cgroup
 		 */
 		strcpy(buf, "/");
 		return 0;
 	}
 	start = buf + buflen;
 	*--start = '\0';
 	for (;;) {
 		int len = cgrp->dentry->d_name.len;
 		if ((start -= len) < buf)
 			return -ENAMETOOLONG;
 		memcpy(start, cgrp->dentry->d_name.name, len);
 		cgrp = cgrp->parent;
 		if (!cgrp)
 			break;
 		if (!cgrp->parent)
 			continue;
 		if (--start < buf)
 			return -ENAMETOOLONG;
 		*start = '/';
 	}
 	memmove(buf, start, buf + buflen - start);
 	return 0;
 }
 /*
  * Return the first subsystem attached to a cgroup's hierarchy, and
  * its subsystem id.
  */
 static void get_first_subsys(const struct cgroup *cgrp,
 			struct cgroup_subsys_state **css, int *subsys_id)
 {
 	const struct cgroupfs_root *root = cgrp->root;
 	const struct cgroup_subsys *test_ss;
 	BUG_ON(list_empty(&root->subsys_list));
 	test_ss = list_entry(root->subsys_list.next,
 			     struct cgroup_subsys, sibling);
 	if (css) {
 		*css = cgrp->subsys[test_ss->subsys_id];
 		BUG_ON(!*css);
 	}
 	if (subsys_id)
 		*subsys_id = test_ss->subsys_id;
 }
 /**
  * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
  * @cgrp: the cgroup the task is attaching to
  * @tsk: the task to be attached
  *
  * Call holding cgroup_mutex. May take task_lock of
  * the task 'tsk' during call.
  */
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 {
 	int retval = 0;
 	struct cgroup_subsys *ss;
 	struct cgroup *oldcgrp;
 	struct css_set *cg = tsk->cgroups;
 	struct css_set *newcg;
 	struct cgroupfs_root *root = cgrp->root;
 	int subsys_id;
 	get_first_subsys(cgrp, NULL, &subsys_id);
 	/* Nothing to do if the task is already in that cgroup */
 	oldcgrp = task_cgroup(tsk, subsys_id);
 	if (cgrp == oldcgrp)
 		return 0;
 	for_each_subsys(root, ss) {
 		if (ss->can_attach) {
 			retval = ss->can_attach(ss, cgrp, tsk);
 			if (retval)
 				return retval;
 		}
 	}
 	/*
 	 * Locate or allocate a new css_set for this task,
 	 * based on its final set of cgroups
 	 */
 	newcg = find_css_set(cg, cgrp);
 	if (!newcg)
 		return -ENOMEM;
 	task_lock(tsk);
 	if (tsk->flags & PF_EXITING) {
 		task_unlock(tsk);
 		put_css_set(newcg);
 		return -ESRCH;
 	}
 	rcu_assign_pointer(tsk->cgroups, newcg);
 	task_unlock(tsk);
 	/* Update the css_set linked lists if we're using them */
 	write_lock(&css_set_lock);
 	if (!list_empty(&tsk->cg_list)) {
 		list_del(&tsk->cg_list);
 		list_add(&tsk->cg_list, &newcg->tasks);
 	}
 	write_unlock(&css_set_lock);
 	for_each_subsys(root, ss) {
 		if (ss->attach)
 			ss->attach(ss, cgrp, oldcgrp, tsk);
 	}
 	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
 	synchronize_rcu();
 	put_css_set(cg);
 	return 0;
 }
 /*
  * Attach task with pid 'pid' to cgroup 'cgrp'. Call with
  * cgroup_mutex, may take task_lock of task
  */
 static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
 {
 	pid_t pid;
 	struct task_struct *tsk;
 	int ret;
 	if (sscanf(pidbuf, "%d", &pid) != 1)
 		return -EIO;
 	if (pid) {
 		rcu_read_lock();
 		tsk = find_task_by_vpid(pid);
 		if (!tsk || tsk->flags & PF_EXITING) {
 			rcu_read_unlock();
 			return -ESRCH;
 		}
 		get_task_struct(tsk);
 		rcu_read_unlock();
 		if ((current->euid) && (current->euid != tsk->uid)
 		    && (current->euid != tsk->suid)) {
 			put_task_struct(tsk);
 			return -EACCES;
 		}
 	} else {
 		tsk = current;
 		get_task_struct(tsk);
 	}
 	ret = cgroup_attach_task(cgrp, tsk);
 	put_task_struct(tsk);
 	return ret;
 }
 /* The various types of files and directories in a cgroup file system */
 enum cgroup_filetype {
 	FILE_ROOT,
 	FILE_DIR,
 	FILE_TASKLIST,
 	FILE_NOTIFY_ON_RELEASE,
-	FILE_RELEASABLE,
 	FILE_RELEASE_AGENT,
 };
 static ssize_t cgroup_write_u64(struct cgroup *cgrp, struct cftype *cft,
 				struct file *file,
 				const char __user *userbuf,
 				size_t nbytes, loff_t *unused_ppos)
 {
 	char buffer[64];
 	int retval = 0;
 	u64 val;
 	char *end;
 	if (!nbytes)
 		return -EINVAL;
 	if (nbytes >= sizeof(buffer))
 		return -E2BIG;
 	if (copy_from_user(buffer, userbuf, nbytes))
 		return -EFAULT;
 	buffer[nbytes] = 0;     /* nul-terminate */
 	strstrip(buffer);
 	val = simple_strtoull(buffer, &end, 0);
 	if (*end)
 		return -EINVAL;
 	/* Pass to subsystem */
 	retval = cft->write_u64(cgrp, cft, val);
 	if (!retval)
 		retval = nbytes;
 	return retval;
 }
 static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
 					   struct cftype *cft,
 					   struct file *file,
 					   const char __user *userbuf,
 					   size_t nbytes, loff_t *unused_ppos)
 {
 	enum cgroup_filetype type = cft->private;
 	char *buffer;
 	int retval = 0;
 	if (nbytes >= PATH_MAX)
 		return -E2BIG;
 	/* +1 for nul-terminator */
 	buffer = kmalloc(nbytes + 1, GFP_KERNEL);
 	if (buffer == NULL)
 		return -ENOMEM;
 	if (copy_from_user(buffer, userbuf, nbytes)) {
 		retval = -EFAULT;
 		goto out1;
 	}
 	buffer[nbytes] = 0;	/* nul-terminate */
 	strstrip(buffer);	/* strip -just- trailing whitespace */
 	mutex_lock(&cgroup_mutex);
 	/*
 	 * This was already checked for in cgroup_file_write(), but
 	 * check again now we're holding cgroup_mutex.
 	 */
 	if (cgroup_is_removed(cgrp)) {
 		retval = -ENODEV;
 		goto out2;
 	}
 	switch (type) {
 	case FILE_TASKLIST:
 		retval = attach_task_by_pid(cgrp, buffer);
 		break;
 	case FILE_NOTIFY_ON_RELEASE:
 		clear_bit(CGRP_RELEASABLE, &cgrp->flags);
 		if (simple_strtoul(buffer, NULL, 10) != 0)
 			set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 		else
 			clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 		break;
 	case FILE_RELEASE_AGENT:
 		BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
 		strcpy(cgrp->root->release_agent_path, buffer);
 		break;
 	default:
 		retval = -EINVAL;
 		goto out2;
 	}
 	if (retval == 0)
 		retval = nbytes;
 out2:
 	mutex_unlock(&cgroup_mutex);
 out1:
 	kfree(buffer);
 	return retval;
 }
 static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 						size_t nbytes, loff_t *ppos)
 {
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 	if (!cft || cgroup_is_removed(cgrp))
 		return -ENODEV;
 	if (cft->write)
 		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
 	if (cft->write_u64)
 		return cgroup_write_u64(cgrp, cft, file, buf, nbytes, ppos);
 	return -EINVAL;
 }
 static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
 			       struct file *file,
 			       char __user *buf, size_t nbytes,
 			       loff_t *ppos)
 {
 	char tmp[64];
 	u64 val = cft->read_u64(cgrp, cft);
 	int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
 	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
 static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
 					  struct cftype *cft,
 					  struct file *file,
 					  char __user *buf,
 					  size_t nbytes, loff_t *ppos)
 {
 	enum cgroup_filetype type = cft->private;
 	char *page;
 	ssize_t retval = 0;
 	char *s;
 	if (!(page = (char *)__get_free_page(GFP_KERNEL)))
 		return -ENOMEM;
 	s = page;
 	switch (type) {
 	case FILE_RELEASE_AGENT:
 	{
 		struct cgroupfs_root *root;
 		size_t n;
 		mutex_lock(&cgroup_mutex);
 		root = cgrp->root;
 		n = strnlen(root->release_agent_path,
 			    sizeof(root->release_agent_path));
 		n = min(n, (size_t) PAGE_SIZE);
 		strncpy(s, root->release_agent_path, n);
 		mutex_unlock(&cgroup_mutex);
 		s += n;
 		break;
 	}
 	default:
 		retval = -EINVAL;
 		goto out;
 	}
 	*s++ = '\n';
 	retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
 out:
 	free_page((unsigned long)page);
 	return retval;
 }
 static ssize_t cgroup_file_read(struct file *file, char __user *buf,
 				   size_t nbytes, loff_t *ppos)
 {
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 	if (!cft || cgroup_is_removed(cgrp))
 		return -ENODEV;
 	if (cft->read)
 		return cft->read(cgrp, cft, file, buf, nbytes, ppos);
 	if (cft->read_u64)
 		return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
 	return -EINVAL;
 }
 /*
  * seqfile ops/methods for returning structured data. Currently just
  * supports string->u64 maps, but can be extended in future.
  */
 struct cgroup_seqfile_state {
 	struct cftype *cft;
 	struct cgroup *cgroup;
 };
 static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
 {
 	struct seq_file *sf = cb->state;
 	return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
 }
 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 {
 	struct cgroup_seqfile_state *state = m->private;
 	struct cftype *cft = state->cft;
 	struct cgroup_map_cb cb = {
 		.fill = cgroup_map_add,
 		.state = m,
 	};
 	return cft->read_map(state->cgroup, cft, &cb);
 }
 int cgroup_seqfile_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *seq = file->private_data;
 	kfree(seq->private);
 	return single_release(inode, file);
 }
 static struct file_operations cgroup_seqfile_operations = {
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.release = cgroup_seqfile_release,
 };
 static int cgroup_file_open(struct inode *inode, struct file *file)
 {
 	int err;
 	struct cftype *cft;
 	err = generic_file_open(inode, file);
 	if (err)
 		return err;
 	cft = __d_cft(file->f_dentry);
 	if (!cft)
 		return -ENODEV;
 	if (cft->read_map) {
 		struct cgroup_seqfile_state *state =
 			kzalloc(sizeof(*state), GFP_USER);
 		if (!state)
 			return -ENOMEM;
 		state->cft = cft;
 		state->cgroup = __d_cgrp(file->f_dentry->d_parent);
 		file->f_op = &cgroup_seqfile_operations;
 		err = single_open(file, cgroup_seqfile_show, state);
 		if (err < 0)
 			kfree(state);
 	} else if (cft->open)
 		err = cft->open(inode, file);
 	else
 		err = 0;
 	return err;
 }
 static int cgroup_file_release(struct inode *inode, struct file *file)
 {
 	struct cftype *cft = __d_cft(file->f_dentry);
 	if (cft->release)
 		return cft->release(inode, file);
 	return 0;
 }
 /*
  * cgroup_rename - Only allow simple rename of directories in place.
  */
 static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
 			    struct inode *new_dir, struct dentry *new_dentry)
 {
 	if (!S_ISDIR(old_dentry->d_inode->i_mode))
 		return -ENOTDIR;
 	if (new_dentry->d_inode)
 		return -EEXIST;
 	if (old_dir != new_dir)
 		return -EIO;
 	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
 }
 static struct file_operations cgroup_file_operations = {
 	.read = cgroup_file_read,
 	.write = cgroup_file_write,
 	.llseek = generic_file_llseek,
 	.open = cgroup_file_open,
 	.release = cgroup_file_release,
 };
 static struct inode_operations cgroup_dir_inode_operations = {
 	.lookup = simple_lookup,
 	.mkdir = cgroup_mkdir,
 	.rmdir = cgroup_rmdir,
 	.rename = cgroup_rename,
 };
 static int cgroup_create_file(struct dentry *dentry, int mode,
 				struct super_block *sb)
 {
 	static struct dentry_operations cgroup_dops = {
 		.d_iput = cgroup_diput,
 	};
 	struct inode *inode;
 	if (!dentry)
 		return -ENOENT;
 	if (dentry->d_inode)
 		return -EEXIST;
 	inode = cgroup_new_inode(mode, sb);
 	if (!inode)
 		return -ENOMEM;
 	if (S_ISDIR(mode)) {
 		inode->i_op = &cgroup_dir_inode_operations;
 		inode->i_fop = &simple_dir_operations;
 		/* start off with i_nlink == 2 (for "." entry) */
 		inc_nlink(inode);
 		/* start with the directory inode held, so that we can
 		 * populate it without racing with another mkdir */
 		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
 	} else if (S_ISREG(mode)) {
 		inode->i_size = 0;
 		inode->i_fop = &cgroup_file_operations;
 	}
 	dentry->d_op = &cgroup_dops;
 	d_instantiate(dentry, inode);
 	dget(dentry);	/* Extra count - pin the dentry in core */
 	return 0;
 }
 /*
  * cgroup_create_dir - create a directory for an object.
  * @cgrp: the cgroup we create the directory for. It must have a valid
  *        ->parent field. And we are going to fill its ->dentry field.
  * @dentry: dentry of the new cgroup
  * @mode: mode to set on new directory.
  */
 static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
 				int mode)
 {
 	struct dentry *parent;
 	int error = 0;
 	parent = cgrp->parent->dentry;
 	error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
 	if (!error) {
 		dentry->d_fsdata = cgrp;
 		inc_nlink(parent->d_inode);
 		cgrp->dentry = dentry;
 		dget(dentry);
 	}
 	dput(dentry);
 	return error;
 }
 int cgroup_add_file(struct cgroup *cgrp,
 		       struct cgroup_subsys *subsys,
 		       const struct cftype *cft)
 {
 	struct dentry *dir = cgrp->dentry;
 	struct dentry *dentry;
 	int error;
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
 	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
 		strcpy(name, subsys->name);
 		strcat(name, ".");
 	}
 	strcat(name, cft->name);
 	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
 	dentry = lookup_one_len(name, dir, strlen(name));
 	if (!IS_ERR(dentry)) {
 		error = cgroup_create_file(dentry, 0644 | S_IFREG,
 						cgrp->root->sb);
 		if (!error)
 			dentry->d_fsdata = (void *)cft;
 		dput(dentry);
 	} else
 		error = PTR_ERR(dentry);
 	return error;
 }
 int cgroup_add_files(struct cgroup *cgrp,
 			struct cgroup_subsys *subsys,
 			const struct cftype cft[],
 			int count)
 {
 	int i, err;
 	for (i = 0; i < count; i++) {
 		err = cgroup_add_file(cgrp, subsys, &cft[i]);
 		if (err)
 			return err;
 	}
 	return 0;
 }
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
  *
  * Return the number of tasks in the cgroup.
  */
 int cgroup_task_count(const struct cgroup *cgrp)
 {
 	int count = 0;
 	struct list_head *l;
 	read_lock(&css_set_lock);
 	l = cgrp->css_sets.next;
 	while (l != &cgrp->css_sets) {
 		struct cg_cgroup_link *link =
 			list_entry(l, struct cg_cgroup_link, cgrp_link_list);
 		count += atomic_read(&link->cg->ref.refcount);
 		l = l->next;
 	}
 	read_unlock(&css_set_lock);
 	return count;
 }
 /*
  * Advance a list_head iterator.  The iterator should be positioned at
  * the start of a css_set
  */
 static void cgroup_advance_iter(struct cgroup *cgrp,
 					  struct cgroup_iter *it)
 {
 	struct list_head *l = it->cg_link;
 	struct cg_cgroup_link *link;
 	struct css_set *cg;
 	/* Advance to the next non-empty css_set */
 	do {
 		l = l->next;
 		if (l == &cgrp->css_sets) {
 			it->cg_link = NULL;
 			return;
 		}
 		link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
 		cg = link->cg;
 	} while (list_empty(&cg->tasks));
 	it->cg_link = l;
 	it->task = cg->tasks.next;
 }
 /*
  * To reduce the fork() overhead for systems that are not actually
  * using their cgroups capability, we don't maintain the lists running
  * through each css_set to its tasks until we see the list actually
  * used - in other words after the first call to cgroup_iter_start().
  *
  * The tasklist_lock is not held here, as do_each_thread() and
  * while_each_thread() are protected by RCU.
  */
 static void cgroup_enable_task_cg_lists(void)
 {
 	struct task_struct *p, *g;
 	write_lock(&css_set_lock);
 	use_task_css_set_links = 1;
 	do_each_thread(g, p) {
 		task_lock(p);
 		/*
 		 * We should check if the process is exiting, otherwise
 		 * it will race with cgroup_exit() in that the list
 		 * entry won't be deleted though the process has exited.
 		 */
 		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
 			list_add(&p->cg_list, &p->cgroups->tasks);
 		task_unlock(p);
 	} while_each_thread(g, p);
 	write_unlock(&css_set_lock);
 }
 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
 {
 	/*
 	 * The first time anyone tries to iterate across a cgroup,
 	 * we need to enable the list linking each css_set to its
 	 * tasks, and fix up all existing tasks.
 	 */
 	if (!use_task_css_set_links)
 		cgroup_enable_task_cg_lists();
 	read_lock(&css_set_lock);
 	it->cg_link = &cgrp->css_sets;
 	cgroup_advance_iter(cgrp, it);
 }
 struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 					struct cgroup_iter *it)
 {
 	struct task_struct *res;
 	struct list_head *l = it->task;
 	/* If the iterator cg is NULL, we have no tasks */
 	if (!it->cg_link)
 		return NULL;
 	res = list_entry(l, struct task_struct, cg_list);
 	/* Advance iterator to find next entry */
 	l = l->next;
 	if (l == &res->cgroups->tasks) {
 		/* We reached the end of this task list - move on to
 		 * the next cg_cgroup_link */
 		cgroup_advance_iter(cgrp, it);
 	} else {
 		it->task = l;
 	}
 	return res;
 }
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
 {
 	read_unlock(&css_set_lock);
 }
 static inline int started_after_time(struct task_struct *t1,
 				     struct timespec *time,
 				     struct task_struct *t2)
 {
 	int start_diff = timespec_compare(&t1->start_time, time);
 	if (start_diff > 0) {
 		return 1;
 	} else if (start_diff < 0) {
 		return 0;
 	} else {
 		/*
 		 * Arbitrarily, if two processes started at the same
 		 * time, we'll say that the lower pointer value
 		 * started first. Note that t2 may have exited by now
 		 * so this may not be a valid pointer any longer, but
 		 * that's fine - it still serves to distinguish
 		 * between two tasks started (effectively) simultaneously.
 		 */
 		return t1 > t2;
 	}
 }
 /*
  * This function is a callback from heap_insert() and is used to order
  * the heap.
  * In this case we order the heap in descending task start time.
  */
 static inline int started_after(void *p1, void *p2)
 {
 	struct task_struct *t1 = p1;
 	struct task_struct *t2 = p2;
 	return started_after_time(t1, &t2->start_time, t2);
 }
 /**
  * cgroup_scan_tasks - iterate though all the tasks in a cgroup
  * @scan: struct cgroup_scanner containing arguments for the scan
  *
  * Arguments include pointers to callback functions test_task() and
  * process_task().
  * Iterate through all the tasks in a cgroup, calling test_task() for each,
  * and if it returns true, call process_task() for it also.
  * The test_task pointer may be NULL, meaning always true (select all tasks).
  * Effectively duplicates cgroup_iter_{start,next,end}()
  * but does not lock css_set_lock for the call to process_task().
  * The struct cgroup_scanner may be embedded in any structure of the caller's
  * creation.
  * It is guaranteed that process_task() will act on every task that
  * is a member of the cgroup for the duration of this call. This
  * function may or may not call process_task() for tasks that exit
  * or move to a different cgroup during the call, or are forked or
  * move into the cgroup during the call.
  *
  * Note that test_task() may be called with locks held, and may in some
  * situations be called multiple times for the same task, so it should
  * be cheap.
  * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
  * pre-allocated and will be used for heap operations (and its "gt" member will
  * be overwritten), else a temporary heap will be used (allocation of which
  * may cause this function to fail).
  */
 int cgroup_scan_tasks(struct cgroup_scanner *scan)
 {
 	int retval, i;
 	struct cgroup_iter it;
 	struct task_struct *p, *dropped;
 	/* Never dereference latest_task, since it's not refcounted */
 	struct task_struct *latest_task = NULL;
 	struct ptr_heap tmp_heap;
 	struct ptr_heap *heap;
 	struct timespec latest_time = { 0, 0 };
 	if (scan->heap) {
 		/* The caller supplied our heap and pre-allocated its memory */
 		heap = scan->heap;
 		heap->gt = &started_after;
 	} else {
 		/* We need to allocate our own heap memory */
 		heap = &tmp_heap;
 		retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
 		if (retval)
 			/* cannot allocate the heap */
 			return retval;
 	}
  again:
 	/*
 	 * Scan tasks in the cgroup, using the scanner's "test_task" callback
 	 * to determine which are of interest, and using the scanner's
 	 * "process_task" callback to process any of them that need an update.
 	 * Since we don't want to hold any locks during the task updates,
 	 * gather tasks to be processed in a heap structure.
 	 * The heap is sorted by descending task start time.
 	 * If the statically-sized heap fills up, we overflow tasks that
 	 * started later, and in future iterations only consider tasks that
 	 * started after the latest task in the previous pass. This
 	 * guarantees forward progress and that we don't miss any tasks.
 	 */
 	heap->size = 0;
 	cgroup_iter_start(scan->cg, &it);
 	while ((p = cgroup_iter_next(scan->cg, &it))) {
 		/*
 		 * Only affect tasks that qualify per the caller's callback,
 		 * if he provided one
 		 */
 		if (scan->test_task && !scan->test_task(p, scan))
 			continue;
 		/*
 		 * Only process tasks that started after the last task
 		 * we processed
 		 */
 		if (!started_after_time(p, &latest_time, latest_task))
 			continue;
 		dropped = heap_insert(heap, p);
 		if (dropped == NULL) {
 			/*
 			 * The new task was inserted; the heap wasn't
 			 * previously full
 			 */
 			get_task_struct(p);
 		} else if (dropped != p) {
 			/*
 			 * The new task was inserted, and pushed out a
 			 * different task
 			 */
 			get_task_struct(p);
 			put_task_struct(dropped);
 		}
 		/*
 		 * Else the new task was newer than anything already in
 		 * the heap and wasn't inserted
 		 */
 	}
 	cgroup_iter_end(scan->cg, &it);
 	if (heap->size) {
 		for (i = 0; i < heap->size; i++) {
 			struct task_struct *q = heap->ptrs[i];
 			if (i == 0) {
 				latest_time = q->start_time;
 				latest_task = q;
 			}
 			/* Process the task per the caller's callback */
 			scan->process_task(q, scan);
 			put_task_struct(q);
 		}
 		/*
 		 * If we had to process any tasks at all, scan again
 		 * in case some of them were in the middle of forking
 		 * children that didn't get processed.
 		 * Not the most efficient way to do it, but it avoids
 		 * having to take callback_mutex in the fork path
 		 */
 		goto again;
 	}
 	if (heap == &tmp_heap)
 		heap_free(&tmp_heap);
 	return 0;
 }
 /*
  * Stuff for reading the 'tasks' file.
  *
  * Reading this file can return large amounts of data if a cgroup has
  * *lots* of attached tasks. So it may need several calls to read(),
  * but we cannot guarantee that the information we produce is correct
  * unless we produce it entirely atomically.
  *
  * Upon tasks file open(), a struct ctr_struct is allocated, that
  * will have a pointer to an array (also allocated here).  The struct
  * ctr_struct * is stored in file->private_data.  Its resources will
  * be freed by release() when the file is closed.  The array is used
  * to sprintf the PIDs and then used by read().
  */
 struct ctr_struct {
 	char *buf;
 	int bufsz;
 };
 /*
  * Load into 'pidarray' up to 'npids' of the tasks using cgroup
  * 'cgrp'.  Return actual number of pids loaded.  No need to
  * task_lock(p) when reading out p->cgroup, since we're in an RCU
  * read section, so the css_set can't go away, and is
  * immutable after creation.
  */
 static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
 {
 	int n = 0;
 	struct cgroup_iter it;
 	struct task_struct *tsk;
 	cgroup_iter_start(cgrp, &it);
 	while ((tsk = cgroup_iter_next(cgrp, &it))) {
 		if (unlikely(n == npids))
 			break;
 		pidarray[n++] = task_pid_vnr(tsk);
 	}
 	cgroup_iter_end(cgrp, &it);
 	return n;
 }
 /**
  * cgroupstats_build - build and fill cgroupstats
  * @stats: cgroupstats to fill information into
  * @dentry: A dentry entry belonging to the cgroup for which stats have
  * been requested.
  *
  * Build and fill cgroupstats so that taskstats can export it to user
  * space.
  */
 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 {
 	int ret = -EINVAL;
 	struct cgroup *cgrp;
 	struct cgroup_iter it;
 	struct task_struct *tsk;
 	/*
 	 * Validate dentry by checking the superblock operations
 	 */
 	if (dentry->d_sb->s_op != &cgroup_ops)
 		 goto err;
 	ret = 0;
 	cgrp = dentry->d_fsdata;
 	rcu_read_lock();
 	cgroup_iter_start(cgrp, &it);
 	while ((tsk = cgroup_iter_next(cgrp, &it))) {
 		switch (tsk->state) {
 		case TASK_RUNNING:
 			stats->nr_running++;
 			break;
 		case TASK_INTERRUPTIBLE:
 			stats->nr_sleeping++;
 			break;
 		case TASK_UNINTERRUPTIBLE:
 			stats->nr_uninterruptible++;
 			break;
 		case TASK_STOPPED:
 			stats->nr_stopped++;
 			break;
 		default:
 			if (delayacct_is_task_waiting_on_io(tsk))
 				stats->nr_io_wait++;
 			break;
 		}
 	}
 	cgroup_iter_end(cgrp, &it);
 	rcu_read_unlock();
 err:
 	return ret;
 }
 static int cmppid(const void *a, const void *b)
 {
 	return *(pid_t *)a - *(pid_t *)b;
 }
 /*
  * Convert array 'a' of 'npids' pid_t's to a string of newline separated
  * decimal pids in 'buf'.  Don't write more than 'sz' chars, but return
  * count 'cnt' of how many chars would be written if buf were large enough.
  */
 static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
 {
 	int cnt = 0;
 	int i;
 	for (i = 0; i < npids; i++)
 		cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
 	return cnt;
 }
 /*
  * Handle an open on 'tasks' file.  Prepare a buffer listing the
  * process id's of tasks currently attached to the cgroup being opened.
  *
  * Does not require any specific cgroup mutexes, and does not take any.
  */
 static int cgroup_tasks_open(struct inode *unused, struct file *file)
 {
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 	struct ctr_struct *ctr;
 	pid_t *pidarray;
 	int npids;
 	char c;
 	if (!(file->f_mode & FMODE_READ))
 		return 0;
 	ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
 	if (!ctr)
 		goto err0;
 	/*
 	 * If cgroup gets more users after we read count, we won't have
 	 * enough space - tough.  This race is indistinguishable to the
 	 * caller from the case that the additional cgroup users didn't
 	 * show up until sometime later on.
 	 */
 	npids = cgroup_task_count(cgrp);
 	if (npids) {
 		pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
 		if (!pidarray)
 			goto err1;
 		npids = pid_array_load(pidarray, npids, cgrp);
 		sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
 		/* Call pid_array_to_buf() twice, first just to get bufsz */
 		ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
 		ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
 		if (!ctr->buf)
 			goto err2;
 		ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
 		kfree(pidarray);
 	} else {
 		ctr->buf = NULL;
 		ctr->bufsz = 0;
 	}
 	file->private_data = ctr;
 	return 0;
 err2:
 	kfree(pidarray);
 err1:
 	kfree(ctr);
 err0:
 	return -ENOMEM;
 }
 static ssize_t cgroup_tasks_read(struct cgroup *cgrp,
 				    struct cftype *cft,
 				    struct file *file, char __user *buf,
 				    size_t nbytes, loff_t *ppos)
 {
 	struct ctr_struct *ctr = file->private_data;
 	return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
 }
 static int cgroup_tasks_release(struct inode *unused_inode,
 					struct file *file)
 {
 	struct ctr_struct *ctr;
 	if (file->f_mode & FMODE_READ) {
 		ctr = file->private_data;
 		kfree(ctr->buf);
 		kfree(ctr);
 	}
 	return 0;
 }
 static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
 					    struct cftype *cft)
 {
 	return notify_on_release(cgrp);
 }
-static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft)
-{
-	return test_bit(CGRP_RELEASABLE, &cgrp->flags);
-}
 /*
  * for the common functions, 'private' gives the type of file
  */
 static struct cftype files[] = {
 	{
 		.name = "tasks",
 		.open = cgroup_tasks_open,
 		.read = cgroup_tasks_read,
 		.write = cgroup_common_file_write,
 		.release = cgroup_tasks_release,
 		.private = FILE_TASKLIST,
 	},
 	{
 		.name = "notify_on_release",
 		.read_u64 = cgroup_read_notify_on_release,
 		.write = cgroup_common_file_write,
 		.private = FILE_NOTIFY_ON_RELEASE,
 	},
-	{
-		.name = "releasable",
-		.read_u64 = cgroup_read_releasable,
-		.private = FILE_RELEASABLE,
-	}
 };
 static struct cftype cft_release_agent = {
 	.name = "release_agent",
 	.read = cgroup_common_file_read,
 	.write = cgroup_common_file_write,
 	.private = FILE_RELEASE_AGENT,
 };
 static int cgroup_populate_dir(struct cgroup *cgrp)
 {
 	int err;
 	struct cgroup_subsys *ss;
 	/* First clear out any existing files */
 	cgroup_clear_directory(cgrp->dentry);
 	err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
 	if (err < 0)
 		return err;
 	if (cgrp == cgrp->top_cgroup) {
 		if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
 			return err;
 	}
 	for_each_subsys(cgrp->root, ss) {
 		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
 			return err;
 	}
 	return 0;
 }
 static void init_cgroup_css(struct cgroup_subsys_state *css,
 			       struct cgroup_subsys *ss,
 			       struct cgroup *cgrp)
 {
 	css->cgroup = cgrp;
 	atomic_set(&css->refcnt, 0);
 	css->flags = 0;
 	if (cgrp == dummytop)
 		set_bit(CSS_ROOT, &css->flags);
 	BUG_ON(cgrp->subsys[ss->subsys_id]);
 	cgrp->subsys[ss->subsys_id] = css;
 }
 /*
  * cgroup_create - create a cgroup
  * @parent: cgroup that will be parent of the new cgroup
  * @dentry: dentry of the new cgroup
  * @mode: mode to set on new inode
  *
  * Must be called with the mutex on the parent inode held
  */
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			     int mode)
 {
 	struct cgroup *cgrp;
 	struct cgroupfs_root *root = parent->root;
 	int err = 0;
 	struct cgroup_subsys *ss;
 	struct super_block *sb = root->sb;
 	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
 	if (!cgrp)
 		return -ENOMEM;
 	/* Grab a reference on the superblock so the hierarchy doesn't
 	 * get deleted on unmount if there are child cgroups.  This
 	 * can be done outside cgroup_mutex, since the sb can't
 	 * disappear while someone has an open control file on the
 	 * fs */
 	atomic_inc(&sb->s_active);
 	mutex_lock(&cgroup_mutex);
 	INIT_LIST_HEAD(&cgrp->sibling);
 	INIT_LIST_HEAD(&cgrp->children);
 	INIT_LIST_HEAD(&cgrp->css_sets);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	cgrp->parent = parent;
 	cgrp->root = parent->root;
 	cgrp->top_cgroup = parent->top_cgroup;
 	if (notify_on_release(parent))
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 	for_each_subsys(root, ss) {
 		struct cgroup_subsys_state *css = ss->create(ss, cgrp);
 		if (IS_ERR(css)) {
 			err = PTR_ERR(css);
 			goto err_destroy;
 		}
 		init_cgroup_css(css, ss, cgrp);
 	}
 	list_add(&cgrp->sibling, &cgrp->parent->children);
 	root->number_of_cgroups++;
 	err = cgroup_create_dir(cgrp, dentry, mode);
 	if (err < 0)
 		goto err_remove;
 	/* The cgroup directory was pre-locked for us */
 	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
 	err = cgroup_populate_dir(cgrp);
 	/* If err < 0, we have a half-filled directory - oh well ;) */
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 	return 0;
  err_remove:
 	list_del(&cgrp->sibling);
 	root->number_of_cgroups--;
  err_destroy:
 	for_each_subsys(root, ss) {
 		if (cgrp->subsys[ss->subsys_id])
 			ss->destroy(ss, cgrp);
 	}
 	mutex_unlock(&cgroup_mutex);
 	/* Release the reference count that we took on the superblock */
 	deactivate_super(sb);
 	kfree(cgrp);
 	return err;
 }
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
 	struct cgroup *c_parent = dentry->d_parent->d_fsdata;
 	/* the vfs holds inode->i_mutex already */
 	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
 static inline int cgroup_has_css_refs(struct cgroup *cgrp)
 {
 	/* Check the reference count on each subsystem. Since we
 	 * already established that there are no tasks in the
 	 * cgroup, if the css refcount is also 0, then there should
 	 * be no outstanding references, so the subsystem is safe to
 	 * destroy. We scan across all subsystems rather than using
 	 * the per-hierarchy linked list of mounted subsystems since
 	 * we can be called via check_for_release() with no
 	 * synchronization other than RCU, and the subsystem linked
 	 * list isn't RCU-safe */
 	int i;
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		struct cgroup_subsys_state *css;
 		/* Skip subsystems not in this hierarchy */
 		if (ss->root != cgrp->root)
 			continue;
 		css = cgrp->subsys[ss->subsys_id];
 		/* When called from check_for_release() it's possible
 		 * that by this point the cgroup has been removed
 		 * and the css deleted. But a false-positive doesn't
 		 * matter, since it can only happen if the cgroup
 		 * has been deleted and hence no longer needs the
 		 * release agent to be called anyway. */
 		if (css && atomic_read(&css->refcnt))
 			return 1;
 	}
 	return 0;
 }
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
 	struct cgroup *cgrp = dentry->d_fsdata;
 	struct dentry *d;
 	struct cgroup *parent;
 	struct super_block *sb;
 	struct cgroupfs_root *root;
 	/* the vfs holds both inode->i_mutex already */
 	mutex_lock(&cgroup_mutex);
 	if (atomic_read(&cgrp->count) != 0) {
 		mutex_unlock(&cgroup_mutex);
 		return -EBUSY;
 	}
 	if (!list_empty(&cgrp->children)) {
 		mutex_unlock(&cgroup_mutex);
 		return -EBUSY;
 	}
 	parent = cgrp->parent;
 	root = cgrp->root;
 	sb = root->sb;
 	/*
 	 * Call pre_destroy handlers of subsys. Notify subsystems
 	 * that rmdir() request comes.
 	 */
 	cgroup_call_pre_destroy(cgrp);
 	if (cgroup_has_css_refs(cgrp)) {
 		mutex_unlock(&cgroup_mutex);
 		return -EBUSY;
 	}
 	spin_lock(&release_list_lock);
 	set_bit(CGRP_REMOVED, &cgrp->flags);
 	if (!list_empty(&cgrp->release_list))
 		list_del(&cgrp->release_list);
 	spin_unlock(&release_list_lock);
 	/* delete my sibling from parent->children */
 	list_del(&cgrp->sibling);
 	spin_lock(&cgrp->dentry->d_lock);
 	d = dget(cgrp->dentry);
 	cgrp->dentry = NULL;
 	spin_unlock(&d->d_lock);
 	cgroup_d_remove_dir(d);
 	dput(d);
 	set_bit(CGRP_RELEASABLE, &parent->flags);
 	check_for_release(parent);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 static void cgroup_init_subsys(struct cgroup_subsys *ss)
 {
 	struct cgroup_subsys_state *css;
 	struct list_head *l;
 	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
 	/* Create the top cgroup state for this subsystem */
 	ss->root = &rootnode;
 	css = ss->create(ss, dummytop);
 	/* We don't handle early failures gracefully */
 	BUG_ON(IS_ERR(css));
 	init_cgroup_css(css, ss, dummytop);
 	/* Update all cgroup groups to contain a subsys
 	 * pointer to this state - since the subsystem is
 	 * newly registered, all tasks and hence all cgroup
 	 * groups are in the subsystem's top cgroup. */
 	write_lock(&css_set_lock);
 	l = &init_css_set.list;
 	do {
 		struct css_set *cg =
 			list_entry(l, struct css_set, list);
 		cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
 		l = l->next;
 	} while (l != &init_css_set.list);
 	write_unlock(&css_set_lock);
  	/* If this subsystem requested that it be notified with fork
  	 * events, we should send it one now for every process in the
  	 * system */
 	if (ss->fork) {
 		struct task_struct *g, *p;
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
 			ss->fork(ss, p);
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
 	}
 	need_forkexit_callback |= ss->fork || ss->exit;
 	ss->active = 1;
 }
 /**
  * cgroup_init_early - cgroup initialization at system boot
  *
  * Initialize cgroups at system boot, and initialize any
  * subsystems that request early init.
  */
 int __init cgroup_init_early(void)
 {
 	int i;
 	kref_init(&init_css_set.ref);
 	kref_get(&init_css_set.ref);
 	INIT_LIST_HEAD(&init_css_set.list);
 	INIT_LIST_HEAD(&init_css_set.cg_links);
 	INIT_LIST_HEAD(&init_css_set.tasks);
 	css_set_count = 1;
 	init_cgroup_root(&rootnode);
 	list_add(&rootnode.root_list, &roots);
 	root_count = 1;
 	init_task.cgroups = &init_css_set;
 	init_css_set_link.cg = &init_css_set;
 	list_add(&init_css_set_link.cgrp_link_list,
 		 &rootnode.top_cgroup.css_sets);
 	list_add(&init_css_set_link.cg_link_list,
 		 &init_css_set.cg_links);
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		BUG_ON(!ss->name);
 		BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
 		BUG_ON(!ss->create);
 		BUG_ON(!ss->destroy);
 		if (ss->subsys_id != i) {
 			printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
 			       ss->name, ss->subsys_id);
 			BUG();
 		}
 		if (ss->early_init)
 			cgroup_init_subsys(ss);
 	}
 	return 0;
 }
 /**
  * cgroup_init - cgroup initialization
  *
  * Register cgroup filesystem and /proc file, and initialize
  * any subsystems that didn't request early init.
  */
 int __init cgroup_init(void)
 {
 	int err;
 	int i;
 	struct proc_dir_entry *entry;
 	err = bdi_init(&cgroup_backing_dev_info);
 	if (err)
 		return err;
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
 	}
 	err = register_filesystem(&cgroup_fs_type);
 	if (err < 0)
 		goto out;
 	entry = create_proc_entry("cgroups", 0, NULL);
 	if (entry)
 		entry->proc_fops = &proc_cgroupstats_operations;
 out:
 	if (err)
 		bdi_destroy(&cgroup_backing_dev_info);
 	return err;
 }
 /*
  * proc_cgroup_show()
  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
  *  - Used for /proc/<pid>/cgroup.
  *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
  *    doesn't really matter if tsk->cgroup changes after we read it,
  *    and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
  *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
  *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
  *    cgroup to top_cgroup.
  */
 /* TODO: Use a proper seq_file iterator */
 static int proc_cgroup_show(struct seq_file *m, void *v)
 {
 	struct pid *pid;
 	struct task_struct *tsk;
 	char *buf;
 	int retval;
 	struct cgroupfs_root *root;
 	retval = -ENOMEM;
 	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!buf)
 		goto out;
 	retval = -ESRCH;
 	pid = m->private;
 	tsk = get_pid_task(pid, PIDTYPE_PID);
 	if (!tsk)
 		goto out_free;
 	retval = 0;
 	mutex_lock(&cgroup_mutex);
 	for_each_root(root) {
 		struct cgroup_subsys *ss;
 		struct cgroup *cgrp;
 		int subsys_id;
 		int count = 0;
 		/* Skip this hierarchy if it has no active subsystems */
 		if (!root->actual_subsys_bits)
 			continue;
 		seq_printf(m, "%lu:", root->subsys_bits);
 		for_each_subsys(root, ss)
 			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
 		seq_putc(m, ':');
 		get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
 		cgrp = task_cgroup(tsk, subsys_id);
 		retval = cgroup_path(cgrp, buf, PAGE_SIZE);
 		if (retval < 0)
 			goto out_unlock;
 		seq_puts(m, buf);
 		seq_putc(m, '\n');
 	}
 out_unlock:
 	mutex_unlock(&cgroup_mutex);
 	put_task_struct(tsk);
 out_free:
 	kfree(buf);
 out:
 	return retval;
 }
 static int cgroup_open(struct inode *inode, struct file *file)
 {
 	struct pid *pid = PROC_I(inode)->pid;
 	return single_open(file, proc_cgroup_show, pid);
 }
 struct file_operations proc_cgroup_operations = {
 	.open		= cgroup_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
 /* Display information about each subsystem and each hierarchy */
 static int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
 	int i;
 	seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
 	mutex_lock(&cgroup_mutex);
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		seq_printf(m, "%s\t%lu\t%d\t%d\n",
 			   ss->name, ss->root->subsys_bits,
 			   ss->root->number_of_cgroups, !ss->disabled);
 	}
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 static int cgroupstats_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, proc_cgroupstats_show, NULL);
 }
 static struct file_operations proc_cgroupstats_operations = {
 	.open = cgroupstats_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.release = single_release,
 };
 /**
  * cgroup_fork - attach newly forked task to its parents cgroup.
  * @child: pointer to task_struct of forking parent process.
  *
  * Description: A task inherits its parent's cgroup at fork().
  *
  * A pointer to the shared css_set was automatically copied in
  * fork.c by dup_task_struct().  However, we ignore that copy, since
  * it was not made under the protection of RCU or cgroup_mutex, so
  * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
  * have already changed current->cgroups, allowing the previously
  * referenced cgroup group to be removed and freed.
  *
  * At the point that cgroup_fork() is called, 'current' is the parent
  * task, and the passed argument 'child' points to the child task.
  */
 void cgroup_fork(struct task_struct *child)
 {
 	task_lock(current);
 	child->cgroups = current->cgroups;
 	get_css_set(child->cgroups);
 	task_unlock(current);
 	INIT_LIST_HEAD(&child->cg_list);
 }
 /**
  * cgroup_fork_callbacks - run fork callbacks
  * @child: the new task
  *
  * Called on a new task very soon before adding it to the
  * tasklist. No need to take any locks since no-one can
  * be operating on this task.
  */
 void cgroup_fork_callbacks(struct task_struct *child)
 {
 	if (need_forkexit_callback) {
 		int i;
 		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 			if (ss->fork)
 				ss->fork(ss, child);
 		}
 	}
 }
 /**
  * cgroup_post_fork - called on a new task after adding it to the task list
  * @child: the task in question
  *
  * Adds the task to the list running through its css_set if necessary.
  * Has to be after the task is visible on the task list in case we race
  * with the first call to cgroup_iter_start() - to guarantee that the
  * new task ends up on its list.
  */
 void cgroup_post_fork(struct task_struct *child)
 {
 	if (use_task_css_set_links) {
 		write_lock(&css_set_lock);
 		if (list_empty(&child->cg_list))
 			list_add(&child->cg_list, &child->cgroups->tasks);
 		write_unlock(&css_set_lock);
 	}
 }
 /**
  * cgroup_exit - detach cgroup from exiting task
  * @tsk: pointer to task_struct of exiting process
  * @run_callback: run exit callbacks?
  *
  * Description: Detach cgroup from @tsk and release it.
  *
  * Note that cgroups marked notify_on_release force every task in
  * them to take the global cgroup_mutex mutex when exiting.
  * This could impact scaling on very large systems.  Be reluctant to
  * use notify_on_release cgroups where very high task exit scaling
  * is required on large systems.
  *
  * the_top_cgroup_hack:
  *
  *    Set the exiting tasks cgroup to the root cgroup (top_cgroup).
  *
  *    We call cgroup_exit() while the task is still competent to
  *    handle notify_on_release(), then leave the task attached to the
  *    root cgroup in each hierarchy for the remainder of its exit.
  *
  *    To do this properly, we would increment the reference count on
  *    top_cgroup, and near the very end of the kernel/exit.c do_exit()
  *    code we would add a second cgroup function call, to drop that
  *    reference.  This would just create an unnecessary hot spot on
  *    the top_cgroup reference count, to no avail.
  *
  *    Normally, holding a reference to a cgroup without bumping its
  *    count is unsafe.   The cgroup could go away, or someone could
  *    attach us to a different cgroup, decrementing the count on
  *    the first cgroup that we never incremented.  But in this case,
  *    top_cgroup isn't going away, and either task has PF_EXITING set,
  *    which wards off any cgroup_attach_task() attempts, or task is a failed
  *    fork, never visible to cgroup_attach_task.
  */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
 	int i;
 	struct css_set *cg;
 	if (run_callbacks && need_forkexit_callback) {
 		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 			if (ss->exit)
 				ss->exit(ss, tsk);
 		}
 	}
 	/*
 	 * Unlink from the css_set task list if necessary.
 	 * Optimistically check cg_list before taking
 	 * css_set_lock
 	 */
 	if (!list_empty(&tsk->cg_list)) {
 		write_lock(&css_set_lock);
 		if (!list_empty(&tsk->cg_list))
 			list_del(&tsk->cg_list);
 		write_unlock(&css_set_lock);
 	}
 	/* Reassign the task to the init_css_set. */
 	task_lock(tsk);
 	cg = tsk->cgroups;
 	tsk->cgroups = &init_css_set;
 	task_unlock(tsk);
 	if (cg)
 		put_css_set_taskexit(cg);
 }
 /**
  * cgroup_clone - clone the cgroup the given subsystem is attached to
  * @tsk: the task to be moved
  * @subsys: the given subsystem
  *
  * Duplicate the current cgroup in the hierarchy that the given
  * subsystem is attached to, and move this task into the new
  * child.
  */
 int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
 {
 	struct dentry *dentry;
 	int ret = 0;
 	char nodename[MAX_CGROUP_TYPE_NAMELEN];
 	struct cgroup *parent, *child;
 	struct inode *inode;
 	struct css_set *cg;
 	struct cgroupfs_root *root;
 	struct cgroup_subsys *ss;
 	/* We shouldn't be called by an unregistered subsystem */
 	BUG_ON(!subsys->active);
 	/* First figure out what hierarchy and cgroup we're dealing
 	 * with, and pin them so we can drop cgroup_mutex */
 	mutex_lock(&cgroup_mutex);
  again:
 	root = subsys->root;
 	if (root == &rootnode) {
 		printk(KERN_INFO
 		       "Not cloning cgroup for unused subsystem %s\n",
 		       subsys->name);
 		mutex_unlock(&cgroup_mutex);
 		return 0;
 	}
 	cg = tsk->cgroups;
 	parent = task_cgroup(tsk, subsys->subsys_id);
 	snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid);
 	/* Pin the hierarchy */
 	atomic_inc(&parent->root->sb->s_active);
 	/* Keep the cgroup alive */
 	get_css_set(cg);
 	mutex_unlock(&cgroup_mutex);
 	/* Now do the VFS work to create a cgroup */
 	inode = parent->dentry->d_inode;
 	/* Hold the parent directory mutex across this operation to
 	 * stop anyone else deleting the new cgroup */
 	mutex_lock(&inode->i_mutex);
 	dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
 	if (IS_ERR(dentry)) {
 		printk(KERN_INFO
 		       "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
 		       PTR_ERR(dentry));
 		ret = PTR_ERR(dentry);
 		goto out_release;
 	}
 	/* Create the cgroup directory, which also creates the cgroup */
 	ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
 	child = __d_cgrp(dentry);
 	dput(dentry);
 	if (ret) {
 		printk(KERN_INFO
 		       "Failed to create cgroup %s: %d\n", nodename,
 		       ret);
 		goto out_release;
 	}
 	if (!child) {
 		printk(KERN_INFO
 		       "Couldn't find new cgroup %s\n", nodename);
 		ret = -ENOMEM;
 		goto out_release;
 	}
 	/* The cgroup now exists. Retake cgroup_mutex and check
 	 * that we're still in the same state that we thought we
 	 * were. */
 	mutex_lock(&cgroup_mutex);
 	if ((root != subsys->root) ||
 	    (parent != task_cgroup(tsk, subsys->subsys_id))) {
 		/* Aargh, we raced ... */
 		mutex_unlock(&inode->i_mutex);
 		put_css_set(cg);
 		deactivate_super(parent->root->sb);
 		/* The cgroup is still accessible in the VFS, but
 		 * we're not going to try to rmdir() it at this
 		 * point. */
 		printk(KERN_INFO
 		       "Race in cgroup_clone() - leaking cgroup %s\n",
 		       nodename);
 		goto again;
 	}
 	/* do any required auto-setup */
 	for_each_subsys(root, ss) {
 		if (ss->post_clone)
 			ss->post_clone(ss, child);
 	}
 	/* All seems fine. Finish by moving the task into the new cgroup */
 	ret = cgroup_attach_task(child, tsk);
 	mutex_unlock(&cgroup_mutex);
  out_release:
 	mutex_unlock(&inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
 	put_css_set(cg);
 	mutex_unlock(&cgroup_mutex);
 	deactivate_super(parent->root->sb);
 	return ret;
 }
 /**
  * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp
  * @cgrp: the cgroup in question
  *
  * See if @cgrp is a descendant of the current task's cgroup in
  * the appropriate hierarchy.
  *
  * If we are sending in dummytop, then presumably we are creating
  * the top cgroup in the subsystem.
  *
  * Called only by the ns (nsproxy) cgroup.
  */
 int cgroup_is_descendant(const struct cgroup *cgrp)
 {
 	int ret;
 	struct cgroup *target;
 	int subsys_id;
 	if (cgrp == dummytop)
 		return 1;
 	get_first_subsys(cgrp, NULL, &subsys_id);
 	target = task_cgroup(current, subsys_id);
 	while (cgrp != target && cgrp!= cgrp->top_cgroup)
 		cgrp = cgrp->parent;
 	ret = (cgrp == target);
 	return ret;
 }
 static void check_for_release(struct cgroup *cgrp)
 {
 	/* All of these checks rely on RCU to keep the cgroup
 	 * structure alive */
 	if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
 	    && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
 		/* Control Group is currently removeable. If it's not
 		 * already queued for a userspace notification, queue
 		 * it now */
 		int need_schedule_work = 0;
 		spin_lock(&release_list_lock);
 		if (!cgroup_is_removed(cgrp) &&
 		    list_empty(&cgrp->release_list)) {
 			list_add(&cgrp->release_list, &release_list);
 			need_schedule_work = 1;
 		}
 		spin_unlock(&release_list_lock);
 		if (need_schedule_work)
 			schedule_work(&release_agent_work);
 	}
 }
 void __css_put(struct cgroup_subsys_state *css)
 {
 	struct cgroup *cgrp = css->cgroup;
 	rcu_read_lock();
 	if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) {
 		set_bit(CGRP_RELEASABLE, &cgrp->flags);
 		check_for_release(cgrp);
 	}
 	rcu_read_unlock();
 }
 /*
  * Notify userspace when a cgroup is released, by running the
  * configured release agent with the name of the cgroup (path
  * relative to the root of cgroup file system) as the argument.
  *
  * Most likely, this user command will try to rmdir this cgroup.
  *
  * This races with the possibility that some other task will be
  * attached to this cgroup before it is removed, or that some other
  * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
  * The presumed 'rmdir' will fail quietly if this cgroup is no longer
  * unused, and this cgroup will be reprieved from its death sentence,
  * to continue to serve a useful existence.  Next time it's released,
  * we will get notified again, if it still has 'notify_on_release' set.
  *
  * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
  * means only wait until the task is successfully execve()'d.  The
  * separate release agent task is forked by call_usermodehelper(),
  * then control in this thread returns here, without waiting for the
  * release agent task.  We don't bother to wait because the caller of
  * this routine has no use for the exit status of the release agent
  * task, so no sense holding our caller up for that.
  */
 static void cgroup_release_agent(struct work_struct *work)
 {
 	BUG_ON(work != &release_agent_work);
 	mutex_lock(&cgroup_mutex);
 	spin_lock(&release_list_lock);
 	while (!list_empty(&release_list)) {
 		char *argv[3], *envp[3];
 		int i;
 		char *pathbuf;
 		struct cgroup *cgrp = list_entry(release_list.next,
 						    struct cgroup,
 						    release_list);
 		list_del_init(&cgrp->release_list);
 		spin_unlock(&release_list_lock);
 		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
 		if (!pathbuf) {
 			spin_lock(&release_list_lock);
 			continue;
 		}
 		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) {
 			kfree(pathbuf);
 			spin_lock(&release_list_lock);
 			continue;
 		}
 		i = 0;
 		argv[i++] = cgrp->root->release_agent_path;
 		argv[i++] = (char *)pathbuf;
 		argv[i] = NULL;
 		i = 0;
 		/* minimal command environment */
 		envp[i++] = "HOME=/";
 		envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 		envp[i] = NULL;
 		/* Drop the lock while we invoke the usermode helper,
 		 * since the exec could involve hitting disk and hence
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
 		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
 		kfree(pathbuf);
 		mutex_lock(&cgroup_mutex);
 		spin_lock(&release_list_lock);
 	}
 	spin_unlock(&release_list_lock);
 	mutex_unlock(&cgroup_mutex);
 }
 static int __init cgroup_disable(char *str)
 {
 	int i;
 	char *token;
 	while ((token = strsep(&str, ",")) != NULL) {
 		if (!*token)
 			continue;
 		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 			if (!strcmp(token, ss->name)) {
 				ss->disabled = 1;
 				printk(KERN_INFO "Disabling %s control group"
 					" subsystem\n", ss->name);
 				break;
 			}
 		}
 	}
 	return 1;
 }
 __setup("cgroup_disable=", cgroup_disable);

 /*
- * kernel/ccontainer_debug.c - Example cgroup subsystem that
+ * kernel/cgroup_debug.c - Example cgroup subsystem that
  * exposes debug info
  *
  * Copyright (C) Google Inc, 2007
  *
  * Developed by Paul Menage (menage@google.com)
  *
  */
 #include <linux/cgroup.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/rcupdate.h>
 #include <asm/atomic.h>
 static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
 						   struct cgroup *cont)
 {
 	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
 	if (!css)
 		return ERR_PTR(-ENOMEM);
 	return css;
 }
 static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	kfree(cont->subsys[debug_subsys_id]);
 }
 static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
 {
 	return atomic_read(&cont->count);
 }
 static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
 {
 	u64 count;
 	cgroup_lock();
 	count = cgroup_task_count(cont);
 	cgroup_unlock();
 	return count;
 }
 static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
 {
 	return (u64)(long)current->cgroups;
 }
 static u64 current_css_set_refcount_read(struct cgroup *cont,
 					   struct cftype *cft)
 {
 	u64 count;
 	rcu_read_lock();
 	count = atomic_read(&current->cgroups->ref.refcount);
 	rcu_read_unlock();
 	return count;
 }
+static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
+{
+	return test_bit(CGRP_RELEASABLE, &cgrp->flags);
+}
 static struct cftype files[] =  {
 	{
 		.name = "cgroup_refcount",
 		.read_u64 = cgroup_refcount_read,
 	},
 	{
 		.name = "taskcount",
 		.read_u64 = taskcount_read,
 	},
 	{
 		.name = "current_css_set",
 		.read_u64 = current_css_set_read,
 	},
 	{
 		.name = "current_css_set_refcount",
 		.read_u64 = current_css_set_refcount_read,
 	},
+	{
+		.name = "releasable",
+		.read_u64 = releasable_read,
+	}
 };
 static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
 }
 struct cgroup_subsys debug_subsys = {
 	.name = "debug",
 	.create = debug_create,
 	.destroy = debug_destroy,
 	.populate = debug_populate,
 	.subsys_id = debug_subsys_id,
 };