Doug / smarc-fsl-linux-kernel

1

/* memcontrol.c - Memory Controller

1

/* memcontrol.c - Memory Controller

2

*

2

*

3

* Copyright IBM Corporation, 2007

3

* Copyright IBM Corporation, 2007

4

* Author Balbir Singh <balbir@linux.vnet.ibm.com>

4

* Author Balbir Singh <balbir@linux.vnet.ibm.com>

5

*

5

*

6

7

* Author: Pavel Emelianov <xemul@openvz.org>

7

* Author: Pavel Emelianov <xemul@openvz.org>

8

*

8

*

9

* This program is free software; you can redistribute it and/or modify

9

* This program is free software; you can redistribute it and/or modify

10

* it under the terms of the GNU General Public License as published by

10

* it under the terms of the GNU General Public License as published by

11

* the Free Software Foundation; either version 2 of the License, or

11

* the Free Software Foundation; either version 2 of the License, or

12

* (at your option) any later version.

12

* (at your option) any later version.

13

*

13

*

14

* This program is distributed in the hope that it will be useful,

14

* This program is distributed in the hope that it will be useful,

15

* but WITHOUT ANY WARRANTY; without even the implied warranty of

15

* but WITHOUT ANY WARRANTY; without even the implied warranty of

16

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

16

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

17

* GNU General Public License for more details.

17

* GNU General Public License for more details.

18

*/

18

*/

19

20

#include <linux/res_counter.h>

20

#include <linux/res_counter.h>

21

#include <linux/memcontrol.h>

21

#include <linux/memcontrol.h>

22

#include <linux/cgroup.h>

22

#include <linux/cgroup.h>

23

#include <linux/mm.h>

23

#include <linux/mm.h>

24

#include <linux/smp.h>

24

#include <linux/smp.h>

25

#include <linux/page-flags.h>

25

#include <linux/page-flags.h>

26

#include <linux/backing-dev.h>

26

#include <linux/backing-dev.h>

27

#include <linux/bit_spinlock.h>

27

#include <linux/bit_spinlock.h>

28

#include <linux/rcupdate.h>

28

#include <linux/rcupdate.h>

29

#include <linux/slab.h>

29

#include <linux/slab.h>

30

#include <linux/swap.h>

30

#include <linux/swap.h>

31

#include <linux/spinlock.h>

31

#include <linux/spinlock.h>

32

#include <linux/fs.h>

32

#include <linux/fs.h>

33

#include <linux/seq_file.h>

33

#include <linux/seq_file.h>

34

#include <linux/vmalloc.h>

34

#include <linux/vmalloc.h>

35

36

#include <asm/uaccess.h>

36

#include <asm/uaccess.h>

37

38

struct cgroup_subsys mem_cgroup_subsys __read_mostly;

38

struct cgroup_subsys mem_cgroup_subsys __read_mostly;

39

static struct kmem_cache *page_cgroup_cache __read_mostly;

39

static struct kmem_cache *page_cgroup_cache __read_mostly;

40

#define MEM_CGROUP_RECLAIM_RETRIES 5

40

#define MEM_CGROUP_RECLAIM_RETRIES 5

41

42

/*

42

/*

43

* Statistics for memory cgroup.

43

* Statistics for memory cgroup.

44

*/

44

*/

45

enum mem_cgroup_stat_index {

45

enum mem_cgroup_stat_index {

46

/*

46

/*

47

* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.

47

* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.

48

*/

48

*/

49

MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */

49

MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */

50

MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */

50

MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */

51

MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */

51

MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */

52

MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */

52

MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */

53

54

MEM_CGROUP_STAT_NSTATS,

54

MEM_CGROUP_STAT_NSTATS,

55

};

55

};

56

57

struct mem_cgroup_stat_cpu {

57

struct mem_cgroup_stat_cpu {

58

s64 count[MEM_CGROUP_STAT_NSTATS];

58

s64 count[MEM_CGROUP_STAT_NSTATS];

59

} ____cacheline_aligned_in_smp;

59

} ____cacheline_aligned_in_smp;

60

61

struct mem_cgroup_stat {

61

struct mem_cgroup_stat {

62

struct mem_cgroup_stat_cpu cpustat[NR_CPUS];

62

struct mem_cgroup_stat_cpu cpustat[NR_CPUS];

63

};

63

};

64

65

/*

65

/*

66

* For accounting under irq disable, no need for increment preempt count.

66

* For accounting under irq disable, no need for increment preempt count.

67

*/

67

*/

68

static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat,

68

static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat,

69

enum mem_cgroup_stat_index idx, int val)

69

enum mem_cgroup_stat_index idx, int val)

70

{

70

{

71

int cpu = smp_processor_id();

71

int cpu = smp_processor_id();

72

stat->cpustat[cpu].count[idx] += val;

72

stat->cpustat[cpu].count[idx] += val;

73

}

73

}

74

75

static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,

75

static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,

76

enum mem_cgroup_stat_index idx)

76

enum mem_cgroup_stat_index idx)

77

{

77

{

78

int cpu;

78

int cpu;

79

s64 ret = 0;

79

s64 ret = 0;

80

for_each_possible_cpu(cpu)

80

for_each_possible_cpu(cpu)

81

ret += stat->cpustat[cpu].count[idx];

81

ret += stat->cpustat[cpu].count[idx];

82

return ret;

82

return ret;

83

}

83

}

84

85

/*

85

/*

86

* per-zone information in memory controller.

86

* per-zone information in memory controller.

87

*/

87

*/

88

89

enum mem_cgroup_zstat_index {

89

enum mem_cgroup_zstat_index {

90

MEM_CGROUP_ZSTAT_ACTIVE,

90

MEM_CGROUP_ZSTAT_ACTIVE,

91

MEM_CGROUP_ZSTAT_INACTIVE,

91

MEM_CGROUP_ZSTAT_INACTIVE,

92

93

NR_MEM_CGROUP_ZSTAT,

93

NR_MEM_CGROUP_ZSTAT,

94

};

94

};

95

96

struct mem_cgroup_per_zone {

96

struct mem_cgroup_per_zone {

97

/*

97

/*

98

* spin_lock to protect the per cgroup LRU

98

* spin_lock to protect the per cgroup LRU

99

*/

99

*/

100

spinlock_t lru_lock;

100

spinlock_t lru_lock;

101

struct list_head active_list;

101

struct list_head active_list;

102

struct list_head inactive_list;

102

struct list_head inactive_list;

103

unsigned long count[NR_MEM_CGROUP_ZSTAT];

103

unsigned long count[NR_MEM_CGROUP_ZSTAT];

104

};

104

};

105

/* Macro for accessing counter */

105

/* Macro for accessing counter */

106

#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])

106

#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])

107

108

struct mem_cgroup_per_node {

108

struct mem_cgroup_per_node {

109

struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];

109

struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];

110

};

110

};

111

112

struct mem_cgroup_lru_info {

112

struct mem_cgroup_lru_info {

113

struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];

113

struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];

114

};

114

};

115

116

/*

116

/*

117

* The memory controller data structure. The memory controller controls both

117

* The memory controller data structure. The memory controller controls both

118

* page cache and RSS per cgroup. We would eventually like to provide

118

* page cache and RSS per cgroup. We would eventually like to provide

119

* statistics based on the statistics developed by Rik Van Riel for clock-pro,

119

* statistics based on the statistics developed by Rik Van Riel for clock-pro,

120

* to help the administrator determine what knobs to tune.

120

* to help the administrator determine what knobs to tune.

121

*

121

*

122

* TODO: Add a water mark for the memory controller. Reclaim will begin when

122

* TODO: Add a water mark for the memory controller. Reclaim will begin when

123

* we hit the water mark. May be even add a low water mark, such that

123

* we hit the water mark. May be even add a low water mark, such that

124

* no reclaim occurs from a cgroup at it's low water mark, this is

124

* no reclaim occurs from a cgroup at it's low water mark, this is

125

* a feature that will be implemented much later in the future.

125

* a feature that will be implemented much later in the future.

126

*/

126

*/

127

struct mem_cgroup {

127

struct mem_cgroup {

128

struct cgroup_subsys_state css;

128

struct cgroup_subsys_state css;

129

/*

129

/*

130

* the counter to account for memory usage

130

* the counter to account for memory usage

131

*/

131

*/

132

struct res_counter res;

132

struct res_counter res;

133

/*

133

/*

134

* Per cgroup active and inactive list, similar to the

134

* Per cgroup active and inactive list, similar to the

135

* per zone LRU lists.

135

* per zone LRU lists.

136

*/

136

*/

137

struct mem_cgroup_lru_info info;

137

struct mem_cgroup_lru_info info;

138

139

int prev_priority; /* for recording reclaim priority */

139

int prev_priority; /* for recording reclaim priority */

140

/*

140

/*

141

* statistics.

141

* statistics.

142

*/

142

*/

143

struct mem_cgroup_stat stat;

143

struct mem_cgroup_stat stat;

144

};

144

};

145

static struct mem_cgroup init_mem_cgroup;

145

static struct mem_cgroup init_mem_cgroup;

146

147

/*

147

/*

148

* We use the lower bit of the page->page_cgroup pointer as a bit spin

148

* We use the lower bit of the page->page_cgroup pointer as a bit spin

149

* lock. We need to ensure that page->page_cgroup is at least two

149

* lock. We need to ensure that page->page_cgroup is at least two

150

* byte aligned (based on comments from Nick Piggin). But since

150

* byte aligned (based on comments from Nick Piggin). But since

151

* bit_spin_lock doesn't actually set that lock bit in a non-debug

151

* bit_spin_lock doesn't actually set that lock bit in a non-debug

152

* uniprocessor kernel, we should avoid setting it here too.

152

* uniprocessor kernel, we should avoid setting it here too.

153

*/

153

*/

154

#define PAGE_CGROUP_LOCK_BIT 0x0

154

#define PAGE_CGROUP_LOCK_BIT 0x0

155

#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)

155

#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)

156

#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)

156

#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)

157

#else

157

#else

158

#define PAGE_CGROUP_LOCK 0x0

158

#define PAGE_CGROUP_LOCK 0x0

159

#endif

159

#endif

160

161

/*

161

/*

162

* A page_cgroup page is associated with every page descriptor. The

162

* A page_cgroup page is associated with every page descriptor. The

163

* page_cgroup helps us identify information about the cgroup

163

* page_cgroup helps us identify information about the cgroup

164

*/

164

*/

165

struct page_cgroup {

165

struct page_cgroup {

166

struct list_head lru; /* per cgroup LRU list */

166

struct list_head lru; /* per cgroup LRU list */

167

struct page *page;

167

struct page *page;

168

struct mem_cgroup *mem_cgroup;

168

struct mem_cgroup *mem_cgroup;

169

int flags;

169

int flags;

170

};

170

};

171

#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */

171

#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */

172

#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */

172

#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */

173

174

static int page_cgroup_nid(struct page_cgroup *pc)

174

static int page_cgroup_nid(struct page_cgroup *pc)

175

{

175

{

176

return page_to_nid(pc->page);

176

return page_to_nid(pc->page);

177

}

177

}

178

179

static enum zone_type page_cgroup_zid(struct page_cgroup *pc)

179

static enum zone_type page_cgroup_zid(struct page_cgroup *pc)

180

{

180

{

181

return page_zonenum(pc->page);

181

return page_zonenum(pc->page);

182

}

182

}

183

184

enum charge_type {

184

enum charge_type {

185

MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

185

MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

186

MEM_CGROUP_CHARGE_TYPE_MAPPED,

186

MEM_CGROUP_CHARGE_TYPE_MAPPED,

187

MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */

187

MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */

188

};

188

};

189

190

/*

190

/*

191

* Always modified under lru lock. Then, not necessary to preempt_disable()

191

* Always modified under lru lock. Then, not necessary to preempt_disable()

192

*/

192

*/

193

static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,

193

static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,

194

bool charge)

194

bool charge)

195

{

195

{

196

int val = (charge)? 1 : -1;

196

int val = (charge)? 1 : -1;

197

struct mem_cgroup_stat *stat = &mem->stat;

197

struct mem_cgroup_stat *stat = &mem->stat;

198

199

VM_BUG_ON(!irqs_disabled());

199

VM_BUG_ON(!irqs_disabled());

200

if (flags & PAGE_CGROUP_FLAG_CACHE)

200

if (flags & PAGE_CGROUP_FLAG_CACHE)

201

__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);

201

__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);

202

else

202

else

203

__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);

203

__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);

204

205

if (charge)

205

if (charge)

206

__mem_cgroup_stat_add_safe(stat,

206

__mem_cgroup_stat_add_safe(stat,

207

MEM_CGROUP_STAT_PGPGIN_COUNT, 1);

207

MEM_CGROUP_STAT_PGPGIN_COUNT, 1);

208

else

208

else

209

__mem_cgroup_stat_add_safe(stat,

209

__mem_cgroup_stat_add_safe(stat,

210

MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);

210

MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);

211

}

211

}

212

213

static struct mem_cgroup_per_zone *

213

static struct mem_cgroup_per_zone *

214

mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)

214

mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)

215

{

215

{

216

return &mem->info.nodeinfo[nid]->zoneinfo[zid];

216

return &mem->info.nodeinfo[nid]->zoneinfo[zid];

217

}

217

}

218

219

static struct mem_cgroup_per_zone *

219

static struct mem_cgroup_per_zone *

220

page_cgroup_zoneinfo(struct page_cgroup *pc)

220

page_cgroup_zoneinfo(struct page_cgroup *pc)

221

{

221

{

222

struct mem_cgroup *mem = pc->mem_cgroup;

222

struct mem_cgroup *mem = pc->mem_cgroup;

223

int nid = page_cgroup_nid(pc);

223

int nid = page_cgroup_nid(pc);

224

int zid = page_cgroup_zid(pc);

224

int zid = page_cgroup_zid(pc);

225

226

return mem_cgroup_zoneinfo(mem, nid, zid);

226

return mem_cgroup_zoneinfo(mem, nid, zid);

227

}

227

}

228

229

static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,

229

static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,

230

enum mem_cgroup_zstat_index idx)

230

enum mem_cgroup_zstat_index idx)

231

{

231

{

232

int nid, zid;

232

int nid, zid;

233

struct mem_cgroup_per_zone *mz;

233

struct mem_cgroup_per_zone *mz;

234

u64 total = 0;

234

u64 total = 0;

235

236

for_each_online_node(nid)

236

for_each_online_node(nid)

237

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

237

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

238

mz = mem_cgroup_zoneinfo(mem, nid, zid);

238

mz = mem_cgroup_zoneinfo(mem, nid, zid);

239

total += MEM_CGROUP_ZSTAT(mz, idx);

239

total += MEM_CGROUP_ZSTAT(mz, idx);

240

}

240

}

241

return total;

241

return total;

242

}

242

}

243

244

static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)

244

static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)

245

{

245

{

246

return container_of(cgroup_subsys_state(cont,

246

return container_of(cgroup_subsys_state(cont,

247

mem_cgroup_subsys_id), struct mem_cgroup,

247

mem_cgroup_subsys_id), struct mem_cgroup,

248

css);

248

css);

249

}

249

}

250

251

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)

251

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)

252

{

252

{

253

return container_of(task_subsys_state(p, mem_cgroup_subsys_id),

253

return container_of(task_subsys_state(p, mem_cgroup_subsys_id),

254

struct mem_cgroup, css);

254

struct mem_cgroup, css);

255

}

255

}

256

257

static inline int page_cgroup_locked(struct page *page)

257

static inline int page_cgroup_locked(struct page *page)

258

{

258

{

259

return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);

259

return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);

260

}

260

}

261

262

static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)

262

static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)

263

{

263

{

264

VM_BUG_ON(!page_cgroup_locked(page));

264

VM_BUG_ON(!page_cgroup_locked(page));

265

page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);

265

page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);

266

}

266

}

267

268

struct page_cgroup *page_get_page_cgroup(struct page *page)

268

struct page_cgroup *page_get_page_cgroup(struct page *page)

269

{

269

{

270

return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);

270

return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);

271

}

271

}

272

273

static void lock_page_cgroup(struct page *page)

273

static void lock_page_cgroup(struct page *page)

274

{

274

{

275

bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);

275

bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);

276

}

276

}

277

278

static int try_lock_page_cgroup(struct page *page)

278

static int try_lock_page_cgroup(struct page *page)

279

{

279

{

280

return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);

280

return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);

281

}

281

}

282

283

static void unlock_page_cgroup(struct page *page)

283

static void unlock_page_cgroup(struct page *page)

284

{

284

{

285

bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);

285

bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);

286

}

286

}

287

288

static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,

288

static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,

289

struct page_cgroup *pc)

289

struct page_cgroup *pc)

290

{

290

{

291

int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;

291

int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;

292

293

if (from)

293

if (from)

294

MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;

294

MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;

295

else

295

else

296

MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;

296

MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;

297

298

mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);

298

mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);

299

list_del(&pc->lru);

299

list_del(&pc->lru);

300

}

300

}

301

302

static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,

302

static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,

303

struct page_cgroup *pc)

303

struct page_cgroup *pc)

304

{

304

{

305

int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;

305

int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;

306

307

if (!to) {

307

if (!to) {

308

MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;

308

MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;

309

list_add(&pc->lru, &mz->inactive_list);

309

list_add(&pc->lru, &mz->inactive_list);

310

} else {

310

} else {

311

MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;

311

MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;

312

list_add(&pc->lru, &mz->active_list);

312

list_add(&pc->lru, &mz->active_list);

313

}

313

}

314

mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);

314

mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);

315

}

315

}

316

317

static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)

317

static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)

318

{

318

{

319

int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;

319

int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;

320

struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);

320

struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);

321

322

if (from)

322

if (from)

323

MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;

323

MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;

324

else

324

else

325

MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;

325

MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;

326

327

if (active) {

327

if (active) {

328

MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;

328

MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;

329

pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;

329

pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;

330

list_move(&pc->lru, &mz->active_list);

330

list_move(&pc->lru, &mz->active_list);

331

} else {

331

} else {

332

MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;

332

MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;

333

pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;

333

pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;

334

list_move(&pc->lru, &mz->inactive_list);

334

list_move(&pc->lru, &mz->inactive_list);

335

}

335

}

336

}

336

}

337

338

int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)

338

int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)

339

{

339

{

340

int ret;

340

int ret;

341

342

task_lock(task);

342

task_lock(task);

343

ret = task->mm && mm_match_cgroup(task->mm, mem);

343

ret = task->mm && mm_match_cgroup(task->mm, mem);

344

task_unlock(task);

344

task_unlock(task);

345

return ret;

345

return ret;

346

}

346

}

347

348

/*

348

/*

349

* This routine assumes that the appropriate zone's lru lock is already held

349

* This routine assumes that the appropriate zone's lru lock is already held

350

*/

350

*/

351

void mem_cgroup_move_lists(struct page *page, bool active)

351

void mem_cgroup_move_lists(struct page *page, bool active)

352

{

352

{

353

struct page_cgroup *pc;

353

struct page_cgroup *pc;

354

struct mem_cgroup_per_zone *mz;

354

struct mem_cgroup_per_zone *mz;

355

unsigned long flags;

355

unsigned long flags;

356

357

if (mem_cgroup_subsys.disabled)

357

if (mem_cgroup_subsys.disabled)

358

return;

358

return;

359

360

/*

360

/*

361

* We cannot lock_page_cgroup while holding zone's lru_lock,

361

* We cannot lock_page_cgroup while holding zone's lru_lock,

362

* because other holders of lock_page_cgroup can be interrupted

362

* because other holders of lock_page_cgroup can be interrupted

363

* with an attempt to rotate_reclaimable_page. But we cannot

363

* with an attempt to rotate_reclaimable_page. But we cannot

364

* safely get to page_cgroup without it, so just try_lock it:

364

* safely get to page_cgroup without it, so just try_lock it:

365

* mem_cgroup_isolate_pages allows for page left on wrong list.

365

* mem_cgroup_isolate_pages allows for page left on wrong list.

366

*/

366

*/

367

if (!try_lock_page_cgroup(page))

367

if (!try_lock_page_cgroup(page))

368

return;

368

return;

369

370

pc = page_get_page_cgroup(page);

370

pc = page_get_page_cgroup(page);

371

if (pc) {

371

if (pc) {

372

mz = page_cgroup_zoneinfo(pc);

372

mz = page_cgroup_zoneinfo(pc);

373

spin_lock_irqsave(&mz->lru_lock, flags);

373

spin_lock_irqsave(&mz->lru_lock, flags);

374

__mem_cgroup_move_lists(pc, active);

374

__mem_cgroup_move_lists(pc, active);

375

spin_unlock_irqrestore(&mz->lru_lock, flags);

375

spin_unlock_irqrestore(&mz->lru_lock, flags);

376

}

376

}

377

unlock_page_cgroup(page);

377

unlock_page_cgroup(page);

378

}

378

}

379

380

/*

380

/*

381

* Calculate mapped_ratio under memory controller. This will be used in

381

* Calculate mapped_ratio under memory controller. This will be used in

382

* vmscan.c for deteremining we have to reclaim mapped pages.

382

* vmscan.c for deteremining we have to reclaim mapped pages.

383

*/

383

*/

384

int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)

384

int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)

385

{

385

{

386

long total, rss;

386

long total, rss;

387

388

/*

388

/*

389

* usage is recorded in bytes. But, here, we assume the number of

389

* usage is recorded in bytes. But, here, we assume the number of

390

* physical pages can be represented by "long" on any arch.

390

* physical pages can be represented by "long" on any arch.

391

*/

391

*/

392

total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;

392

total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;

393

rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);

393

rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);

394

return (int)((rss * 100L) / total);

394

return (int)((rss * 100L) / total);

395

}

395

}

396

397

/*

397

/*

398

* This function is called from vmscan.c. In page reclaiming loop. balance

398

* This function is called from vmscan.c. In page reclaiming loop. balance

399

* between active and inactive list is calculated. For memory controller

399

* between active and inactive list is calculated. For memory controller

400

* page reclaiming, we should use using mem_cgroup's imbalance rather than

400

* page reclaiming, we should use using mem_cgroup's imbalance rather than

401

* zone's global lru imbalance.

401

* zone's global lru imbalance.

402

*/

402

*/

403

long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)

403

long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)

404

{

404

{

405

unsigned long active, inactive;

405

unsigned long active, inactive;

406

/* active and inactive are the number of pages. 'long' is ok.*/

406

/* active and inactive are the number of pages. 'long' is ok.*/

407

active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);

407

active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);

408

inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);

408

inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);

409

return (long) (active / (inactive + 1));

409

return (long) (active / (inactive + 1));

410

}

410

}

411

412

/*

412

/*

413

* prev_priority control...this will be used in memory reclaim path.

413

* prev_priority control...this will be used in memory reclaim path.

414

*/

414

*/

415

int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)

415

int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)

416

{

416

{

417

return mem->prev_priority;

417

return mem->prev_priority;

418

}

418

}

419

420

void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)

420

void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)

421

{

421

{

422

if (priority < mem->prev_priority)

422

if (priority < mem->prev_priority)

423

mem->prev_priority = priority;

423

mem->prev_priority = priority;

424

}

424

}

425

426

void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)

426

void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)

427

{

427

{

428

mem->prev_priority = priority;

428

mem->prev_priority = priority;

429

}

429

}

430

431

/*

431

/*

432

* Calculate # of pages to be scanned in this priority/zone.

432

* Calculate # of pages to be scanned in this priority/zone.

433

* See also vmscan.c

433

* See also vmscan.c

434

*

434

*

435

* priority starts from "DEF_PRIORITY" and decremented in each loop.

435

* priority starts from "DEF_PRIORITY" and decremented in each loop.

436

* (see include/linux/mmzone.h)

436

* (see include/linux/mmzone.h)

437

*/

437

*/

438

439

long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,

439

long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,

440

struct zone *zone, int priority)

440

struct zone *zone, int priority)

441

{

441

{

442

long nr_active;

442

long nr_active;

443

int nid = zone->zone_pgdat->node_id;

443

int nid = zone->zone_pgdat->node_id;

444

int zid = zone_idx(zone);

444

int zid = zone_idx(zone);

445

struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);

445

struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);

446

447

nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE);

447

nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE);

448

return (nr_active >> priority);

448

return (nr_active >> priority);

449

}

449

}

450

451

long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,

451

long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,

452

struct zone *zone, int priority)

452

struct zone *zone, int priority)

453

{

453

{

454

long nr_inactive;

454

long nr_inactive;

455

int nid = zone->zone_pgdat->node_id;

455

int nid = zone->zone_pgdat->node_id;

456

int zid = zone_idx(zone);

456

int zid = zone_idx(zone);

457

struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);

457

struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);

458

459

nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);

459

nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);

460

return (nr_inactive >> priority);

460

return (nr_inactive >> priority);

461

}

461

}

462

463

unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,

463

unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,

464

struct list_head *dst,

464

struct list_head *dst,

465

unsigned long *scanned, int order,

465

unsigned long *scanned, int order,

466

int mode, struct zone *z,

466

int mode, struct zone *z,

467

struct mem_cgroup *mem_cont,

467

struct mem_cgroup *mem_cont,

468

int active)

468

int active)

469

{

469

{

470

unsigned long nr_taken = 0;

470

unsigned long nr_taken = 0;

471

struct page *page;

471

struct page *page;

472

unsigned long scan;

472

unsigned long scan;

473

LIST_HEAD(pc_list);

473

LIST_HEAD(pc_list);

474

struct list_head *src;

474

struct list_head *src;

475

struct page_cgroup *pc, *tmp;

475

struct page_cgroup *pc, *tmp;

476

int nid = z->zone_pgdat->node_id;

476

int nid = z->zone_pgdat->node_id;

477

int zid = zone_idx(z);

477

int zid = zone_idx(z);

478

struct mem_cgroup_per_zone *mz;

478

struct mem_cgroup_per_zone *mz;

479

480

BUG_ON(!mem_cont);

480

BUG_ON(!mem_cont);

481

mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);

481

mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);

482

if (active)

482

if (active)

483

src = &mz->active_list;

483

src = &mz->active_list;

484

else

484

else

485

src = &mz->inactive_list;

485

src = &mz->inactive_list;

486

487

488

spin_lock(&mz->lru_lock);

488

spin_lock(&mz->lru_lock);

489

scan = 0;

489

scan = 0;

490

list_for_each_entry_safe_reverse(pc, tmp, src, lru) {

490

list_for_each_entry_safe_reverse(pc, tmp, src, lru) {

491

if (scan >= nr_to_scan)

491

if (scan >= nr_to_scan)

492

break;

492

break;

493

page = pc->page;

493

page = pc->page;

494

495

if (unlikely(!PageLRU(page)))

495

if (unlikely(!PageLRU(page)))

496

continue;

496

continue;

497

498

if (PageActive(page) && !active) {

498

if (PageActive(page) && !active) {

499

__mem_cgroup_move_lists(pc, true);

499

__mem_cgroup_move_lists(pc, true);

500

continue;

500

continue;

501

}

501

}

502

if (!PageActive(page) && active) {

502

if (!PageActive(page) && active) {

503

__mem_cgroup_move_lists(pc, false);

503

__mem_cgroup_move_lists(pc, false);

504

continue;

504

continue;

505

}

505

}

506

507

scan++;

507

scan++;

508

list_move(&pc->lru, &pc_list);

508

list_move(&pc->lru, &pc_list);

509

510

if (__isolate_lru_page(page, mode) == 0) {

510

if (__isolate_lru_page(page, mode) == 0) {

511

list_move(&page->lru, dst);

511

list_move(&page->lru, dst);

512

nr_taken++;

512

nr_taken++;

513

}

513

}

514

}

514

}

515

516

list_splice(&pc_list, src);

516

list_splice(&pc_list, src);

517

spin_unlock(&mz->lru_lock);

517

spin_unlock(&mz->lru_lock);

518

519

*scanned = scan;

519

*scanned = scan;

520

return nr_taken;

520

return nr_taken;

521

}

521

}

522

523

/*

523

/*

524

* Charge the memory controller for page usage.

524

* Charge the memory controller for page usage.

525

* Return

525

* Return

526

* 0 if the charge was successful

526

* 0 if the charge was successful

527

* < 0 if the cgroup is over its limit

527

* < 0 if the cgroup is over its limit

528

*/

528

*/

529

static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,

529

static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,

530

gfp_t gfp_mask, enum charge_type ctype,

530

gfp_t gfp_mask, enum charge_type ctype,

531

struct mem_cgroup *memcg)

531

struct mem_cgroup *memcg)

532

{

532

{

533

struct mem_cgroup *mem;

533

struct mem_cgroup *mem;

534

struct page_cgroup *pc;

534

struct page_cgroup *pc;

535

unsigned long flags;

535

unsigned long flags;

536

unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

536

unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

537

struct mem_cgroup_per_zone *mz;

537

struct mem_cgroup_per_zone *mz;

538

539

pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);

539

pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);

540

if (unlikely(pc == NULL))

540

if (unlikely(pc == NULL))

541

goto err;

541

goto err;

542

543

/*

543

/*

544

* We always charge the cgroup the mm_struct belongs to.

544

* We always charge the cgroup the mm_struct belongs to.

545

* The mm_struct's mem_cgroup changes on task migration if the

545

* The mm_struct's mem_cgroup changes on task migration if the

546

* thread group leader migrates. It's possible that mm is not

546

* thread group leader migrates. It's possible that mm is not

547

* set, if so charge the init_mm (happens for pagecache usage).

547

* set, if so charge the init_mm (happens for pagecache usage).

548

*/

548

*/

549

if (likely(!memcg)) {

549

if (likely(!memcg)) {

550

rcu_read_lock();

550

rcu_read_lock();

551

mem = mem_cgroup_from_task(rcu_dereference(mm->owner));

551

mem = mem_cgroup_from_task(rcu_dereference(mm->owner));

552

/*

552

/*

553

* For every charge from the cgroup, increment reference count

553

* For every charge from the cgroup, increment reference count

554

*/

554

*/

555

css_get(&mem->css);

555

css_get(&mem->css);

556

rcu_read_unlock();

556

rcu_read_unlock();

557

} else {

557

} else {

558

mem = memcg;

558

mem = memcg;

559

css_get(&memcg->css);

559

css_get(&memcg->css);

560

}

560

}

561

562

while (res_counter_charge(&mem->res, PAGE_SIZE)) {

562

while (res_counter_charge(&mem->res, PAGE_SIZE)) {

563

if (!(gfp_mask & __GFP_WAIT))

563

if (!(gfp_mask & __GFP_WAIT))

564

goto out;

564

goto out;

565

566

if (try_to_free_mem_cgroup_pages(mem, gfp_mask))

566

if (try_to_free_mem_cgroup_pages(mem, gfp_mask))

567

continue;

567

continue;

568

569

/*

569

/*

570

* try_to_free_mem_cgroup_pages() might not give us a full

570

* try_to_free_mem_cgroup_pages() might not give us a full

571

* picture of reclaim. Some pages are reclaimed and might be

571

* picture of reclaim. Some pages are reclaimed and might be

572

* moved to swap cache or just unmapped from the cgroup.

572

* moved to swap cache or just unmapped from the cgroup.

573

* Check the limit again to see if the reclaim reduced the

573

* Check the limit again to see if the reclaim reduced the

574

* current usage of the cgroup before giving up

574

* current usage of the cgroup before giving up

575

*/

575

*/

576

if (res_counter_check_under_limit(&mem->res))

576

if (res_counter_check_under_limit(&mem->res))

577

continue;

577

continue;

578

579

if (!nr_retries--) {

579

if (!nr_retries--) {

580

mem_cgroup_out_of_memory(mem, gfp_mask);

580

mem_cgroup_out_of_memory(mem, gfp_mask);

581

goto out;

581

goto out;

582

}

582

}

583

}

583

}

584

585

pc->mem_cgroup = mem;

585

pc->mem_cgroup = mem;

586

pc->page = page;

586

pc->page = page;

587

/*

587

/*

588

* If a page is accounted as a page cache, insert to inactive list.

588

* If a page is accounted as a page cache, insert to inactive list.

589

* If anon, insert to active list.

589

* If anon, insert to active list.

590

*/

590

*/

591

if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)

591

if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)

592

pc->flags = PAGE_CGROUP_FLAG_CACHE;

592

pc->flags = PAGE_CGROUP_FLAG_CACHE;

593

else

593

else

594

pc->flags = PAGE_CGROUP_FLAG_ACTIVE;

594

pc->flags = PAGE_CGROUP_FLAG_ACTIVE;

595

596

lock_page_cgroup(page);

596

lock_page_cgroup(page);

597

if (unlikely(page_get_page_cgroup(page))) {

597

if (unlikely(page_get_page_cgroup(page))) {

598

unlock_page_cgroup(page);

598

unlock_page_cgroup(page);

599

res_counter_uncharge(&mem->res, PAGE_SIZE);

599

res_counter_uncharge(&mem->res, PAGE_SIZE);

600

css_put(&mem->css);

600

css_put(&mem->css);

601

kmem_cache_free(page_cgroup_cache, pc);

601

kmem_cache_free(page_cgroup_cache, pc);

602

goto done;

602

goto done;

603

}

603

}

604

page_assign_page_cgroup(page, pc);

604

page_assign_page_cgroup(page, pc);

605

606

mz = page_cgroup_zoneinfo(pc);

606

mz = page_cgroup_zoneinfo(pc);

607

spin_lock_irqsave(&mz->lru_lock, flags);

607

spin_lock_irqsave(&mz->lru_lock, flags);

608

__mem_cgroup_add_list(mz, pc);

608

__mem_cgroup_add_list(mz, pc);

609

spin_unlock_irqrestore(&mz->lru_lock, flags);

609

spin_unlock_irqrestore(&mz->lru_lock, flags);

610

611

unlock_page_cgroup(page);

611

unlock_page_cgroup(page);

612

done:

612

done:

613

return 0;

613

return 0;

614

out:

614

out:

615

css_put(&mem->css);

615

css_put(&mem->css);

616

kmem_cache_free(page_cgroup_cache, pc);

616

kmem_cache_free(page_cgroup_cache, pc);

617

err:

617

err:

618

return -ENOMEM;

618

return -ENOMEM;

619

}

619

}

620

621

int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)

621

int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)

622

{

622

{

623

if (mem_cgroup_subsys.disabled)

623

if (mem_cgroup_subsys.disabled)

624

return 0;

624

return 0;

625

626

/*

626

/*

627

* If already mapped, we don't have to account.

627

* If already mapped, we don't have to account.

628

* If page cache, page->mapping has address_space.

628

* If page cache, page->mapping has address_space.

629

* But page->mapping may have out-of-use anon_vma pointer,

629

* But page->mapping may have out-of-use anon_vma pointer,

630

* detecit it by PageAnon() check. newly-mapped-anon's page->mapping

630

* detecit it by PageAnon() check. newly-mapped-anon's page->mapping

631

* is NULL.

631

* is NULL.

632

*/

632

*/

633

if (page_mapped(page) || (page->mapping && !PageAnon(page)))

633

if (page_mapped(page) || (page->mapping && !PageAnon(page)))

634

return 0;

634

return 0;

635

if (unlikely(!mm))

635

if (unlikely(!mm))

636

mm = &init_mm;

636

mm = &init_mm;

637

return mem_cgroup_charge_common(page, mm, gfp_mask,

637

return mem_cgroup_charge_common(page, mm, gfp_mask,

638

MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);

638

MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);

639

}

639

}

640

641

int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,

641

int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,

642

gfp_t gfp_mask)

642

gfp_t gfp_mask)

643

{

643

{

644

if (mem_cgroup_subsys.disabled)

644

if (mem_cgroup_subsys.disabled)

645

return 0;

645

return 0;

646

647

/*

647

/*

648

* Corner case handling. This is called from add_to_page_cache()

648

* Corner case handling. This is called from add_to_page_cache()

649

* in usual. But some FS (shmem) precharges this page before calling it

649

* in usual. But some FS (shmem) precharges this page before calling it

650

* and call add_to_page_cache() with GFP_NOWAIT.

650

* and call add_to_page_cache() with GFP_NOWAIT.

651

*

651

*

652

* For GFP_NOWAIT case, the page may be pre-charged before calling

652

* For GFP_NOWAIT case, the page may be pre-charged before calling

653

* add_to_page_cache(). (See shmem.c) check it here and avoid to call

653

* add_to_page_cache(). (See shmem.c) check it here and avoid to call

654

* charge twice. (It works but has to pay a bit larger cost.)

654

* charge twice. (It works but has to pay a bit larger cost.)

655

*/

655

*/

656

if (!(gfp_mask & __GFP_WAIT)) {

656

if (!(gfp_mask & __GFP_WAIT)) {

657

struct page_cgroup *pc;

657

struct page_cgroup *pc;

658

659

lock_page_cgroup(page);

659

lock_page_cgroup(page);

660

pc = page_get_page_cgroup(page);

660

pc = page_get_page_cgroup(page);

661

if (pc) {

661

if (pc) {

662

VM_BUG_ON(pc->page != page);

662

VM_BUG_ON(pc->page != page);

663

VM_BUG_ON(!pc->mem_cgroup);

663

VM_BUG_ON(!pc->mem_cgroup);

664

unlock_page_cgroup(page);

664

unlock_page_cgroup(page);

665

return 0;

665

return 0;

666

}

666

}

667

unlock_page_cgroup(page);

667

unlock_page_cgroup(page);

668

}

668

}

669

670

if (unlikely(!mm))

670

if (unlikely(!mm))

671

mm = &init_mm;

671

mm = &init_mm;

672

673

return mem_cgroup_charge_common(page, mm, gfp_mask,

673

return mem_cgroup_charge_common(page, mm, gfp_mask,

674

MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);

674

MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);

675

}

675

}

676

677

/*

677

/*

678

* uncharge if !page_mapped(page)

678

* uncharge if !page_mapped(page)

679

*/

679

*/

680

static void

680

static void

681

__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)

681

__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)

682

{

682

{

683

struct page_cgroup *pc;

683

struct page_cgroup *pc;

684

struct mem_cgroup *mem;

684

struct mem_cgroup *mem;

685

struct mem_cgroup_per_zone *mz;

685

struct mem_cgroup_per_zone *mz;

686

unsigned long flags;

686

unsigned long flags;

687

688

if (mem_cgroup_subsys.disabled)

688

if (mem_cgroup_subsys.disabled)

689

return;

689

return;

690

691

/*

691

/*

692

* Check if our page_cgroup is valid

692

* Check if our page_cgroup is valid

693

*/

693

*/

694

lock_page_cgroup(page);

694

lock_page_cgroup(page);

695

pc = page_get_page_cgroup(page);

695

pc = page_get_page_cgroup(page);

696

if (unlikely(!pc))

696

if (unlikely(!pc))

697

goto unlock;

697

goto unlock;

698

699

VM_BUG_ON(pc->page != page);

699

VM_BUG_ON(pc->page != page);

700

701

if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)

701

if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)

702

&& ((pc->flags & PAGE_CGROUP_FLAG_CACHE)

702

&& ((pc->flags & PAGE_CGROUP_FLAG_CACHE)

703

|| page_mapped(page)))

703

|| page_mapped(page)))

704

goto unlock;

704

goto unlock;

705

706

mz = page_cgroup_zoneinfo(pc);

706

mz = page_cgroup_zoneinfo(pc);

707

spin_lock_irqsave(&mz->lru_lock, flags);

707

spin_lock_irqsave(&mz->lru_lock, flags);

708

__mem_cgroup_remove_list(mz, pc);

708

__mem_cgroup_remove_list(mz, pc);

709

spin_unlock_irqrestore(&mz->lru_lock, flags);

709

spin_unlock_irqrestore(&mz->lru_lock, flags);

710

711

page_assign_page_cgroup(page, NULL);

711

page_assign_page_cgroup(page, NULL);

712

unlock_page_cgroup(page);

712

unlock_page_cgroup(page);

713

714

mem = pc->mem_cgroup;

714

mem = pc->mem_cgroup;

715

res_counter_uncharge(&mem->res, PAGE_SIZE);

715

res_counter_uncharge(&mem->res, PAGE_SIZE);

716

css_put(&mem->css);

716

css_put(&mem->css);

717

718

kmem_cache_free(page_cgroup_cache, pc);

718

kmem_cache_free(page_cgroup_cache, pc);

719

return;

719

return;

720

unlock:

720

unlock:

721

unlock_page_cgroup(page);

721

unlock_page_cgroup(page);

722

}

722

}

723

724

void mem_cgroup_uncharge_page(struct page *page)

724

void mem_cgroup_uncharge_page(struct page *page)

725

{

725

{

726

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);

726

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);

727

}

727

}

728

729

void mem_cgroup_uncharge_cache_page(struct page *page)

729

void mem_cgroup_uncharge_cache_page(struct page *page)

730

{

730

{

731

VM_BUG_ON(page_mapped(page));

731

VM_BUG_ON(page_mapped(page));

732

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);

732

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);

733

}

733

}

734

735

/*

735

/*

736

* Before starting migration, account against new page.

736

* Before starting migration, account against new page.

737

*/

737

*/

738

int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)

738

int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)

739

{

739

{

740

struct page_cgroup *pc;

740

struct page_cgroup *pc;

741

struct mem_cgroup *mem = NULL;

741

struct mem_cgroup *mem = NULL;

742

enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;

742

enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;

743

int ret = 0;

743

int ret = 0;

744

745

if (mem_cgroup_subsys.disabled)

745

if (mem_cgroup_subsys.disabled)

746

return 0;

746

return 0;

747

748

lock_page_cgroup(page);

748

lock_page_cgroup(page);

749

pc = page_get_page_cgroup(page);

749

pc = page_get_page_cgroup(page);

750

if (pc) {

750

if (pc) {

751

mem = pc->mem_cgroup;

751

mem = pc->mem_cgroup;

752

css_get(&mem->css);

752

css_get(&mem->css);

753

if (pc->flags & PAGE_CGROUP_FLAG_CACHE)

753

if (pc->flags & PAGE_CGROUP_FLAG_CACHE)

754

ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;

754

ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;

755

}

755

}

756

unlock_page_cgroup(page);

756

unlock_page_cgroup(page);

757

if (mem) {

757

if (mem) {

758

ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,

758

ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,

759

ctype, mem);

759

ctype, mem);

760

css_put(&mem->css);

760

css_put(&mem->css);

761

}

761

}

762

return ret;

762

return ret;

763

}

763

}

764

765

/* remove redundant charge if migration failed*/

765

/* remove redundant charge if migration failed*/

766

void mem_cgroup_end_migration(struct page *newpage)

766

void mem_cgroup_end_migration(struct page *newpage)

767

{

767

{

768

/*

768

/*

769

* At success, page->mapping is not NULL.

769

* At success, page->mapping is not NULL.

770

* special rollback care is necessary when

770

* special rollback care is necessary when

771

* 1. at migration failure. (newpage->mapping is cleared in this case)

771

* 1. at migration failure. (newpage->mapping is cleared in this case)

772

* 2. the newpage was moved but not remapped again because the task

772

* 2. the newpage was moved but not remapped again because the task

773

* exits and the newpage is obsolete. In this case, the new page

773

* exits and the newpage is obsolete. In this case, the new page

774

* may be a swapcache. So, we just call mem_cgroup_uncharge_page()

774

* may be a swapcache. So, we just call mem_cgroup_uncharge_page()

775

* always for avoiding mess. The page_cgroup will be removed if

775

* always for avoiding mess. The page_cgroup will be removed if

776

* unnecessary. File cache pages is still on radix-tree. Don't

776

* unnecessary. File cache pages is still on radix-tree. Don't

777

* care it.

777

* care it.

778

*/

778

*/

779

if (!newpage->mapping)

779

if (!newpage->mapping)

780

__mem_cgroup_uncharge_common(newpage,

780

__mem_cgroup_uncharge_common(newpage,

781

MEM_CGROUP_CHARGE_TYPE_FORCE);

781

MEM_CGROUP_CHARGE_TYPE_FORCE);

782

else if (PageAnon(newpage))

782

else if (PageAnon(newpage))

783

mem_cgroup_uncharge_page(newpage);

783

mem_cgroup_uncharge_page(newpage);

784

}

784

}

785

786

/*

786

/*

787

* A call to try to shrink memory usage under specified resource controller.

787

* A call to try to shrink memory usage under specified resource controller.

788

* This is typically used for page reclaiming for shmem for reducing side

788

* This is typically used for page reclaiming for shmem for reducing side

789

* effect of page allocation from shmem, which is used by some mem_cgroup.

789

* effect of page allocation from shmem, which is used by some mem_cgroup.

790

*/

790

*/

791

int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)

791

int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)

792

{

792

{

793

struct mem_cgroup *mem;

793

struct mem_cgroup *mem;

794

int progress = 0;

794

int progress = 0;

795

int retry = MEM_CGROUP_RECLAIM_RETRIES;

795

int retry = MEM_CGROUP_RECLAIM_RETRIES;

796

797

if (mem_cgroup_subsys.disabled)

797

if (mem_cgroup_subsys.disabled)

798

return 0;

798

return 0;

799

if (!mm)

800

return 0;

799

801

800

rcu_read_lock();

802

rcu_read_lock();

801

mem = mem_cgroup_from_task(rcu_dereference(mm->owner));

803

mem = mem_cgroup_from_task(rcu_dereference(mm->owner));

802

css_get(&mem->css);

804

css_get(&mem->css);

803

rcu_read_unlock();

805

rcu_read_unlock();

804

806

805

do {

807

do {

806

progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);

808

progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);

807

} while (!progress && --retry);

809

} while (!progress && --retry);

808

810

809

css_put(&mem->css);

811

css_put(&mem->css);

810

if (!retry)

812

if (!retry)

811

return -ENOMEM;

813

return -ENOMEM;

812

return 0;

814

return 0;

813

}

815

}

814

816

815

int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)

817

int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)

816

{

818

{

817

819

818

int retry_count = MEM_CGROUP_RECLAIM_RETRIES;

820

int retry_count = MEM_CGROUP_RECLAIM_RETRIES;

819

int progress;

821

int progress;

820

int ret = 0;

822

int ret = 0;

821

823

822

while (res_counter_set_limit(&memcg->res, val)) {

824

while (res_counter_set_limit(&memcg->res, val)) {

823

if (signal_pending(current)) {

825

if (signal_pending(current)) {

824

ret = -EINTR;

826

ret = -EINTR;

825

break;

827

break;

826

}

828

}

827

if (!retry_count) {

829

if (!retry_count) {

828

ret = -EBUSY;

830

ret = -EBUSY;

829

break;

831

break;

830

}

832

}

831

progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);

833

progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);

832

if (!progress)

834

if (!progress)

833

retry_count--;

835

retry_count--;

834

}

836

}

835

return ret;

837

return ret;

836

}

838

}

837

839

838

840

839

/*

841

/*

840

* This routine traverse page_cgroup in given list and drop them all.

842

* This routine traverse page_cgroup in given list and drop them all.

841

* *And* this routine doesn't reclaim page itself, just removes page_cgroup.

843

* *And* this routine doesn't reclaim page itself, just removes page_cgroup.

842

*/

844

*/

843

#define FORCE_UNCHARGE_BATCH (128)

845

#define FORCE_UNCHARGE_BATCH (128)

844

static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,

846

static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,

845

struct mem_cgroup_per_zone *mz,

847

struct mem_cgroup_per_zone *mz,

846

int active)

848

int active)

847

{

849

{

848

struct page_cgroup *pc;

850

struct page_cgroup *pc;

849

struct page *page;

851

struct page *page;

850

int count = FORCE_UNCHARGE_BATCH;

852

int count = FORCE_UNCHARGE_BATCH;

851

unsigned long flags;

853

unsigned long flags;

852

struct list_head *list;

854

struct list_head *list;

853

855

854

if (active)

856

if (active)

855

list = &mz->active_list;

857

list = &mz->active_list;

856

else

858

else

857

list = &mz->inactive_list;

859

list = &mz->inactive_list;

858

860

859

spin_lock_irqsave(&mz->lru_lock, flags);

861

spin_lock_irqsave(&mz->lru_lock, flags);

860

while (!list_empty(list)) {

862

while (!list_empty(list)) {

861

pc = list_entry(list->prev, struct page_cgroup, lru);

863

pc = list_entry(list->prev, struct page_cgroup, lru);

862

page = pc->page;

864

page = pc->page;

863

get_page(page);

865

get_page(page);

864

spin_unlock_irqrestore(&mz->lru_lock, flags);

866

spin_unlock_irqrestore(&mz->lru_lock, flags);

865

/*

867

/*

866

* Check if this page is on LRU. !LRU page can be found

868

* Check if this page is on LRU. !LRU page can be found

867

* if it's under page migration.

869

* if it's under page migration.

868

*/

870

*/

869

if (PageLRU(page)) {

871

if (PageLRU(page)) {

870

__mem_cgroup_uncharge_common(page,

872

__mem_cgroup_uncharge_common(page,

871

MEM_CGROUP_CHARGE_TYPE_FORCE);

873

MEM_CGROUP_CHARGE_TYPE_FORCE);

872

put_page(page);

874

put_page(page);

873

if (--count <= 0) {

875

if (--count <= 0) {

874

count = FORCE_UNCHARGE_BATCH;

876

count = FORCE_UNCHARGE_BATCH;

875

cond_resched();

877

cond_resched();

876

}

878

}

877

} else

879

} else

878

cond_resched();

880

cond_resched();

879

spin_lock_irqsave(&mz->lru_lock, flags);

881

spin_lock_irqsave(&mz->lru_lock, flags);

880

}

882

}

881

spin_unlock_irqrestore(&mz->lru_lock, flags);

883

spin_unlock_irqrestore(&mz->lru_lock, flags);

882

}

884

}

883

885

884

/*

886

/*

885

* make mem_cgroup's charge to be 0 if there is no task.

887

* make mem_cgroup's charge to be 0 if there is no task.

886

* This enables deleting this mem_cgroup.

888

* This enables deleting this mem_cgroup.

887

*/

889

*/

888

static int mem_cgroup_force_empty(struct mem_cgroup *mem)

890

static int mem_cgroup_force_empty(struct mem_cgroup *mem)

889

{

891

{

890

int ret = -EBUSY;

892

int ret = -EBUSY;

891

int node, zid;

893

int node, zid;

892

894

893

css_get(&mem->css);

895

css_get(&mem->css);

894

/*

896

/*

895

* page reclaim code (kswapd etc..) will move pages between

897

* page reclaim code (kswapd etc..) will move pages between

896

* active_list <-> inactive_list while we don't take a lock.

898

* active_list <-> inactive_list while we don't take a lock.

897

* So, we have to do loop here until all lists are empty.

899

* So, we have to do loop here until all lists are empty.

898

*/

900

*/

899

while (mem->res.usage > 0) {

901

while (mem->res.usage > 0) {

900

if (atomic_read(&mem->css.cgroup->count) > 0)

902

if (atomic_read(&mem->css.cgroup->count) > 0)

901

goto out;

903

goto out;

902

for_each_node_state(node, N_POSSIBLE)

904

for_each_node_state(node, N_POSSIBLE)

903

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

905

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

904

struct mem_cgroup_per_zone *mz;

906

struct mem_cgroup_per_zone *mz;

905

mz = mem_cgroup_zoneinfo(mem, node, zid);

907

mz = mem_cgroup_zoneinfo(mem, node, zid);

906

/* drop all page_cgroup in active_list */

908

/* drop all page_cgroup in active_list */

907

mem_cgroup_force_empty_list(mem, mz, 1);

909

mem_cgroup_force_empty_list(mem, mz, 1);

908

/* drop all page_cgroup in inactive_list */

910

/* drop all page_cgroup in inactive_list */

909

mem_cgroup_force_empty_list(mem, mz, 0);

911

mem_cgroup_force_empty_list(mem, mz, 0);

910

}

912

}

911

}

913

}

912

ret = 0;

914

ret = 0;

913

out:

915

out:

914

css_put(&mem->css);

916

css_put(&mem->css);

915

return ret;

917

return ret;

916

}

918

}

917

919

918

static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)

920

static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)

919

{

921

{

920

return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,

922

return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,

921

cft->private);

923

cft->private);

922

}

924

}

923

/*

925

/*

924

* The user of this function is...

926

* The user of this function is...

925

* RES_LIMIT.

927

* RES_LIMIT.

926

*/

928

*/

927

static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,

929

static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,

928

const char *buffer)

930

const char *buffer)

929

{

931

{

930

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

932

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

931

unsigned long long val;

933

unsigned long long val;

932

int ret;

934

int ret;

933

935

934

switch (cft->private) {

936

switch (cft->private) {

935

case RES_LIMIT:

937

case RES_LIMIT:

936

/* This function does all necessary parse...reuse it */

938

/* This function does all necessary parse...reuse it */

937

ret = res_counter_memparse_write_strategy(buffer, &val);

939

ret = res_counter_memparse_write_strategy(buffer, &val);

938

if (!ret)

940

if (!ret)

939

ret = mem_cgroup_resize_limit(memcg, val);

941

ret = mem_cgroup_resize_limit(memcg, val);

940

break;

942

break;

941

default:

943

default:

942

ret = -EINVAL; /* should be BUG() ? */

944

ret = -EINVAL; /* should be BUG() ? */

943

break;

945

break;

944

}

946

}

945

return ret;

947

return ret;

946

}

948

}

947

949

948

static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)

950

static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)

949

{

951

{

950

struct mem_cgroup *mem;

952

struct mem_cgroup *mem;

951

953

952

mem = mem_cgroup_from_cont(cont);

954

mem = mem_cgroup_from_cont(cont);

953

switch (event) {

955

switch (event) {

954

case RES_MAX_USAGE:

956

case RES_MAX_USAGE:

955

res_counter_reset_max(&mem->res);

957

res_counter_reset_max(&mem->res);

956

break;

958

break;

957

case RES_FAILCNT:

959

case RES_FAILCNT:

958

res_counter_reset_failcnt(&mem->res);

960

res_counter_reset_failcnt(&mem->res);

959

break;

961

break;

960

}

962

}

961

return 0;

963

return 0;

962

}

964

}

963

965

964

static int mem_force_empty_write(struct cgroup *cont, unsigned int event)

966

static int mem_force_empty_write(struct cgroup *cont, unsigned int event)

965

{

967

{

966

return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));

968

return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));

967

}

969

}

968

970

969

static const struct mem_cgroup_stat_desc {

971

static const struct mem_cgroup_stat_desc {

970

const char *msg;

972

const char *msg;

971

u64 unit;

973

u64 unit;

972

} mem_cgroup_stat_desc[] = {

974

} mem_cgroup_stat_desc[] = {

973

[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },

975

[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },

974

[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },

976

[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },

975

[MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },

977

[MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },

976

[MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },

978

[MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },

977

};

979

};

978

980

979

static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,

981

static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,

980

struct cgroup_map_cb *cb)

982

struct cgroup_map_cb *cb)

981

{

983

{

982

struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);

984

struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);

983

struct mem_cgroup_stat *stat = &mem_cont->stat;

985

struct mem_cgroup_stat *stat = &mem_cont->stat;

984

int i;

986

int i;

985

987

986

for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {

988

for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {

987

s64 val;

989

s64 val;

988

990

989

val = mem_cgroup_read_stat(stat, i);

991

val = mem_cgroup_read_stat(stat, i);

990

val *= mem_cgroup_stat_desc[i].unit;

992

val *= mem_cgroup_stat_desc[i].unit;

991

cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);

993

cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);

992

}

994

}

993

/* showing # of active pages */

995

/* showing # of active pages */

994

{

996

{

995

unsigned long active, inactive;

997

unsigned long active, inactive;

996

998

997

inactive = mem_cgroup_get_all_zonestat(mem_cont,

999

inactive = mem_cgroup_get_all_zonestat(mem_cont,

998

MEM_CGROUP_ZSTAT_INACTIVE);

1000

MEM_CGROUP_ZSTAT_INACTIVE);

999

active = mem_cgroup_get_all_zonestat(mem_cont,

1001

active = mem_cgroup_get_all_zonestat(mem_cont,

1000

MEM_CGROUP_ZSTAT_ACTIVE);

1002

MEM_CGROUP_ZSTAT_ACTIVE);

1001

cb->fill(cb, "active", (active) * PAGE_SIZE);

1003

cb->fill(cb, "active", (active) * PAGE_SIZE);

1002

cb->fill(cb, "inactive", (inactive) * PAGE_SIZE);

1004

cb->fill(cb, "inactive", (inactive) * PAGE_SIZE);

1003

}

1005

}

1004

return 0;

1006

return 0;

1005

}

1007

}

1006

1008

1007

static struct cftype mem_cgroup_files[] = {

1009

static struct cftype mem_cgroup_files[] = {

1008

{

1010

{

1009

.name = "usage_in_bytes",

1011

.name = "usage_in_bytes",

1010

.private = RES_USAGE,

1012

.private = RES_USAGE,

1011

.read_u64 = mem_cgroup_read,

1013

.read_u64 = mem_cgroup_read,

1012

},

1014

},

1013

{

1015

{

1014

.name = "max_usage_in_bytes",

1016

.name = "max_usage_in_bytes",

1015

.private = RES_MAX_USAGE,

1017

.private = RES_MAX_USAGE,

1016

.trigger = mem_cgroup_reset,

1018

.trigger = mem_cgroup_reset,

1017

.read_u64 = mem_cgroup_read,

1019

.read_u64 = mem_cgroup_read,

1018

},

1020

},

1019

{

1021

{

1020

.name = "limit_in_bytes",

1022

.name = "limit_in_bytes",

1021

.private = RES_LIMIT,

1023

.private = RES_LIMIT,

1022

.write_string = mem_cgroup_write,

1024

.write_string = mem_cgroup_write,

1023

.read_u64 = mem_cgroup_read,

1025

.read_u64 = mem_cgroup_read,

1024

},

1026

},

1025

{

1027

{

1026

.name = "failcnt",

1028

.name = "failcnt",

1027

.private = RES_FAILCNT,

1029

.private = RES_FAILCNT,

1028

.trigger = mem_cgroup_reset,

1030

.trigger = mem_cgroup_reset,

1029

.read_u64 = mem_cgroup_read,

1031

.read_u64 = mem_cgroup_read,

1030

},

1032

},

1031

{

1033

{

1032

.name = "force_empty",

1034

.name = "force_empty",

1033

.trigger = mem_force_empty_write,

1035

.trigger = mem_force_empty_write,

1034

},

1036

},

1035

{

1037

{

1036

.name = "stat",

1038

.name = "stat",

1037

.read_map = mem_control_stat_show,

1039

.read_map = mem_control_stat_show,

1038

},

1040

},

1039

};

1041

};

1040

1042

1041

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)

1043

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)

1042

{

1044

{

1043

struct mem_cgroup_per_node *pn;

1045

struct mem_cgroup_per_node *pn;

1044

struct mem_cgroup_per_zone *mz;

1046

struct mem_cgroup_per_zone *mz;

1045

int zone, tmp = node;

1047

int zone, tmp = node;

1046

/*

1048

/*

1047

* This routine is called against possible nodes.

1049

* This routine is called against possible nodes.

1048

* But it's BUG to call kmalloc() against offline node.

1050

* But it's BUG to call kmalloc() against offline node.

1049

*

1051

*

1050

* TODO: this routine can waste much memory for nodes which will

1052

* TODO: this routine can waste much memory for nodes which will

1051

* never be onlined. It's better to use memory hotplug callback

1053

* never be onlined. It's better to use memory hotplug callback

1052

* function.

1054

* function.

1053

*/

1055

*/

1054

if (!node_state(node, N_NORMAL_MEMORY))

1056

if (!node_state(node, N_NORMAL_MEMORY))

1055

tmp = -1;

1057

tmp = -1;

1056

pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

1058

pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

1057

if (!pn)

1059

if (!pn)

1058

return 1;

1060

return 1;

1059

1061

1060

mem->info.nodeinfo[node] = pn;

1062

mem->info.nodeinfo[node] = pn;

1061

memset(pn, 0, sizeof(*pn));

1063

memset(pn, 0, sizeof(*pn));

1062

1064

1063

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

1065

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

1064

mz = &pn->zoneinfo[zone];

1066

mz = &pn->zoneinfo[zone];

1065

INIT_LIST_HEAD(&mz->active_list);

1067

INIT_LIST_HEAD(&mz->active_list);

1066

INIT_LIST_HEAD(&mz->inactive_list);

1068

INIT_LIST_HEAD(&mz->inactive_list);

1067

spin_lock_init(&mz->lru_lock);

1069

spin_lock_init(&mz->lru_lock);

1068

}

1070

}

1069

return 0;

1071

return 0;

1070

}

1072

}

1071

1073

1072

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)

1074

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)

1073

{

1075

{

1074

kfree(mem->info.nodeinfo[node]);

1076

kfree(mem->info.nodeinfo[node]);

1075

}

1077

}

1076

1078

1077

static struct mem_cgroup *mem_cgroup_alloc(void)

1079

static struct mem_cgroup *mem_cgroup_alloc(void)

1078

{

1080

{

1079

struct mem_cgroup *mem;

1081

struct mem_cgroup *mem;

1080

1082

1081

if (sizeof(*mem) < PAGE_SIZE)

1083

if (sizeof(*mem) < PAGE_SIZE)

1082

mem = kmalloc(sizeof(*mem), GFP_KERNEL);

1084

mem = kmalloc(sizeof(*mem), GFP_KERNEL);

1083

else

1085

else

1084

mem = vmalloc(sizeof(*mem));

1086

mem = vmalloc(sizeof(*mem));

1085

1087

1086

if (mem)

1088

if (mem)

1087

memset(mem, 0, sizeof(*mem));

1089

memset(mem, 0, sizeof(*mem));

1088

return mem;

1090

return mem;

1089

}

1091

}

1090

1092

1091

static void mem_cgroup_free(struct mem_cgroup *mem)

1093

static void mem_cgroup_free(struct mem_cgroup *mem)

1092

{

1094

{

1093

if (sizeof(*mem) < PAGE_SIZE)

1095

if (sizeof(*mem) < PAGE_SIZE)

1094

kfree(mem);

1096

kfree(mem);

1095

else

1097

else

1096

vfree(mem);

1098

vfree(mem);

1097

}

1099

}

1098

1100

1099

1101

1100

static struct cgroup_subsys_state *

1102

static struct cgroup_subsys_state *

1101

mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)

1103

mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)

1102

{

1104

{

1103

struct mem_cgroup *mem;

1105

struct mem_cgroup *mem;

1104

int node;

1106

int node;

1105

1107

1106

if (unlikely((cont->parent) == NULL)) {

1108

if (unlikely((cont->parent) == NULL)) {

1107

mem = &init_mem_cgroup;

1109

mem = &init_mem_cgroup;

1108

page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);

1110

page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);

1109

} else {

1111

} else {

1110

mem = mem_cgroup_alloc();

1112

mem = mem_cgroup_alloc();

1111

if (!mem)

1113

if (!mem)

1112

return ERR_PTR(-ENOMEM);

1114

return ERR_PTR(-ENOMEM);

1113

}

1115

}

1114

1116

1115

res_counter_init(&mem->res);

1117

res_counter_init(&mem->res);

1116

1118

1117

for_each_node_state(node, N_POSSIBLE)

1119

for_each_node_state(node, N_POSSIBLE)

1118

if (alloc_mem_cgroup_per_zone_info(mem, node))

1120

if (alloc_mem_cgroup_per_zone_info(mem, node))

1119

goto free_out;

1121

goto free_out;

1120

1122

1121

return &mem->css;

1123

return &mem->css;

1122

free_out:

1124

free_out:

1123

for_each_node_state(node, N_POSSIBLE)

1125

for_each_node_state(node, N_POSSIBLE)

1124

free_mem_cgroup_per_zone_info(mem, node);

1126

free_mem_cgroup_per_zone_info(mem, node);

1125

if (cont->parent != NULL)

1127

if (cont->parent != NULL)

1126

mem_cgroup_free(mem);

1128

mem_cgroup_free(mem);

1127

return ERR_PTR(-ENOMEM);

1129

return ERR_PTR(-ENOMEM);

1128

}

1130

}

1129

1131

1130

static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,

1132

static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,

1131

struct cgroup *cont)

1133

struct cgroup *cont)

1132

{

1134

{

1133

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

1135

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

1134

mem_cgroup_force_empty(mem);

1136

mem_cgroup_force_empty(mem);

1135

}

1137

}

1136

1138

1137

static void mem_cgroup_destroy(struct cgroup_subsys *ss,

1139

static void mem_cgroup_destroy(struct cgroup_subsys *ss,

1138

struct cgroup *cont)

1140

struct cgroup *cont)

1139

{

1141

{

1140

int node;

1142

int node;

1141

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

1143

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

1142

1144

1143

for_each_node_state(node, N_POSSIBLE)

1145

for_each_node_state(node, N_POSSIBLE)

1144

free_mem_cgroup_per_zone_info(mem, node);

1146

free_mem_cgroup_per_zone_info(mem, node);

1145

1147

1146

mem_cgroup_free(mem_cgroup_from_cont(cont));

1148

mem_cgroup_free(mem_cgroup_from_cont(cont));

1147

}

1149

}

1148

1150

1149

static int mem_cgroup_populate(struct cgroup_subsys *ss,

1151

static int mem_cgroup_populate(struct cgroup_subsys *ss,

1150

struct cgroup *cont)

1152

struct cgroup *cont)

1151

{

1153

{

1152

return cgroup_add_files(cont, ss, mem_cgroup_files,

1154

return cgroup_add_files(cont, ss, mem_cgroup_files,

1153

ARRAY_SIZE(mem_cgroup_files));

1155

ARRAY_SIZE(mem_cgroup_files));

1154

}

1156

}

1155

1157

1156

static void mem_cgroup_move_task(struct cgroup_subsys *ss,

1158

static void mem_cgroup_move_task(struct cgroup_subsys *ss,

1157

struct cgroup *cont,

1159

struct cgroup *cont,

1158

struct cgroup *old_cont,

1160

struct cgroup *old_cont,

1159

struct task_struct *p)

1161

struct task_struct *p)

1160

{

1162

{

1161

struct mm_struct *mm;

1163

struct mm_struct *mm;

1162

struct mem_cgroup *mem, *old_mem;

1164

struct mem_cgroup *mem, *old_mem;

1163

1165

1164

mm = get_task_mm(p);

1166

mm = get_task_mm(p);

1165

if (mm == NULL)

1167

if (mm == NULL)

1166

return;

1168

return;

1167

1169

1168

mem = mem_cgroup_from_cont(cont);

1170

mem = mem_cgroup_from_cont(cont);

1169

old_mem = mem_cgroup_from_cont(old_cont);

1171

old_mem = mem_cgroup_from_cont(old_cont);

1170

1172

1171

/*

1173

/*

1172

* Only thread group leaders are allowed to migrate, the mm_struct is

1174

* Only thread group leaders are allowed to migrate, the mm_struct is

1173

* in effect owned by the leader

1175

* in effect owned by the leader

1174

*/

1176

*/

1175

if (!thread_group_leader(p))

1177

if (!thread_group_leader(p))

1176

goto out;

1178

goto out;

1177

1179

1178

out:

1180

out:

1179

mmput(mm);

1181

mmput(mm);

1180

}

1182

}

1181

1183

1182

struct cgroup_subsys mem_cgroup_subsys = {

1184

struct cgroup_subsys mem_cgroup_subsys = {

1183

.name = "memory",

1185

.name = "memory",

1184

.subsys_id = mem_cgroup_subsys_id,

1186

.subsys_id = mem_cgroup_subsys_id,

1185

.create = mem_cgroup_create,

1187

.create = mem_cgroup_create,

1186

.pre_destroy = mem_cgroup_pre_destroy,

1188

.pre_destroy = mem_cgroup_pre_destroy,

1187

.destroy = mem_cgroup_destroy,

1189

.destroy = mem_cgroup_destroy,

1188

.populate = mem_cgroup_populate,

1190

.populate = mem_cgroup_populate,

1189

.attach = mem_cgroup_move_task,

1191

.attach = mem_cgroup_move_task,

1190

.early_init = 0,

1192

.early_init = 0,

1191

};

1193

};

1192

1194

GITLAB

memcg: fix oops in mem_cgroup_shrink_usage

 /* memcontrol.c - Memory Controller
  *
  * Copyright IBM Corporation, 2007
  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
  *
  * Copyright 2007 OpenVZ SWsoft Inc
  * Author: Pavel Emelianov <xemul@openvz.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  */
 #include <linux/res_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/page-flags.h>
 #include <linux/backing-dev.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
 #include <asm/uaccess.h>
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 static struct kmem_cache *page_cgroup_cache __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 /*
  * Statistics for memory cgroup.
  */
 enum mem_cgroup_stat_index {
 	/*
 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
 	 */
 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
 	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
 	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
 	MEM_CGROUP_STAT_NSTATS,
 };
 struct mem_cgroup_stat_cpu {
 	s64 count[MEM_CGROUP_STAT_NSTATS];
 } ____cacheline_aligned_in_smp;
 struct mem_cgroup_stat {
 	struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
 };
 /*
  * For accounting under irq disable, no need for increment preempt count.
  */
 static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat,
 		enum mem_cgroup_stat_index idx, int val)
 {
 	int cpu = smp_processor_id();
 	stat->cpustat[cpu].count[idx] += val;
 }
 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
 		enum mem_cgroup_stat_index idx)
 {
 	int cpu;
 	s64 ret = 0;
 	for_each_possible_cpu(cpu)
 		ret += stat->cpustat[cpu].count[idx];
 	return ret;
 }
 /*
  * per-zone information in memory controller.
  */
 enum mem_cgroup_zstat_index {
 	MEM_CGROUP_ZSTAT_ACTIVE,
 	MEM_CGROUP_ZSTAT_INACTIVE,
 	NR_MEM_CGROUP_ZSTAT,
 };
 struct mem_cgroup_per_zone {
 	/*
 	 * spin_lock to protect the per cgroup LRU
 	 */
 	spinlock_t		lru_lock;
 	struct list_head	active_list;
 	struct list_head	inactive_list;
 	unsigned long count[NR_MEM_CGROUP_ZSTAT];
 };
 /* Macro for accessing counter */
 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
 struct mem_cgroup_per_node {
 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 };
 struct mem_cgroup_lru_info {
 	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 };
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
  * to help the administrator determine what knobs to tune.
  *
  * TODO: Add a water mark for the memory controller. Reclaim will begin when
  * we hit the water mark. May be even add a low water mark, such that
  * no reclaim occurs from a cgroup at it's low water mark, this is
  * a feature that will be implemented much later in the future.
  */
 struct mem_cgroup {
 	struct cgroup_subsys_state css;
 	/*
 	 * the counter to account for memory usage
 	 */
 	struct res_counter res;
 	/*
 	 * Per cgroup active and inactive list, similar to the
 	 * per zone LRU lists.
 	 */
 	struct mem_cgroup_lru_info info;
 	int	prev_priority;	/* for recording reclaim priority */
 	/*
 	 * statistics.
 	 */
 	struct mem_cgroup_stat stat;
 };
 static struct mem_cgroup init_mem_cgroup;
 /*
  * We use the lower bit of the page->page_cgroup pointer as a bit spin
  * lock.  We need to ensure that page->page_cgroup is at least two
  * byte aligned (based on comments from Nick Piggin).  But since
  * bit_spin_lock doesn't actually set that lock bit in a non-debug
  * uniprocessor kernel, we should avoid setting it here too.
  */
 #define PAGE_CGROUP_LOCK_BIT 	0x0
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 #define PAGE_CGROUP_LOCK 	(1 << PAGE_CGROUP_LOCK_BIT)
 #else
 #define PAGE_CGROUP_LOCK	0x0
 #endif
 /*
  * A page_cgroup page is associated with every page descriptor. The
  * page_cgroup helps us identify information about the cgroup
  */
 struct page_cgroup {
 	struct list_head lru;		/* per cgroup LRU list */
 	struct page *page;
 	struct mem_cgroup *mem_cgroup;
 	int flags;
 };
 #define PAGE_CGROUP_FLAG_CACHE	(0x1)	/* charged as cache */
 #define PAGE_CGROUP_FLAG_ACTIVE (0x2)	/* page is active in this cgroup */
 static int page_cgroup_nid(struct page_cgroup *pc)
 {
 	return page_to_nid(pc->page);
 }
 static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
 {
 	return page_zonenum(pc->page);
 }
 enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
 	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
 };
 /*
  * Always modified under lru lock. Then, not necessary to preempt_disable()
  */
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
 					bool charge)
 {
 	int val = (charge)? 1 : -1;
 	struct mem_cgroup_stat *stat = &mem->stat;
 	VM_BUG_ON(!irqs_disabled());
 	if (flags & PAGE_CGROUP_FLAG_CACHE)
 		__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);
 	else
 		__mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
 	if (charge)
 		__mem_cgroup_stat_add_safe(stat,
 				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
 	else
 		__mem_cgroup_stat_add_safe(stat,
 				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
 }
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
 {
 	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
 }
 static struct mem_cgroup_per_zone *
 page_cgroup_zoneinfo(struct page_cgroup *pc)
 {
 	struct mem_cgroup *mem = pc->mem_cgroup;
 	int nid = page_cgroup_nid(pc);
 	int zid = page_cgroup_zid(pc);
 	return mem_cgroup_zoneinfo(mem, nid, zid);
 }
 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
 					enum mem_cgroup_zstat_index idx)
 {
 	int nid, zid;
 	struct mem_cgroup_per_zone *mz;
 	u64 total = 0;
 	for_each_online_node(nid)
 		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 			mz = mem_cgroup_zoneinfo(mem, nid, zid);
 			total += MEM_CGROUP_ZSTAT(mz, idx);
 		}
 	return total;
 }
 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 {
 	return container_of(cgroup_subsys_state(cont,
 				mem_cgroup_subsys_id), struct mem_cgroup,
 				css);
 }
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 {
 	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 				struct mem_cgroup, css);
 }
 static inline int page_cgroup_locked(struct page *page)
 {
 	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 }
 static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
 {
 	VM_BUG_ON(!page_cgroup_locked(page));
 	page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
 }
 struct page_cgroup *page_get_page_cgroup(struct page *page)
 {
 	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
 }
 static void lock_page_cgroup(struct page *page)
 {
 	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 }
 static int try_lock_page_cgroup(struct page *page)
 {
 	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 }
 static void unlock_page_cgroup(struct page *page)
 {
 	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 }
 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 			struct page_cgroup *pc)
 {
 	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
 	if (from)
 		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
 	else
 		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
 	mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
 	list_del(&pc->lru);
 }
 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
 				struct page_cgroup *pc)
 {
 	int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
 	if (!to) {
 		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
 		list_add(&pc->lru, &mz->inactive_list);
 	} else {
 		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
 		list_add(&pc->lru, &mz->active_list);
 	}
 	mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);
 }
 static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
 {
 	int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
 	struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
 	if (from)
 		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
 	else
 		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
 	if (active) {
 		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
 		pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
 		list_move(&pc->lru, &mz->active_list);
 	} else {
 		MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
 		pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
 		list_move(&pc->lru, &mz->inactive_list);
 	}
 }
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 {
 	int ret;
 	task_lock(task);
 	ret = task->mm && mm_match_cgroup(task->mm, mem);
 	task_unlock(task);
 	return ret;
 }
 /*
  * This routine assumes that the appropriate zone's lru lock is already held
  */
 void mem_cgroup_move_lists(struct page *page, bool active)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup_per_zone *mz;
 	unsigned long flags;
 	if (mem_cgroup_subsys.disabled)
 		return;
 	/*
 	 * We cannot lock_page_cgroup while holding zone's lru_lock,
 	 * because other holders of lock_page_cgroup can be interrupted
 	 * with an attempt to rotate_reclaimable_page.  But we cannot
 	 * safely get to page_cgroup without it, so just try_lock it:
 	 * mem_cgroup_isolate_pages allows for page left on wrong list.
 	 */
 	if (!try_lock_page_cgroup(page))
 		return;
 	pc = page_get_page_cgroup(page);
 	if (pc) {
 		mz = page_cgroup_zoneinfo(pc);
 		spin_lock_irqsave(&mz->lru_lock, flags);
 		__mem_cgroup_move_lists(pc, active);
 		spin_unlock_irqrestore(&mz->lru_lock, flags);
 	}
 	unlock_page_cgroup(page);
 }
 /*
  * Calculate mapped_ratio under memory controller. This will be used in
  * vmscan.c for deteremining we have to reclaim mapped pages.
  */
 int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
 {
 	long total, rss;
 	/*
 	 * usage is recorded in bytes. But, here, we assume the number of
 	 * physical pages can be represented by "long" on any arch.
 	 */
 	total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
 	rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
 	return (int)((rss * 100L) / total);
 }
 /*
  * This function is called from vmscan.c. In page reclaiming loop. balance
  * between active and inactive list is calculated. For memory controller
  * page reclaiming, we should use using mem_cgroup's imbalance rather than
  * zone's global lru imbalance.
  */
 long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
 {
 	unsigned long active, inactive;
 	/* active and inactive are the number of pages. 'long' is ok.*/
 	active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);
 	inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);
 	return (long) (active / (inactive + 1));
 }
 /*
  * prev_priority control...this will be used in memory reclaim path.
  */
 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
 {
 	return mem->prev_priority;
 }
 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
 {
 	if (priority < mem->prev_priority)
 		mem->prev_priority = priority;
 }
 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
 {
 	mem->prev_priority = priority;
 }
 /*
  * Calculate # of pages to be scanned in this priority/zone.
  * See also vmscan.c
  *
  * priority starts from "DEF_PRIORITY" and decremented in each loop.
  * (see include/linux/mmzone.h)
  */
 long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
 				   struct zone *zone, int priority)
 {
 	long nr_active;
 	int nid = zone->zone_pgdat->node_id;
 	int zid = zone_idx(zone);
 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
 	nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE);
 	return (nr_active >> priority);
 }
 long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
 					struct zone *zone, int priority)
 {
 	long nr_inactive;
 	int nid = zone->zone_pgdat->node_id;
 	int zid = zone_idx(zone);
 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
 	nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
 	return (nr_inactive >> priority);
 }
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					struct list_head *dst,
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
 					int active)
 {
 	unsigned long nr_taken = 0;
 	struct page *page;
 	unsigned long scan;
 	LIST_HEAD(pc_list);
 	struct list_head *src;
 	struct page_cgroup *pc, *tmp;
 	int nid = z->zone_pgdat->node_id;
 	int zid = zone_idx(z);
 	struct mem_cgroup_per_zone *mz;
 	BUG_ON(!mem_cont);
 	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
 	if (active)
 		src = &mz->active_list;
 	else
 		src = &mz->inactive_list;
 	spin_lock(&mz->lru_lock);
 	scan = 0;
 	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
 		if (scan >= nr_to_scan)
 			break;
 		page = pc->page;
 		if (unlikely(!PageLRU(page)))
 			continue;
 		if (PageActive(page) && !active) {
 			__mem_cgroup_move_lists(pc, true);
 			continue;
 		}
 		if (!PageActive(page) && active) {
 			__mem_cgroup_move_lists(pc, false);
 			continue;
 		}
 		scan++;
 		list_move(&pc->lru, &pc_list);
 		if (__isolate_lru_page(page, mode) == 0) {
 			list_move(&page->lru, dst);
 			nr_taken++;
 		}
 	}
 	list_splice(&pc_list, src);
 	spin_unlock(&mz->lru_lock);
 	*scanned = scan;
 	return nr_taken;
 }
 /*
  * Charge the memory controller for page usage.
  * Return
  * 0 if the charge was successful
  * < 0 if the cgroup is over its limit
  */
 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask, enum charge_type ctype,
 				struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc;
 	unsigned long flags;
 	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct mem_cgroup_per_zone *mz;
 	pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
 	if (unlikely(pc == NULL))
 		goto err;
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
 	 * The mm_struct's mem_cgroup changes on task migration if the
 	 * thread group leader migrates. It's possible that mm is not
 	 * set, if so charge the init_mm (happens for pagecache usage).
 	 */
 	if (likely(!memcg)) {
 		rcu_read_lock();
 		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 		/*
 		 * For every charge from the cgroup, increment reference count
 		 */
 		css_get(&mem->css);
 		rcu_read_unlock();
 	} else {
 		mem = memcg;
 		css_get(&memcg->css);
 	}
 	while (res_counter_charge(&mem->res, PAGE_SIZE)) {
 		if (!(gfp_mask & __GFP_WAIT))
 			goto out;
 		if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
 			continue;
 		/*
 		 * try_to_free_mem_cgroup_pages() might not give us a full
 		 * picture of reclaim. Some pages are reclaimed and might be
 		 * moved to swap cache or just unmapped from the cgroup.
 		 * Check the limit again to see if the reclaim reduced the
 		 * current usage of the cgroup before giving up
 		 */
 		if (res_counter_check_under_limit(&mem->res))
 			continue;
 		if (!nr_retries--) {
 			mem_cgroup_out_of_memory(mem, gfp_mask);
 			goto out;
 		}
 	}
 	pc->mem_cgroup = mem;
 	pc->page = page;
 	/*
 	 * If a page is accounted as a page cache, insert to inactive list.
 	 * If anon, insert to active list.
 	 */
 	if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
 		pc->flags = PAGE_CGROUP_FLAG_CACHE;
 	else
 		pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
 	lock_page_cgroup(page);
 	if (unlikely(page_get_page_cgroup(page))) {
 		unlock_page_cgroup(page);
 		res_counter_uncharge(&mem->res, PAGE_SIZE);
 		css_put(&mem->css);
 		kmem_cache_free(page_cgroup_cache, pc);
 		goto done;
 	}
 	page_assign_page_cgroup(page, pc);
 	mz = page_cgroup_zoneinfo(pc);
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	__mem_cgroup_add_list(mz, pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
 	unlock_page_cgroup(page);
 done:
 	return 0;
 out:
 	css_put(&mem->css);
 	kmem_cache_free(page_cgroup_cache, pc);
 err:
 	return -ENOMEM;
 }
 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 {
 	if (mem_cgroup_subsys.disabled)
 		return 0;
 	/*
 	 * If already mapped, we don't have to account.
 	 * If page cache, page->mapping has address_space.
 	 * But page->mapping may have out-of-use anon_vma pointer,
 	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
 	 * is NULL.
   	 */
 	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
 		return 0;
 	if (unlikely(!mm))
 		mm = &init_mm;
 	return mem_cgroup_charge_common(page, mm, gfp_mask,
 				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
 }
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask)
 {
 	if (mem_cgroup_subsys.disabled)
 		return 0;
 	/*
 	 * Corner case handling. This is called from add_to_page_cache()
 	 * in usual. But some FS (shmem) precharges this page before calling it
 	 * and call add_to_page_cache() with GFP_NOWAIT.
 	 *
 	 * For GFP_NOWAIT case, the page may be pre-charged before calling
 	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
 	 * charge twice. (It works but has to pay a bit larger cost.)
 	 */
 	if (!(gfp_mask & __GFP_WAIT)) {
 		struct page_cgroup *pc;
 		lock_page_cgroup(page);
 		pc = page_get_page_cgroup(page);
 		if (pc) {
 			VM_BUG_ON(pc->page != page);
 			VM_BUG_ON(!pc->mem_cgroup);
 			unlock_page_cgroup(page);
 			return 0;
 		}
 		unlock_page_cgroup(page);
 	}
 	if (unlikely(!mm))
 		mm = &init_mm;
 	return mem_cgroup_charge_common(page, mm, gfp_mask,
 				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
 }
 /*
  * uncharge if !page_mapped(page)
  */
 static void
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup *mem;
 	struct mem_cgroup_per_zone *mz;
 	unsigned long flags;
 	if (mem_cgroup_subsys.disabled)
 		return;
 	/*
 	 * Check if our page_cgroup is valid
 	 */
 	lock_page_cgroup(page);
 	pc = page_get_page_cgroup(page);
 	if (unlikely(!pc))
 		goto unlock;
 	VM_BUG_ON(pc->page != page);
 	if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
 	    && ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
 		|| page_mapped(page)))
 		goto unlock;
 	mz = page_cgroup_zoneinfo(pc);
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	__mem_cgroup_remove_list(mz, pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
 	page_assign_page_cgroup(page, NULL);
 	unlock_page_cgroup(page);
 	mem = pc->mem_cgroup;
 	res_counter_uncharge(&mem->res, PAGE_SIZE);
 	css_put(&mem->css);
 	kmem_cache_free(page_cgroup_cache, pc);
 	return;
 unlock:
 	unlock_page_cgroup(page);
 }
 void mem_cgroup_uncharge_page(struct page *page)
 {
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 void mem_cgroup_uncharge_cache_page(struct page *page)
 {
 	VM_BUG_ON(page_mapped(page));
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 }
 /*
  * Before starting migration, account against new page.
  */
 int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup *mem = NULL;
 	enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
 	int ret = 0;
 	if (mem_cgroup_subsys.disabled)
 		return 0;
 	lock_page_cgroup(page);
 	pc = page_get_page_cgroup(page);
 	if (pc) {
 		mem = pc->mem_cgroup;
 		css_get(&mem->css);
 		if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
 			ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	}
 	unlock_page_cgroup(page);
 	if (mem) {
 		ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
 			ctype, mem);
 		css_put(&mem->css);
 	}
 	return ret;
 }
 /* remove redundant charge if migration failed*/
 void mem_cgroup_end_migration(struct page *newpage)
 {
 	/*
 	 * At success, page->mapping is not NULL.
 	 * special rollback care is necessary when
 	 * 1. at migration failure. (newpage->mapping is cleared in this case)
 	 * 2. the newpage was moved but not remapped again because the task
 	 *    exits and the newpage is obsolete. In this case, the new page
 	 *    may be a swapcache. So, we just call mem_cgroup_uncharge_page()
 	 *    always for avoiding mess. The  page_cgroup will be removed if
 	 *    unnecessary. File cache pages is still on radix-tree. Don't
 	 *    care it.
 	 */
 	if (!newpage->mapping)
 		__mem_cgroup_uncharge_common(newpage,
 					 MEM_CGROUP_CHARGE_TYPE_FORCE);
 	else if (PageAnon(newpage))
 		mem_cgroup_uncharge_page(newpage);
 }
 /*
  * A call to try to shrink memory usage under specified resource controller.
  * This is typically used for page reclaiming for shmem for reducing side
  * effect of page allocation from shmem, which is used by some mem_cgroup.
  */
 int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 {
 	struct mem_cgroup *mem;
 	int progress = 0;
 	int retry = MEM_CGROUP_RECLAIM_RETRIES;
 	if (mem_cgroup_subsys.disabled)
 		return 0;
+	if (!mm)
+		return 0;
 	rcu_read_lock();
 	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 	css_get(&mem->css);
 	rcu_read_unlock();
 	do {
 		progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
 	} while (!progress && --retry);
 	css_put(&mem->css);
 	if (!retry)
 		return -ENOMEM;
 	return 0;
 }
 int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
 {
 	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
 	int progress;
 	int ret = 0;
 	while (res_counter_set_limit(&memcg->res, val)) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		if (!retry_count) {
 			ret = -EBUSY;
 			break;
 		}
 		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
 		if (!progress)
 			retry_count--;
 	}
 	return ret;
 }
 /*
  * This routine traverse page_cgroup in given list and drop them all.
  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
  */
 #define FORCE_UNCHARGE_BATCH	(128)
 static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 			    struct mem_cgroup_per_zone *mz,
 			    int active)
 {
 	struct page_cgroup *pc;
 	struct page *page;
 	int count = FORCE_UNCHARGE_BATCH;
 	unsigned long flags;
 	struct list_head *list;
 	if (active)
 		list = &mz->active_list;
 	else
 		list = &mz->inactive_list;
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	while (!list_empty(list)) {
 		pc = list_entry(list->prev, struct page_cgroup, lru);
 		page = pc->page;
 		get_page(page);
 		spin_unlock_irqrestore(&mz->lru_lock, flags);
 		/*
 		 * Check if this page is on LRU. !LRU page can be found
 		 * if it's under page migration.
 		 */
 		if (PageLRU(page)) {
 			__mem_cgroup_uncharge_common(page,
 					MEM_CGROUP_CHARGE_TYPE_FORCE);
 			put_page(page);
 			if (--count <= 0) {
 				count = FORCE_UNCHARGE_BATCH;
 				cond_resched();
 			}
 		} else
 			cond_resched();
 		spin_lock_irqsave(&mz->lru_lock, flags);
 	}
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
 }
 /*
  * make mem_cgroup's charge to be 0 if there is no task.
  * This enables deleting this mem_cgroup.
  */
 static int mem_cgroup_force_empty(struct mem_cgroup *mem)
 {
 	int ret = -EBUSY;
 	int node, zid;
 	css_get(&mem->css);
 	/*
 	 * page reclaim code (kswapd etc..) will move pages between
 	 * active_list <-> inactive_list while we don't take a lock.
 	 * So, we have to do loop here until all lists are empty.
 	 */
 	while (mem->res.usage > 0) {
 		if (atomic_read(&mem->css.cgroup->count) > 0)
 			goto out;
 		for_each_node_state(node, N_POSSIBLE)
 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 				struct mem_cgroup_per_zone *mz;
 				mz = mem_cgroup_zoneinfo(mem, node, zid);
 				/* drop all page_cgroup in active_list */
 				mem_cgroup_force_empty_list(mem, mz, 1);
 				/* drop all page_cgroup in inactive_list */
 				mem_cgroup_force_empty_list(mem, mz, 0);
 			}
 	}
 	ret = 0;
 out:
 	css_put(&mem->css);
 	return ret;
 }
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
 	return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
 				    cft->private);
 }
 /*
  * The user of this function is...
  * RES_LIMIT.
  */
 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 			    const char *buffer)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	unsigned long long val;
 	int ret;
 	switch (cft->private) {
 	case RES_LIMIT:
 		/* This function does all necessary parse...reuse it */
 		ret = res_counter_memparse_write_strategy(buffer, &val);
 		if (!ret)
 			ret = mem_cgroup_resize_limit(memcg, val);
 		break;
 	default:
 		ret = -EINVAL; /* should be BUG() ? */
 		break;
 	}
 	return ret;
 }
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
 	struct mem_cgroup *mem;
 	mem = mem_cgroup_from_cont(cont);
 	switch (event) {
 	case RES_MAX_USAGE:
 		res_counter_reset_max(&mem->res);
 		break;
 	case RES_FAILCNT:
 		res_counter_reset_failcnt(&mem->res);
 		break;
 	}
 	return 0;
 }
 static int mem_force_empty_write(struct cgroup *cont, unsigned int event)
 {
 	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));
 }
 static const struct mem_cgroup_stat_desc {
 	const char *msg;
 	u64 unit;
 } mem_cgroup_stat_desc[] = {
 	[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
 	[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
 	[MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
 	[MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
 };
 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 				 struct cgroup_map_cb *cb)
 {
 	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
 	struct mem_cgroup_stat *stat = &mem_cont->stat;
 	int i;
 	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
 		s64 val;
 		val = mem_cgroup_read_stat(stat, i);
 		val *= mem_cgroup_stat_desc[i].unit;
 		cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
 	}
 	/* showing # of active pages */
 	{
 		unsigned long active, inactive;
 		inactive = mem_cgroup_get_all_zonestat(mem_cont,
 						MEM_CGROUP_ZSTAT_INACTIVE);
 		active = mem_cgroup_get_all_zonestat(mem_cont,
 						MEM_CGROUP_ZSTAT_ACTIVE);
 		cb->fill(cb, "active", (active) * PAGE_SIZE);
 		cb->fill(cb, "inactive", (inactive) * PAGE_SIZE);
 	}
 	return 0;
 }
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
 		.private = RES_USAGE,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "max_usage_in_bytes",
 		.private = RES_MAX_USAGE,
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "limit_in_bytes",
 		.private = RES_LIMIT,
 		.write_string = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "failcnt",
 		.private = RES_FAILCNT,
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "force_empty",
 		.trigger = mem_force_empty_write,
 	},
 	{
 		.name = "stat",
 		.read_map = mem_control_stat_show,
 	},
 };
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
 	struct mem_cgroup_per_node *pn;
 	struct mem_cgroup_per_zone *mz;
 	int zone, tmp = node;
 	/*
 	 * This routine is called against possible nodes.
 	 * But it's BUG to call kmalloc() against offline node.
 	 *
 	 * TODO: this routine can waste much memory for nodes which will
 	 *       never be onlined. It's better to use memory hotplug callback
 	 *       function.
 	 */
 	if (!node_state(node, N_NORMAL_MEMORY))
 		tmp = -1;
 	pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
 	if (!pn)
 		return 1;
 	mem->info.nodeinfo[node] = pn;
 	memset(pn, 0, sizeof(*pn));
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
 		INIT_LIST_HEAD(&mz->active_list);
 		INIT_LIST_HEAD(&mz->inactive_list);
 		spin_lock_init(&mz->lru_lock);
 	}
 	return 0;
 }
 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
 	kfree(mem->info.nodeinfo[node]);
 }
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
 	struct mem_cgroup *mem;
 	if (sizeof(*mem) < PAGE_SIZE)
 		mem = kmalloc(sizeof(*mem), GFP_KERNEL);
 	else
 		mem = vmalloc(sizeof(*mem));
 	if (mem)
 		memset(mem, 0, sizeof(*mem));
 	return mem;
 }
 static void mem_cgroup_free(struct mem_cgroup *mem)
 {
 	if (sizeof(*mem) < PAGE_SIZE)
 		kfree(mem);
 	else
 		vfree(mem);
 }
 static struct cgroup_subsys_state *
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	struct mem_cgroup *mem;
 	int node;
 	if (unlikely((cont->parent) == NULL)) {
 		mem = &init_mem_cgroup;
 		page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
 	} else {
 		mem = mem_cgroup_alloc();
 		if (!mem)
 			return ERR_PTR(-ENOMEM);
 	}
 	res_counter_init(&mem->res);
 	for_each_node_state(node, N_POSSIBLE)
 		if (alloc_mem_cgroup_per_zone_info(mem, node))
 			goto free_out;
 	return &mem->css;
 free_out:
 	for_each_node_state(node, N_POSSIBLE)
 		free_mem_cgroup_per_zone_info(mem, node);
 	if (cont->parent != NULL)
 		mem_cgroup_free(mem);
 	return ERR_PTR(-ENOMEM);
 }
 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
 					struct cgroup *cont)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 	mem_cgroup_force_empty(mem);
 }
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 				struct cgroup *cont)
 {
 	int node;
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 	for_each_node_state(node, N_POSSIBLE)
 		free_mem_cgroup_per_zone_info(mem, node);
 	mem_cgroup_free(mem_cgroup_from_cont(cont));
 }
 static int mem_cgroup_populate(struct cgroup_subsys *ss,
 				struct cgroup *cont)
 {
 	return cgroup_add_files(cont, ss, mem_cgroup_files,
 					ARRAY_SIZE(mem_cgroup_files));
 }
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 				struct cgroup *cont,
 				struct cgroup *old_cont,
 				struct task_struct *p)
 {
 	struct mm_struct *mm;
 	struct mem_cgroup *mem, *old_mem;
 	mm = get_task_mm(p);
 	if (mm == NULL)
 		return;
 	mem = mem_cgroup_from_cont(cont);
 	old_mem = mem_cgroup_from_cont(old_cont);
 	/*
 	 * Only thread group leaders are allowed to migrate, the mm_struct is
 	 * in effect owned by the leader
 	 */
 	if (!thread_group_leader(p))
 		goto out;
 out:
 	mmput(mm);
 }
 struct cgroup_subsys mem_cgroup_subsys = {
 	.name = "memory",
 	.subsys_id = mem_cgroup_subsys_id,
 	.create = mem_cgroup_create,
 	.pre_destroy = mem_cgroup_pre_destroy,
 	.destroy = mem_cgroup_destroy,
 	.populate = mem_cgroup_populate,
 	.attach = mem_cgroup_move_task,
 	.early_init = 0,
 };