Eric Lee / linux-smarc-t335x-v3.2

1

/* memcontrol.c - Memory Controller

1

/* memcontrol.c - Memory Controller

2

*

2

*

3

* Copyright IBM Corporation, 2007

3

* Copyright IBM Corporation, 2007

4

* Author Balbir Singh <balbir@linux.vnet.ibm.com>

4

* Author Balbir Singh <balbir@linux.vnet.ibm.com>

5

*

5

*

6

7

* Author: Pavel Emelianov <xemul@openvz.org>

7

* Author: Pavel Emelianov <xemul@openvz.org>

8

*

8

*

9

* This program is free software; you can redistribute it and/or modify

9

* This program is free software; you can redistribute it and/or modify

10

* it under the terms of the GNU General Public License as published by

10

* it under the terms of the GNU General Public License as published by

11

* the Free Software Foundation; either version 2 of the License, or

11

* the Free Software Foundation; either version 2 of the License, or

12

* (at your option) any later version.

12

* (at your option) any later version.

13

*

13

*

14

* This program is distributed in the hope that it will be useful,

14

* This program is distributed in the hope that it will be useful,

15

* but WITHOUT ANY WARRANTY; without even the implied warranty of

15

* but WITHOUT ANY WARRANTY; without even the implied warranty of

16

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

16

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

17

* GNU General Public License for more details.

17

* GNU General Public License for more details.

18

*/

18

*/

19

20

#include <linux/res_counter.h>

20

#include <linux/res_counter.h>

21

#include <linux/memcontrol.h>

21

#include <linux/memcontrol.h>

22

#include <linux/cgroup.h>

22

#include <linux/cgroup.h>

23

#include <linux/mm.h>

23

#include <linux/mm.h>

24

#include <linux/smp.h>

24

#include <linux/smp.h>

25

#include <linux/page-flags.h>

25

#include <linux/page-flags.h>

26

#include <linux/backing-dev.h>

26

#include <linux/backing-dev.h>

27

#include <linux/bit_spinlock.h>

27

#include <linux/bit_spinlock.h>

28

#include <linux/rcupdate.h>

28

#include <linux/rcupdate.h>

29

#include <linux/slab.h>

29

#include <linux/slab.h>

30

#include <linux/swap.h>

30

#include <linux/swap.h>

31

#include <linux/spinlock.h>

31

#include <linux/spinlock.h>

32

#include <linux/fs.h>

32

#include <linux/fs.h>

33

#include <linux/seq_file.h>

33

#include <linux/seq_file.h>

34

#include <linux/vmalloc.h>

34

#include <linux/vmalloc.h>

35

#include <linux/mm_inline.h>

35

#include <linux/mm_inline.h>

36

#include <linux/page_cgroup.h>

36

#include <linux/page_cgroup.h>

37

38

#include <asm/uaccess.h>

38

#include <asm/uaccess.h>

39

40

struct cgroup_subsys mem_cgroup_subsys __read_mostly;

40

struct cgroup_subsys mem_cgroup_subsys __read_mostly;

41

#define MEM_CGROUP_RECLAIM_RETRIES 5

41

#define MEM_CGROUP_RECLAIM_RETRIES 5

42

43

/*

43

/*

44

* Statistics for memory cgroup.

44

* Statistics for memory cgroup.

45

*/

45

*/

46

enum mem_cgroup_stat_index {

46

enum mem_cgroup_stat_index {

47

/*

47

/*

48

* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.

48

* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.

49

*/

49

*/

50

MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */

50

MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */

51

MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */

51

MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */

52

MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */

52

MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */

53

MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */

53

MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */

54

55

MEM_CGROUP_STAT_NSTATS,

55

MEM_CGROUP_STAT_NSTATS,

56

};

56

};

57

58

struct mem_cgroup_stat_cpu {

58

struct mem_cgroup_stat_cpu {

59

s64 count[MEM_CGROUP_STAT_NSTATS];

59

s64 count[MEM_CGROUP_STAT_NSTATS];

60

} ____cacheline_aligned_in_smp;

60

} ____cacheline_aligned_in_smp;

61

62

struct mem_cgroup_stat {

62

struct mem_cgroup_stat {

63

struct mem_cgroup_stat_cpu cpustat[NR_CPUS];

63

struct mem_cgroup_stat_cpu cpustat[0];

64

};

64

};

65

66

/*

66

/*

67

* For accounting under irq disable, no need for increment preempt count.

67

* For accounting under irq disable, no need for increment preempt count.

68

*/

68

*/

69

static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,

69

static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,

70

enum mem_cgroup_stat_index idx, int val)

70

enum mem_cgroup_stat_index idx, int val)

71

{

71

{

72

stat->count[idx] += val;

72

stat->count[idx] += val;

73

}

73

}

74

75

static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,

75

static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,

76

enum mem_cgroup_stat_index idx)

76

enum mem_cgroup_stat_index idx)

77

{

77

{

78

int cpu;

78

int cpu;

79

s64 ret = 0;

79

s64 ret = 0;

80

for_each_possible_cpu(cpu)

80

for_each_possible_cpu(cpu)

81

ret += stat->cpustat[cpu].count[idx];

81

ret += stat->cpustat[cpu].count[idx];

82

return ret;

82

return ret;

83

}

83

}

84

85

/*

85

/*

86

* per-zone information in memory controller.

86

* per-zone information in memory controller.

87

*/

87

*/

88

struct mem_cgroup_per_zone {

88

struct mem_cgroup_per_zone {

89

/*

89

/*

90

* spin_lock to protect the per cgroup LRU

90

* spin_lock to protect the per cgroup LRU

91

*/

91

*/

92

spinlock_t lru_lock;

92

spinlock_t lru_lock;

93

struct list_head lists[NR_LRU_LISTS];

93

struct list_head lists[NR_LRU_LISTS];

94

unsigned long count[NR_LRU_LISTS];

94

unsigned long count[NR_LRU_LISTS];

95

};

95

};

96

/* Macro for accessing counter */

96

/* Macro for accessing counter */

97

#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])

97

#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])

98

99

struct mem_cgroup_per_node {

99

struct mem_cgroup_per_node {

100

struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];

100

struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];

101

};

101

};

102

103

struct mem_cgroup_lru_info {

103

struct mem_cgroup_lru_info {

104

struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];

104

struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];

105

};

105

};

106

107

/*

107

/*

108

* The memory controller data structure. The memory controller controls both

108

* The memory controller data structure. The memory controller controls both

109

* page cache and RSS per cgroup. We would eventually like to provide

109

* page cache and RSS per cgroup. We would eventually like to provide

110

* statistics based on the statistics developed by Rik Van Riel for clock-pro,

110

* statistics based on the statistics developed by Rik Van Riel for clock-pro,

111

* to help the administrator determine what knobs to tune.

111

* to help the administrator determine what knobs to tune.

112

*

112

*

113

* TODO: Add a water mark for the memory controller. Reclaim will begin when

113

* TODO: Add a water mark for the memory controller. Reclaim will begin when

114

* we hit the water mark. May be even add a low water mark, such that

114

* we hit the water mark. May be even add a low water mark, such that

115

* no reclaim occurs from a cgroup at it's low water mark, this is

115

* no reclaim occurs from a cgroup at it's low water mark, this is

116

* a feature that will be implemented much later in the future.

116

* a feature that will be implemented much later in the future.

117

*/

117

*/

118

struct mem_cgroup {

118

struct mem_cgroup {

119

struct cgroup_subsys_state css;

119

struct cgroup_subsys_state css;

120

/*

120

/*

121

* the counter to account for memory usage

121

* the counter to account for memory usage

122

*/

122

*/

123

struct res_counter res;

123

struct res_counter res;

124

/*

124

/*

125

* Per cgroup active and inactive list, similar to the

125

* Per cgroup active and inactive list, similar to the

126

* per zone LRU lists.

126

* per zone LRU lists.

127

*/

127

*/

128

struct mem_cgroup_lru_info info;

128

struct mem_cgroup_lru_info info;

129

130

int prev_priority; /* for recording reclaim priority */

130

int prev_priority; /* for recording reclaim priority */

131

/*

131

/*

132

* statistics.

132

* statistics. This must be placed at the end of memcg.

133

*/

133

*/

134

struct mem_cgroup_stat stat;

134

struct mem_cgroup_stat stat;

135

};

135

};

136

static struct mem_cgroup init_mem_cgroup;

137

136

138

enum charge_type {

137

enum charge_type {

139

MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

138

MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

140

MEM_CGROUP_CHARGE_TYPE_MAPPED,

139

MEM_CGROUP_CHARGE_TYPE_MAPPED,

141

MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */

140

MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */

142

MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */

141

MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */

143

NR_CHARGE_TYPE,

142

NR_CHARGE_TYPE,

144

};

143

};

145

144

146

/* only for here (for easy reading.) */

145

/* only for here (for easy reading.) */

147

#define PCGF_CACHE (1UL << PCG_CACHE)

146

#define PCGF_CACHE (1UL << PCG_CACHE)

148

#define PCGF_USED (1UL << PCG_USED)

147

#define PCGF_USED (1UL << PCG_USED)

149

#define PCGF_ACTIVE (1UL << PCG_ACTIVE)

148

#define PCGF_ACTIVE (1UL << PCG_ACTIVE)

150

#define PCGF_LOCK (1UL << PCG_LOCK)

149

#define PCGF_LOCK (1UL << PCG_LOCK)

151

#define PCGF_FILE (1UL << PCG_FILE)

150

#define PCGF_FILE (1UL << PCG_FILE)

152

static const unsigned long

151

static const unsigned long

153

pcg_default_flags[NR_CHARGE_TYPE] = {

152

pcg_default_flags[NR_CHARGE_TYPE] = {

154

PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */

153

PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */

155

PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */

154

PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */

156

PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */

155

PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */

157

0, /* FORCE */

156

0, /* FORCE */

158

};

157

};

159

158

160

/*

159

/*

161

* Always modified under lru lock. Then, not necessary to preempt_disable()

160

* Always modified under lru lock. Then, not necessary to preempt_disable()

162

*/

161

*/

163

static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,

162

static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,

164

struct page_cgroup *pc,

163

struct page_cgroup *pc,

165

bool charge)

164

bool charge)

166

{

165

{

167

int val = (charge)? 1 : -1;

166

int val = (charge)? 1 : -1;

168

struct mem_cgroup_stat *stat = &mem->stat;

167

struct mem_cgroup_stat *stat = &mem->stat;

169

struct mem_cgroup_stat_cpu *cpustat;

168

struct mem_cgroup_stat_cpu *cpustat;

170

169

171

VM_BUG_ON(!irqs_disabled());

170

VM_BUG_ON(!irqs_disabled());

172

171

173

cpustat = &stat->cpustat[smp_processor_id()];

172

cpustat = &stat->cpustat[smp_processor_id()];

174

if (PageCgroupCache(pc))

173

if (PageCgroupCache(pc))

175

__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);

174

__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);

176

else

175

else

177

__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);

176

__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);

178

177

179

if (charge)

178

if (charge)

180

__mem_cgroup_stat_add_safe(cpustat,

179

__mem_cgroup_stat_add_safe(cpustat,

181

MEM_CGROUP_STAT_PGPGIN_COUNT, 1);

180

MEM_CGROUP_STAT_PGPGIN_COUNT, 1);

182

else

181

else

183

__mem_cgroup_stat_add_safe(cpustat,

182

__mem_cgroup_stat_add_safe(cpustat,

184

MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);

183

MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);

185

}

184

}

186

185

187

static struct mem_cgroup_per_zone *

186

static struct mem_cgroup_per_zone *

188

mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)

187

mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)

189

{

188

{

190

return &mem->info.nodeinfo[nid]->zoneinfo[zid];

189

return &mem->info.nodeinfo[nid]->zoneinfo[zid];

191

}

190

}

192

191

193

static struct mem_cgroup_per_zone *

192

static struct mem_cgroup_per_zone *

194

page_cgroup_zoneinfo(struct page_cgroup *pc)

193

page_cgroup_zoneinfo(struct page_cgroup *pc)

195

{

194

{

196

struct mem_cgroup *mem = pc->mem_cgroup;

195

struct mem_cgroup *mem = pc->mem_cgroup;

197

int nid = page_cgroup_nid(pc);

196

int nid = page_cgroup_nid(pc);

198

int zid = page_cgroup_zid(pc);

197

int zid = page_cgroup_zid(pc);

199

198

200

return mem_cgroup_zoneinfo(mem, nid, zid);

199

return mem_cgroup_zoneinfo(mem, nid, zid);

201

}

200

}

202

201

203

static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,

202

static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,

204

enum lru_list idx)

203

enum lru_list idx)

205

{

204

{

206

int nid, zid;

205

int nid, zid;

207

struct mem_cgroup_per_zone *mz;

206

struct mem_cgroup_per_zone *mz;

208

u64 total = 0;

207

u64 total = 0;

209

208

210

for_each_online_node(nid)

209

for_each_online_node(nid)

211

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

210

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

212

mz = mem_cgroup_zoneinfo(mem, nid, zid);

211

mz = mem_cgroup_zoneinfo(mem, nid, zid);

213

total += MEM_CGROUP_ZSTAT(mz, idx);

212

total += MEM_CGROUP_ZSTAT(mz, idx);

214

}

213

}

215

return total;

214

return total;

216

}

215

}

217

216

218

static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)

217

static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)

219

{

218

{

220

return container_of(cgroup_subsys_state(cont,

219

return container_of(cgroup_subsys_state(cont,

221

mem_cgroup_subsys_id), struct mem_cgroup,

220

mem_cgroup_subsys_id), struct mem_cgroup,

222

css);

221

css);

223

}

222

}

224

223

225

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)

224

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)

226

{

225

{

227

/*

226

/*

228

* mm_update_next_owner() may clear mm->owner to NULL

227

* mm_update_next_owner() may clear mm->owner to NULL

229

* if it races with swapoff, page migration, etc.

228

* if it races with swapoff, page migration, etc.

230

* So this can be called with p == NULL.

229

* So this can be called with p == NULL.

231

*/

230

*/

232

if (unlikely(!p))

231

if (unlikely(!p))

233

return NULL;

232

return NULL;

234

233

235

return container_of(task_subsys_state(p, mem_cgroup_subsys_id),

234

return container_of(task_subsys_state(p, mem_cgroup_subsys_id),

236

struct mem_cgroup, css);

235

struct mem_cgroup, css);

237

}

236

}

238

237

239

static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,

238

static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,

240

struct page_cgroup *pc)

239

struct page_cgroup *pc)

241

{

240

{

242

int lru = LRU_BASE;

241

int lru = LRU_BASE;

243

242

244

if (PageCgroupUnevictable(pc))

243

if (PageCgroupUnevictable(pc))

245

lru = LRU_UNEVICTABLE;

244

lru = LRU_UNEVICTABLE;

246

else {

245

else {

247

if (PageCgroupActive(pc))

246

if (PageCgroupActive(pc))

248

lru += LRU_ACTIVE;

247

lru += LRU_ACTIVE;

249

if (PageCgroupFile(pc))

248

if (PageCgroupFile(pc))

250

lru += LRU_FILE;

249

lru += LRU_FILE;

251

}

250

}

252

251

253

MEM_CGROUP_ZSTAT(mz, lru) -= 1;

252

MEM_CGROUP_ZSTAT(mz, lru) -= 1;

254

253

255

mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);

254

mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);

256

list_del(&pc->lru);

255

list_del(&pc->lru);

257

}

256

}

258

257

259

static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,

258

static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,

260

struct page_cgroup *pc, bool hot)

259

struct page_cgroup *pc, bool hot)

261

{

260

{

262

int lru = LRU_BASE;

261

int lru = LRU_BASE;

263

262

264

if (PageCgroupUnevictable(pc))

263

if (PageCgroupUnevictable(pc))

265

lru = LRU_UNEVICTABLE;

264

lru = LRU_UNEVICTABLE;

266

else {

265

else {

267

if (PageCgroupActive(pc))

266

if (PageCgroupActive(pc))

268

lru += LRU_ACTIVE;

267

lru += LRU_ACTIVE;

269

if (PageCgroupFile(pc))

268

if (PageCgroupFile(pc))

270

lru += LRU_FILE;

269

lru += LRU_FILE;

271

}

270

}

272

271

273

MEM_CGROUP_ZSTAT(mz, lru) += 1;

272

MEM_CGROUP_ZSTAT(mz, lru) += 1;

274

if (hot)

273

if (hot)

275

list_add(&pc->lru, &mz->lists[lru]);

274

list_add(&pc->lru, &mz->lists[lru]);

276

else

275

else

277

list_add_tail(&pc->lru, &mz->lists[lru]);

276

list_add_tail(&pc->lru, &mz->lists[lru]);

278

277

279

mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);

278

mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);

280

}

279

}

281

280

282

static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)

281

static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)

283

{

282

{

284

struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);

283

struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);

285

int active = PageCgroupActive(pc);

284

int active = PageCgroupActive(pc);

286

int file = PageCgroupFile(pc);

285

int file = PageCgroupFile(pc);

287

int unevictable = PageCgroupUnevictable(pc);

286

int unevictable = PageCgroupUnevictable(pc);

288

enum lru_list from = unevictable ? LRU_UNEVICTABLE :

287

enum lru_list from = unevictable ? LRU_UNEVICTABLE :

289

(LRU_FILE * !!file + !!active);

288

(LRU_FILE * !!file + !!active);

290

289

291

if (lru == from)

290

if (lru == from)

292

return;

291

return;

293

292

294

MEM_CGROUP_ZSTAT(mz, from) -= 1;

293

MEM_CGROUP_ZSTAT(mz, from) -= 1;

295

/*

294

/*

296

* However this is done under mz->lru_lock, another flags, which

295

* However this is done under mz->lru_lock, another flags, which

297

* are not related to LRU, will be modified from out-of-lock.

296

* are not related to LRU, will be modified from out-of-lock.

298

* We have to use atomic set/clear flags.

297

* We have to use atomic set/clear flags.

299

*/

298

*/

300

if (is_unevictable_lru(lru)) {

299

if (is_unevictable_lru(lru)) {

301

ClearPageCgroupActive(pc);

300

ClearPageCgroupActive(pc);

302

SetPageCgroupUnevictable(pc);

301

SetPageCgroupUnevictable(pc);

303

} else {

302

} else {

304

if (is_active_lru(lru))

303

if (is_active_lru(lru))

305

SetPageCgroupActive(pc);

304

SetPageCgroupActive(pc);

306

else

305

else

307

ClearPageCgroupActive(pc);

306

ClearPageCgroupActive(pc);

308

ClearPageCgroupUnevictable(pc);

307

ClearPageCgroupUnevictable(pc);

309

}

308

}

310

309

311

MEM_CGROUP_ZSTAT(mz, lru) += 1;

310

MEM_CGROUP_ZSTAT(mz, lru) += 1;

312

list_move(&pc->lru, &mz->lists[lru]);

311

list_move(&pc->lru, &mz->lists[lru]);

313

}

312

}

314

313

315

int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)

314

int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)

316

{

315

{

317

int ret;

316

int ret;

318

317

319

task_lock(task);

318

task_lock(task);

320

ret = task->mm && mm_match_cgroup(task->mm, mem);

319

ret = task->mm && mm_match_cgroup(task->mm, mem);

321

task_unlock(task);

320

task_unlock(task);

322

return ret;

321

return ret;

323

}

322

}

324

323

325

/*

324

/*

326

* This routine assumes that the appropriate zone's lru lock is already held

325

* This routine assumes that the appropriate zone's lru lock is already held

327

*/

326

*/

328

void mem_cgroup_move_lists(struct page *page, enum lru_list lru)

327

void mem_cgroup_move_lists(struct page *page, enum lru_list lru)

329

{

328

{

330

struct page_cgroup *pc;

329

struct page_cgroup *pc;

331

struct mem_cgroup_per_zone *mz;

330

struct mem_cgroup_per_zone *mz;

332

unsigned long flags;

331

unsigned long flags;

333

332

334

if (mem_cgroup_subsys.disabled)

333

if (mem_cgroup_subsys.disabled)

335

return;

334

return;

336

335

337

/*

336

/*

338

* We cannot lock_page_cgroup while holding zone's lru_lock,

337

* We cannot lock_page_cgroup while holding zone's lru_lock,

339

* because other holders of lock_page_cgroup can be interrupted

338

* because other holders of lock_page_cgroup can be interrupted

340

* with an attempt to rotate_reclaimable_page. But we cannot

339

* with an attempt to rotate_reclaimable_page. But we cannot

341

* safely get to page_cgroup without it, so just try_lock it:

340

* safely get to page_cgroup without it, so just try_lock it:

342

* mem_cgroup_isolate_pages allows for page left on wrong list.

341

* mem_cgroup_isolate_pages allows for page left on wrong list.

343

*/

342

*/

344

pc = lookup_page_cgroup(page);

343

pc = lookup_page_cgroup(page);

345

if (!trylock_page_cgroup(pc))

344

if (!trylock_page_cgroup(pc))

346

return;

345

return;

347

if (pc && PageCgroupUsed(pc)) {

346

if (pc && PageCgroupUsed(pc)) {

348

mz = page_cgroup_zoneinfo(pc);

347

mz = page_cgroup_zoneinfo(pc);

349

spin_lock_irqsave(&mz->lru_lock, flags);

348

spin_lock_irqsave(&mz->lru_lock, flags);

350

__mem_cgroup_move_lists(pc, lru);

349

__mem_cgroup_move_lists(pc, lru);

351

spin_unlock_irqrestore(&mz->lru_lock, flags);

350

spin_unlock_irqrestore(&mz->lru_lock, flags);

352

}

351

}

353

unlock_page_cgroup(pc);

352

unlock_page_cgroup(pc);

354

}

353

}

355

354

356

/*

355

/*

357

* Calculate mapped_ratio under memory controller. This will be used in

356

* Calculate mapped_ratio under memory controller. This will be used in

358

* vmscan.c for deteremining we have to reclaim mapped pages.

357

* vmscan.c for deteremining we have to reclaim mapped pages.

359

*/

358

*/

360

int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)

359

int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)

361

{

360

{

362

long total, rss;

361

long total, rss;

363

362

364

/*

363

/*

365

* usage is recorded in bytes. But, here, we assume the number of

364

* usage is recorded in bytes. But, here, we assume the number of

366

* physical pages can be represented by "long" on any arch.

365

* physical pages can be represented by "long" on any arch.

367

*/

366

*/

368

total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;

367

total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;

369

rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);

368

rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);

370

return (int)((rss * 100L) / total);

369

return (int)((rss * 100L) / total);

371

}

370

}

372

371

373

/*

372

/*

374

* prev_priority control...this will be used in memory reclaim path.

373

* prev_priority control...this will be used in memory reclaim path.

375

*/

374

*/

376

int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)

375

int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)

377

{

376

{

378

return mem->prev_priority;

377

return mem->prev_priority;

379

}

378

}

380

379

381

void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)

380

void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)

382

{

381

{

383

if (priority < mem->prev_priority)

382

if (priority < mem->prev_priority)

384

mem->prev_priority = priority;

383

mem->prev_priority = priority;

385

}

384

}

386

385

387

void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)

386

void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)

388

{

387

{

389

mem->prev_priority = priority;

388

mem->prev_priority = priority;

390

}

389

}

391

390

392

/*

391

/*

393

* Calculate # of pages to be scanned in this priority/zone.

392

* Calculate # of pages to be scanned in this priority/zone.

394

* See also vmscan.c

393

* See also vmscan.c

395

*

394

*

396

* priority starts from "DEF_PRIORITY" and decremented in each loop.

395

* priority starts from "DEF_PRIORITY" and decremented in each loop.

397

* (see include/linux/mmzone.h)

396

* (see include/linux/mmzone.h)

398

*/

397

*/

399

398

400

long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,

399

long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,

401

int priority, enum lru_list lru)

400

int priority, enum lru_list lru)

402

{

401

{

403

long nr_pages;

402

long nr_pages;

404

int nid = zone->zone_pgdat->node_id;

403

int nid = zone->zone_pgdat->node_id;

405

int zid = zone_idx(zone);

404

int zid = zone_idx(zone);

406

struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);

405

struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);

407

406

408

nr_pages = MEM_CGROUP_ZSTAT(mz, lru);

407

nr_pages = MEM_CGROUP_ZSTAT(mz, lru);

409

408

410

return (nr_pages >> priority);

409

return (nr_pages >> priority);

411

}

410

}

412

411

413

unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,

412

unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,

414

struct list_head *dst,

413

struct list_head *dst,

415

unsigned long *scanned, int order,

414

unsigned long *scanned, int order,

416

int mode, struct zone *z,

415

int mode, struct zone *z,

417

struct mem_cgroup *mem_cont,

416

struct mem_cgroup *mem_cont,

418

int active, int file)

417

int active, int file)

419

{

418

{

420

unsigned long nr_taken = 0;

419

unsigned long nr_taken = 0;

421

struct page *page;

420

struct page *page;

422

unsigned long scan;

421

unsigned long scan;

423

LIST_HEAD(pc_list);

422

LIST_HEAD(pc_list);

424

struct list_head *src;

423

struct list_head *src;

425

struct page_cgroup *pc, *tmp;

424

struct page_cgroup *pc, *tmp;

426

int nid = z->zone_pgdat->node_id;

425

int nid = z->zone_pgdat->node_id;

427

int zid = zone_idx(z);

426

int zid = zone_idx(z);

428

struct mem_cgroup_per_zone *mz;

427

struct mem_cgroup_per_zone *mz;

429

int lru = LRU_FILE * !!file + !!active;

428

int lru = LRU_FILE * !!file + !!active;

430

429

431

BUG_ON(!mem_cont);

430

BUG_ON(!mem_cont);

432

mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);

431

mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);

433

src = &mz->lists[lru];

432

src = &mz->lists[lru];

434

433

435

spin_lock(&mz->lru_lock);

434

spin_lock(&mz->lru_lock);

436

scan = 0;

435

scan = 0;

437

list_for_each_entry_safe_reverse(pc, tmp, src, lru) {

436

list_for_each_entry_safe_reverse(pc, tmp, src, lru) {

438

if (scan >= nr_to_scan)

437

if (scan >= nr_to_scan)

439

break;

438

break;

440

if (unlikely(!PageCgroupUsed(pc)))

439

if (unlikely(!PageCgroupUsed(pc)))

441

continue;

440

continue;

442

page = pc->page;

441

page = pc->page;

443

442

444

if (unlikely(!PageLRU(page)))

443

if (unlikely(!PageLRU(page)))

445

continue;

444

continue;

446

445

447

/*

446

/*

448

* TODO: play better with lumpy reclaim, grabbing anything.

447

* TODO: play better with lumpy reclaim, grabbing anything.

449

*/

448

*/

450

if (PageUnevictable(page) ||

449

if (PageUnevictable(page) ||

451

(PageActive(page) && !active) ||

450

(PageActive(page) && !active) ||

452

(!PageActive(page) && active)) {

451

(!PageActive(page) && active)) {

453

__mem_cgroup_move_lists(pc, page_lru(page));

452

__mem_cgroup_move_lists(pc, page_lru(page));

454

continue;

453

continue;

455

}

454

}

456

455

457

scan++;

456

scan++;

458

list_move(&pc->lru, &pc_list);

457

list_move(&pc->lru, &pc_list);

459

458

460

if (__isolate_lru_page(page, mode, file) == 0) {

459

if (__isolate_lru_page(page, mode, file) == 0) {

461

list_move(&page->lru, dst);

460

list_move(&page->lru, dst);

462

nr_taken++;

461

nr_taken++;

463

}

462

}

464

}

463

}

465

464

466

list_splice(&pc_list, src);

465

list_splice(&pc_list, src);

467

spin_unlock(&mz->lru_lock);

466

spin_unlock(&mz->lru_lock);

468

467

469

*scanned = scan;

468

*scanned = scan;

470

return nr_taken;

469

return nr_taken;

471

}

470

}

472

471

473

/*

472

/*

474

* Unlike exported interface, "oom" parameter is added. if oom==true,

473

* Unlike exported interface, "oom" parameter is added. if oom==true,

475

* oom-killer can be invoked.

474

* oom-killer can be invoked.

476

*/

475

*/

477

static int __mem_cgroup_try_charge(struct mm_struct *mm,

476

static int __mem_cgroup_try_charge(struct mm_struct *mm,

478

gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)

477

gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)

479

{

478

{

480

struct mem_cgroup *mem;

479

struct mem_cgroup *mem;

481

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

480

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

482

/*

481

/*

483

* We always charge the cgroup the mm_struct belongs to.

482

* We always charge the cgroup the mm_struct belongs to.

484

* The mm_struct's mem_cgroup changes on task migration if the

483

* The mm_struct's mem_cgroup changes on task migration if the

485

* thread group leader migrates. It's possible that mm is not

484

* thread group leader migrates. It's possible that mm is not

486

* set, if so charge the init_mm (happens for pagecache usage).

485

* set, if so charge the init_mm (happens for pagecache usage).

487

*/

486

*/

488

if (likely(!*memcg)) {

487

if (likely(!*memcg)) {

489

rcu_read_lock();

488

rcu_read_lock();

490

mem = mem_cgroup_from_task(rcu_dereference(mm->owner));

489

mem = mem_cgroup_from_task(rcu_dereference(mm->owner));

491

if (unlikely(!mem)) {

490

if (unlikely(!mem)) {

492

rcu_read_unlock();

491

rcu_read_unlock();

493

return 0;

492

return 0;

494

}

493

}

495

/*

494

/*

496

* For every charge from the cgroup, increment reference count

495

* For every charge from the cgroup, increment reference count

497

*/

496

*/

498

css_get(&mem->css);

497

css_get(&mem->css);

499

*memcg = mem;

498

*memcg = mem;

500

rcu_read_unlock();

499

rcu_read_unlock();

501

} else {

500

} else {

502

mem = *memcg;

501

mem = *memcg;

503

css_get(&mem->css);

502

css_get(&mem->css);

504

}

503

}

505

504

506

505

507

while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {

506

while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {

508

if (!(gfp_mask & __GFP_WAIT))

507

if (!(gfp_mask & __GFP_WAIT))

509

goto nomem;

508

goto nomem;

510

509

511

if (try_to_free_mem_cgroup_pages(mem, gfp_mask))

510

if (try_to_free_mem_cgroup_pages(mem, gfp_mask))

512

continue;

511

continue;

513

512

514

/*

513

/*

515

* try_to_free_mem_cgroup_pages() might not give us a full

514

* try_to_free_mem_cgroup_pages() might not give us a full

516

* picture of reclaim. Some pages are reclaimed and might be

515

* picture of reclaim. Some pages are reclaimed and might be

517

* moved to swap cache or just unmapped from the cgroup.

516

* moved to swap cache or just unmapped from the cgroup.

518

* Check the limit again to see if the reclaim reduced the

517

* Check the limit again to see if the reclaim reduced the

519

* current usage of the cgroup before giving up

518

* current usage of the cgroup before giving up

520

*/

519

*/

521

if (res_counter_check_under_limit(&mem->res))

520

if (res_counter_check_under_limit(&mem->res))

522

continue;

521

continue;

523

522

524

if (!nr_retries--) {

523

if (!nr_retries--) {

525

if (oom)

524

if (oom)

526

mem_cgroup_out_of_memory(mem, gfp_mask);

525

mem_cgroup_out_of_memory(mem, gfp_mask);

527

goto nomem;

526

goto nomem;

528

}

527

}

529

}

528

}

530

return 0;

529

return 0;

531

nomem:

530

nomem:

532

css_put(&mem->css);

531

css_put(&mem->css);

533

return -ENOMEM;

532

return -ENOMEM;

534

}

533

}

535

534

536

/**

535

/**

537

* mem_cgroup_try_charge - get charge of PAGE_SIZE.

536

* mem_cgroup_try_charge - get charge of PAGE_SIZE.

538

* @mm: an mm_struct which is charged against. (when *memcg is NULL)

537

* @mm: an mm_struct which is charged against. (when *memcg is NULL)

539

* @gfp_mask: gfp_mask for reclaim.

538

* @gfp_mask: gfp_mask for reclaim.

540

* @memcg: a pointer to memory cgroup which is charged against.

539

* @memcg: a pointer to memory cgroup which is charged against.

541

*

540

*

542

* charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated

541

* charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated

543

* memory cgroup from @mm is got and stored in *memcg.

542

* memory cgroup from @mm is got and stored in *memcg.

544

*

543

*

545

* Returns 0 if success. -ENOMEM at failure.

544

* Returns 0 if success. -ENOMEM at failure.

546

* This call can invoke OOM-Killer.

545

* This call can invoke OOM-Killer.

547

*/

546

*/

548

547

549

int mem_cgroup_try_charge(struct mm_struct *mm,

548

int mem_cgroup_try_charge(struct mm_struct *mm,

550

gfp_t mask, struct mem_cgroup **memcg)

549

gfp_t mask, struct mem_cgroup **memcg)

551

{

550

{

552

return __mem_cgroup_try_charge(mm, mask, memcg, true);

551

return __mem_cgroup_try_charge(mm, mask, memcg, true);

553

}

552

}

554

553

555

/*

554

/*

556

* commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be

555

* commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be

557

* USED state. If already USED, uncharge and return.

556

* USED state. If already USED, uncharge and return.

558

*/

557

*/

559

558

560

static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,

559

static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,

561

struct page_cgroup *pc,

560

struct page_cgroup *pc,

562

enum charge_type ctype)

561

enum charge_type ctype)

563

{

562

{

564

struct mem_cgroup_per_zone *mz;

563

struct mem_cgroup_per_zone *mz;

565

unsigned long flags;

564

unsigned long flags;

566

565

567

/* try_charge() can return NULL to *memcg, taking care of it. */

566

/* try_charge() can return NULL to *memcg, taking care of it. */

568

if (!mem)

567

if (!mem)

569

return;

568

return;

570

569

571

lock_page_cgroup(pc);

570

lock_page_cgroup(pc);

572

if (unlikely(PageCgroupUsed(pc))) {

571

if (unlikely(PageCgroupUsed(pc))) {

573

unlock_page_cgroup(pc);

572

unlock_page_cgroup(pc);

574

res_counter_uncharge(&mem->res, PAGE_SIZE);

573

res_counter_uncharge(&mem->res, PAGE_SIZE);

575

css_put(&mem->css);

574

css_put(&mem->css);

576

return;

575

return;

577

}

576

}

578

pc->mem_cgroup = mem;

577

pc->mem_cgroup = mem;

579

/*

578

/*

580

* If a page is accounted as a page cache, insert to inactive list.

579

* If a page is accounted as a page cache, insert to inactive list.

581

* If anon, insert to active list.

580

* If anon, insert to active list.

582

*/

581

*/

583

pc->flags = pcg_default_flags[ctype];

582

pc->flags = pcg_default_flags[ctype];

584

583

585

mz = page_cgroup_zoneinfo(pc);

584

mz = page_cgroup_zoneinfo(pc);

586

585

587

spin_lock_irqsave(&mz->lru_lock, flags);

586

spin_lock_irqsave(&mz->lru_lock, flags);

588

__mem_cgroup_add_list(mz, pc, true);

587

__mem_cgroup_add_list(mz, pc, true);

589

spin_unlock_irqrestore(&mz->lru_lock, flags);

588

spin_unlock_irqrestore(&mz->lru_lock, flags);

590

unlock_page_cgroup(pc);

589

unlock_page_cgroup(pc);

591

}

590

}

592

591

593

/**

592

/**

594

* mem_cgroup_move_account - move account of the page

593

* mem_cgroup_move_account - move account of the page

595

* @pc: page_cgroup of the page.

594

* @pc: page_cgroup of the page.

596

* @from: mem_cgroup which the page is moved from.

595

* @from: mem_cgroup which the page is moved from.

597

* @to: mem_cgroup which the page is moved to. @from != @to.

596

* @to: mem_cgroup which the page is moved to. @from != @to.

598

*

597

*

599

* The caller must confirm following.

598

* The caller must confirm following.

600

* 1. disable irq.

599

* 1. disable irq.

601

* 2. lru_lock of old mem_cgroup(@from) should be held.

600

* 2. lru_lock of old mem_cgroup(@from) should be held.

602

*

601

*

603

* returns 0 at success,

602

* returns 0 at success,

604

* returns -EBUSY when lock is busy or "pc" is unstable.

603

* returns -EBUSY when lock is busy or "pc" is unstable.

605

*

604

*

606

* This function does "uncharge" from old cgroup but doesn't do "charge" to

605

* This function does "uncharge" from old cgroup but doesn't do "charge" to

607

* new cgroup. It should be done by a caller.

606

* new cgroup. It should be done by a caller.

608

*/

607

*/

609

608

610

static int mem_cgroup_move_account(struct page_cgroup *pc,

609

static int mem_cgroup_move_account(struct page_cgroup *pc,

611

struct mem_cgroup *from, struct mem_cgroup *to)

610

struct mem_cgroup *from, struct mem_cgroup *to)

612

{

611

{

613

struct mem_cgroup_per_zone *from_mz, *to_mz;

612

struct mem_cgroup_per_zone *from_mz, *to_mz;

614

int nid, zid;

613

int nid, zid;

615

int ret = -EBUSY;

614

int ret = -EBUSY;

616

615

617

VM_BUG_ON(!irqs_disabled());

616

VM_BUG_ON(!irqs_disabled());

618

VM_BUG_ON(from == to);

617

VM_BUG_ON(from == to);

619

618

620

nid = page_cgroup_nid(pc);

619

nid = page_cgroup_nid(pc);

621

zid = page_cgroup_zid(pc);

620

zid = page_cgroup_zid(pc);

622

from_mz = mem_cgroup_zoneinfo(from, nid, zid);

621

from_mz = mem_cgroup_zoneinfo(from, nid, zid);

623

to_mz = mem_cgroup_zoneinfo(to, nid, zid);

622

to_mz = mem_cgroup_zoneinfo(to, nid, zid);

624

623

625

624

626

if (!trylock_page_cgroup(pc))

625

if (!trylock_page_cgroup(pc))

627

return ret;

626

return ret;

628

627

629

if (!PageCgroupUsed(pc))

628

if (!PageCgroupUsed(pc))

630

goto out;

629

goto out;

631

630

632

if (pc->mem_cgroup != from)

631

if (pc->mem_cgroup != from)

633

goto out;

632

goto out;

634

633

635

if (spin_trylock(&to_mz->lru_lock)) {

634

if (spin_trylock(&to_mz->lru_lock)) {

636

__mem_cgroup_remove_list(from_mz, pc);

635

__mem_cgroup_remove_list(from_mz, pc);

637

css_put(&from->css);

636

css_put(&from->css);

638

res_counter_uncharge(&from->res, PAGE_SIZE);

637

res_counter_uncharge(&from->res, PAGE_SIZE);

639

pc->mem_cgroup = to;

638

pc->mem_cgroup = to;

640

css_get(&to->css);

639

css_get(&to->css);

641

__mem_cgroup_add_list(to_mz, pc, false);

640

__mem_cgroup_add_list(to_mz, pc, false);

642

ret = 0;

641

ret = 0;

643

spin_unlock(&to_mz->lru_lock);

642

spin_unlock(&to_mz->lru_lock);

644

}

643

}

645

out:

644

out:

646

unlock_page_cgroup(pc);

645

unlock_page_cgroup(pc);

647

return ret;

646

return ret;

648

}

647

}

649

648

650

/*

649

/*

651

* move charges to its parent.

650

* move charges to its parent.

652

*/

651

*/

653

652

654

static int mem_cgroup_move_parent(struct page_cgroup *pc,

653

static int mem_cgroup_move_parent(struct page_cgroup *pc,

655

struct mem_cgroup *child,

654

struct mem_cgroup *child,

656

gfp_t gfp_mask)

655

gfp_t gfp_mask)

657

{

656

{

658

struct cgroup *cg = child->css.cgroup;

657

struct cgroup *cg = child->css.cgroup;

659

struct cgroup *pcg = cg->parent;

658

struct cgroup *pcg = cg->parent;

660

struct mem_cgroup *parent;

659

struct mem_cgroup *parent;

661

struct mem_cgroup_per_zone *mz;

660

struct mem_cgroup_per_zone *mz;

662

unsigned long flags;

661

unsigned long flags;

663

int ret;

662

int ret;

664

663

665

/* Is ROOT ? */

664

/* Is ROOT ? */

666

if (!pcg)

665

if (!pcg)

667

return -EINVAL;

666

return -EINVAL;

668

667

669

parent = mem_cgroup_from_cont(pcg);

668

parent = mem_cgroup_from_cont(pcg);

670

669

671

ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);

670

ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);

672

if (ret)

671

if (ret)

673

return ret;

672

return ret;

674

673

675

mz = mem_cgroup_zoneinfo(child,

674

mz = mem_cgroup_zoneinfo(child,

676

page_cgroup_nid(pc), page_cgroup_zid(pc));

675

page_cgroup_nid(pc), page_cgroup_zid(pc));

677

676

678

spin_lock_irqsave(&mz->lru_lock, flags);

677

spin_lock_irqsave(&mz->lru_lock, flags);

679

ret = mem_cgroup_move_account(pc, child, parent);

678

ret = mem_cgroup_move_account(pc, child, parent);

680

spin_unlock_irqrestore(&mz->lru_lock, flags);

679

spin_unlock_irqrestore(&mz->lru_lock, flags);

681

680

682

/* drop extra refcnt */

681

/* drop extra refcnt */

683

css_put(&parent->css);

682

css_put(&parent->css);

684

/* uncharge if move fails */

683

/* uncharge if move fails */

685

if (ret)

684

if (ret)

686

res_counter_uncharge(&parent->res, PAGE_SIZE);

685

res_counter_uncharge(&parent->res, PAGE_SIZE);

687

686

688

return ret;

687

return ret;

689

}

688

}

690

689

691

/*

690

/*

692

* Charge the memory controller for page usage.

691

* Charge the memory controller for page usage.

693

* Return

692

* Return

694

* 0 if the charge was successful

693

* 0 if the charge was successful

695

* < 0 if the cgroup is over its limit

694

* < 0 if the cgroup is over its limit

696

*/

695

*/

697

static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,

696

static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,

698

gfp_t gfp_mask, enum charge_type ctype,

697

gfp_t gfp_mask, enum charge_type ctype,

699

struct mem_cgroup *memcg)

698

struct mem_cgroup *memcg)

700

{

699

{

701

struct mem_cgroup *mem;

700

struct mem_cgroup *mem;

702

struct page_cgroup *pc;

701

struct page_cgroup *pc;

703

int ret;

702

int ret;

704

703

705

pc = lookup_page_cgroup(page);

704

pc = lookup_page_cgroup(page);

706

/* can happen at boot */

705

/* can happen at boot */

707

if (unlikely(!pc))

706

if (unlikely(!pc))

708

return 0;

707

return 0;

709

prefetchw(pc);

708

prefetchw(pc);

710

709

711

mem = memcg;

710

mem = memcg;

712

ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);

711

ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);

713

if (ret)

712

if (ret)

714

return ret;

713

return ret;

715

714

716

__mem_cgroup_commit_charge(mem, pc, ctype);

715

__mem_cgroup_commit_charge(mem, pc, ctype);

717

return 0;

716

return 0;

718

}

717

}

719

718

720

int mem_cgroup_newpage_charge(struct page *page,

719

int mem_cgroup_newpage_charge(struct page *page,

721

struct mm_struct *mm, gfp_t gfp_mask)

720

struct mm_struct *mm, gfp_t gfp_mask)

722

{

721

{

723

if (mem_cgroup_subsys.disabled)

722

if (mem_cgroup_subsys.disabled)

724

return 0;

723

return 0;

725

if (PageCompound(page))

724

if (PageCompound(page))

726

return 0;

725

return 0;

727

/*

726

/*

728

* If already mapped, we don't have to account.

727

* If already mapped, we don't have to account.

729

* If page cache, page->mapping has address_space.

728

* If page cache, page->mapping has address_space.

730

* But page->mapping may have out-of-use anon_vma pointer,

729

* But page->mapping may have out-of-use anon_vma pointer,

731

* detecit it by PageAnon() check. newly-mapped-anon's page->mapping

730

* detecit it by PageAnon() check. newly-mapped-anon's page->mapping

732

* is NULL.

731

* is NULL.

733

*/

732

*/

734

if (page_mapped(page) || (page->mapping && !PageAnon(page)))

733

if (page_mapped(page) || (page->mapping && !PageAnon(page)))

735

return 0;

734

return 0;

736

if (unlikely(!mm))

735

if (unlikely(!mm))

737

mm = &init_mm;

736

mm = &init_mm;

738

return mem_cgroup_charge_common(page, mm, gfp_mask,

737

return mem_cgroup_charge_common(page, mm, gfp_mask,

739

MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);

738

MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);

740

}

739

}

741

740

742

int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,

741

int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,

743

gfp_t gfp_mask)

742

gfp_t gfp_mask)

744

{

743

{

745

if (mem_cgroup_subsys.disabled)

744

if (mem_cgroup_subsys.disabled)

746

return 0;

745

return 0;

747

if (PageCompound(page))

746

if (PageCompound(page))

748

return 0;

747

return 0;

749

/*

748

/*

750

* Corner case handling. This is called from add_to_page_cache()

749

* Corner case handling. This is called from add_to_page_cache()

751

* in usual. But some FS (shmem) precharges this page before calling it

750

* in usual. But some FS (shmem) precharges this page before calling it

752

* and call add_to_page_cache() with GFP_NOWAIT.

751

* and call add_to_page_cache() with GFP_NOWAIT.

753

*

752

*

754

* For GFP_NOWAIT case, the page may be pre-charged before calling

753

* For GFP_NOWAIT case, the page may be pre-charged before calling

755

* add_to_page_cache(). (See shmem.c) check it here and avoid to call

754

* add_to_page_cache(). (See shmem.c) check it here and avoid to call

756

* charge twice. (It works but has to pay a bit larger cost.)

755

* charge twice. (It works but has to pay a bit larger cost.)

757

*/

756

*/

758

if (!(gfp_mask & __GFP_WAIT)) {

757

if (!(gfp_mask & __GFP_WAIT)) {

759

struct page_cgroup *pc;

758

struct page_cgroup *pc;

760

759

761

760

762

pc = lookup_page_cgroup(page);

761

pc = lookup_page_cgroup(page);

763

if (!pc)

762

if (!pc)

764

return 0;

763

return 0;

765

lock_page_cgroup(pc);

764

lock_page_cgroup(pc);

766

if (PageCgroupUsed(pc)) {

765

if (PageCgroupUsed(pc)) {

767

unlock_page_cgroup(pc);

766

unlock_page_cgroup(pc);

768

return 0;

767

return 0;

769

}

768

}

770

unlock_page_cgroup(pc);

769

unlock_page_cgroup(pc);

771

}

770

}

772

771

773

if (unlikely(!mm))

772

if (unlikely(!mm))

774

mm = &init_mm;

773

mm = &init_mm;

775

774

776

if (page_is_file_cache(page))

775

if (page_is_file_cache(page))

777

return mem_cgroup_charge_common(page, mm, gfp_mask,

776

return mem_cgroup_charge_common(page, mm, gfp_mask,

778

MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);

777

MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);

779

else

778

else

780

return mem_cgroup_charge_common(page, mm, gfp_mask,

779

return mem_cgroup_charge_common(page, mm, gfp_mask,

781

MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);

780

MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);

782

}

781

}

783

782

784

void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)

783

void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)

785

{

784

{

786

struct page_cgroup *pc;

785

struct page_cgroup *pc;

787

786

788

if (mem_cgroup_subsys.disabled)

787

if (mem_cgroup_subsys.disabled)

789

return;

788

return;

790

if (!ptr)

789

if (!ptr)

791

return;

790

return;

792

pc = lookup_page_cgroup(page);

791

pc = lookup_page_cgroup(page);

793

__mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);

792

__mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);

794

}

793

}

795

794

796

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)

795

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)

797

{

796

{

798

if (mem_cgroup_subsys.disabled)

797

if (mem_cgroup_subsys.disabled)

799

return;

798

return;

800

if (!mem)

799

if (!mem)

801

return;

800

return;

802

res_counter_uncharge(&mem->res, PAGE_SIZE);

801

res_counter_uncharge(&mem->res, PAGE_SIZE);

803

css_put(&mem->css);

802

css_put(&mem->css);

804

}

803

}

805

804

806

805

807

/*

806

/*

808

* uncharge if !page_mapped(page)

807

* uncharge if !page_mapped(page)

809

*/

808

*/

810

static void

809

static void

811

__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)

810

__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)

812

{

811

{

813

struct page_cgroup *pc;

812

struct page_cgroup *pc;

814

struct mem_cgroup *mem;

813

struct mem_cgroup *mem;

815

struct mem_cgroup_per_zone *mz;

814

struct mem_cgroup_per_zone *mz;

816

unsigned long flags;

815

unsigned long flags;

817

816

818

if (mem_cgroup_subsys.disabled)

817

if (mem_cgroup_subsys.disabled)

819

return;

818

return;

820

819

821

/*

820

/*

822

* Check if our page_cgroup is valid

821

* Check if our page_cgroup is valid

823

*/

822

*/

824

pc = lookup_page_cgroup(page);

823

pc = lookup_page_cgroup(page);

825

if (unlikely(!pc || !PageCgroupUsed(pc)))

824

if (unlikely(!pc || !PageCgroupUsed(pc)))

826

return;

825

return;

827

826

828

lock_page_cgroup(pc);

827

lock_page_cgroup(pc);

829

if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))

828

if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))

830

|| !PageCgroupUsed(pc)) {

829

|| !PageCgroupUsed(pc)) {

831

/* This happens at race in zap_pte_range() and do_swap_page()*/

830

/* This happens at race in zap_pte_range() and do_swap_page()*/

832

unlock_page_cgroup(pc);

831

unlock_page_cgroup(pc);

833

return;

832

return;

834

}

833

}

835

ClearPageCgroupUsed(pc);

834

ClearPageCgroupUsed(pc);

836

mem = pc->mem_cgroup;

835

mem = pc->mem_cgroup;

837

836

838

mz = page_cgroup_zoneinfo(pc);

837

mz = page_cgroup_zoneinfo(pc);

839

spin_lock_irqsave(&mz->lru_lock, flags);

838

spin_lock_irqsave(&mz->lru_lock, flags);

840

__mem_cgroup_remove_list(mz, pc);

839

__mem_cgroup_remove_list(mz, pc);

841

spin_unlock_irqrestore(&mz->lru_lock, flags);

840

spin_unlock_irqrestore(&mz->lru_lock, flags);

842

unlock_page_cgroup(pc);

841

unlock_page_cgroup(pc);

843

842

844

res_counter_uncharge(&mem->res, PAGE_SIZE);

843

res_counter_uncharge(&mem->res, PAGE_SIZE);

845

css_put(&mem->css);

844

css_put(&mem->css);

846

845

847

return;

846

return;

848

}

847

}

849

848

850

void mem_cgroup_uncharge_page(struct page *page)

849

void mem_cgroup_uncharge_page(struct page *page)

851

{

850

{

852

/* early check. */

851

/* early check. */

853

if (page_mapped(page))

852

if (page_mapped(page))

854

return;

853

return;

855

if (page->mapping && !PageAnon(page))

854

if (page->mapping && !PageAnon(page))

856

return;

855

return;

857

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);

856

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);

858

}

857

}

859

858

860

void mem_cgroup_uncharge_cache_page(struct page *page)

859

void mem_cgroup_uncharge_cache_page(struct page *page)

861

{

860

{

862

VM_BUG_ON(page_mapped(page));

861

VM_BUG_ON(page_mapped(page));

863

VM_BUG_ON(page->mapping);

862

VM_BUG_ON(page->mapping);

864

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);

863

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);

865

}

864

}

866

865

867

/*

866

/*

868

* Before starting migration, account PAGE_SIZE to mem_cgroup that the old

867

* Before starting migration, account PAGE_SIZE to mem_cgroup that the old

869

* page belongs to.

868

* page belongs to.

870

*/

869

*/

871

int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)

870

int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)

872

{

871

{

873

struct page_cgroup *pc;

872

struct page_cgroup *pc;

874

struct mem_cgroup *mem = NULL;

873

struct mem_cgroup *mem = NULL;

875

int ret = 0;

874

int ret = 0;

876

875

877

if (mem_cgroup_subsys.disabled)

876

if (mem_cgroup_subsys.disabled)

878

return 0;

877

return 0;

879

878

880

pc = lookup_page_cgroup(page);

879

pc = lookup_page_cgroup(page);

881

lock_page_cgroup(pc);

880

lock_page_cgroup(pc);

882

if (PageCgroupUsed(pc)) {

881

if (PageCgroupUsed(pc)) {

883

mem = pc->mem_cgroup;

882

mem = pc->mem_cgroup;

884

css_get(&mem->css);

883

css_get(&mem->css);

885

}

884

}

886

unlock_page_cgroup(pc);

885

unlock_page_cgroup(pc);

887

886

888

if (mem) {

887

if (mem) {

889

ret = mem_cgroup_try_charge(NULL, GFP_HIGHUSER_MOVABLE, &mem);

888

ret = mem_cgroup_try_charge(NULL, GFP_HIGHUSER_MOVABLE, &mem);

890

css_put(&mem->css);

889

css_put(&mem->css);

891

}

890

}

892

*ptr = mem;

891

*ptr = mem;

893

return ret;

892

return ret;

894

}

893

}

895

894

896

/* remove redundant charge if migration failed*/

895

/* remove redundant charge if migration failed*/

897

void mem_cgroup_end_migration(struct mem_cgroup *mem,

896

void mem_cgroup_end_migration(struct mem_cgroup *mem,

898

struct page *oldpage, struct page *newpage)

897

struct page *oldpage, struct page *newpage)

899

{

898

{

900

struct page *target, *unused;

899

struct page *target, *unused;

901

struct page_cgroup *pc;

900

struct page_cgroup *pc;

902

enum charge_type ctype;

901

enum charge_type ctype;

903

902

904

if (!mem)

903

if (!mem)

905

return;

904

return;

906

905

907

/* at migration success, oldpage->mapping is NULL. */

906

/* at migration success, oldpage->mapping is NULL. */

908

if (oldpage->mapping) {

907

if (oldpage->mapping) {

909

target = oldpage;

908

target = oldpage;

910

unused = NULL;

909

unused = NULL;

911

} else {

910

} else {

912

target = newpage;

911

target = newpage;

913

unused = oldpage;

912

unused = oldpage;

914

}

913

}

915

914

916

if (PageAnon(target))

915

if (PageAnon(target))

917

ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;

916

ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;

918

else if (page_is_file_cache(target))

917

else if (page_is_file_cache(target))

919

ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;

918

ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;

920

else

919

else

921

ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;

920

ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;

922

921

923

/* unused page is not on radix-tree now. */

922

/* unused page is not on radix-tree now. */

924

if (unused && ctype != MEM_CGROUP_CHARGE_TYPE_MAPPED)

923

if (unused && ctype != MEM_CGROUP_CHARGE_TYPE_MAPPED)

925

__mem_cgroup_uncharge_common(unused, ctype);

924

__mem_cgroup_uncharge_common(unused, ctype);

926

925

927

pc = lookup_page_cgroup(target);

926

pc = lookup_page_cgroup(target);

928

/*

927

/*

929

* __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.

928

* __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.

930

* So, double-counting is effectively avoided.

929

* So, double-counting is effectively avoided.

931

*/

930

*/

932

__mem_cgroup_commit_charge(mem, pc, ctype);

931

__mem_cgroup_commit_charge(mem, pc, ctype);

933

932

934

/*

933

/*

935

* Both of oldpage and newpage are still under lock_page().

934

* Both of oldpage and newpage are still under lock_page().

936

* Then, we don't have to care about race in radix-tree.

935

* Then, we don't have to care about race in radix-tree.

937

* But we have to be careful that this page is unmapped or not.

936

* But we have to be careful that this page is unmapped or not.

938

*

937

*

939

* There is a case for !page_mapped(). At the start of

938

* There is a case for !page_mapped(). At the start of

940

* migration, oldpage was mapped. But now, it's zapped.

939

* migration, oldpage was mapped. But now, it's zapped.

941

* But we know *target* page is not freed/reused under us.

940

* But we know *target* page is not freed/reused under us.

942

* mem_cgroup_uncharge_page() does all necessary checks.

941

* mem_cgroup_uncharge_page() does all necessary checks.

943

*/

942

*/

944

if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)

943

if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)

945

mem_cgroup_uncharge_page(target);

944

mem_cgroup_uncharge_page(target);

946

}

945

}

947

946

948

/*

947

/*

949

* A call to try to shrink memory usage under specified resource controller.

948

* A call to try to shrink memory usage under specified resource controller.

950

* This is typically used for page reclaiming for shmem for reducing side

949

* This is typically used for page reclaiming for shmem for reducing side

951

* effect of page allocation from shmem, which is used by some mem_cgroup.

950

* effect of page allocation from shmem, which is used by some mem_cgroup.

952

*/

951

*/

953

int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)

952

int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)

954

{

953

{

955

struct mem_cgroup *mem;

954

struct mem_cgroup *mem;

956

int progress = 0;

955

int progress = 0;

957

int retry = MEM_CGROUP_RECLAIM_RETRIES;

956

int retry = MEM_CGROUP_RECLAIM_RETRIES;

958

957

959

if (mem_cgroup_subsys.disabled)

958

if (mem_cgroup_subsys.disabled)

960

return 0;

959

return 0;

961

if (!mm)

960

if (!mm)

962

return 0;

961

return 0;

963

962

964

rcu_read_lock();

963

rcu_read_lock();

965

mem = mem_cgroup_from_task(rcu_dereference(mm->owner));

964

mem = mem_cgroup_from_task(rcu_dereference(mm->owner));

966

if (unlikely(!mem)) {

965

if (unlikely(!mem)) {

967

rcu_read_unlock();

966

rcu_read_unlock();

968

return 0;

967

return 0;

969

}

968

}

970

css_get(&mem->css);

969

css_get(&mem->css);

971

rcu_read_unlock();

970

rcu_read_unlock();

972

971

973

do {

972

do {

974

progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);

973

progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);

975

progress += res_counter_check_under_limit(&mem->res);

974

progress += res_counter_check_under_limit(&mem->res);

976

} while (!progress && --retry);

975

} while (!progress && --retry);

977

976

978

css_put(&mem->css);

977

css_put(&mem->css);

979

if (!retry)

978

if (!retry)

980

return -ENOMEM;

979

return -ENOMEM;

981

return 0;

980

return 0;

982

}

981

}

983

982

984

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,

983

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,

985

unsigned long long val)

984

unsigned long long val)

986

{

985

{

987

986

988

int retry_count = MEM_CGROUP_RECLAIM_RETRIES;

987

int retry_count = MEM_CGROUP_RECLAIM_RETRIES;

989

int progress;

988

int progress;

990

int ret = 0;

989

int ret = 0;

991

990

992

while (res_counter_set_limit(&memcg->res, val)) {

991

while (res_counter_set_limit(&memcg->res, val)) {

993

if (signal_pending(current)) {

992

if (signal_pending(current)) {

994

ret = -EINTR;

993

ret = -EINTR;

995

break;

994

break;

996

}

995

}

997

if (!retry_count) {

996

if (!retry_count) {

998

ret = -EBUSY;

997

ret = -EBUSY;

999

break;

998

break;

1000

}

999

}

1001

progress = try_to_free_mem_cgroup_pages(memcg,

1000

progress = try_to_free_mem_cgroup_pages(memcg,

1002

GFP_HIGHUSER_MOVABLE);

1001

GFP_HIGHUSER_MOVABLE);

1003

if (!progress)

1002

if (!progress)

1004

retry_count--;

1003

retry_count--;

1005

}

1004

}

1006

return ret;

1005

return ret;

1007

}

1006

}

1008

1007

1009

1008

1010

/*

1009

/*

1011

* This routine traverse page_cgroup in given list and drop them all.

1010

* This routine traverse page_cgroup in given list and drop them all.

1012

* *And* this routine doesn't reclaim page itself, just removes page_cgroup.

1011

* *And* this routine doesn't reclaim page itself, just removes page_cgroup.

1013

*/

1012

*/

1014

static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,

1013

static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,

1015

struct mem_cgroup_per_zone *mz,

1014

struct mem_cgroup_per_zone *mz,

1016

enum lru_list lru)

1015

enum lru_list lru)

1017

{

1016

{

1018

struct page_cgroup *pc, *busy;

1017

struct page_cgroup *pc, *busy;

1019

unsigned long flags;

1018

unsigned long flags;

1020

unsigned long loop;

1019

unsigned long loop;

1021

struct list_head *list;

1020

struct list_head *list;

1022

int ret = 0;

1021

int ret = 0;

1023

1022

1024

list = &mz->lists[lru];

1023

list = &mz->lists[lru];

1025

1024

1026

loop = MEM_CGROUP_ZSTAT(mz, lru);

1025

loop = MEM_CGROUP_ZSTAT(mz, lru);

1027

/* give some margin against EBUSY etc...*/

1026

/* give some margin against EBUSY etc...*/

1028

loop += 256;

1027

loop += 256;

1029

busy = NULL;

1028

busy = NULL;

1030

while (loop--) {

1029

while (loop--) {

1031

ret = 0;

1030

ret = 0;

1032

spin_lock_irqsave(&mz->lru_lock, flags);

1031

spin_lock_irqsave(&mz->lru_lock, flags);

1033

if (list_empty(list)) {

1032

if (list_empty(list)) {

1034

spin_unlock_irqrestore(&mz->lru_lock, flags);

1033

spin_unlock_irqrestore(&mz->lru_lock, flags);

1035

break;

1034

break;

1036

}

1035

}

1037

pc = list_entry(list->prev, struct page_cgroup, lru);

1036

pc = list_entry(list->prev, struct page_cgroup, lru);

1038

if (busy == pc) {

1037

if (busy == pc) {

1039

list_move(&pc->lru, list);

1038

list_move(&pc->lru, list);

1040

busy = 0;

1039

busy = 0;

1041

spin_unlock_irqrestore(&mz->lru_lock, flags);

1040

spin_unlock_irqrestore(&mz->lru_lock, flags);

1042

continue;

1041

continue;

1043

}

1042

}

1044

spin_unlock_irqrestore(&mz->lru_lock, flags);

1043

spin_unlock_irqrestore(&mz->lru_lock, flags);

1045

1044

1046

ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);

1045

ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);

1047

if (ret == -ENOMEM)

1046

if (ret == -ENOMEM)

1048

break;

1047

break;

1049

1048

1050

if (ret == -EBUSY || ret == -EINVAL) {

1049

if (ret == -EBUSY || ret == -EINVAL) {

1051

/* found lock contention or "pc" is obsolete. */

1050

/* found lock contention or "pc" is obsolete. */

1052

busy = pc;

1051

busy = pc;

1053

cond_resched();

1052

cond_resched();

1054

} else

1053

} else

1055

busy = NULL;

1054

busy = NULL;

1056

}

1055

}

1057

if (!ret && !list_empty(list))

1056

if (!ret && !list_empty(list))

1058

return -EBUSY;

1057

return -EBUSY;

1059

return ret;

1058

return ret;

1060

}

1059

}

1061

1060

1062

/*

1061

/*

1063

* make mem_cgroup's charge to be 0 if there is no task.

1062

* make mem_cgroup's charge to be 0 if there is no task.

1064

* This enables deleting this mem_cgroup.

1063

* This enables deleting this mem_cgroup.

1065

*/

1064

*/

1066

static int mem_cgroup_force_empty(struct mem_cgroup *mem)

1065

static int mem_cgroup_force_empty(struct mem_cgroup *mem)

1067

{

1066

{

1068

int ret;

1067

int ret;

1069

int node, zid, shrink;

1068

int node, zid, shrink;

1070

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

1069

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

1071

1070

1072

css_get(&mem->css);

1071

css_get(&mem->css);

1073

1072

1074

shrink = 0;

1073

shrink = 0;

1075

move_account:

1074

move_account:

1076

while (mem->res.usage > 0) {

1075

while (mem->res.usage > 0) {

1077

ret = -EBUSY;

1076

ret = -EBUSY;

1078

if (atomic_read(&mem->css.cgroup->count) > 0)

1077

if (atomic_read(&mem->css.cgroup->count) > 0)

1079

goto out;

1078

goto out;

1080

1079

1081

/* This is for making all *used* pages to be on LRU. */

1080

/* This is for making all *used* pages to be on LRU. */

1082

lru_add_drain_all();

1081

lru_add_drain_all();

1083

ret = 0;

1082

ret = 0;

1084

for_each_node_state(node, N_POSSIBLE) {

1083

for_each_node_state(node, N_POSSIBLE) {

1085

for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {

1084

for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {

1086

struct mem_cgroup_per_zone *mz;

1085

struct mem_cgroup_per_zone *mz;

1087

enum lru_list l;

1086

enum lru_list l;

1088

mz = mem_cgroup_zoneinfo(mem, node, zid);

1087

mz = mem_cgroup_zoneinfo(mem, node, zid);

1089

for_each_lru(l) {

1088

for_each_lru(l) {

1090

ret = mem_cgroup_force_empty_list(mem,

1089

ret = mem_cgroup_force_empty_list(mem,

1091

mz, l);

1090

mz, l);

1092

if (ret)

1091

if (ret)

1093

break;

1092

break;

1094

}

1093

}

1095

}

1094

}

1096

if (ret)

1095

if (ret)

1097

break;

1096

break;

1098

}

1097

}

1099

/* it seems parent cgroup doesn't have enough mem */

1098

/* it seems parent cgroup doesn't have enough mem */

1100

if (ret == -ENOMEM)

1099

if (ret == -ENOMEM)

1101

goto try_to_free;

1100

goto try_to_free;

1102

cond_resched();

1101

cond_resched();

1103

}

1102

}

1104

ret = 0;

1103

ret = 0;

1105

out:

1104

out:

1106

css_put(&mem->css);

1105

css_put(&mem->css);

1107

return ret;

1106

return ret;

1108

1107

1109

try_to_free:

1108

try_to_free:

1110

/* returns EBUSY if we come here twice. */

1109

/* returns EBUSY if we come here twice. */

1111

if (shrink) {

1110

if (shrink) {

1112

ret = -EBUSY;

1111

ret = -EBUSY;

1113

goto out;

1112

goto out;

1114

}

1113

}

1115

/* try to free all pages in this cgroup */

1114

/* try to free all pages in this cgroup */

1116

shrink = 1;

1115

shrink = 1;

1117

while (nr_retries && mem->res.usage > 0) {

1116

while (nr_retries && mem->res.usage > 0) {

1118

int progress;

1117

int progress;

1119

progress = try_to_free_mem_cgroup_pages(mem,

1118

progress = try_to_free_mem_cgroup_pages(mem,

1120

GFP_HIGHUSER_MOVABLE);

1119

GFP_HIGHUSER_MOVABLE);

1121

if (!progress)

1120

if (!progress)

1122

nr_retries--;

1121

nr_retries--;

1123

1122

1124

}

1123

}

1125

/* try move_account...there may be some *locked* pages. */

1124

/* try move_account...there may be some *locked* pages. */

1126

if (mem->res.usage)

1125

if (mem->res.usage)

1127

goto move_account;

1126

goto move_account;

1128

ret = 0;

1127

ret = 0;

1129

goto out;

1128

goto out;

1130

}

1129

}

1131

1130

1132

static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)

1131

static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)

1133

{

1132

{

1134

return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,

1133

return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,

1135

cft->private);

1134

cft->private);

1136

}

1135

}

1137

/*

1136

/*

1138

* The user of this function is...

1137

* The user of this function is...

1139

* RES_LIMIT.

1138

* RES_LIMIT.

1140

*/

1139

*/

1141

static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,

1140

static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,

1142

const char *buffer)

1141

const char *buffer)

1143

{

1142

{

1144

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

1143

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

1145

unsigned long long val;

1144

unsigned long long val;

1146

int ret;

1145

int ret;

1147

1146

1148

switch (cft->private) {

1147

switch (cft->private) {

1149

case RES_LIMIT:

1148

case RES_LIMIT:

1150

/* This function does all necessary parse...reuse it */

1149

/* This function does all necessary parse...reuse it */

1151

ret = res_counter_memparse_write_strategy(buffer, &val);

1150

ret = res_counter_memparse_write_strategy(buffer, &val);

1152

if (!ret)

1151

if (!ret)

1153

ret = mem_cgroup_resize_limit(memcg, val);

1152

ret = mem_cgroup_resize_limit(memcg, val);

1154

break;

1153

break;

1155

default:

1154

default:

1156

ret = -EINVAL; /* should be BUG() ? */

1155

ret = -EINVAL; /* should be BUG() ? */

1157

break;

1156

break;

1158

}

1157

}

1159

return ret;

1158

return ret;

1160

}

1159

}

1161

1160

1162

static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)

1161

static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)

1163

{

1162

{

1164

struct mem_cgroup *mem;

1163

struct mem_cgroup *mem;

1165

1164

1166

mem = mem_cgroup_from_cont(cont);

1165

mem = mem_cgroup_from_cont(cont);

1167

switch (event) {

1166

switch (event) {

1168

case RES_MAX_USAGE:

1167

case RES_MAX_USAGE:

1169

res_counter_reset_max(&mem->res);

1168

res_counter_reset_max(&mem->res);

1170

break;

1169

break;

1171

case RES_FAILCNT:

1170

case RES_FAILCNT:

1172

res_counter_reset_failcnt(&mem->res);

1171

res_counter_reset_failcnt(&mem->res);

1173

break;

1172

break;

1174

}

1173

}

1175

return 0;

1174

return 0;

1176

}

1175

}

1177

1176

1178

static const struct mem_cgroup_stat_desc {

1177

static const struct mem_cgroup_stat_desc {

1179

const char *msg;

1178

const char *msg;

1180

u64 unit;

1179

u64 unit;

1181

} mem_cgroup_stat_desc[] = {

1180

} mem_cgroup_stat_desc[] = {

1182

[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },

1181

[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },

1183

[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },

1182

[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },

1184

[MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },

1183

[MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },

1185

[MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },

1184

[MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },

1186

};

1185

};

1187

1186

1188

static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,

1187

static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,

1189

struct cgroup_map_cb *cb)

1188

struct cgroup_map_cb *cb)

1190

{

1189

{

1191

struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);

1190

struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);

1192

struct mem_cgroup_stat *stat = &mem_cont->stat;

1191

struct mem_cgroup_stat *stat = &mem_cont->stat;

1193

int i;

1192

int i;

1194

1193

1195

for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {

1194

for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {

1196

s64 val;

1195

s64 val;

1197

1196

1198

val = mem_cgroup_read_stat(stat, i);

1197

val = mem_cgroup_read_stat(stat, i);

1199

val *= mem_cgroup_stat_desc[i].unit;

1198

val *= mem_cgroup_stat_desc[i].unit;

1200

cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);

1199

cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);

1201

}

1200

}

1202

/* showing # of active pages */

1201

/* showing # of active pages */

1203

{

1202

{

1204

unsigned long active_anon, inactive_anon;

1203

unsigned long active_anon, inactive_anon;

1205

unsigned long active_file, inactive_file;

1204

unsigned long active_file, inactive_file;

1206

unsigned long unevictable;

1205

unsigned long unevictable;

1207

1206

1208

inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,

1207

inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,

1209

LRU_INACTIVE_ANON);

1208

LRU_INACTIVE_ANON);

1210

active_anon = mem_cgroup_get_all_zonestat(mem_cont,

1209

active_anon = mem_cgroup_get_all_zonestat(mem_cont,

1211

LRU_ACTIVE_ANON);

1210

LRU_ACTIVE_ANON);

1212

inactive_file = mem_cgroup_get_all_zonestat(mem_cont,

1211

inactive_file = mem_cgroup_get_all_zonestat(mem_cont,

1213

LRU_INACTIVE_FILE);

1212

LRU_INACTIVE_FILE);

1214

active_file = mem_cgroup_get_all_zonestat(mem_cont,

1213

active_file = mem_cgroup_get_all_zonestat(mem_cont,

1215

LRU_ACTIVE_FILE);

1214

LRU_ACTIVE_FILE);

1216

unevictable = mem_cgroup_get_all_zonestat(mem_cont,

1215

unevictable = mem_cgroup_get_all_zonestat(mem_cont,

1217

LRU_UNEVICTABLE);

1216

LRU_UNEVICTABLE);

1218

1217

1219

cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);

1218

cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);

1220

cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);

1219

cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);

1221

cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);

1220

cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);

1222

cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);

1221

cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);

1223

cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);

1222

cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);

1224

1223

1225

}

1224

}

1226

return 0;

1225

return 0;

1227

}

1226

}

1228

1227

1229

static struct cftype mem_cgroup_files[] = {

1228

static struct cftype mem_cgroup_files[] = {

1230

{

1229

{

1231

.name = "usage_in_bytes",

1230

.name = "usage_in_bytes",

1232

.private = RES_USAGE,

1231

.private = RES_USAGE,

1233

.read_u64 = mem_cgroup_read,

1232

.read_u64 = mem_cgroup_read,

1234

},

1233

},

1235

{

1234

{

1236

.name = "max_usage_in_bytes",

1235

.name = "max_usage_in_bytes",

1237

.private = RES_MAX_USAGE,

1236

.private = RES_MAX_USAGE,

1238

.trigger = mem_cgroup_reset,

1237

.trigger = mem_cgroup_reset,

1239

.read_u64 = mem_cgroup_read,

1238

.read_u64 = mem_cgroup_read,

1240

},

1239

},

1241

{

1240

{

1242

.name = "limit_in_bytes",

1241

.name = "limit_in_bytes",

1243

.private = RES_LIMIT,

1242

.private = RES_LIMIT,

1244

.write_string = mem_cgroup_write,

1243

.write_string = mem_cgroup_write,

1245

.read_u64 = mem_cgroup_read,

1244

.read_u64 = mem_cgroup_read,

1246

},

1245

},

1247

{

1246

{

1248

.name = "failcnt",

1247

.name = "failcnt",

1249

.private = RES_FAILCNT,

1248

.private = RES_FAILCNT,

1250

.trigger = mem_cgroup_reset,

1249

.trigger = mem_cgroup_reset,

1251

.read_u64 = mem_cgroup_read,

1250

.read_u64 = mem_cgroup_read,

1252

},

1251

},

1253

{

1252

{

1254

.name = "stat",

1253

.name = "stat",

1255

.read_map = mem_control_stat_show,

1254

.read_map = mem_control_stat_show,

1256

},

1255

},

1257

};

1256

};

1258

1257

1259

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)

1258

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)

1260

{

1259

{

1261

struct mem_cgroup_per_node *pn;

1260

struct mem_cgroup_per_node *pn;

1262

struct mem_cgroup_per_zone *mz;

1261

struct mem_cgroup_per_zone *mz;

1263

enum lru_list l;

1262

enum lru_list l;

1264

int zone, tmp = node;

1263

int zone, tmp = node;

1265

/*

1264

/*

1266

* This routine is called against possible nodes.

1265

* This routine is called against possible nodes.

1267

* But it's BUG to call kmalloc() against offline node.

1266

* But it's BUG to call kmalloc() against offline node.

1268

*

1267

*

1269

* TODO: this routine can waste much memory for nodes which will

1268

* TODO: this routine can waste much memory for nodes which will

1270

* never be onlined. It's better to use memory hotplug callback

1269

* never be onlined. It's better to use memory hotplug callback

1271

* function.

1270

* function.

1272

*/

1271

*/

1273

if (!node_state(node, N_NORMAL_MEMORY))

1272

if (!node_state(node, N_NORMAL_MEMORY))

1274

tmp = -1;

1273

tmp = -1;

1275

pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

1274

pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

1276

if (!pn)

1275

if (!pn)

1277

return 1;

1276

return 1;

1278

1277

1279

mem->info.nodeinfo[node] = pn;

1278

mem->info.nodeinfo[node] = pn;

1280

memset(pn, 0, sizeof(*pn));

1279

memset(pn, 0, sizeof(*pn));

1281

1280

1282

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

1281

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

1283

mz = &pn->zoneinfo[zone];

1282

mz = &pn->zoneinfo[zone];

1284

spin_lock_init(&mz->lru_lock);

1283

spin_lock_init(&mz->lru_lock);

1285

for_each_lru(l)

1284

for_each_lru(l)

1286

INIT_LIST_HEAD(&mz->lists[l]);

1285

INIT_LIST_HEAD(&mz->lists[l]);

1287

}

1286

}

1288

return 0;

1287

return 0;

1289

}

1288

}

1290

1289

1291

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)

1290

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)

1292

{

1291

{

1293

kfree(mem->info.nodeinfo[node]);

1292

kfree(mem->info.nodeinfo[node]);

1294

}

1293

}

1295

1294

1295

static int mem_cgroup_size(void)

1296

{

1297

int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);

1298

return sizeof(struct mem_cgroup) + cpustat_size;

1299

}

1300

1296

static struct mem_cgroup *mem_cgroup_alloc(void)

1301

static struct mem_cgroup *mem_cgroup_alloc(void)

1297

{

1302

{

1298

struct mem_cgroup *mem;

1303

struct mem_cgroup *mem;

1304

int size = mem_cgroup_size();

1299

1305

1300

if (sizeof(*mem) < PAGE_SIZE)

1306

if (size < PAGE_SIZE)

1301

mem = kmalloc(sizeof(*mem), GFP_KERNEL);

1307

mem = kmalloc(size, GFP_KERNEL);

1302

else

1308

else

1303

mem = vmalloc(sizeof(*mem));

1309

mem = vmalloc(size);

1304

1310

1305

if (mem)

1311

if (mem)

1306

memset(mem, 0, sizeof(*mem));

1312

memset(mem, 0, size);

1307

return mem;

1313

return mem;

1308

}

1314

}

1309

1315

1310

static void mem_cgroup_free(struct mem_cgroup *mem)

1316

static void mem_cgroup_free(struct mem_cgroup *mem)

1311

{

1317

{

1312

if (sizeof(*mem) < PAGE_SIZE)

1318

if (mem_cgroup_size() < PAGE_SIZE)

1313

kfree(mem);

1319

kfree(mem);

1314

else

1320

else

1315

vfree(mem);

1321

vfree(mem);

1316

}

1322

}

1317

1323

1318

1324

1319

static struct cgroup_subsys_state *

1325

static struct cgroup_subsys_state *

1320

mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)

1326

mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)

1321

{

1327

{

1322

struct mem_cgroup *mem;

1328

struct mem_cgroup *mem;

1323

int node;

1329

int node;

1324

1330

1325

if (unlikely((cont->parent) == NULL)) {

1331

mem = mem_cgroup_alloc();

1326

mem = &init_mem_cgroup;

1332

if (!mem)

1327

} else {

1333

return ERR_PTR(-ENOMEM);

1328

mem = mem_cgroup_alloc();

1329

if (!mem)

1330

return ERR_PTR(-ENOMEM);

1331

}

1332

1334

1333

res_counter_init(&mem->res);

1335

res_counter_init(&mem->res);

1334

1336

1335

for_each_node_state(node, N_POSSIBLE)

1337

for_each_node_state(node, N_POSSIBLE)

1336

if (alloc_mem_cgroup_per_zone_info(mem, node))

1338

if (alloc_mem_cgroup_per_zone_info(mem, node))

1337

goto free_out;

1339

goto free_out;

1338

1340

1339

return &mem->css;

1341

return &mem->css;

1340

free_out:

1342

free_out:

1341

for_each_node_state(node, N_POSSIBLE)

1343

for_each_node_state(node, N_POSSIBLE)

1342

free_mem_cgroup_per_zone_info(mem, node);

1344

free_mem_cgroup_per_zone_info(mem, node);

1343

if (cont->parent != NULL)

1345

mem_cgroup_free(mem);

1344

mem_cgroup_free(mem);

1345

return ERR_PTR(-ENOMEM);

1346

return ERR_PTR(-ENOMEM);

1346

}

1347

}

1347

1348

static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,

1349

static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,

1349

struct cgroup *cont)

1350

struct cgroup *cont)

1350

{

1351

{

1351

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

1352

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

1352

mem_cgroup_force_empty(mem);

1353

mem_cgroup_force_empty(mem);

1353

}

1354

}

1354

1355

static void mem_cgroup_destroy(struct cgroup_subsys *ss,

1356

static void mem_cgroup_destroy(struct cgroup_subsys *ss,

1356

struct cgroup *cont)

1357

struct cgroup *cont)

1357

{

1358

{

1358

int node;

1359

int node;

1359

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

1360

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

1360

1361

for_each_node_state(node, N_POSSIBLE)

1362

for_each_node_state(node, N_POSSIBLE)

1362

free_mem_cgroup_per_zone_info(mem, node);

1363

free_mem_cgroup_per_zone_info(mem, node);

1363

1364

mem_cgroup_free(mem_cgroup_from_cont(cont));

1365

mem_cgroup_free(mem_cgroup_from_cont(cont));

1365

}

1366

}

1366

1367

static int mem_cgroup_populate(struct cgroup_subsys *ss,

1368

static int mem_cgroup_populate(struct cgroup_subsys *ss,

1368

struct cgroup *cont)

1369

struct cgroup *cont)

1369

{

1370

{

1370

return cgroup_add_files(cont, ss, mem_cgroup_files,

1371

return cgroup_add_files(cont, ss, mem_cgroup_files,

1371

ARRAY_SIZE(mem_cgroup_files));

1372

ARRAY_SIZE(mem_cgroup_files));

1372

}

1373

}

1373

1374

static void mem_cgroup_move_task(struct cgroup_subsys *ss,

1375

static void mem_cgroup_move_task(struct cgroup_subsys *ss,

1375

struct cgroup *cont,

1376

struct cgroup *cont,

1376

struct cgroup *old_cont,

1377

struct cgroup *old_cont,

1377

struct task_struct *p)

1378

struct task_struct *p)

1378

{

1379

{

1379

struct mm_struct *mm;

1380

struct mm_struct *mm;

1380

struct mem_cgroup *mem, *old_mem;

1381

struct mem_cgroup *mem, *old_mem;

1381

1382

mm = get_task_mm(p);

1383

mm = get_task_mm(p);

1383

if (mm == NULL)

1384

if (mm == NULL)

1384

return;

1385

return;

1385

1386

mem = mem_cgroup_from_cont(cont);

1387

mem = mem_cgroup_from_cont(cont);

1387

old_mem = mem_cgroup_from_cont(old_cont);

1388

old_mem = mem_cgroup_from_cont(old_cont);

1388

1389

/*

1390

/*

1390

* Only thread group leaders are allowed to migrate, the mm_struct is

1391

* Only thread group leaders are allowed to migrate, the mm_struct is

1391

* in effect owned by the leader

1392

* in effect owned by the leader

1392

*/

1393

*/

1393

if (!thread_group_leader(p))

1394

if (!thread_group_leader(p))

1394

goto out;

1395

goto out;

1395

1396

out:

1397

out:

1397

mmput(mm);

1398

mmput(mm);

1398

}

1399

}

1399

1400

struct cgroup_subsys mem_cgroup_subsys = {

1401

struct cgroup_subsys mem_cgroup_subsys = {

1401

.name = "memory",

1402

.name = "memory",

1402

.subsys_id = mem_cgroup_subsys_id,

1403

.subsys_id = mem_cgroup_subsys_id,

1403

.create = mem_cgroup_create,

1404

.create = mem_cgroup_create,

1404

.pre_destroy = mem_cgroup_pre_destroy,

1405

.pre_destroy = mem_cgroup_pre_destroy,

GITLAB

Eric Lee / linux-smarc-t335x-v3.2

memcg: reduce size of mem_cgroup by using nr_cpu_ids

 /* memcontrol.c - Memory Controller
  *
  * Copyright IBM Corporation, 2007
  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
  *
  * Copyright 2007 OpenVZ SWsoft Inc
  * Author: Pavel Emelianov <xemul@openvz.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  */
 #include <linux/res_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
 #include <linux/page-flags.h>
 #include <linux/backing-dev.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <asm/uaccess.h>
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 /*
  * Statistics for memory cgroup.
  */
 enum mem_cgroup_stat_index {
 	/*
 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
 	 */
 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as rss */
 	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
 	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
 	MEM_CGROUP_STAT_NSTATS,
 };
 struct mem_cgroup_stat_cpu {
 	s64 count[MEM_CGROUP_STAT_NSTATS];
 } ____cacheline_aligned_in_smp;
 struct mem_cgroup_stat {
-	struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
+	struct mem_cgroup_stat_cpu cpustat[0];
 };
 /*
  * For accounting under irq disable, no need for increment preempt count.
  */
 static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
 		enum mem_cgroup_stat_index idx, int val)
 {
 	stat->count[idx] += val;
 }
 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
 		enum mem_cgroup_stat_index idx)
 {
 	int cpu;
 	s64 ret = 0;
 	for_each_possible_cpu(cpu)
 		ret += stat->cpustat[cpu].count[idx];
 	return ret;
 }
 /*
  * per-zone information in memory controller.
  */
 struct mem_cgroup_per_zone {
 	/*
 	 * spin_lock to protect the per cgroup LRU
 	 */
 	spinlock_t		lru_lock;
 	struct list_head	lists[NR_LRU_LISTS];
 	unsigned long		count[NR_LRU_LISTS];
 };
 /* Macro for accessing counter */
 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
 struct mem_cgroup_per_node {
 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 };
 struct mem_cgroup_lru_info {
 	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 };
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
  * to help the administrator determine what knobs to tune.
  *
  * TODO: Add a water mark for the memory controller. Reclaim will begin when
  * we hit the water mark. May be even add a low water mark, such that
  * no reclaim occurs from a cgroup at it's low water mark, this is
  * a feature that will be implemented much later in the future.
  */
 struct mem_cgroup {
 	struct cgroup_subsys_state css;
 	/*
 	 * the counter to account for memory usage
 	 */
 	struct res_counter res;
 	/*
 	 * Per cgroup active and inactive list, similar to the
 	 * per zone LRU lists.
 	 */
 	struct mem_cgroup_lru_info info;
 	int	prev_priority;	/* for recording reclaim priority */
 	/*
-	 * statistics.
+	 * statistics. This must be placed at the end of memcg.
 	 */
 	struct mem_cgroup_stat stat;
 };
-static struct mem_cgroup init_mem_cgroup;
 enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
 	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
 	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
 	NR_CHARGE_TYPE,
 };
 /* only for here (for easy reading.) */
 #define PCGF_CACHE	(1UL << PCG_CACHE)
 #define PCGF_USED	(1UL << PCG_USED)
 #define PCGF_ACTIVE	(1UL << PCG_ACTIVE)
 #define PCGF_LOCK	(1UL << PCG_LOCK)
 #define PCGF_FILE	(1UL << PCG_FILE)
 static const unsigned long
 pcg_default_flags[NR_CHARGE_TYPE] = {
 	PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
 	PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
 	PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
 	0, /* FORCE */
 };
 /*
  * Always modified under lru lock. Then, not necessary to preempt_disable()
  */
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 					 struct page_cgroup *pc,
 					 bool charge)
 {
 	int val = (charge)? 1 : -1;
 	struct mem_cgroup_stat *stat = &mem->stat;
 	struct mem_cgroup_stat_cpu *cpustat;
 	VM_BUG_ON(!irqs_disabled());
 	cpustat = &stat->cpustat[smp_processor_id()];
 	if (PageCgroupCache(pc))
 		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
 	else
 		__mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
 	if (charge)
 		__mem_cgroup_stat_add_safe(cpustat,
 				MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
 	else
 		__mem_cgroup_stat_add_safe(cpustat,
 				MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
 }
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
 {
 	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
 }
 static struct mem_cgroup_per_zone *
 page_cgroup_zoneinfo(struct page_cgroup *pc)
 {
 	struct mem_cgroup *mem = pc->mem_cgroup;
 	int nid = page_cgroup_nid(pc);
 	int zid = page_cgroup_zid(pc);
 	return mem_cgroup_zoneinfo(mem, nid, zid);
 }
 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
 					enum lru_list idx)
 {
 	int nid, zid;
 	struct mem_cgroup_per_zone *mz;
 	u64 total = 0;
 	for_each_online_node(nid)
 		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 			mz = mem_cgroup_zoneinfo(mem, nid, zid);
 			total += MEM_CGROUP_ZSTAT(mz, idx);
 		}
 	return total;
 }
 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 {
 	return container_of(cgroup_subsys_state(cont,
 				mem_cgroup_subsys_id), struct mem_cgroup,
 				css);
 }
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 {
 	/*
 	 * mm_update_next_owner() may clear mm->owner to NULL
 	 * if it races with swapoff, page migration, etc.
 	 * So this can be called with p == NULL.
 	 */
 	if (unlikely(!p))
 		return NULL;
 	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 				struct mem_cgroup, css);
 }
 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 			struct page_cgroup *pc)
 {
 	int lru = LRU_BASE;
 	if (PageCgroupUnevictable(pc))
 		lru = LRU_UNEVICTABLE;
 	else {
 		if (PageCgroupActive(pc))
 			lru += LRU_ACTIVE;
 		if (PageCgroupFile(pc))
 			lru += LRU_FILE;
 	}
 	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
 	mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
 	list_del(&pc->lru);
 }
 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
 				struct page_cgroup *pc, bool hot)
 {
 	int lru = LRU_BASE;
 	if (PageCgroupUnevictable(pc))
 		lru = LRU_UNEVICTABLE;
 	else {
 		if (PageCgroupActive(pc))
 			lru += LRU_ACTIVE;
 		if (PageCgroupFile(pc))
 			lru += LRU_FILE;
 	}
 	MEM_CGROUP_ZSTAT(mz, lru) += 1;
 	if (hot)
 		list_add(&pc->lru, &mz->lists[lru]);
 	else
 		list_add_tail(&pc->lru, &mz->lists[lru]);
 	mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
 }
 static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
 {
 	struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
 	int active    = PageCgroupActive(pc);
 	int file      = PageCgroupFile(pc);
 	int unevictable = PageCgroupUnevictable(pc);
 	enum lru_list from = unevictable ? LRU_UNEVICTABLE :
 				(LRU_FILE * !!file + !!active);
 	if (lru == from)
 		return;
 	MEM_CGROUP_ZSTAT(mz, from) -= 1;
 	/*
 	 * However this is done under mz->lru_lock, another flags, which
 	 * are not related to LRU, will be modified from out-of-lock.
 	 * We have to use atomic set/clear flags.
 	 */
 	if (is_unevictable_lru(lru)) {
 		ClearPageCgroupActive(pc);
 		SetPageCgroupUnevictable(pc);
 	} else {
 		if (is_active_lru(lru))
 			SetPageCgroupActive(pc);
 		else
 			ClearPageCgroupActive(pc);
 		ClearPageCgroupUnevictable(pc);
 	}
 	MEM_CGROUP_ZSTAT(mz, lru) += 1;
 	list_move(&pc->lru, &mz->lists[lru]);
 }
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 {
 	int ret;
 	task_lock(task);
 	ret = task->mm && mm_match_cgroup(task->mm, mem);
 	task_unlock(task);
 	return ret;
 }
 /*
  * This routine assumes that the appropriate zone's lru lock is already held
  */
 void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup_per_zone *mz;
 	unsigned long flags;
 	if (mem_cgroup_subsys.disabled)
 		return;
 	/*
 	 * We cannot lock_page_cgroup while holding zone's lru_lock,
 	 * because other holders of lock_page_cgroup can be interrupted
 	 * with an attempt to rotate_reclaimable_page.  But we cannot
 	 * safely get to page_cgroup without it, so just try_lock it:
 	 * mem_cgroup_isolate_pages allows for page left on wrong list.
 	 */
 	pc = lookup_page_cgroup(page);
 	if (!trylock_page_cgroup(pc))
 		return;
 	if (pc && PageCgroupUsed(pc)) {
 		mz = page_cgroup_zoneinfo(pc);
 		spin_lock_irqsave(&mz->lru_lock, flags);
 		__mem_cgroup_move_lists(pc, lru);
 		spin_unlock_irqrestore(&mz->lru_lock, flags);
 	}
 	unlock_page_cgroup(pc);
 }
 /*
  * Calculate mapped_ratio under memory controller. This will be used in
  * vmscan.c for deteremining we have to reclaim mapped pages.
  */
 int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
 {
 	long total, rss;
 	/*
 	 * usage is recorded in bytes. But, here, we assume the number of
 	 * physical pages can be represented by "long" on any arch.
 	 */
 	total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
 	rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
 	return (int)((rss * 100L) / total);
 }
 /*
  * prev_priority control...this will be used in memory reclaim path.
  */
 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
 {
 	return mem->prev_priority;
 }
 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
 {
 	if (priority < mem->prev_priority)
 		mem->prev_priority = priority;
 }
 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
 {
 	mem->prev_priority = priority;
 }
 /*
  * Calculate # of pages to be scanned in this priority/zone.
  * See also vmscan.c
  *
  * priority starts from "DEF_PRIORITY" and decremented in each loop.
  * (see include/linux/mmzone.h)
  */
 long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
 					int priority, enum lru_list lru)
 {
 	long nr_pages;
 	int nid = zone->zone_pgdat->node_id;
 	int zid = zone_idx(zone);
 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
 	nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
 	return (nr_pages >> priority);
 }
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					struct list_head *dst,
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
 					int active, int file)
 {
 	unsigned long nr_taken = 0;
 	struct page *page;
 	unsigned long scan;
 	LIST_HEAD(pc_list);
 	struct list_head *src;
 	struct page_cgroup *pc, *tmp;
 	int nid = z->zone_pgdat->node_id;
 	int zid = zone_idx(z);
 	struct mem_cgroup_per_zone *mz;
 	int lru = LRU_FILE * !!file + !!active;
 	BUG_ON(!mem_cont);
 	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
 	src = &mz->lists[lru];
 	spin_lock(&mz->lru_lock);
 	scan = 0;
 	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
 		if (scan >= nr_to_scan)
 			break;
 		if (unlikely(!PageCgroupUsed(pc)))
 			continue;
 		page = pc->page;
 		if (unlikely(!PageLRU(page)))
 			continue;
 		/*
 		 * TODO: play better with lumpy reclaim, grabbing anything.
 		 */
 		if (PageUnevictable(page) ||
 		    (PageActive(page) && !active) ||
 		    (!PageActive(page) && active)) {
 			__mem_cgroup_move_lists(pc, page_lru(page));
 			continue;
 		}
 		scan++;
 		list_move(&pc->lru, &pc_list);
 		if (__isolate_lru_page(page, mode, file) == 0) {
 			list_move(&page->lru, dst);
 			nr_taken++;
 		}
 	}
 	list_splice(&pc_list, src);
 	spin_unlock(&mz->lru_lock);
 	*scanned = scan;
 	return nr_taken;
 }
 /*
  * Unlike exported interface, "oom" parameter is added. if oom==true,
  * oom-killer can be invoked.
  */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
 			gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
 {
 	struct mem_cgroup *mem;
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
 	 * The mm_struct's mem_cgroup changes on task migration if the
 	 * thread group leader migrates. It's possible that mm is not
 	 * set, if so charge the init_mm (happens for pagecache usage).
 	 */
 	if (likely(!*memcg)) {
 		rcu_read_lock();
 		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 		if (unlikely(!mem)) {
 			rcu_read_unlock();
 			return 0;
 		}
 		/*
 		 * For every charge from the cgroup, increment reference count
 		 */
 		css_get(&mem->css);
 		*memcg = mem;
 		rcu_read_unlock();
 	} else {
 		mem = *memcg;
 		css_get(&mem->css);
 	}
 	while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
 		if (!(gfp_mask & __GFP_WAIT))
 			goto nomem;
 		if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
 			continue;
 		/*
 		 * try_to_free_mem_cgroup_pages() might not give us a full
 		 * picture of reclaim. Some pages are reclaimed and might be
 		 * moved to swap cache or just unmapped from the cgroup.
 		 * Check the limit again to see if the reclaim reduced the
 		 * current usage of the cgroup before giving up
 		 */
 		if (res_counter_check_under_limit(&mem->res))
 			continue;
 		if (!nr_retries--) {
 			if (oom)
 				mem_cgroup_out_of_memory(mem, gfp_mask);
 			goto nomem;
 		}
 	}
 	return 0;
 nomem:
 	css_put(&mem->css);
 	return -ENOMEM;
 }
 /**
  * mem_cgroup_try_charge - get charge of PAGE_SIZE.
  * @mm: an mm_struct which is charged against. (when *memcg is NULL)
  * @gfp_mask: gfp_mask for reclaim.
  * @memcg: a pointer to memory cgroup which is charged against.
  *
  * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
  * memory cgroup from @mm is got and stored in *memcg.
  *
  * Returns 0 if success. -ENOMEM at failure.
  * This call can invoke OOM-Killer.
  */
 int mem_cgroup_try_charge(struct mm_struct *mm,
 			  gfp_t mask, struct mem_cgroup **memcg)
 {
 	return __mem_cgroup_try_charge(mm, mask, memcg, true);
 }
 /*
  * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
  * USED state. If already USED, uncharge and return.
  */
 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 				     struct page_cgroup *pc,
 				     enum charge_type ctype)
 {
 	struct mem_cgroup_per_zone *mz;
 	unsigned long flags;
 	/* try_charge() can return NULL to *memcg, taking care of it. */
 	if (!mem)
 		return;
 	lock_page_cgroup(pc);
 	if (unlikely(PageCgroupUsed(pc))) {
 		unlock_page_cgroup(pc);
 		res_counter_uncharge(&mem->res, PAGE_SIZE);
 		css_put(&mem->css);
 		return;
 	}
 	pc->mem_cgroup = mem;
 	/*
 	 * If a page is accounted as a page cache, insert to inactive list.
 	 * If anon, insert to active list.
 	 */
 	pc->flags = pcg_default_flags[ctype];
 	mz = page_cgroup_zoneinfo(pc);
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	__mem_cgroup_add_list(mz, pc, true);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
 	unlock_page_cgroup(pc);
 }
 /**
  * mem_cgroup_move_account - move account of the page
  * @pc:	page_cgroup of the page.
  * @from: mem_cgroup which the page is moved from.
  * @to:	mem_cgroup which the page is moved to. @from != @to.
  *
  * The caller must confirm following.
  * 1. disable irq.
  * 2. lru_lock of old mem_cgroup(@from) should be held.
  *
  * returns 0 at success,
  * returns -EBUSY when lock is busy or "pc" is unstable.
  *
  * This function does "uncharge" from old cgroup but doesn't do "charge" to
  * new cgroup. It should be done by a caller.
  */
 static int mem_cgroup_move_account(struct page_cgroup *pc,
 	struct mem_cgroup *from, struct mem_cgroup *to)
 {
 	struct mem_cgroup_per_zone *from_mz, *to_mz;
 	int nid, zid;
 	int ret = -EBUSY;
 	VM_BUG_ON(!irqs_disabled());
 	VM_BUG_ON(from == to);
 	nid = page_cgroup_nid(pc);
 	zid = page_cgroup_zid(pc);
 	from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
 	to_mz =  mem_cgroup_zoneinfo(to, nid, zid);
 	if (!trylock_page_cgroup(pc))
 		return ret;
 	if (!PageCgroupUsed(pc))
 		goto out;
 	if (pc->mem_cgroup != from)
 		goto out;
 	if (spin_trylock(&to_mz->lru_lock)) {
 		__mem_cgroup_remove_list(from_mz, pc);
 		css_put(&from->css);
 		res_counter_uncharge(&from->res, PAGE_SIZE);
 		pc->mem_cgroup = to;
 		css_get(&to->css);
 		__mem_cgroup_add_list(to_mz, pc, false);
 		ret = 0;
 		spin_unlock(&to_mz->lru_lock);
 	}
 out:
 	unlock_page_cgroup(pc);
 	return ret;
 }
 /*
  * move charges to its parent.
  */
 static int mem_cgroup_move_parent(struct page_cgroup *pc,
 				  struct mem_cgroup *child,
 				  gfp_t gfp_mask)
 {
 	struct cgroup *cg = child->css.cgroup;
 	struct cgroup *pcg = cg->parent;
 	struct mem_cgroup *parent;
 	struct mem_cgroup_per_zone *mz;
 	unsigned long flags;
 	int ret;
 	/* Is ROOT ? */
 	if (!pcg)
 		return -EINVAL;
 	parent = mem_cgroup_from_cont(pcg);
 	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
 	if (ret)
 		return ret;
 	mz = mem_cgroup_zoneinfo(child,
 			page_cgroup_nid(pc), page_cgroup_zid(pc));
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	ret = mem_cgroup_move_account(pc, child, parent);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
 	/* drop extra refcnt */
 	css_put(&parent->css);
 	/* uncharge if move fails */
 	if (ret)
 		res_counter_uncharge(&parent->res, PAGE_SIZE);
 	return ret;
 }
 /*
  * Charge the memory controller for page usage.
  * Return
  * 0 if the charge was successful
  * < 0 if the cgroup is over its limit
  */
 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask, enum charge_type ctype,
 				struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc;
 	int ret;
 	pc = lookup_page_cgroup(page);
 	/* can happen at boot */
 	if (unlikely(!pc))
 		return 0;
 	prefetchw(pc);
 	mem = memcg;
 	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
 	if (ret)
 		return ret;
 	__mem_cgroup_commit_charge(mem, pc, ctype);
 	return 0;
 }
 int mem_cgroup_newpage_charge(struct page *page,
 			      struct mm_struct *mm, gfp_t gfp_mask)
 {
 	if (mem_cgroup_subsys.disabled)
 		return 0;
 	if (PageCompound(page))
 		return 0;
 	/*
 	 * If already mapped, we don't have to account.
 	 * If page cache, page->mapping has address_space.
 	 * But page->mapping may have out-of-use anon_vma pointer,
 	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
 	 * is NULL.
   	 */
 	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
 		return 0;
 	if (unlikely(!mm))
 		mm = &init_mm;
 	return mem_cgroup_charge_common(page, mm, gfp_mask,
 				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
 }
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask)
 {
 	if (mem_cgroup_subsys.disabled)
 		return 0;
 	if (PageCompound(page))
 		return 0;
 	/*
 	 * Corner case handling. This is called from add_to_page_cache()
 	 * in usual. But some FS (shmem) precharges this page before calling it
 	 * and call add_to_page_cache() with GFP_NOWAIT.
 	 *
 	 * For GFP_NOWAIT case, the page may be pre-charged before calling
 	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
 	 * charge twice. (It works but has to pay a bit larger cost.)
 	 */
 	if (!(gfp_mask & __GFP_WAIT)) {
 		struct page_cgroup *pc;
 		pc = lookup_page_cgroup(page);
 		if (!pc)
 			return 0;
 		lock_page_cgroup(pc);
 		if (PageCgroupUsed(pc)) {
 			unlock_page_cgroup(pc);
 			return 0;
 		}
 		unlock_page_cgroup(pc);
 	}
 	if (unlikely(!mm))
 		mm = &init_mm;
 	if (page_is_file_cache(page))
 		return mem_cgroup_charge_common(page, mm, gfp_mask,
 				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
 	else
 		return mem_cgroup_charge_common(page, mm, gfp_mask,
 				MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
 }
 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 {
 	struct page_cgroup *pc;
 	if (mem_cgroup_subsys.disabled)
 		return;
 	if (!ptr)
 		return;
 	pc = lookup_page_cgroup(page);
 	__mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
 {
 	if (mem_cgroup_subsys.disabled)
 		return;
 	if (!mem)
 		return;
 	res_counter_uncharge(&mem->res, PAGE_SIZE);
 	css_put(&mem->css);
 }
 /*
  * uncharge if !page_mapped(page)
  */
 static void
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup *mem;
 	struct mem_cgroup_per_zone *mz;
 	unsigned long flags;
 	if (mem_cgroup_subsys.disabled)
 		return;
 	/*
 	 * Check if our page_cgroup is valid
 	 */
 	pc = lookup_page_cgroup(page);
 	if (unlikely(!pc || !PageCgroupUsed(pc)))
 		return;
 	lock_page_cgroup(pc);
 	if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
 	     || !PageCgroupUsed(pc)) {
 		/* This happens at race in zap_pte_range() and do_swap_page()*/
 		unlock_page_cgroup(pc);
 		return;
 	}
 	ClearPageCgroupUsed(pc);
 	mem = pc->mem_cgroup;
 	mz = page_cgroup_zoneinfo(pc);
 	spin_lock_irqsave(&mz->lru_lock, flags);
 	__mem_cgroup_remove_list(mz, pc);
 	spin_unlock_irqrestore(&mz->lru_lock, flags);
 	unlock_page_cgroup(pc);
 	res_counter_uncharge(&mem->res, PAGE_SIZE);
 	css_put(&mem->css);
 	return;
 }
 void mem_cgroup_uncharge_page(struct page *page)
 {
 	/* early check. */
 	if (page_mapped(page))
 		return;
 	if (page->mapping && !PageAnon(page))
 		return;
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 void mem_cgroup_uncharge_cache_page(struct page *page)
 {
 	VM_BUG_ON(page_mapped(page));
 	VM_BUG_ON(page->mapping);
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 }
 /*
  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
  * page belongs to.
  */
 int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup *mem = NULL;
 	int ret = 0;
 	if (mem_cgroup_subsys.disabled)
 		return 0;
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
 		css_get(&mem->css);
 	}
 	unlock_page_cgroup(pc);
 	if (mem) {
 		ret = mem_cgroup_try_charge(NULL, GFP_HIGHUSER_MOVABLE, &mem);
 		css_put(&mem->css);
 	}
 	*ptr = mem;
 	return ret;
 }
 /* remove redundant charge if migration failed*/
 void mem_cgroup_end_migration(struct mem_cgroup *mem,
 		struct page *oldpage, struct page *newpage)
 {
 	struct page *target, *unused;
 	struct page_cgroup *pc;
 	enum charge_type ctype;
 	if (!mem)
 		return;
 	/* at migration success, oldpage->mapping is NULL. */
 	if (oldpage->mapping) {
 		target = oldpage;
 		unused = NULL;
 	} else {
 		target = newpage;
 		unused = oldpage;
 	}
 	if (PageAnon(target))
 		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
 	else if (page_is_file_cache(target))
 		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	else
 		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
 	/* unused page is not on radix-tree now. */
 	if (unused && ctype != MEM_CGROUP_CHARGE_TYPE_MAPPED)
 		__mem_cgroup_uncharge_common(unused, ctype);
 	pc = lookup_page_cgroup(target);
 	/*
 	 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
 	 * So, double-counting is effectively avoided.
 	 */
 	__mem_cgroup_commit_charge(mem, pc, ctype);
 	/*
 	 * Both of oldpage and newpage are still under lock_page().
 	 * Then, we don't have to care about race in radix-tree.
 	 * But we have to be careful that this page is unmapped or not.
 	 *
 	 * There is a case for !page_mapped(). At the start of
 	 * migration, oldpage was mapped. But now, it's zapped.
 	 * But we know *target* page is not freed/reused under us.
 	 * mem_cgroup_uncharge_page() does all necessary checks.
 	 */
 	if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
 		mem_cgroup_uncharge_page(target);
 }
 /*
  * A call to try to shrink memory usage under specified resource controller.
  * This is typically used for page reclaiming for shmem for reducing side
  * effect of page allocation from shmem, which is used by some mem_cgroup.
  */
 int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 {
 	struct mem_cgroup *mem;
 	int progress = 0;
 	int retry = MEM_CGROUP_RECLAIM_RETRIES;
 	if (mem_cgroup_subsys.disabled)
 		return 0;
 	if (!mm)
 		return 0;
 	rcu_read_lock();
 	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 	if (unlikely(!mem)) {
 		rcu_read_unlock();
 		return 0;
 	}
 	css_get(&mem->css);
 	rcu_read_unlock();
 	do {
 		progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
 		progress += res_counter_check_under_limit(&mem->res);
 	} while (!progress && --retry);
 	css_put(&mem->css);
 	if (!retry)
 		return -ENOMEM;
 	return 0;
 }
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 				   unsigned long long val)
 {
 	int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
 	int progress;
 	int ret = 0;
 	while (res_counter_set_limit(&memcg->res, val)) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		if (!retry_count) {
 			ret = -EBUSY;
 			break;
 		}
 		progress = try_to_free_mem_cgroup_pages(memcg,
 				GFP_HIGHUSER_MOVABLE);
 		if (!progress)
 			retry_count--;
 	}
 	return ret;
 }
 /*
  * This routine traverse page_cgroup in given list and drop them all.
  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
  */
 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 			    struct mem_cgroup_per_zone *mz,
 			    enum lru_list lru)
 {
 	struct page_cgroup *pc, *busy;
 	unsigned long flags;
 	unsigned long loop;
 	struct list_head *list;
 	int ret = 0;
 	list = &mz->lists[lru];
 	loop = MEM_CGROUP_ZSTAT(mz, lru);
 	/* give some margin against EBUSY etc...*/
 	loop += 256;
 	busy = NULL;
 	while (loop--) {
 		ret = 0;
 		spin_lock_irqsave(&mz->lru_lock, flags);
 		if (list_empty(list)) {
 			spin_unlock_irqrestore(&mz->lru_lock, flags);
 			break;
 		}
 		pc = list_entry(list->prev, struct page_cgroup, lru);
 		if (busy == pc) {
 			list_move(&pc->lru, list);
 			busy = 0;
 			spin_unlock_irqrestore(&mz->lru_lock, flags);
 			continue;
 		}
 		spin_unlock_irqrestore(&mz->lru_lock, flags);
 		ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
 		if (ret == -ENOMEM)
 			break;
 		if (ret == -EBUSY || ret == -EINVAL) {
 			/* found lock contention or "pc" is obsolete. */
 			busy = pc;
 			cond_resched();
 		} else
 			busy = NULL;
 	}
 	if (!ret && !list_empty(list))
 		return -EBUSY;
 	return ret;
 }
 /*
  * make mem_cgroup's charge to be 0 if there is no task.
  * This enables deleting this mem_cgroup.
  */
 static int mem_cgroup_force_empty(struct mem_cgroup *mem)
 {
 	int ret;
 	int node, zid, shrink;
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	css_get(&mem->css);
 	shrink = 0;
 move_account:
 	while (mem->res.usage > 0) {
 		ret = -EBUSY;
 		if (atomic_read(&mem->css.cgroup->count) > 0)
 			goto out;
 		/* This is for making all *used* pages to be on LRU. */
 		lru_add_drain_all();
 		ret = 0;
 		for_each_node_state(node, N_POSSIBLE) {
 			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
 				struct mem_cgroup_per_zone *mz;
 				enum lru_list l;
 				mz = mem_cgroup_zoneinfo(mem, node, zid);
 				for_each_lru(l) {
 					ret = mem_cgroup_force_empty_list(mem,
 								  mz, l);
 					if (ret)
 						break;
 				}
 			}
 			if (ret)
 				break;
 		}
 		/* it seems parent cgroup doesn't have enough mem */
 		if (ret == -ENOMEM)
 			goto try_to_free;
 		cond_resched();
 	}
 	ret = 0;
 out:
 	css_put(&mem->css);
 	return ret;
 try_to_free:
 	/* returns EBUSY if we come here twice. */
 	if (shrink)  {
 		ret = -EBUSY;
 		goto out;
 	}
 	/* try to free all pages in this cgroup */
 	shrink = 1;
 	while (nr_retries && mem->res.usage > 0) {
 		int progress;
 		progress = try_to_free_mem_cgroup_pages(mem,
 						  GFP_HIGHUSER_MOVABLE);
 		if (!progress)
 			nr_retries--;
 	}
 	/* try move_account...there may be some *locked* pages. */
 	if (mem->res.usage)
 		goto move_account;
 	ret = 0;
 	goto out;
 }
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
 	return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
 				    cft->private);
 }
 /*
  * The user of this function is...
  * RES_LIMIT.
  */
 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 			    const char *buffer)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	unsigned long long val;
 	int ret;
 	switch (cft->private) {
 	case RES_LIMIT:
 		/* This function does all necessary parse...reuse it */
 		ret = res_counter_memparse_write_strategy(buffer, &val);
 		if (!ret)
 			ret = mem_cgroup_resize_limit(memcg, val);
 		break;
 	default:
 		ret = -EINVAL; /* should be BUG() ? */
 		break;
 	}
 	return ret;
 }
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
 	struct mem_cgroup *mem;
 	mem = mem_cgroup_from_cont(cont);
 	switch (event) {
 	case RES_MAX_USAGE:
 		res_counter_reset_max(&mem->res);
 		break;
 	case RES_FAILCNT:
 		res_counter_reset_failcnt(&mem->res);
 		break;
 	}
 	return 0;
 }
 static const struct mem_cgroup_stat_desc {
 	const char *msg;
 	u64 unit;
 } mem_cgroup_stat_desc[] = {
 	[MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
 	[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
 	[MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
 	[MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
 };
 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 				 struct cgroup_map_cb *cb)
 {
 	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
 	struct mem_cgroup_stat *stat = &mem_cont->stat;
 	int i;
 	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
 		s64 val;
 		val = mem_cgroup_read_stat(stat, i);
 		val *= mem_cgroup_stat_desc[i].unit;
 		cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
 	}
 	/* showing # of active pages */
 	{
 		unsigned long active_anon, inactive_anon;
 		unsigned long active_file, inactive_file;
 		unsigned long unevictable;
 		inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
 						LRU_INACTIVE_ANON);
 		active_anon = mem_cgroup_get_all_zonestat(mem_cont,
 						LRU_ACTIVE_ANON);
 		inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
 						LRU_INACTIVE_FILE);
 		active_file = mem_cgroup_get_all_zonestat(mem_cont,
 						LRU_ACTIVE_FILE);
 		unevictable = mem_cgroup_get_all_zonestat(mem_cont,
 							LRU_UNEVICTABLE);
 		cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
 		cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
 		cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
 		cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
 		cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
 	}
 	return 0;
 }
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
 		.private = RES_USAGE,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "max_usage_in_bytes",
 		.private = RES_MAX_USAGE,
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "limit_in_bytes",
 		.private = RES_LIMIT,
 		.write_string = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "failcnt",
 		.private = RES_FAILCNT,
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "stat",
 		.read_map = mem_control_stat_show,
 	},
 };
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
 	struct mem_cgroup_per_node *pn;
 	struct mem_cgroup_per_zone *mz;
 	enum lru_list l;
 	int zone, tmp = node;
 	/*
 	 * This routine is called against possible nodes.
 	 * But it's BUG to call kmalloc() against offline node.
 	 *
 	 * TODO: this routine can waste much memory for nodes which will
 	 *       never be onlined. It's better to use memory hotplug callback
 	 *       function.
 	 */
 	if (!node_state(node, N_NORMAL_MEMORY))
 		tmp = -1;
 	pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
 	if (!pn)
 		return 1;
 	mem->info.nodeinfo[node] = pn;
 	memset(pn, 0, sizeof(*pn));
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
 		spin_lock_init(&mz->lru_lock);
 		for_each_lru(l)
 			INIT_LIST_HEAD(&mz->lists[l]);
 	}
 	return 0;
 }
 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
 	kfree(mem->info.nodeinfo[node]);
 }
+static int mem_cgroup_size(void)
+{
+	int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
+	return sizeof(struct mem_cgroup) + cpustat_size;
+}
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
 	struct mem_cgroup *mem;
+	int size = mem_cgroup_size();
-	if (sizeof(*mem) < PAGE_SIZE)
+	if (size < PAGE_SIZE)
-		mem = kmalloc(sizeof(*mem), GFP_KERNEL);
+		mem = kmalloc(size, GFP_KERNEL);
 	else
-		mem = vmalloc(sizeof(*mem));
+		mem = vmalloc(size);
 	if (mem)
-		memset(mem, 0, sizeof(*mem));
+		memset(mem, 0, size);
 	return mem;
 }
 static void mem_cgroup_free(struct mem_cgroup *mem)
 {
-	if (sizeof(*mem) < PAGE_SIZE)
+	if (mem_cgroup_size() < PAGE_SIZE)
 		kfree(mem);
 	else
 		vfree(mem);
 }
 static struct cgroup_subsys_state *
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	struct mem_cgroup *mem;
 	int node;
-	if (unlikely((cont->parent) == NULL)) {
+	mem = mem_cgroup_alloc();
-		mem = &init_mem_cgroup;
+	if (!mem)
-	} else {
+		return ERR_PTR(-ENOMEM);
-		mem = mem_cgroup_alloc();
-		if (!mem)
-			return ERR_PTR(-ENOMEM);
-	}
 	res_counter_init(&mem->res);
 	for_each_node_state(node, N_POSSIBLE)
 		if (alloc_mem_cgroup_per_zone_info(mem, node))
 			goto free_out;
 	return &mem->css;
 free_out:
 	for_each_node_state(node, N_POSSIBLE)
 		free_mem_cgroup_per_zone_info(mem, node);
-	if (cont->parent != NULL)
+	mem_cgroup_free(mem);
-		mem_cgroup_free(mem);
 	return ERR_PTR(-ENOMEM);
 }
 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
 					struct cgroup *cont)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 	mem_cgroup_force_empty(mem);
 }
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 				struct cgroup *cont)
 {
 	int node;
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 	for_each_node_state(node, N_POSSIBLE)
 		free_mem_cgroup_per_zone_info(mem, node);
 	mem_cgroup_free(mem_cgroup_from_cont(cont));
 }
 static int mem_cgroup_populate(struct cgroup_subsys *ss,
 				struct cgroup *cont)
 {
 	return cgroup_add_files(cont, ss, mem_cgroup_files,
 					ARRAY_SIZE(mem_cgroup_files));
 }
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 				struct cgroup *cont,
 				struct cgroup *old_cont,
 				struct task_struct *p)
 {
 	struct mm_struct *mm;
 	struct mem_cgroup *mem, *old_mem;
 	mm = get_task_mm(p);
 	if (mm == NULL)
 		return;
 	mem = mem_cgroup_from_cont(cont);
 	old_mem = mem_cgroup_from_cont(old_cont);
 	/*
 	 * Only thread group leaders are allowed to migrate, the mm_struct is
 	 * in effect owned by the leader
 	 */
 	if (!thread_group_leader(p))
 		goto out;
 out:
 	mmput(mm);
 }
 struct cgroup_subsys mem_cgroup_subsys = {
 	.name = "memory",
 	.subsys_id = mem_cgroup_subsys_id,
 	.create = mem_cgroup_create,
 	.pre_destroy = mem_cgroup_pre_destroy,