Eric Lee / smarc-fsl-linux-kernel

1

/* memcontrol.c - Memory Controller

1

/* memcontrol.c - Memory Controller

2

*

2

*

3

* Copyright IBM Corporation, 2007

3

* Copyright IBM Corporation, 2007

4

* Author Balbir Singh <balbir@linux.vnet.ibm.com>

4

* Author Balbir Singh <balbir@linux.vnet.ibm.com>

5

*

5

*

6

7

* Author: Pavel Emelianov <xemul@openvz.org>

7

* Author: Pavel Emelianov <xemul@openvz.org>

8

*

8

*

9

* Memory thresholds

9

* Memory thresholds

10

11

* Author: Kirill A. Shutemov

11

* Author: Kirill A. Shutemov

12

*

12

*

13

* This program is free software; you can redistribute it and/or modify

13

* This program is free software; you can redistribute it and/or modify

14

* it under the terms of the GNU General Public License as published by

14

* it under the terms of the GNU General Public License as published by

15

* the Free Software Foundation; either version 2 of the License, or

15

* the Free Software Foundation; either version 2 of the License, or

16

* (at your option) any later version.

16

* (at your option) any later version.

17

*

17

*

18

* This program is distributed in the hope that it will be useful,

18

* This program is distributed in the hope that it will be useful,

19

* but WITHOUT ANY WARRANTY; without even the implied warranty of

19

* but WITHOUT ANY WARRANTY; without even the implied warranty of

20

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

20

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

21

* GNU General Public License for more details.

21

* GNU General Public License for more details.

22

*/

22

*/

23

24

#include <linux/res_counter.h>

24

#include <linux/res_counter.h>

25

#include <linux/memcontrol.h>

25

#include <linux/memcontrol.h>

26

#include <linux/cgroup.h>

26

#include <linux/cgroup.h>

27

#include <linux/mm.h>

27

#include <linux/mm.h>

28

#include <linux/hugetlb.h>

28

#include <linux/hugetlb.h>

29

#include <linux/pagemap.h>

29

#include <linux/pagemap.h>

30

#include <linux/smp.h>

30

#include <linux/smp.h>

31

#include <linux/page-flags.h>

31

#include <linux/page-flags.h>

32

#include <linux/backing-dev.h>

32

#include <linux/backing-dev.h>

33

#include <linux/bit_spinlock.h>

33

#include <linux/bit_spinlock.h>

34

#include <linux/rcupdate.h>

34

#include <linux/rcupdate.h>

35

#include <linux/limits.h>

35

#include <linux/limits.h>

36

#include <linux/mutex.h>

36

#include <linux/mutex.h>

37

#include <linux/rbtree.h>

37

#include <linux/rbtree.h>

38

#include <linux/shmem_fs.h>

38

#include <linux/shmem_fs.h>

39

#include <linux/slab.h>

39

#include <linux/slab.h>

40

#include <linux/swap.h>

40

#include <linux/swap.h>

41

#include <linux/swapops.h>

41

#include <linux/swapops.h>

42

#include <linux/spinlock.h>

42

#include <linux/spinlock.h>

43

#include <linux/eventfd.h>

43

#include <linux/eventfd.h>

44

#include <linux/sort.h>

44

#include <linux/sort.h>

45

#include <linux/fs.h>

45

#include <linux/fs.h>

46

#include <linux/seq_file.h>

46

#include <linux/seq_file.h>

47

#include <linux/vmalloc.h>

47

#include <linux/vmalloc.h>

48

#include <linux/mm_inline.h>

48

#include <linux/mm_inline.h>

49

#include <linux/page_cgroup.h>

49

#include <linux/page_cgroup.h>

50

#include <linux/cpu.h>

50

#include <linux/cpu.h>

51

#include <linux/oom.h>

51

#include <linux/oom.h>

52

#include "internal.h"

52

#include "internal.h"

53

54

#include <asm/uaccess.h>

54

#include <asm/uaccess.h>

55

56

#include <trace/events/vmscan.h>

56

#include <trace/events/vmscan.h>

57

58

struct cgroup_subsys mem_cgroup_subsys __read_mostly;

58

struct cgroup_subsys mem_cgroup_subsys __read_mostly;

59

#define MEM_CGROUP_RECLAIM_RETRIES 5

59

#define MEM_CGROUP_RECLAIM_RETRIES 5

60

struct mem_cgroup *root_mem_cgroup __read_mostly;

60

struct mem_cgroup *root_mem_cgroup __read_mostly;

61

62

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

62

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

63

/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */

63

/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */

64

int do_swap_account __read_mostly;

64

int do_swap_account __read_mostly;

65

66

/* for remember boot option*/

66

/* for remember boot option*/

67

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED

67

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED

68

static int really_do_swap_account __initdata = 1;

68

static int really_do_swap_account __initdata = 1;

69

#else

69

#else

70

static int really_do_swap_account __initdata = 0;

70

static int really_do_swap_account __initdata = 0;

71

#endif

71

#endif

72

73

#else

73

#else

74

#define do_swap_account (0)

74

#define do_swap_account (0)

75

#endif

75

#endif

76

77

78

/*

78

/*

79

* Statistics for memory cgroup.

79

* Statistics for memory cgroup.

80

*/

80

*/

81

enum mem_cgroup_stat_index {

81

enum mem_cgroup_stat_index {

82

/*

82

/*

83

* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.

83

* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.

84

*/

84

*/

85

MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */

85

MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */

86

MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */

86

MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */

87

MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */

87

MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */

88

MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */

88

MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */

89

MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */

89

MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */

90

MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */

90

MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */

91

MEM_CGROUP_STAT_NSTATS,

91

MEM_CGROUP_STAT_NSTATS,

92

};

92

};

93

94

enum mem_cgroup_events_index {

94

enum mem_cgroup_events_index {

95

MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */

95

MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */

96

MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */

96

MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */

97

MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */

97

MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */

98

MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */

98

MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */

99

MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */

99

MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */

100

MEM_CGROUP_EVENTS_NSTATS,

100

MEM_CGROUP_EVENTS_NSTATS,

101

};

101

};

102

/*

102

/*

103

* Per memcg event counter is incremented at every pagein/pageout. With THP,

103

* Per memcg event counter is incremented at every pagein/pageout. With THP,

104

* it will be incremated by the number of pages. This counter is used for

104

* it will be incremated by the number of pages. This counter is used for

105

* for trigger some periodic events. This is straightforward and better

105

* for trigger some periodic events. This is straightforward and better

106

* than using jiffies etc. to handle periodic memcg event.

106

* than using jiffies etc. to handle periodic memcg event.

107

*/

107

*/

108

enum mem_cgroup_events_target {

108

enum mem_cgroup_events_target {

109

MEM_CGROUP_TARGET_THRESH,

109

MEM_CGROUP_TARGET_THRESH,

110

MEM_CGROUP_TARGET_SOFTLIMIT,

110

MEM_CGROUP_TARGET_SOFTLIMIT,

111

MEM_CGROUP_TARGET_NUMAINFO,

111

MEM_CGROUP_TARGET_NUMAINFO,

112

MEM_CGROUP_NTARGETS,

112

MEM_CGROUP_NTARGETS,

113

};

113

};

114

#define THRESHOLDS_EVENTS_TARGET (128)

114

#define THRESHOLDS_EVENTS_TARGET (128)

115

#define SOFTLIMIT_EVENTS_TARGET (1024)

115

#define SOFTLIMIT_EVENTS_TARGET (1024)

116

#define NUMAINFO_EVENTS_TARGET (1024)

116

#define NUMAINFO_EVENTS_TARGET (1024)

117

118

struct mem_cgroup_stat_cpu {

118

struct mem_cgroup_stat_cpu {

119

long count[MEM_CGROUP_STAT_NSTATS];

119

long count[MEM_CGROUP_STAT_NSTATS];

120

unsigned long events[MEM_CGROUP_EVENTS_NSTATS];

120

unsigned long events[MEM_CGROUP_EVENTS_NSTATS];

121

unsigned long targets[MEM_CGROUP_NTARGETS];

121

unsigned long targets[MEM_CGROUP_NTARGETS];

122

};

122

};

123

124

/*

124

/*

125

* per-zone information in memory controller.

125

* per-zone information in memory controller.

126

*/

126

*/

127

struct mem_cgroup_per_zone {

127

struct mem_cgroup_per_zone {

128

/*

128

/*

129

* spin_lock to protect the per cgroup LRU

129

* spin_lock to protect the per cgroup LRU

130

*/

130

*/

131

struct list_head lists[NR_LRU_LISTS];

131

struct list_head lists[NR_LRU_LISTS];

132

unsigned long count[NR_LRU_LISTS];

132

unsigned long count[NR_LRU_LISTS];

133

134

struct zone_reclaim_stat reclaim_stat;

134

struct zone_reclaim_stat reclaim_stat;

135

struct rb_node tree_node; /* RB tree node */

135

struct rb_node tree_node; /* RB tree node */

136

unsigned long long usage_in_excess;/* Set to the value by which */

136

unsigned long long usage_in_excess;/* Set to the value by which */

137

/* the soft limit is exceeded*/

137

/* the soft limit is exceeded*/

138

bool on_tree;

138

bool on_tree;

139

struct mem_cgroup *mem; /* Back pointer, we cannot */

139

struct mem_cgroup *mem; /* Back pointer, we cannot */

140

/* use container_of */

140

/* use container_of */

141

};

141

};

142

/* Macro for accessing counter */

142

/* Macro for accessing counter */

143

#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])

143

#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])

144

145

struct mem_cgroup_per_node {

145

struct mem_cgroup_per_node {

146

struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];

146

struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];

147

};

147

};

148

149

struct mem_cgroup_lru_info {

149

struct mem_cgroup_lru_info {

150

struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];

150

struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];

151

};

151

};

152

153

/*

153

/*

154

* Cgroups above their limits are maintained in a RB-Tree, independent of

154

* Cgroups above their limits are maintained in a RB-Tree, independent of

155

* their hierarchy representation

155

* their hierarchy representation

156

*/

156

*/

157

158

struct mem_cgroup_tree_per_zone {

158

struct mem_cgroup_tree_per_zone {

159

struct rb_root rb_root;

159

struct rb_root rb_root;

160

spinlock_t lock;

160

spinlock_t lock;

161

};

161

};

162

163

struct mem_cgroup_tree_per_node {

163

struct mem_cgroup_tree_per_node {

164

struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];

164

struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];

165

};

165

};

166

167

struct mem_cgroup_tree {

167

struct mem_cgroup_tree {

168

struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];

168

struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];

169

};

169

};

170

171

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

171

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

172

173

struct mem_cgroup_threshold {

173

struct mem_cgroup_threshold {

174

struct eventfd_ctx *eventfd;

174

struct eventfd_ctx *eventfd;

175

u64 threshold;

175

u64 threshold;

176

};

176

};

177

178

/* For threshold */

178

/* For threshold */

179

struct mem_cgroup_threshold_ary {

179

struct mem_cgroup_threshold_ary {

180

/* An array index points to threshold just below usage. */

180

/* An array index points to threshold just below usage. */

181

int current_threshold;

181

int current_threshold;

182

/* Size of entries[] */

182

/* Size of entries[] */

183

unsigned int size;

183

unsigned int size;

184

/* Array of thresholds */

184

/* Array of thresholds */

185

struct mem_cgroup_threshold entries[0];

185

struct mem_cgroup_threshold entries[0];

186

};

186

};

187

188

struct mem_cgroup_thresholds {

188

struct mem_cgroup_thresholds {

189

/* Primary thresholds array */

189

/* Primary thresholds array */

190

struct mem_cgroup_threshold_ary *primary;

190

struct mem_cgroup_threshold_ary *primary;

191

/*

191

/*

192

* Spare threshold array.

192

* Spare threshold array.

193

* This is needed to make mem_cgroup_unregister_event() "never fail".

193

* This is needed to make mem_cgroup_unregister_event() "never fail".

194

* It must be able to store at least primary->size - 1 entries.

194

* It must be able to store at least primary->size - 1 entries.

195

*/

195

*/

196

struct mem_cgroup_threshold_ary *spare;

196

struct mem_cgroup_threshold_ary *spare;

197

};

197

};

198

199

/* for OOM */

199

/* for OOM */

200

struct mem_cgroup_eventfd_list {

200

struct mem_cgroup_eventfd_list {

201

struct list_head list;

201

struct list_head list;

202

struct eventfd_ctx *eventfd;

202

struct eventfd_ctx *eventfd;

203

};

203

};

204

205

static void mem_cgroup_threshold(struct mem_cgroup *mem);

205

static void mem_cgroup_threshold(struct mem_cgroup *mem);

206

static void mem_cgroup_oom_notify(struct mem_cgroup *mem);

206

static void mem_cgroup_oom_notify(struct mem_cgroup *mem);

207

208

enum {

208

enum {

209

SCAN_BY_LIMIT,

209

SCAN_BY_LIMIT,

210

SCAN_BY_SYSTEM,

210

SCAN_BY_SYSTEM,

211

NR_SCAN_CONTEXT,

211

NR_SCAN_CONTEXT,

212

SCAN_BY_SHRINK, /* not recorded now */

212

SCAN_BY_SHRINK, /* not recorded now */

213

};

213

};

214

215

enum {

215

enum {

216

SCAN,

216

SCAN,

217

SCAN_ANON,

217

SCAN_ANON,

218

SCAN_FILE,

218

SCAN_FILE,

219

ROTATE,

219

ROTATE,

220

ROTATE_ANON,

220

ROTATE_ANON,

221

ROTATE_FILE,

221

ROTATE_FILE,

222

FREED,

222

FREED,

223

FREED_ANON,

223

FREED_ANON,

224

FREED_FILE,

224

FREED_FILE,

225

ELAPSED,

225

ELAPSED,

226

NR_SCANSTATS,

226

NR_SCANSTATS,

227

};

227

};

228

229

struct scanstat {

229

struct scanstat {

230

spinlock_t lock;

230

spinlock_t lock;

231

unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS];

231

unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS];

232

unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS];

232

unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS];

233

};

233

};

234

235

const char *scanstat_string[NR_SCANSTATS] = {

235

const char *scanstat_string[NR_SCANSTATS] = {

236

"scanned_pages",

236

"scanned_pages",

237

"scanned_anon_pages",

237

"scanned_anon_pages",

238

"scanned_file_pages",

238

"scanned_file_pages",

239

"rotated_pages",

239

"rotated_pages",

240

"rotated_anon_pages",

240

"rotated_anon_pages",

241

"rotated_file_pages",

241

"rotated_file_pages",

242

"freed_pages",

242

"freed_pages",

243

"freed_anon_pages",

243

"freed_anon_pages",

244

"freed_file_pages",

244

"freed_file_pages",

245

"elapsed_ns",

245

"elapsed_ns",

246

};

246

};

247

#define SCANSTAT_WORD_LIMIT "_by_limit"

247

#define SCANSTAT_WORD_LIMIT "_by_limit"

248

#define SCANSTAT_WORD_SYSTEM "_by_system"

248

#define SCANSTAT_WORD_SYSTEM "_by_system"

249

#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy"

249

#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy"

250

251

252

/*

252

/*

253

* The memory controller data structure. The memory controller controls both

253

* The memory controller data structure. The memory controller controls both

254

* page cache and RSS per cgroup. We would eventually like to provide

254

* page cache and RSS per cgroup. We would eventually like to provide

255

* statistics based on the statistics developed by Rik Van Riel for clock-pro,

255

* statistics based on the statistics developed by Rik Van Riel for clock-pro,

256

* to help the administrator determine what knobs to tune.

256

* to help the administrator determine what knobs to tune.

257

*

257

*

258

* TODO: Add a water mark for the memory controller. Reclaim will begin when

258

* TODO: Add a water mark for the memory controller. Reclaim will begin when

259

* we hit the water mark. May be even add a low water mark, such that

259

* we hit the water mark. May be even add a low water mark, such that

260

* no reclaim occurs from a cgroup at it's low water mark, this is

260

* no reclaim occurs from a cgroup at it's low water mark, this is

261

* a feature that will be implemented much later in the future.

261

* a feature that will be implemented much later in the future.

262

*/

262

*/

263

struct mem_cgroup {

263

struct mem_cgroup {

264

struct cgroup_subsys_state css;

264

struct cgroup_subsys_state css;

265

/*

265

/*

266

* the counter to account for memory usage

266

* the counter to account for memory usage

267

*/

267

*/

268

struct res_counter res;

268

struct res_counter res;

269

/*

269

/*

270

* the counter to account for mem+swap usage.

270

* the counter to account for mem+swap usage.

271

*/

271

*/

272

struct res_counter memsw;

272

struct res_counter memsw;

273

/*

273

/*

274

* Per cgroup active and inactive list, similar to the

274

* Per cgroup active and inactive list, similar to the

275

* per zone LRU lists.

275

* per zone LRU lists.

276

*/

276

*/

277

struct mem_cgroup_lru_info info;

277

struct mem_cgroup_lru_info info;

278

/*

278

/*

279

* While reclaiming in a hierarchy, we cache the last child we

279

* While reclaiming in a hierarchy, we cache the last child we

280

* reclaimed from.

280

* reclaimed from.

281

*/

281

*/

282

int last_scanned_child;

282

int last_scanned_child;

283

int last_scanned_node;

283

int last_scanned_node;

284

#if MAX_NUMNODES > 1

284

#if MAX_NUMNODES > 1

285

nodemask_t scan_nodes;

285

nodemask_t scan_nodes;

286

atomic_t numainfo_events;

286

atomic_t numainfo_events;

287

atomic_t numainfo_updating;

287

atomic_t numainfo_updating;

288

#endif

288

#endif

289

/*

289

/*

290

* Should the accounting and control be hierarchical, per subtree?

290

* Should the accounting and control be hierarchical, per subtree?

291

*/

291

*/

292

bool use_hierarchy;

292

bool use_hierarchy;

293

294

bool oom_lock;

294

bool oom_lock;

295

atomic_t under_oom;

295

atomic_t under_oom;

296

297

atomic_t refcnt;

297

atomic_t refcnt;

298

299

int swappiness;

299

int swappiness;

300

/* OOM-Killer disable */

300

/* OOM-Killer disable */

301

int oom_kill_disable;

301

int oom_kill_disable;

302

303

/* set when res.limit == memsw.limit */

303

/* set when res.limit == memsw.limit */

304

bool memsw_is_minimum;

304

bool memsw_is_minimum;

305

306

/* protect arrays of thresholds */

306

/* protect arrays of thresholds */

307

struct mutex thresholds_lock;

307

struct mutex thresholds_lock;

308

309

/* thresholds for memory usage. RCU-protected */

309

/* thresholds for memory usage. RCU-protected */

310

struct mem_cgroup_thresholds thresholds;

310

struct mem_cgroup_thresholds thresholds;

311

312

/* thresholds for mem+swap usage. RCU-protected */

312

/* thresholds for mem+swap usage. RCU-protected */

313

struct mem_cgroup_thresholds memsw_thresholds;

313

struct mem_cgroup_thresholds memsw_thresholds;

314

315

/* For oom notifier event fd */

315

/* For oom notifier event fd */

316

struct list_head oom_notify;

316

struct list_head oom_notify;

317

/* For recording LRU-scan statistics */

317

/* For recording LRU-scan statistics */

318

struct scanstat scanstat;

318

struct scanstat scanstat;

319

/*

319

/*

320

* Should we move charges of a task when a task is moved into this

320

* Should we move charges of a task when a task is moved into this

321

* mem_cgroup ? And what type of charges should we move ?

321

* mem_cgroup ? And what type of charges should we move ?

322

*/

322

*/

323

unsigned long move_charge_at_immigrate;

323

unsigned long move_charge_at_immigrate;

324

/*

324

/*

325

* percpu counter.

325

* percpu counter.

326

*/

326

*/

327

struct mem_cgroup_stat_cpu *stat;

327

struct mem_cgroup_stat_cpu *stat;

328

/*

328

/*

329

* used when a cpu is offlined or other synchronizations

329

* used when a cpu is offlined or other synchronizations

330

* See mem_cgroup_read_stat().

330

* See mem_cgroup_read_stat().

331

*/

331

*/

332

struct mem_cgroup_stat_cpu nocpu_base;

332

struct mem_cgroup_stat_cpu nocpu_base;

333

spinlock_t pcp_counter_lock;

333

spinlock_t pcp_counter_lock;

334

};

334

};

335

336

/* Stuffs for move charges at task migration. */

336

/* Stuffs for move charges at task migration. */

337

/*

337

/*

338

* Types of charges to be moved. "move_charge_at_immitgrate" is treated as a

338

* Types of charges to be moved. "move_charge_at_immitgrate" is treated as a

339

* left-shifted bitmap of these types.

339

* left-shifted bitmap of these types.

340

*/

340

*/

341

enum move_type {

341

enum move_type {

342

MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */

342

MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */

343

MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */

343

MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */

344

NR_MOVE_TYPE,

344

NR_MOVE_TYPE,

345

};

345

};

346

347

/* "mc" and its members are protected by cgroup_mutex */

347

/* "mc" and its members are protected by cgroup_mutex */

348

static struct move_charge_struct {

348

static struct move_charge_struct {

349

spinlock_t lock; /* for from, to */

349

spinlock_t lock; /* for from, to */

350

struct mem_cgroup *from;

350

struct mem_cgroup *from;

351

struct mem_cgroup *to;

351

struct mem_cgroup *to;

352

unsigned long precharge;

352

unsigned long precharge;

353

unsigned long moved_charge;

353

unsigned long moved_charge;

354

unsigned long moved_swap;

354

unsigned long moved_swap;

355

struct task_struct *moving_task; /* a task moving charges */

355

struct task_struct *moving_task; /* a task moving charges */

356

wait_queue_head_t waitq; /* a waitq for other context */

356

wait_queue_head_t waitq; /* a waitq for other context */

357

} mc = {

357

} mc = {

358

.lock = __SPIN_LOCK_UNLOCKED(mc.lock),

358

.lock = __SPIN_LOCK_UNLOCKED(mc.lock),

359

.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),

359

.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),

360

};

360

};

361

362

static bool move_anon(void)

362

static bool move_anon(void)

363

{

363

{

364

return test_bit(MOVE_CHARGE_TYPE_ANON,

364

return test_bit(MOVE_CHARGE_TYPE_ANON,

365

&mc.to->move_charge_at_immigrate);

365

&mc.to->move_charge_at_immigrate);

366

}

366

}

367

368

static bool move_file(void)

368

static bool move_file(void)

369

{

369

{

370

return test_bit(MOVE_CHARGE_TYPE_FILE,

370

return test_bit(MOVE_CHARGE_TYPE_FILE,

371

&mc.to->move_charge_at_immigrate);

371

&mc.to->move_charge_at_immigrate);

372

}

372

}

373

374

/*

374

/*

375

* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft

375

* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft

376

* limit reclaim to prevent infinite loops, if they ever occur.

376

* limit reclaim to prevent infinite loops, if they ever occur.

377

*/

377

*/

378

#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)

378

#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)

379

#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)

379

#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)

380

381

enum charge_type {

381

enum charge_type {

382

MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

382

MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

383

MEM_CGROUP_CHARGE_TYPE_MAPPED,

383

MEM_CGROUP_CHARGE_TYPE_MAPPED,

384

MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */

384

MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */

385

MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */

385

MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */

386

MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */

386

MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */

387

MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */

387

MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */

388

NR_CHARGE_TYPE,

388

NR_CHARGE_TYPE,

389

};

389

};

390

391

/* for encoding cft->private value on file */

391

/* for encoding cft->private value on file */

392

#define _MEM (0)

392

#define _MEM (0)

393

#define _MEMSWAP (1)

393

#define _MEMSWAP (1)

394

#define _OOM_TYPE (2)

394

#define _OOM_TYPE (2)

395

#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))

395

#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))

396

#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)

396

#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)

397

#define MEMFILE_ATTR(val) ((val) & 0xffff)

397

#define MEMFILE_ATTR(val) ((val) & 0xffff)

398

/* Used for OOM nofiier */

398

/* Used for OOM nofiier */

399

#define OOM_CONTROL (0)

399

#define OOM_CONTROL (0)

400

401

/*

401

/*

402

* Reclaim flags for mem_cgroup_hierarchical_reclaim

402

* Reclaim flags for mem_cgroup_hierarchical_reclaim

403

*/

403

*/

404

#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0

404

#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0

405

#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)

405

#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)

406

#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1

406

#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1

407

#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)

407

#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)

408

#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2

408

#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2

409

#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)

409

#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)

410

411

static void mem_cgroup_get(struct mem_cgroup *mem);

411

static void mem_cgroup_get(struct mem_cgroup *mem);

412

static void mem_cgroup_put(struct mem_cgroup *mem);

412

static void mem_cgroup_put(struct mem_cgroup *mem);

413

static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);

413

static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);

414

static void drain_all_stock_async(struct mem_cgroup *mem);

414

static void drain_all_stock_async(struct mem_cgroup *mem);

415

416

static struct mem_cgroup_per_zone *

416

static struct mem_cgroup_per_zone *

417

mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)

417

mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)

418

{

418

{

419

return &mem->info.nodeinfo[nid]->zoneinfo[zid];

419

return &mem->info.nodeinfo[nid]->zoneinfo[zid];

420

}

420

}

421

422

struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)

422

struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)

423

{

423

{

424

return &mem->css;

424

return &mem->css;

425

}

425

}

426

427

static struct mem_cgroup_per_zone *

427

static struct mem_cgroup_per_zone *

428

page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)

428

page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)

429

{

429

{

430

int nid = page_to_nid(page);

430

int nid = page_to_nid(page);

431

int zid = page_zonenum(page);

431

int zid = page_zonenum(page);

432

433

return mem_cgroup_zoneinfo(mem, nid, zid);

433

return mem_cgroup_zoneinfo(mem, nid, zid);

434

}

434

}

435

436

static struct mem_cgroup_tree_per_zone *

436

static struct mem_cgroup_tree_per_zone *

437

soft_limit_tree_node_zone(int nid, int zid)

437

soft_limit_tree_node_zone(int nid, int zid)

438

{

438

{

439

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

439

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

440

}

440

}

441

442

static struct mem_cgroup_tree_per_zone *

442

static struct mem_cgroup_tree_per_zone *

443

soft_limit_tree_from_page(struct page *page)

443

soft_limit_tree_from_page(struct page *page)

444

{

444

{

445

int nid = page_to_nid(page);

445

int nid = page_to_nid(page);

446

int zid = page_zonenum(page);

446

int zid = page_zonenum(page);

447

448

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

448

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

449

}

449

}

450

451

static void

451

static void

452

__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,

452

__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,

453

struct mem_cgroup_per_zone *mz,

453

struct mem_cgroup_per_zone *mz,

454

struct mem_cgroup_tree_per_zone *mctz,

454

struct mem_cgroup_tree_per_zone *mctz,

455

unsigned long long new_usage_in_excess)

455

unsigned long long new_usage_in_excess)

456

{

456

{

457

struct rb_node **p = &mctz->rb_root.rb_node;

457

struct rb_node **p = &mctz->rb_root.rb_node;

458

struct rb_node *parent = NULL;

458

struct rb_node *parent = NULL;

459

struct mem_cgroup_per_zone *mz_node;

459

struct mem_cgroup_per_zone *mz_node;

460

461

if (mz->on_tree)

461

if (mz->on_tree)

462

return;

462

return;

463

464

mz->usage_in_excess = new_usage_in_excess;

464

mz->usage_in_excess = new_usage_in_excess;

465

if (!mz->usage_in_excess)

465

if (!mz->usage_in_excess)

466

return;

466

return;

467

while (*p) {

467

while (*p) {

468

parent = *p;

468

parent = *p;

469

mz_node = rb_entry(parent, struct mem_cgroup_per_zone,

469

mz_node = rb_entry(parent, struct mem_cgroup_per_zone,

470

tree_node);

470

tree_node);

471

if (mz->usage_in_excess < mz_node->usage_in_excess)

471

if (mz->usage_in_excess < mz_node->usage_in_excess)

472

p = &(*p)->rb_left;

472

p = &(*p)->rb_left;

473

/*

473

/*

474

* We can't avoid mem cgroups that are over their soft

474

* We can't avoid mem cgroups that are over their soft

475

* limit by the same amount

475

* limit by the same amount

476

*/

476

*/

477

else if (mz->usage_in_excess >= mz_node->usage_in_excess)

477

else if (mz->usage_in_excess >= mz_node->usage_in_excess)

478

p = &(*p)->rb_right;

478

p = &(*p)->rb_right;

479

}

479

}

480

rb_link_node(&mz->tree_node, parent, p);

480

rb_link_node(&mz->tree_node, parent, p);

481

rb_insert_color(&mz->tree_node, &mctz->rb_root);

481

rb_insert_color(&mz->tree_node, &mctz->rb_root);

482

mz->on_tree = true;

482

mz->on_tree = true;

483

}

483

}

484

485

static void

485

static void

486

__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,

486

__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,

487

struct mem_cgroup_per_zone *mz,

487

struct mem_cgroup_per_zone *mz,

488

struct mem_cgroup_tree_per_zone *mctz)

488

struct mem_cgroup_tree_per_zone *mctz)

489

{

489

{

490

if (!mz->on_tree)

490

if (!mz->on_tree)

491

return;

491

return;

492

rb_erase(&mz->tree_node, &mctz->rb_root);

492

rb_erase(&mz->tree_node, &mctz->rb_root);

493

mz->on_tree = false;

493

mz->on_tree = false;

494

}

494

}

495

496

static void

496

static void

497

mem_cgroup_remove_exceeded(struct mem_cgroup *mem,

497

mem_cgroup_remove_exceeded(struct mem_cgroup *mem,

498

struct mem_cgroup_per_zone *mz,

498

struct mem_cgroup_per_zone *mz,

499

struct mem_cgroup_tree_per_zone *mctz)

499

struct mem_cgroup_tree_per_zone *mctz)

500

{

500

{

501

spin_lock(&mctz->lock);

501

spin_lock(&mctz->lock);

502

__mem_cgroup_remove_exceeded(mem, mz, mctz);

502

__mem_cgroup_remove_exceeded(mem, mz, mctz);

503

spin_unlock(&mctz->lock);

503

spin_unlock(&mctz->lock);

504

}

504

}

505

506

507

static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)

507

static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)

508

{

508

{

509

unsigned long long excess;

509

unsigned long long excess;

510

struct mem_cgroup_per_zone *mz;

510

struct mem_cgroup_per_zone *mz;

511

struct mem_cgroup_tree_per_zone *mctz;

511

struct mem_cgroup_tree_per_zone *mctz;

512

int nid = page_to_nid(page);

512

int nid = page_to_nid(page);

513

int zid = page_zonenum(page);

513

int zid = page_zonenum(page);

514

mctz = soft_limit_tree_from_page(page);

514

mctz = soft_limit_tree_from_page(page);

515

516

/*

516

/*

517

* Necessary to update all ancestors when hierarchy is used.

517

* Necessary to update all ancestors when hierarchy is used.

518

* because their event counter is not touched.

518

* because their event counter is not touched.

519

*/

519

*/

520

for (; mem; mem = parent_mem_cgroup(mem)) {

520

for (; mem; mem = parent_mem_cgroup(mem)) {

521

mz = mem_cgroup_zoneinfo(mem, nid, zid);

521

mz = mem_cgroup_zoneinfo(mem, nid, zid);

522

excess = res_counter_soft_limit_excess(&mem->res);

522

excess = res_counter_soft_limit_excess(&mem->res);

523

/*

523

/*

524

* We have to update the tree if mz is on RB-tree or

524

* We have to update the tree if mz is on RB-tree or

525

* mem is over its softlimit.

525

* mem is over its softlimit.

526

*/

526

*/

527

if (excess || mz->on_tree) {

527

if (excess || mz->on_tree) {

528

spin_lock(&mctz->lock);

528

spin_lock(&mctz->lock);

529

/* if on-tree, remove it */

529

/* if on-tree, remove it */

530

if (mz->on_tree)

530

if (mz->on_tree)

531

__mem_cgroup_remove_exceeded(mem, mz, mctz);

531

__mem_cgroup_remove_exceeded(mem, mz, mctz);

532

/*

532

/*

533

* Insert again. mz->usage_in_excess will be updated.

533

* Insert again. mz->usage_in_excess will be updated.

534

* If excess is 0, no tree ops.

534

* If excess is 0, no tree ops.

535

*/

535

*/

536

__mem_cgroup_insert_exceeded(mem, mz, mctz, excess);

536

__mem_cgroup_insert_exceeded(mem, mz, mctz, excess);

537

spin_unlock(&mctz->lock);

537

spin_unlock(&mctz->lock);

538

}

538

}

539

}

539

}

540

}

540

}

541

542

static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)

542

static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)

543

{

543

{

544

int node, zone;

544

int node, zone;

545

struct mem_cgroup_per_zone *mz;

545

struct mem_cgroup_per_zone *mz;

546

struct mem_cgroup_tree_per_zone *mctz;

546

struct mem_cgroup_tree_per_zone *mctz;

547

548

for_each_node_state(node, N_POSSIBLE) {

548

for_each_node_state(node, N_POSSIBLE) {

549

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

549

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

550

mz = mem_cgroup_zoneinfo(mem, node, zone);

550

mz = mem_cgroup_zoneinfo(mem, node, zone);

551

mctz = soft_limit_tree_node_zone(node, zone);

551

mctz = soft_limit_tree_node_zone(node, zone);

552

mem_cgroup_remove_exceeded(mem, mz, mctz);

552

mem_cgroup_remove_exceeded(mem, mz, mctz);

553

}

553

}

554

}

554

}

555

}

555

}

556

557

static struct mem_cgroup_per_zone *

557

static struct mem_cgroup_per_zone *

558

__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

558

__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

559

{

559

{

560

struct rb_node *rightmost = NULL;

560

struct rb_node *rightmost = NULL;

561

struct mem_cgroup_per_zone *mz;

561

struct mem_cgroup_per_zone *mz;

562

563

retry:

563

retry:

564

mz = NULL;

564

mz = NULL;

565

rightmost = rb_last(&mctz->rb_root);

565

rightmost = rb_last(&mctz->rb_root);

566

if (!rightmost)

566

if (!rightmost)

567

goto done; /* Nothing to reclaim from */

567

goto done; /* Nothing to reclaim from */

568

569

mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);

569

mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);

570

/*

570

/*

571

* Remove the node now but someone else can add it back,

571

* Remove the node now but someone else can add it back,

572

* we will to add it back at the end of reclaim to its correct

572

* we will to add it back at the end of reclaim to its correct

573

* position in the tree.

573

* position in the tree.

574

*/

574

*/

575

__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);

575

__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);

576

if (!res_counter_soft_limit_excess(&mz->mem->res) ||

576

if (!res_counter_soft_limit_excess(&mz->mem->res) ||

577

!css_tryget(&mz->mem->css))

577

!css_tryget(&mz->mem->css))

578

goto retry;

578

goto retry;

579

done:

579

done:

580

return mz;

580

return mz;

581

}

581

}

582

583

static struct mem_cgroup_per_zone *

583

static struct mem_cgroup_per_zone *

584

mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

584

mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

585

{

585

{

586

struct mem_cgroup_per_zone *mz;

586

struct mem_cgroup_per_zone *mz;

587

588

spin_lock(&mctz->lock);

588

spin_lock(&mctz->lock);

589

mz = __mem_cgroup_largest_soft_limit_node(mctz);

589

mz = __mem_cgroup_largest_soft_limit_node(mctz);

590

spin_unlock(&mctz->lock);

590

spin_unlock(&mctz->lock);

591

return mz;

591

return mz;

592

}

592

}

593

594

/*

594

/*

595

* Implementation Note: reading percpu statistics for memcg.

595

* Implementation Note: reading percpu statistics for memcg.

596

*

596

*

597

* Both of vmstat[] and percpu_counter has threshold and do periodic

597

* Both of vmstat[] and percpu_counter has threshold and do periodic

598

* synchronization to implement "quick" read. There are trade-off between

598

* synchronization to implement "quick" read. There are trade-off between

599

* reading cost and precision of value. Then, we may have a chance to implement

599

* reading cost and precision of value. Then, we may have a chance to implement

600

* a periodic synchronizion of counter in memcg's counter.

600

* a periodic synchronizion of counter in memcg's counter.

601

*

601

*

602

* But this _read() function is used for user interface now. The user accounts

602

* But this _read() function is used for user interface now. The user accounts

603

* memory usage by memory cgroup and he _always_ requires exact value because

603

* memory usage by memory cgroup and he _always_ requires exact value because

604

* he accounts memory. Even if we provide quick-and-fuzzy read, we always

604

* he accounts memory. Even if we provide quick-and-fuzzy read, we always

605

* have to visit all online cpus and make sum. So, for now, unnecessary

605

* have to visit all online cpus and make sum. So, for now, unnecessary

606

* synchronization is not implemented. (just implemented for cpu hotplug)

606

* synchronization is not implemented. (just implemented for cpu hotplug)

607

*

607

*

608

* If there are kernel internal actions which can make use of some not-exact

608

* If there are kernel internal actions which can make use of some not-exact

609

* value, and reading all cpu value can be performance bottleneck in some

609

* value, and reading all cpu value can be performance bottleneck in some

610

* common workload, threashold and synchonization as vmstat[] should be

610

* common workload, threashold and synchonization as vmstat[] should be

611

* implemented.

611

* implemented.

612

*/

612

*/

613

static long mem_cgroup_read_stat(struct mem_cgroup *mem,

613

static long mem_cgroup_read_stat(struct mem_cgroup *mem,

614

enum mem_cgroup_stat_index idx)

614

enum mem_cgroup_stat_index idx)

615

{

615

{

616

long val = 0;

616

long val = 0;

617

int cpu;

617

int cpu;

618

619

get_online_cpus();

619

get_online_cpus();

620

for_each_online_cpu(cpu)

620

for_each_online_cpu(cpu)

621

val += per_cpu(mem->stat->count[idx], cpu);

621

val += per_cpu(mem->stat->count[idx], cpu);

622

#ifdef CONFIG_HOTPLUG_CPU

622

#ifdef CONFIG_HOTPLUG_CPU

623

spin_lock(&mem->pcp_counter_lock);

623

spin_lock(&mem->pcp_counter_lock);

624

val += mem->nocpu_base.count[idx];

624

val += mem->nocpu_base.count[idx];

625

spin_unlock(&mem->pcp_counter_lock);

625

spin_unlock(&mem->pcp_counter_lock);

626

#endif

626

#endif

627

put_online_cpus();

627

put_online_cpus();

628

return val;

628

return val;

629

}

629

}

630

631

static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,

631

static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,

632

bool charge)

632

bool charge)

633

{

633

{

634

int val = (charge) ? 1 : -1;

634

int val = (charge) ? 1 : -1;

635

this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);

635

this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);

636

}

636

}

637

638

void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)

638

void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)

639

{

639

{

640

this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);

640

this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);

641

}

641

}

642

643

void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)

643

void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)

644

{

644

{

645

this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);

645

this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);

646

}

646

}

647

648

static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,

648

static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,

649

enum mem_cgroup_events_index idx)

649

enum mem_cgroup_events_index idx)

650

{

650

{

651

unsigned long val = 0;

651

unsigned long val = 0;

652

int cpu;

652

int cpu;

653

654

for_each_online_cpu(cpu)

654

for_each_online_cpu(cpu)

655

val += per_cpu(mem->stat->events[idx], cpu);

655

val += per_cpu(mem->stat->events[idx], cpu);

656

#ifdef CONFIG_HOTPLUG_CPU

656

#ifdef CONFIG_HOTPLUG_CPU

657

spin_lock(&mem->pcp_counter_lock);

657

spin_lock(&mem->pcp_counter_lock);

658

val += mem->nocpu_base.events[idx];

658

val += mem->nocpu_base.events[idx];

659

spin_unlock(&mem->pcp_counter_lock);

659

spin_unlock(&mem->pcp_counter_lock);

660

#endif

660

#endif

661

return val;

661

return val;

662

}

662

}

663

664

static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,

664

static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,

665

bool file, int nr_pages)

665

bool file, int nr_pages)

666

{

666

{

667

preempt_disable();

667

preempt_disable();

668

669

if (file)

669

if (file)

670

__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);

670

__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);

671

else

671

else

672

__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);

672

__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);

673

674

/* pagein of a big page is an event. So, ignore page size */

674

/* pagein of a big page is an event. So, ignore page size */

675

if (nr_pages > 0)

675

if (nr_pages > 0)

676

__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);

676

__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);

677

else {

677

else {

678

__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);

678

__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);

679

nr_pages = -nr_pages; /* for event */

679

nr_pages = -nr_pages; /* for event */

680

}

680

}

681

682

__this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);

682

__this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);

683

684

preempt_enable();

684

preempt_enable();

685

}

685

}

686

687

unsigned long

687

unsigned long

688

mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,

688

mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,

689

unsigned int lru_mask)

689

unsigned int lru_mask)

690

{

690

{

691

struct mem_cgroup_per_zone *mz;

691

struct mem_cgroup_per_zone *mz;

692

enum lru_list l;

692

enum lru_list l;

693

unsigned long ret = 0;

693

unsigned long ret = 0;

694

695

mz = mem_cgroup_zoneinfo(mem, nid, zid);

695

mz = mem_cgroup_zoneinfo(mem, nid, zid);

696

697

for_each_lru(l) {

697

for_each_lru(l) {

698

if (BIT(l) & lru_mask)

698

if (BIT(l) & lru_mask)

699

ret += MEM_CGROUP_ZSTAT(mz, l);

699

ret += MEM_CGROUP_ZSTAT(mz, l);

700

}

700

}

701

return ret;

701

return ret;

702

}

702

}

703

704

static unsigned long

704

static unsigned long

705

mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,

705

mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,

706

int nid, unsigned int lru_mask)

706

int nid, unsigned int lru_mask)

707

{

707

{

708

u64 total = 0;

708

u64 total = 0;

709

int zid;

709

int zid;

710

711

for (zid = 0; zid < MAX_NR_ZONES; zid++)

711

for (zid = 0; zid < MAX_NR_ZONES; zid++)

712

total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);

712

total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);

713

714

return total;

714

return total;

715

}

715

}

716

717

static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,

717

static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,

718

unsigned int lru_mask)

718

unsigned int lru_mask)

719

{

719

{

720

int nid;

720

int nid;

721

u64 total = 0;

721

u64 total = 0;

722

723

for_each_node_state(nid, N_HIGH_MEMORY)

723

for_each_node_state(nid, N_HIGH_MEMORY)

724

total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);

724

total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);

725

return total;

725

return total;

726

}

726

}

727

728

static bool __memcg_event_check(struct mem_cgroup *mem, int target)

728

static bool __memcg_event_check(struct mem_cgroup *mem, int target)

729

{

729

{

730

unsigned long val, next;

730

unsigned long val, next;

731

732

val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);

732

val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);

733

next = this_cpu_read(mem->stat->targets[target]);

733

next = this_cpu_read(mem->stat->targets[target]);

734

/* from time_after() in jiffies.h */

734

/* from time_after() in jiffies.h */

735

return ((long)next - (long)val < 0);

735

return ((long)next - (long)val < 0);

736

}

736

}

737

738

static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)

738

static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)

739

{

739

{

740

unsigned long val, next;

740

unsigned long val, next;

741

742

val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);

742

val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);

743

744

switch (target) {

744

switch (target) {

745

case MEM_CGROUP_TARGET_THRESH:

745

case MEM_CGROUP_TARGET_THRESH:

746

next = val + THRESHOLDS_EVENTS_TARGET;

746

next = val + THRESHOLDS_EVENTS_TARGET;

747

break;

747

break;

748

case MEM_CGROUP_TARGET_SOFTLIMIT:

748

case MEM_CGROUP_TARGET_SOFTLIMIT:

749

next = val + SOFTLIMIT_EVENTS_TARGET;

749

next = val + SOFTLIMIT_EVENTS_TARGET;

750

break;

750

break;

751

case MEM_CGROUP_TARGET_NUMAINFO:

751

case MEM_CGROUP_TARGET_NUMAINFO:

752

next = val + NUMAINFO_EVENTS_TARGET;

752

next = val + NUMAINFO_EVENTS_TARGET;

753

break;

753

break;

754

default:

754

default:

755

return;

755

return;

756

}

756

}

757

758

this_cpu_write(mem->stat->targets[target], next);

758

this_cpu_write(mem->stat->targets[target], next);

759

}

759

}

760

761

/*

761

/*

762

* Check events in order.

762

* Check events in order.

763

*

763

*

764

*/

764

*/

765

static void memcg_check_events(struct mem_cgroup *mem, struct page *page)

765

static void memcg_check_events(struct mem_cgroup *mem, struct page *page)

766

{

766

{

767

/* threshold event is triggered in finer grain than soft limit */

767

/* threshold event is triggered in finer grain than soft limit */

768

if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {

768

if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {

769

mem_cgroup_threshold(mem);

769

mem_cgroup_threshold(mem);

770

__mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);

770

__mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);

771

if (unlikely(__memcg_event_check(mem,

771

if (unlikely(__memcg_event_check(mem,

772

MEM_CGROUP_TARGET_SOFTLIMIT))) {

772

MEM_CGROUP_TARGET_SOFTLIMIT))) {

773

mem_cgroup_update_tree(mem, page);

773

mem_cgroup_update_tree(mem, page);

774

__mem_cgroup_target_update(mem,

774

__mem_cgroup_target_update(mem,

775

MEM_CGROUP_TARGET_SOFTLIMIT);

775

MEM_CGROUP_TARGET_SOFTLIMIT);

776

}

776

}

777

#if MAX_NUMNODES > 1

777

#if MAX_NUMNODES > 1

778

if (unlikely(__memcg_event_check(mem,

778

if (unlikely(__memcg_event_check(mem,

779

MEM_CGROUP_TARGET_NUMAINFO))) {

779

MEM_CGROUP_TARGET_NUMAINFO))) {

780

atomic_inc(&mem->numainfo_events);

780

atomic_inc(&mem->numainfo_events);

781

__mem_cgroup_target_update(mem,

781

__mem_cgroup_target_update(mem,

782

MEM_CGROUP_TARGET_NUMAINFO);

782

MEM_CGROUP_TARGET_NUMAINFO);

783

}

783

}

784

#endif

784

#endif

785

}

785

}

786

}

786

}

787

788

static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)

788

static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)

789

{

789

{

790

return container_of(cgroup_subsys_state(cont,

790

return container_of(cgroup_subsys_state(cont,

791

mem_cgroup_subsys_id), struct mem_cgroup,

791

mem_cgroup_subsys_id), struct mem_cgroup,

792

css);

792

css);

793

}

793

}

794

795

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)

795

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)

796

{

796

{

797

/*

797

/*

798

* mm_update_next_owner() may clear mm->owner to NULL

798

* mm_update_next_owner() may clear mm->owner to NULL

799

* if it races with swapoff, page migration, etc.

799

* if it races with swapoff, page migration, etc.

800

* So this can be called with p == NULL.

800

* So this can be called with p == NULL.

801

*/

801

*/

802

if (unlikely(!p))

802

if (unlikely(!p))

803

return NULL;

803

return NULL;

804

805

return container_of(task_subsys_state(p, mem_cgroup_subsys_id),

805

return container_of(task_subsys_state(p, mem_cgroup_subsys_id),

806

struct mem_cgroup, css);

806

struct mem_cgroup, css);

807

}

807

}

808

809

struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)

809

struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)

810

{

810

{

811

struct mem_cgroup *mem = NULL;

811

struct mem_cgroup *mem = NULL;

812

813

if (!mm)

813

if (!mm)

814

return NULL;

814

return NULL;

815

/*

815

/*

816

* Because we have no locks, mm->owner's may be being moved to other

816

* Because we have no locks, mm->owner's may be being moved to other

817

* cgroup. We use css_tryget() here even if this looks

817

* cgroup. We use css_tryget() here even if this looks

818

* pessimistic (rather than adding locks here).

818

* pessimistic (rather than adding locks here).

819

*/

819

*/

820

rcu_read_lock();

820

rcu_read_lock();

821

do {

821

do {

822

mem = mem_cgroup_from_task(rcu_dereference(mm->owner));

822

mem = mem_cgroup_from_task(rcu_dereference(mm->owner));

823

if (unlikely(!mem))

823

if (unlikely(!mem))

824

break;

824

break;

825

} while (!css_tryget(&mem->css));

825

} while (!css_tryget(&mem->css));

826

rcu_read_unlock();

826

rcu_read_unlock();

827

return mem;

827

return mem;

828

}

828

}

829

830

/* The caller has to guarantee "mem" exists before calling this */

830

/* The caller has to guarantee "mem" exists before calling this */

831

static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)

831

static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)

832

{

832

{

833

struct cgroup_subsys_state *css;

833

struct cgroup_subsys_state *css;

834

int found;

834

int found;

835

836

if (!mem) /* ROOT cgroup has the smallest ID */

836

if (!mem) /* ROOT cgroup has the smallest ID */

837

return root_mem_cgroup; /*css_put/get against root is ignored*/

837

return root_mem_cgroup; /*css_put/get against root is ignored*/

838

if (!mem->use_hierarchy) {

838

if (!mem->use_hierarchy) {

839

if (css_tryget(&mem->css))

839

if (css_tryget(&mem->css))

840

return mem;

840

return mem;

841

return NULL;

841

return NULL;

842

}

842

}

843

rcu_read_lock();

843

rcu_read_lock();

844

/*

844

/*

845

* searching a memory cgroup which has the smallest ID under given

845

* searching a memory cgroup which has the smallest ID under given

846

* ROOT cgroup. (ID >= 1)

846

* ROOT cgroup. (ID >= 1)

847

*/

847

*/

848

css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);

848

css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);

849

if (css && css_tryget(css))

849

if (css && css_tryget(css))

850

mem = container_of(css, struct mem_cgroup, css);

850

mem = container_of(css, struct mem_cgroup, css);

851

else

851

else

852

mem = NULL;

852

mem = NULL;

853

rcu_read_unlock();

853

rcu_read_unlock();

854

return mem;

854

return mem;

855

}

855

}

856

857

static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,

857

static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,

858

struct mem_cgroup *root,

858

struct mem_cgroup *root,

859

bool cond)

859

bool cond)

860

{

860

{

861

int nextid = css_id(&iter->css) + 1;

861

int nextid = css_id(&iter->css) + 1;

862

int found;

862

int found;

863

int hierarchy_used;

863

int hierarchy_used;

864

struct cgroup_subsys_state *css;

864

struct cgroup_subsys_state *css;

865

866

hierarchy_used = iter->use_hierarchy;

866

hierarchy_used = iter->use_hierarchy;

867

868

css_put(&iter->css);

868

css_put(&iter->css);

869

/* If no ROOT, walk all, ignore hierarchy */

869

/* If no ROOT, walk all, ignore hierarchy */

870

if (!cond || (root && !hierarchy_used))

870

if (!cond || (root && !hierarchy_used))

871

return NULL;

871

return NULL;

872

873

if (!root)

873

if (!root)

874

root = root_mem_cgroup;

874

root = root_mem_cgroup;

875

876

do {

876

do {

877

iter = NULL;

877

iter = NULL;

878

rcu_read_lock();

878

rcu_read_lock();

879

880

css = css_get_next(&mem_cgroup_subsys, nextid,

880

css = css_get_next(&mem_cgroup_subsys, nextid,

881

&root->css, &found);

881

&root->css, &found);

882

if (css && css_tryget(css))

882

if (css && css_tryget(css))

883

iter = container_of(css, struct mem_cgroup, css);

883

iter = container_of(css, struct mem_cgroup, css);

884

rcu_read_unlock();

884

rcu_read_unlock();

885

/* If css is NULL, no more cgroups will be found */

885

/* If css is NULL, no more cgroups will be found */

886

nextid = found + 1;

886

nextid = found + 1;

887

} while (css && !iter);

887

} while (css && !iter);

888

889

return iter;

889

return iter;

890

}

890

}

891

/*

891

/*

892

* for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please

892

* for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please

893

* be careful that "break" loop is not allowed. We have reference count.

893

* be careful that "break" loop is not allowed. We have reference count.

894

* Instead of that modify "cond" to be false and "continue" to exit the loop.

894

* Instead of that modify "cond" to be false and "continue" to exit the loop.

895

*/

895

*/

896

#define for_each_mem_cgroup_tree_cond(iter, root, cond) \

896

#define for_each_mem_cgroup_tree_cond(iter, root, cond) \

897

for (iter = mem_cgroup_start_loop(root);\

897

for (iter = mem_cgroup_start_loop(root);\

898

iter != NULL;\

898

iter != NULL;\

899

iter = mem_cgroup_get_next(iter, root, cond))

899

iter = mem_cgroup_get_next(iter, root, cond))

900

901

#define for_each_mem_cgroup_tree(iter, root) \

901

#define for_each_mem_cgroup_tree(iter, root) \

902

for_each_mem_cgroup_tree_cond(iter, root, true)

902

for_each_mem_cgroup_tree_cond(iter, root, true)

903

904

#define for_each_mem_cgroup_all(iter) \

904

#define for_each_mem_cgroup_all(iter) \

905

for_each_mem_cgroup_tree_cond(iter, NULL, true)

905

for_each_mem_cgroup_tree_cond(iter, NULL, true)

906

907

908

static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)

908

static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)

909

{

909

{

910

return (mem == root_mem_cgroup);

910

return (mem == root_mem_cgroup);

911

}

911

}

912

913

void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)

913

void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)

914

{

914

{

915

struct mem_cgroup *mem;

915

struct mem_cgroup *mem;

916

917

if (!mm)

917

if (!mm)

918

return;

918

return;

919

920

rcu_read_lock();

920

rcu_read_lock();

921

mem = mem_cgroup_from_task(rcu_dereference(mm->owner));

921

mem = mem_cgroup_from_task(rcu_dereference(mm->owner));

922

if (unlikely(!mem))

922

if (unlikely(!mem))

923

goto out;

923

goto out;

924

925

switch (idx) {

925

switch (idx) {

926

case PGMAJFAULT:

926

case PGMAJFAULT:

927

mem_cgroup_pgmajfault(mem, 1);

927

mem_cgroup_pgmajfault(mem, 1);

928

break;

928

break;

929

case PGFAULT:

929

case PGFAULT:

930

mem_cgroup_pgfault(mem, 1);

930

mem_cgroup_pgfault(mem, 1);

931

break;

931

break;

932

default:

932

default:

933

BUG();

933

BUG();

934

}

934

}

935

out:

935

out:

936

rcu_read_unlock();

936

rcu_read_unlock();

937

}

937

}

938

EXPORT_SYMBOL(mem_cgroup_count_vm_event);

938

EXPORT_SYMBOL(mem_cgroup_count_vm_event);

939

940

/*

940

/*

941

* Following LRU functions are allowed to be used without PCG_LOCK.

941

* Following LRU functions are allowed to be used without PCG_LOCK.

942

* Operations are called by routine of global LRU independently from memcg.

942

* Operations are called by routine of global LRU independently from memcg.

943

* What we have to take care of here is validness of pc->mem_cgroup.

943

* What we have to take care of here is validness of pc->mem_cgroup.

944

*

944

*

945

* Changes to pc->mem_cgroup happens when

945

* Changes to pc->mem_cgroup happens when

946

* 1. charge

946

* 1. charge

947

* 2. moving account

947

* 2. moving account

948

* In typical case, "charge" is done before add-to-lru. Exception is SwapCache.

948

* In typical case, "charge" is done before add-to-lru. Exception is SwapCache.

949

* It is added to LRU before charge.

949

* It is added to LRU before charge.

950

* If PCG_USED bit is not set, page_cgroup is not added to this private LRU.

950

* If PCG_USED bit is not set, page_cgroup is not added to this private LRU.

951

* When moving account, the page is not on LRU. It's isolated.

951

* When moving account, the page is not on LRU. It's isolated.

952

*/

952

*/

953

954

void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)

954

void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)

955

{

955

{

956

struct page_cgroup *pc;

956

struct page_cgroup *pc;

957

struct mem_cgroup_per_zone *mz;

957

struct mem_cgroup_per_zone *mz;

958

959

if (mem_cgroup_disabled())

959

if (mem_cgroup_disabled())

960

return;

960

return;

961

pc = lookup_page_cgroup(page);

961

pc = lookup_page_cgroup(page);

962

/* can happen while we handle swapcache. */

962

/* can happen while we handle swapcache. */

963

if (!TestClearPageCgroupAcctLRU(pc))

963

if (!TestClearPageCgroupAcctLRU(pc))

964

return;

964

return;

965

VM_BUG_ON(!pc->mem_cgroup);

965

VM_BUG_ON(!pc->mem_cgroup);

966

/*

966

/*

967

* We don't check PCG_USED bit. It's cleared when the "page" is finally

967

* We don't check PCG_USED bit. It's cleared when the "page" is finally

968

* removed from global LRU.

968

* removed from global LRU.

969

*/

969

*/

970

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

970

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

971

/* huge page split is done under lru_lock. so, we have no races. */

971

/* huge page split is done under lru_lock. so, we have no races. */

972

MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);

972

MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);

973

if (mem_cgroup_is_root(pc->mem_cgroup))

973

if (mem_cgroup_is_root(pc->mem_cgroup))

974

return;

974

return;

975

VM_BUG_ON(list_empty(&pc->lru));

975

VM_BUG_ON(list_empty(&pc->lru));

976

list_del_init(&pc->lru);

976

list_del_init(&pc->lru);

977

}

977

}

978

979

void mem_cgroup_del_lru(struct page *page)

979

void mem_cgroup_del_lru(struct page *page)

980

{

980

{

981

mem_cgroup_del_lru_list(page, page_lru(page));

981

mem_cgroup_del_lru_list(page, page_lru(page));

982

}

982

}

983

984

/*

984

/*

985

* Writeback is about to end against a page which has been marked for immediate

985

* Writeback is about to end against a page which has been marked for immediate

986

* reclaim. If it still appears to be reclaimable, move it to the tail of the

986

* reclaim. If it still appears to be reclaimable, move it to the tail of the

987

* inactive list.

987

* inactive list.

988

*/

988

*/

989

void mem_cgroup_rotate_reclaimable_page(struct page *page)

989

void mem_cgroup_rotate_reclaimable_page(struct page *page)

990

{

990

{

991

struct mem_cgroup_per_zone *mz;

991

struct mem_cgroup_per_zone *mz;

992

struct page_cgroup *pc;

992

struct page_cgroup *pc;

993

enum lru_list lru = page_lru(page);

993

enum lru_list lru = page_lru(page);

994

995

if (mem_cgroup_disabled())

995

if (mem_cgroup_disabled())

996

return;

996

return;

997

998

pc = lookup_page_cgroup(page);

998

pc = lookup_page_cgroup(page);

999

/* unused or root page is not rotated. */

999

/* unused or root page is not rotated. */

1000

if (!PageCgroupUsed(pc))

1000

if (!PageCgroupUsed(pc))

1001

return;

1001

return;

1002

/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */

1002

/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */

1003

smp_rmb();

1003

smp_rmb();

1004

if (mem_cgroup_is_root(pc->mem_cgroup))

1004

if (mem_cgroup_is_root(pc->mem_cgroup))

1005

return;

1005

return;

1006

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

1006

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

1007

list_move_tail(&pc->lru, &mz->lists[lru]);

1007

list_move_tail(&pc->lru, &mz->lists[lru]);

1008

}

1008

}

1009

1010

void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)

1010

void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)

1011

{

1011

{

1012

struct mem_cgroup_per_zone *mz;

1012

struct mem_cgroup_per_zone *mz;

1013

struct page_cgroup *pc;

1013

struct page_cgroup *pc;

1014

1015

if (mem_cgroup_disabled())

1015

if (mem_cgroup_disabled())

1016

return;

1016

return;

1017

1018

pc = lookup_page_cgroup(page);

1018

pc = lookup_page_cgroup(page);

1019

/* unused or root page is not rotated. */

1019

/* unused or root page is not rotated. */

1020

if (!PageCgroupUsed(pc))

1020

if (!PageCgroupUsed(pc))

1021

return;

1021

return;

1022

/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */

1022

/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */

1023

smp_rmb();

1023

smp_rmb();

1024

if (mem_cgroup_is_root(pc->mem_cgroup))

1024

if (mem_cgroup_is_root(pc->mem_cgroup))

1025

return;

1025

return;

1026

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

1026

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

1027

list_move(&pc->lru, &mz->lists[lru]);

1027

list_move(&pc->lru, &mz->lists[lru]);

1028

}

1028

}

1029

1030

void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)

1030

void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)

1031

{

1031

{

1032

struct page_cgroup *pc;

1032

struct page_cgroup *pc;

1033

struct mem_cgroup_per_zone *mz;

1033

struct mem_cgroup_per_zone *mz;

1034

1035

if (mem_cgroup_disabled())

1035

if (mem_cgroup_disabled())

1036

return;

1036

return;

1037

pc = lookup_page_cgroup(page);

1037

pc = lookup_page_cgroup(page);

1038

VM_BUG_ON(PageCgroupAcctLRU(pc));

1038

VM_BUG_ON(PageCgroupAcctLRU(pc));

1039

if (!PageCgroupUsed(pc))

1039

if (!PageCgroupUsed(pc))

1040

return;

1040

return;

1041

/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */

1041

/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */

1042

smp_rmb();

1042

smp_rmb();

1043

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

1043

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

1044

/* huge page split is done under lru_lock. so, we have no races. */

1044

/* huge page split is done under lru_lock. so, we have no races. */

1045

MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);

1045

MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);

1046

SetPageCgroupAcctLRU(pc);

1046

SetPageCgroupAcctLRU(pc);

1047

if (mem_cgroup_is_root(pc->mem_cgroup))

1047

if (mem_cgroup_is_root(pc->mem_cgroup))

1048

return;

1048

return;

1049

list_add(&pc->lru, &mz->lists[lru]);

1049

list_add(&pc->lru, &mz->lists[lru]);

1050

}

1050

}

1051

1052

/*

1052

/*

1053

* At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed

1053

* At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed

1054

* while it's linked to lru because the page may be reused after it's fully

1054

* while it's linked to lru because the page may be reused after it's fully

1055

* uncharged. To handle that, unlink page_cgroup from LRU when charge it again.

1055

* uncharged. To handle that, unlink page_cgroup from LRU when charge it again.

1056

* It's done under lock_page and expected that zone->lru_lock isnever held.

1056

* It's done under lock_page and expected that zone->lru_lock isnever held.

1057

*/

1057

*/

1058

static void mem_cgroup_lru_del_before_commit(struct page *page)

1058

static void mem_cgroup_lru_del_before_commit(struct page *page)

1059

{

1059

{

1060

unsigned long flags;

1060

unsigned long flags;

1061

struct zone *zone = page_zone(page);

1061

struct zone *zone = page_zone(page);

1062

struct page_cgroup *pc = lookup_page_cgroup(page);

1062

struct page_cgroup *pc = lookup_page_cgroup(page);

1063

1064

/*

1064

/*

1065

* Doing this check without taking ->lru_lock seems wrong but this

1065

* Doing this check without taking ->lru_lock seems wrong but this

1066

* is safe. Because if page_cgroup's USED bit is unset, the page

1066

* is safe. Because if page_cgroup's USED bit is unset, the page

1067

* will not be added to any memcg's LRU. If page_cgroup's USED bit is

1067

* will not be added to any memcg's LRU. If page_cgroup's USED bit is

1068

* set, the commit after this will fail, anyway.

1068

* set, the commit after this will fail, anyway.

1069

* This all charge/uncharge is done under some mutual execustion.

1069

* This all charge/uncharge is done under some mutual execustion.

1070

* So, we don't need to taking care of changes in USED bit.

1070

* So, we don't need to taking care of changes in USED bit.

1071

*/

1071

*/

1072

if (likely(!PageLRU(page)))

1072

if (likely(!PageLRU(page)))

1073

return;

1073

return;

1074

1075

spin_lock_irqsave(&zone->lru_lock, flags);

1075

spin_lock_irqsave(&zone->lru_lock, flags);

1076

/*

1076

/*

1077

* Forget old LRU when this page_cgroup is *not* used. This Used bit

1077

* Forget old LRU when this page_cgroup is *not* used. This Used bit

1078

* is guarded by lock_page() because the page is SwapCache.

1078

* is guarded by lock_page() because the page is SwapCache.

1079

*/

1079

*/

1080

if (!PageCgroupUsed(pc))

1080

if (!PageCgroupUsed(pc))

1081

mem_cgroup_del_lru_list(page, page_lru(page));

1081

mem_cgroup_del_lru_list(page, page_lru(page));

1082

spin_unlock_irqrestore(&zone->lru_lock, flags);

1082

spin_unlock_irqrestore(&zone->lru_lock, flags);

1083

}

1083

}

1084

1085

static void mem_cgroup_lru_add_after_commit(struct page *page)

1085

static void mem_cgroup_lru_add_after_commit(struct page *page)

1086

{

1086

{

1087

unsigned long flags;

1087

unsigned long flags;

1088

struct zone *zone = page_zone(page);

1088

struct zone *zone = page_zone(page);

1089

struct page_cgroup *pc = lookup_page_cgroup(page);

1089

struct page_cgroup *pc = lookup_page_cgroup(page);

1090

1091

/* taking care of that the page is added to LRU while we commit it */

1091

/* taking care of that the page is added to LRU while we commit it */

1092

if (likely(!PageLRU(page)))

1092

if (likely(!PageLRU(page)))

1093

return;

1093

return;

1094

spin_lock_irqsave(&zone->lru_lock, flags);

1094

spin_lock_irqsave(&zone->lru_lock, flags);

1095

/* link when the page is linked to LRU but page_cgroup isn't */

1095

/* link when the page is linked to LRU but page_cgroup isn't */

1096

if (PageLRU(page) && !PageCgroupAcctLRU(pc))

1096

if (PageLRU(page) && !PageCgroupAcctLRU(pc))

1097

mem_cgroup_add_lru_list(page, page_lru(page));

1097

mem_cgroup_add_lru_list(page, page_lru(page));

1098

spin_unlock_irqrestore(&zone->lru_lock, flags);

1098

spin_unlock_irqrestore(&zone->lru_lock, flags);

1099

}

1099

}

1100

1101

1102

void mem_cgroup_move_lists(struct page *page,

1102

void mem_cgroup_move_lists(struct page *page,

1103

enum lru_list from, enum lru_list to)

1103

enum lru_list from, enum lru_list to)

1104

{

1104

{

1105

if (mem_cgroup_disabled())

1105

if (mem_cgroup_disabled())

1106

return;

1106

return;

1107

mem_cgroup_del_lru_list(page, from);

1107

mem_cgroup_del_lru_list(page, from);

1108

mem_cgroup_add_lru_list(page, to);

1108

mem_cgroup_add_lru_list(page, to);

1109

}

1109

}

1110

1111

/*

1111

/*

1112

* Checks whether given mem is same or in the root_mem's

1112

* Checks whether given mem is same or in the root_mem's

1113

* hierarchy subtree

1113

* hierarchy subtree

1114

*/

1114

*/

1115

static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,

1115

static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,

1116

struct mem_cgroup *mem)

1116

struct mem_cgroup *mem)

1117

{

1117

{

1118

if (root_mem != mem) {

1118

if (root_mem != mem) {

1119

return (root_mem->use_hierarchy &&

1119

return (root_mem->use_hierarchy &&

1120

css_is_ancestor(&mem->css, &root_mem->css));

1120

css_is_ancestor(&mem->css, &root_mem->css));

1121

}

1121

}

1122

1123

return true;

1123

return true;

1124

}

1124

}

1125

1126

int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)

1126

int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)

1127

{

1127

{

1128

int ret;

1128

int ret;

1129

struct mem_cgroup *curr = NULL;

1129

struct mem_cgroup *curr = NULL;

1130

struct task_struct *p;

1130

struct task_struct *p;

1131

1132

p = find_lock_task_mm(task);

1132

p = find_lock_task_mm(task);

1133

if (!p)

1133

if (!p)

1134

return 0;

1134

return 0;

1135

curr = try_get_mem_cgroup_from_mm(p->mm);

1135

curr = try_get_mem_cgroup_from_mm(p->mm);

1136

task_unlock(p);

1136

task_unlock(p);

1137

if (!curr)

1137

if (!curr)

1138

return 0;

1138

return 0;

1139

/*

1139

/*

1140

* We should check use_hierarchy of "mem" not "curr". Because checking

1140

* We should check use_hierarchy of "mem" not "curr". Because checking

1141

* use_hierarchy of "curr" here make this function true if hierarchy is

1141

* use_hierarchy of "curr" here make this function true if hierarchy is

1142

* enabled in "curr" and "curr" is a child of "mem" in *cgroup*

1142

* enabled in "curr" and "curr" is a child of "mem" in *cgroup*

1143

* hierarchy(even if use_hierarchy is disabled in "mem").

1143

* hierarchy(even if use_hierarchy is disabled in "mem").

1144

*/

1144

*/

1145

ret = mem_cgroup_same_or_subtree(mem, curr);

1145

ret = mem_cgroup_same_or_subtree(mem, curr);

1146

css_put(&curr->css);

1146

css_put(&curr->css);

1147

return ret;

1147

return ret;

1148

}

1148

}

1149

1150

static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)

1150

static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)

1151

{

1151

{

1152

unsigned long active;

1152

unsigned long active;

1153

unsigned long inactive;

1153

unsigned long inactive;

1154

unsigned long gb;

1154

unsigned long gb;

1155

unsigned long inactive_ratio;

1155

unsigned long inactive_ratio;

1156

1157

inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));

1157

inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));

1158

active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));

1158

active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));

1159

1160

gb = (inactive + active) >> (30 - PAGE_SHIFT);

1160

gb = (inactive + active) >> (30 - PAGE_SHIFT);

1161

if (gb)

1161

if (gb)

1162

inactive_ratio = int_sqrt(10 * gb);

1162

inactive_ratio = int_sqrt(10 * gb);

1163

else

1163

else

1164

inactive_ratio = 1;

1164

inactive_ratio = 1;

1165

1166

if (present_pages) {

1166

if (present_pages) {

1167

present_pages[0] = inactive;

1167

present_pages[0] = inactive;

1168

present_pages[1] = active;

1168

present_pages[1] = active;

1169

}

1169

}

1170

1171

return inactive_ratio;

1171

return inactive_ratio;

1172

}

1172

}

1173

1174

int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)

1174

int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)

1175

{

1175

{

1176

unsigned long active;

1176

unsigned long active;

1177

unsigned long inactive;

1177

unsigned long inactive;

1178

unsigned long present_pages[2];

1178

unsigned long present_pages[2];

1179

unsigned long inactive_ratio;

1179

unsigned long inactive_ratio;

1180

1181

inactive_ratio = calc_inactive_ratio(memcg, present_pages);

1181

inactive_ratio = calc_inactive_ratio(memcg, present_pages);

1182

1183

inactive = present_pages[0];

1183

inactive = present_pages[0];

1184

active = present_pages[1];

1184

active = present_pages[1];

1185

1186

if (inactive * inactive_ratio < active)

1186

if (inactive * inactive_ratio < active)

1187

return 1;

1187

return 1;

1188

1189

return 0;

1189

return 0;

1190

}

1190

}

1191

1192

int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)

1192

int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)

1193

{

1193

{

1194

unsigned long active;

1194

unsigned long active;

1195

unsigned long inactive;

1195

unsigned long inactive;

1196

1197

inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));

1197

inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));

1198

active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));

1198

active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));

1199

1200

return (active > inactive);

1200

return (active > inactive);

1201

}

1201

}

1202

1203

struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,

1203

struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,

1204

struct zone *zone)

1204

struct zone *zone)

1205

{

1205

{

1206

int nid = zone_to_nid(zone);

1206

int nid = zone_to_nid(zone);

1207

int zid = zone_idx(zone);

1207

int zid = zone_idx(zone);

1208

struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);

1208

struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);

1209

1210

return &mz->reclaim_stat;

1210

return &mz->reclaim_stat;

1211

}

1211

}

1212

1213

struct zone_reclaim_stat *

1213

struct zone_reclaim_stat *

1214

mem_cgroup_get_reclaim_stat_from_page(struct page *page)

1214

mem_cgroup_get_reclaim_stat_from_page(struct page *page)

1215

{

1215

{

1216

struct page_cgroup *pc;

1216

struct page_cgroup *pc;

1217

struct mem_cgroup_per_zone *mz;

1217

struct mem_cgroup_per_zone *mz;

1218

1219

if (mem_cgroup_disabled())

1219

if (mem_cgroup_disabled())

1220

return NULL;

1220

return NULL;

1221

1222

pc = lookup_page_cgroup(page);

1222

pc = lookup_page_cgroup(page);

1223

if (!PageCgroupUsed(pc))

1223

if (!PageCgroupUsed(pc))

1224

return NULL;

1224

return NULL;

1225

/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */

1225

/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */

1226

smp_rmb();

1226

smp_rmb();

1227

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

1227

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

1228

return &mz->reclaim_stat;

1228

return &mz->reclaim_stat;

1229

}

1229

}

1230

1231

unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,

1231

unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,

1232

struct list_head *dst,

1232

struct list_head *dst,

1233

unsigned long *scanned, int order,

1233

unsigned long *scanned, int order,

1234

int mode, struct zone *z,

1234

int mode, struct zone *z,

1235

struct mem_cgroup *mem_cont,

1235

struct mem_cgroup *mem_cont,

1236

int active, int file)

1236

int active, int file)

1237

{

1237

{

1238

unsigned long nr_taken = 0;

1238

unsigned long nr_taken = 0;

1239

struct page *page;

1239

struct page *page;

1240

unsigned long scan;

1240

unsigned long scan;

1241

LIST_HEAD(pc_list);

1241

LIST_HEAD(pc_list);

1242

struct list_head *src;

1242

struct list_head *src;

1243

struct page_cgroup *pc, *tmp;

1243

struct page_cgroup *pc, *tmp;

1244

int nid = zone_to_nid(z);

1244

int nid = zone_to_nid(z);

1245

int zid = zone_idx(z);

1245

int zid = zone_idx(z);

1246

struct mem_cgroup_per_zone *mz;

1246

struct mem_cgroup_per_zone *mz;

1247

int lru = LRU_FILE * file + active;

1247

int lru = LRU_FILE * file + active;

1248

int ret;

1248

int ret;

1249

1250

BUG_ON(!mem_cont);

1250

BUG_ON(!mem_cont);

1251

mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);

1251

mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);

1252

src = &mz->lists[lru];

1252

src = &mz->lists[lru];

1253

1254

scan = 0;

1254

scan = 0;

1255

list_for_each_entry_safe_reverse(pc, tmp, src, lru) {

1255

list_for_each_entry_safe_reverse(pc, tmp, src, lru) {

1256

if (scan >= nr_to_scan)

1256

if (scan >= nr_to_scan)

1257

break;

1257

break;

1258

1259

if (unlikely(!PageCgroupUsed(pc)))

1259

if (unlikely(!PageCgroupUsed(pc)))

1260

continue;

1260

continue;

1261

1262

page = lookup_cgroup_page(pc);

1262

page = lookup_cgroup_page(pc);

1263

1264

if (unlikely(!PageLRU(page)))

1264

if (unlikely(!PageLRU(page)))

1265

continue;

1265

continue;

1266

1267

scan++;

1267

scan++;

1268

ret = __isolate_lru_page(page, mode, file);

1268

ret = __isolate_lru_page(page, mode, file);

1269

switch (ret) {

1269

switch (ret) {

1270

case 0:

1270

case 0:

1271

list_move(&page->lru, dst);

1271

list_move(&page->lru, dst);

1272

mem_cgroup_del_lru(page);

1272

mem_cgroup_del_lru(page);

1273

nr_taken += hpage_nr_pages(page);

1273

nr_taken += hpage_nr_pages(page);

1274

break;

1274

break;

1275

case -EBUSY:

1275

case -EBUSY:

1276

/* we don't affect global LRU but rotate in our LRU */

1276

/* we don't affect global LRU but rotate in our LRU */

1277

mem_cgroup_rotate_lru_list(page, page_lru(page));

1277

mem_cgroup_rotate_lru_list(page, page_lru(page));

1278

break;

1278

break;

1279

default:

1279

default:

1280

break;

1280

break;

1281

}

1281

}

1282

}

1282

}

1283

1284

*scanned = scan;

1284

*scanned = scan;

1285

1286

trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,

1286

trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,

1287

0, 0, 0, mode);

1287

0, 0, 0, mode);

1288

1289

return nr_taken;

1289

return nr_taken;

1290

}

1290

}

1291

1292

#define mem_cgroup_from_res_counter(counter, member) \

1292

#define mem_cgroup_from_res_counter(counter, member) \

1293

container_of(counter, struct mem_cgroup, member)

1293

container_of(counter, struct mem_cgroup, member)

1294

1295

/**

1295

/**

1296

* mem_cgroup_margin - calculate chargeable space of a memory cgroup

1296

* mem_cgroup_margin - calculate chargeable space of a memory cgroup

1297

* @mem: the memory cgroup

1297

* @mem: the memory cgroup

1298

*

1298

*

1299

* Returns the maximum amount of memory @mem can be charged with, in

1299

* Returns the maximum amount of memory @mem can be charged with, in

1300

* pages.

1300

* pages.

1301

*/

1301

*/

1302

static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)

1302

static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)

1303

{

1303

{

1304

unsigned long long margin;

1304

unsigned long long margin;

1305

1306

margin = res_counter_margin(&mem->res);

1306

margin = res_counter_margin(&mem->res);

1307

if (do_swap_account)

1307

if (do_swap_account)

1308

margin = min(margin, res_counter_margin(&mem->memsw));

1308

margin = min(margin, res_counter_margin(&mem->memsw));

1309

return margin >> PAGE_SHIFT;

1309

return margin >> PAGE_SHIFT;

1310

}

1310

}

1311

1312

int mem_cgroup_swappiness(struct mem_cgroup *memcg)

1312

int mem_cgroup_swappiness(struct mem_cgroup *memcg)

1313

{

1313

{

1314

struct cgroup *cgrp = memcg->css.cgroup;

1314

struct cgroup *cgrp = memcg->css.cgroup;

1315

1316

/* root ? */

1316

/* root ? */

1317

if (cgrp->parent == NULL)

1317

if (cgrp->parent == NULL)

1318

return vm_swappiness;

1318

return vm_swappiness;

1319

1320

return memcg->swappiness;

1320

return memcg->swappiness;

1321

}

1321

}

1322

1323

static void mem_cgroup_start_move(struct mem_cgroup *mem)

1323

static void mem_cgroup_start_move(struct mem_cgroup *mem)

1324

{

1324

{

1325

int cpu;

1325

int cpu;

1326

1327

get_online_cpus();

1327

get_online_cpus();

1328

spin_lock(&mem->pcp_counter_lock);

1328

spin_lock(&mem->pcp_counter_lock);

1329

for_each_online_cpu(cpu)

1329

for_each_online_cpu(cpu)

1330

per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;

1330

per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;

1331

mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;

1331

mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;

1332

spin_unlock(&mem->pcp_counter_lock);

1332

spin_unlock(&mem->pcp_counter_lock);

1333

put_online_cpus();

1333

put_online_cpus();

1334

1335

synchronize_rcu();

1335

synchronize_rcu();

1336

}

1336

}

1337

1338

static void mem_cgroup_end_move(struct mem_cgroup *mem)

1338

static void mem_cgroup_end_move(struct mem_cgroup *mem)

1339

{

1339

{

1340

int cpu;

1340

int cpu;

1341

1342

if (!mem)

1342

if (!mem)

1343

return;

1343

return;

1344

get_online_cpus();

1344

get_online_cpus();

1345

spin_lock(&mem->pcp_counter_lock);

1345

spin_lock(&mem->pcp_counter_lock);

1346

for_each_online_cpu(cpu)

1346

for_each_online_cpu(cpu)

1347

per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;

1347

per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;

1348

mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;

1348

mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;

1349

spin_unlock(&mem->pcp_counter_lock);

1349

spin_unlock(&mem->pcp_counter_lock);

1350

put_online_cpus();

1350

put_online_cpus();

1351

}

1351

}

1352

/*

1352

/*

1353

* 2 routines for checking "mem" is under move_account() or not.

1353

* 2 routines for checking "mem" is under move_account() or not.

1354

*

1354

*

1355

* mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used

1355

* mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used

1356

* for avoiding race in accounting. If true,

1356

* for avoiding race in accounting. If true,

1357

* pc->mem_cgroup may be overwritten.

1357

* pc->mem_cgroup may be overwritten.

1358

*

1358

*

1359

* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or

1359

* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or

1360

* under hierarchy of moving cgroups. This is for

1360

* under hierarchy of moving cgroups. This is for

1361

* waiting at hith-memory prressure caused by "move".

1361

* waiting at hith-memory prressure caused by "move".

1362

*/

1362

*/

1363

1364

static bool mem_cgroup_stealed(struct mem_cgroup *mem)

1364

static bool mem_cgroup_stealed(struct mem_cgroup *mem)

1365

{

1365

{

1366

VM_BUG_ON(!rcu_read_lock_held());

1366

VM_BUG_ON(!rcu_read_lock_held());

1367

return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;

1367

return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;

1368

}

1368

}

1369

1370

static bool mem_cgroup_under_move(struct mem_cgroup *mem)

1370

static bool mem_cgroup_under_move(struct mem_cgroup *mem)

1371

{

1371

{

1372

struct mem_cgroup *from;

1372

struct mem_cgroup *from;

1373

struct mem_cgroup *to;

1373

struct mem_cgroup *to;

1374

bool ret = false;

1374

bool ret = false;

1375

/*

1375

/*

1376

* Unlike task_move routines, we access mc.to, mc.from not under

1376

* Unlike task_move routines, we access mc.to, mc.from not under

1377

* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.

1377

* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.

1378

*/

1378

*/

1379

spin_lock(&mc.lock);

1379

spin_lock(&mc.lock);

1380

from = mc.from;

1380

from = mc.from;

1381

to = mc.to;

1381

to = mc.to;

1382

if (!from)

1382

if (!from)

1383

goto unlock;

1383

goto unlock;

1384

1385

ret = mem_cgroup_same_or_subtree(mem, from)

1385

ret = mem_cgroup_same_or_subtree(mem, from)

1386

|| mem_cgroup_same_or_subtree(mem, to);

1386

|| mem_cgroup_same_or_subtree(mem, to);

1387

unlock:

1387

unlock:

1388

spin_unlock(&mc.lock);

1388

spin_unlock(&mc.lock);

1389

return ret;

1389

return ret;

1390

}

1390

}

1391

1392

static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)

1392

static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)

1393

{

1393

{

1394

if (mc.moving_task && current != mc.moving_task) {

1394

if (mc.moving_task && current != mc.moving_task) {

1395

if (mem_cgroup_under_move(mem)) {

1395

if (mem_cgroup_under_move(mem)) {

1396

DEFINE_WAIT(wait);

1396

DEFINE_WAIT(wait);

1397

prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);

1397

prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);

1398

/* moving charge context might have finished. */

1398

/* moving charge context might have finished. */

1399

if (mc.moving_task)

1399

if (mc.moving_task)

1400

schedule();

1400

schedule();

1401

finish_wait(&mc.waitq, &wait);

1401

finish_wait(&mc.waitq, &wait);

1402

return true;

1402

return true;

1403

}

1403

}

1404

}

1404

}

1405

return false;

1405

return false;

1406

}

1406

}

1407

1408

/**

1408

/**

1409

* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.

1409

* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.

1410

* @memcg: The memory cgroup that went over limit

1410

* @memcg: The memory cgroup that went over limit

1411

* @p: Task that is going to be killed

1411

* @p: Task that is going to be killed

1412

*

1412

*

1413

* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is

1413

* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is

1414

* enabled

1414

* enabled

1415

*/

1415

*/

1416

void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)

1416

void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)

1417

{

1417

{

1418

struct cgroup *task_cgrp;

1418

struct cgroup *task_cgrp;

1419

struct cgroup *mem_cgrp;

1419

struct cgroup *mem_cgrp;

1420

/*

1420

/*

1421

* Need a buffer in BSS, can't rely on allocations. The code relies

1421

* Need a buffer in BSS, can't rely on allocations. The code relies

1422

* on the assumption that OOM is serialized for memory controller.

1422

* on the assumption that OOM is serialized for memory controller.

1423

* If this assumption is broken, revisit this code.

1423

* If this assumption is broken, revisit this code.

1424

*/

1424

*/

1425

static char memcg_name[PATH_MAX];

1425

static char memcg_name[PATH_MAX];

1426

int ret;

1426

int ret;

1427

1428

if (!memcg || !p)

1428

if (!memcg || !p)

1429

return;

1429

return;

1430

1431

1432

rcu_read_lock();

1432

rcu_read_lock();

1433

1434

mem_cgrp = memcg->css.cgroup;

1434

mem_cgrp = memcg->css.cgroup;

1435

task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);

1435

task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);

1436

1437

ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);

1437

ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);

1438

if (ret < 0) {

1438

if (ret < 0) {

1439

/*

1439

/*

1440

* Unfortunately, we are unable to convert to a useful name

1440

* Unfortunately, we are unable to convert to a useful name

1441

* But we'll still print out the usage information

1441

* But we'll still print out the usage information

1442

*/

1442

*/

1443

rcu_read_unlock();

1443

rcu_read_unlock();

1444

goto done;

1444

goto done;

1445

}

1445

}

1446

rcu_read_unlock();

1446

rcu_read_unlock();

1447

1448

printk(KERN_INFO "Task in %s killed", memcg_name);

1448

printk(KERN_INFO "Task in %s killed", memcg_name);

1449

1450

rcu_read_lock();

1450

rcu_read_lock();

1451

ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);

1451

ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);

1452

if (ret < 0) {

1452

if (ret < 0) {

1453

rcu_read_unlock();

1453

rcu_read_unlock();

1454

goto done;

1454

goto done;

1455

}

1455

}

1456

rcu_read_unlock();

1456

rcu_read_unlock();

1457

1458

/*

1458

/*

1459

* Continues from above, so we don't need an KERN_ level

1459

* Continues from above, so we don't need an KERN_ level

1460

*/

1460

*/

1461

printk(KERN_CONT " as a result of limit of %s\n", memcg_name);

1461

printk(KERN_CONT " as a result of limit of %s\n", memcg_name);

1462

done:

1462

done:

1463

1464

printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",

1464

printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",

1465

res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,

1465

res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,

1466

res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,

1466

res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,

1467

res_counter_read_u64(&memcg->res, RES_FAILCNT));

1467

res_counter_read_u64(&memcg->res, RES_FAILCNT));

1468

printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "

1468

printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "

1469

"failcnt %llu\n",

1469

"failcnt %llu\n",

1470

res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,

1470

res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,

1471

res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,

1471

res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,

1472

res_counter_read_u64(&memcg->memsw, RES_FAILCNT));

1472

res_counter_read_u64(&memcg->memsw, RES_FAILCNT));

1473

}

1473

}

1474

1475

/*

1475

/*

1476

* This function returns the number of memcg under hierarchy tree. Returns

1476

* This function returns the number of memcg under hierarchy tree. Returns

1477

* 1(self count) if no children.

1477

* 1(self count) if no children.

1478

*/

1478

*/

1479

static int mem_cgroup_count_children(struct mem_cgroup *mem)

1479

static int mem_cgroup_count_children(struct mem_cgroup *mem)

1480

{

1480

{

1481

int num = 0;

1481

int num = 0;

1482

struct mem_cgroup *iter;

1482

struct mem_cgroup *iter;

1483

1484

for_each_mem_cgroup_tree(iter, mem)

1484

for_each_mem_cgroup_tree(iter, mem)

1485

num++;

1485

num++;

1486

return num;

1486

return num;

1487

}

1487

}

1488

1489

/*

1489

/*

1490

* Return the memory (and swap, if configured) limit for a memcg.

1490

* Return the memory (and swap, if configured) limit for a memcg.

1491

*/

1491

*/

1492

u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)

1492

u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)

1493

{

1493

{

1494

u64 limit;

1494

u64 limit;

1495

u64 memsw;

1495

u64 memsw;

1496

1497

limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

1497

limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

1498

limit += total_swap_pages << PAGE_SHIFT;

1498

limit += total_swap_pages << PAGE_SHIFT;

1499

1500

memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

1500

memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

1501

/*

1501

/*

1502

* If memsw is finite and limits the amount of swap space available

1502

* If memsw is finite and limits the amount of swap space available

1503

* to this memcg, return that limit.

1503

* to this memcg, return that limit.

1504

*/

1504

*/

1505

return min(limit, memsw);

1505

return min(limit, memsw);

1506

}

1506

}

1507

1508

/*

1508

/*

1509

* Visit the first child (need not be the first child as per the ordering

1509

* Visit the first child (need not be the first child as per the ordering

1510

* of the cgroup list, since we track last_scanned_child) of @mem and use

1510

* of the cgroup list, since we track last_scanned_child) of @mem and use

1511

* that to reclaim free pages from.

1511

* that to reclaim free pages from.

1512

*/

1512

*/

1513

static struct mem_cgroup *

1513

static struct mem_cgroup *

1514

mem_cgroup_select_victim(struct mem_cgroup *root_mem)

1514

mem_cgroup_select_victim(struct mem_cgroup *root_mem)

1515

{

1515

{

1516

struct mem_cgroup *ret = NULL;

1516

struct mem_cgroup *ret = NULL;

1517

struct cgroup_subsys_state *css;

1517

struct cgroup_subsys_state *css;

1518

int nextid, found;

1518

int nextid, found;

1519

1520

if (!root_mem->use_hierarchy) {

1520

if (!root_mem->use_hierarchy) {

1521

css_get(&root_mem->css);

1521

css_get(&root_mem->css);

1522

ret = root_mem;

1522

ret = root_mem;

1523

}

1523

}

1524

1525

while (!ret) {

1525

while (!ret) {

1526

rcu_read_lock();

1526

rcu_read_lock();

1527

nextid = root_mem->last_scanned_child + 1;

1527

nextid = root_mem->last_scanned_child + 1;

1528

css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,

1528

css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,

1529

&found);

1529

&found);

1530

if (css && css_tryget(css))

1530

if (css && css_tryget(css))

1531

ret = container_of(css, struct mem_cgroup, css);

1531

ret = container_of(css, struct mem_cgroup, css);

1532

1533

rcu_read_unlock();

1533

rcu_read_unlock();

1534

/* Updates scanning parameter */

1534

/* Updates scanning parameter */

1535

if (!css) {

1535

if (!css) {

1536

/* this means start scan from ID:1 */

1536

/* this means start scan from ID:1 */

1537

root_mem->last_scanned_child = 0;

1537

root_mem->last_scanned_child = 0;

1538

} else

1538

} else

1539

root_mem->last_scanned_child = found;

1539

root_mem->last_scanned_child = found;

1540

}

1540

}

1541

1542

return ret;

1542

return ret;

1543

}

1543

}

1544

1545

/**

1545

/**

1546

* test_mem_cgroup_node_reclaimable

1546

* test_mem_cgroup_node_reclaimable

1547

* @mem: the target memcg

1547

* @mem: the target memcg

1548

* @nid: the node ID to be checked.

1548

* @nid: the node ID to be checked.

1549

* @noswap : specify true here if the user wants flle only information.

1549

* @noswap : specify true here if the user wants flle only information.

1550

*

1550

*

1551

* This function returns whether the specified memcg contains any

1551

* This function returns whether the specified memcg contains any

1552

* reclaimable pages on a node. Returns true if there are any reclaimable

1552

* reclaimable pages on a node. Returns true if there are any reclaimable

1553

* pages in the node.

1553

* pages in the node.

1554

*/

1554

*/

1555

static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,

1555

static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,

1556

int nid, bool noswap)

1556

int nid, bool noswap)

1557

{

1557

{

1558

if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))

1558

if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))

1559

return true;

1559

return true;

1560

if (noswap || !total_swap_pages)

1560

if (noswap || !total_swap_pages)

1561

return false;

1561

return false;

1562

if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))

1562

if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))

1563

return true;

1563

return true;

1564

return false;

1564

return false;

1565

1566

}

1566

}

1567

#if MAX_NUMNODES > 1

1567

#if MAX_NUMNODES > 1

1568

1569

/*

1569

/*

1570

* Always updating the nodemask is not very good - even if we have an empty

1570

* Always updating the nodemask is not very good - even if we have an empty

1571

* list or the wrong list here, we can start from some node and traverse all

1571

* list or the wrong list here, we can start from some node and traverse all

1572

* nodes based on the zonelist. So update the list loosely once per 10 secs.

1572

* nodes based on the zonelist. So update the list loosely once per 10 secs.

1573

*

1573

*

1574

*/

1574

*/

1575

static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)

1575

static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)

1576

{

1576

{

1577

int nid;

1577

int nid;

1578

/*

1578

/*

1579

* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET

1579

* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET

1580

* pagein/pageout changes since the last update.

1580

* pagein/pageout changes since the last update.

1581

*/

1581

*/

1582

if (!atomic_read(&mem->numainfo_events))

1582

if (!atomic_read(&mem->numainfo_events))

1583

return;

1583

return;

1584

if (atomic_inc_return(&mem->numainfo_updating) > 1)

1584

if (atomic_inc_return(&mem->numainfo_updating) > 1)

1585

return;

1585

return;

1586

1587

/* make a nodemask where this memcg uses memory from */

1587

/* make a nodemask where this memcg uses memory from */

1588

mem->scan_nodes = node_states[N_HIGH_MEMORY];

1588

mem->scan_nodes = node_states[N_HIGH_MEMORY];

1589

1590

for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {

1590

for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {

1591

1592

if (!test_mem_cgroup_node_reclaimable(mem, nid, false))

1592

if (!test_mem_cgroup_node_reclaimable(mem, nid, false))

1593

node_clear(nid, mem->scan_nodes);

1593

node_clear(nid, mem->scan_nodes);

1594

}

1594

}

1595

1596

atomic_set(&mem->numainfo_events, 0);

1596

atomic_set(&mem->numainfo_events, 0);

1597

atomic_set(&mem->numainfo_updating, 0);

1597

atomic_set(&mem->numainfo_updating, 0);

1598

}

1598

}

1599

1600

/*

1600

/*

1601

* Selecting a node where we start reclaim from. Because what we need is just

1601

* Selecting a node where we start reclaim from. Because what we need is just

1602

* reducing usage counter, start from anywhere is O,K. Considering

1602

* reducing usage counter, start from anywhere is O,K. Considering

1603

* memory reclaim from current node, there are pros. and cons.

1603

* memory reclaim from current node, there are pros. and cons.

1604

*

1604

*

1605

* Freeing memory from current node means freeing memory from a node which

1605

* Freeing memory from current node means freeing memory from a node which

1606

* we'll use or we've used. So, it may make LRU bad. And if several threads

1606

* we'll use or we've used. So, it may make LRU bad. And if several threads

1607

* hit limits, it will see a contention on a node. But freeing from remote

1607

* hit limits, it will see a contention on a node. But freeing from remote

1608

* node means more costs for memory reclaim because of memory latency.

1608

* node means more costs for memory reclaim because of memory latency.

1609

*

1609

*

1610

* Now, we use round-robin. Better algorithm is welcomed.

1610

* Now, we use round-robin. Better algorithm is welcomed.

1611

*/

1611

*/

1612

int mem_cgroup_select_victim_node(struct mem_cgroup *mem)

1612

int mem_cgroup_select_victim_node(struct mem_cgroup *mem)

1613

{

1613

{

1614

int node;

1614

int node;

1615

1616

mem_cgroup_may_update_nodemask(mem);

1616

mem_cgroup_may_update_nodemask(mem);

1617

node = mem->last_scanned_node;

1617

node = mem->last_scanned_node;

1618

1619

node = next_node(node, mem->scan_nodes);

1619

node = next_node(node, mem->scan_nodes);

1620

if (node == MAX_NUMNODES)

1620

if (node == MAX_NUMNODES)

1621

node = first_node(mem->scan_nodes);

1621

node = first_node(mem->scan_nodes);

1622

/*

1622

/*

1623

* We call this when we hit limit, not when pages are added to LRU.

1623

* We call this when we hit limit, not when pages are added to LRU.

1624

* No LRU may hold pages because all pages are UNEVICTABLE or

1624

* No LRU may hold pages because all pages are UNEVICTABLE or

1625

* memcg is too small and all pages are not on LRU. In that case,

1625

* memcg is too small and all pages are not on LRU. In that case,

1626

* we use curret node.

1626

* we use curret node.

1627

*/

1627

*/

1628

if (unlikely(node == MAX_NUMNODES))

1628

if (unlikely(node == MAX_NUMNODES))

1629

node = numa_node_id();

1629

node = numa_node_id();

1630

1631

mem->last_scanned_node = node;

1631

mem->last_scanned_node = node;

1632

return node;

1632

return node;

1633

}

1633

}

1634

1635

/*

1635

/*

1636

* Check all nodes whether it contains reclaimable pages or not.

1636

* Check all nodes whether it contains reclaimable pages or not.

1637

* For quick scan, we make use of scan_nodes. This will allow us to skip

1637

* For quick scan, we make use of scan_nodes. This will allow us to skip

1638

* unused nodes. But scan_nodes is lazily updated and may not cotain

1638

* unused nodes. But scan_nodes is lazily updated and may not cotain

1639

* enough new information. We need to do double check.

1639

* enough new information. We need to do double check.

1640

*/

1640

*/

1641

bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)

1641

bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)

1642

{

1642

{

1643

int nid;

1643

int nid;

1644

1645

/*

1645

/*

1646

* quick check...making use of scan_node.

1646

* quick check...making use of scan_node.

1647

* We can skip unused nodes.

1647

* We can skip unused nodes.

1648

*/

1648

*/

1649

if (!nodes_empty(mem->scan_nodes)) {

1649

if (!nodes_empty(mem->scan_nodes)) {

1650

for (nid = first_node(mem->scan_nodes);

1650

for (nid = first_node(mem->scan_nodes);

1651

nid < MAX_NUMNODES;

1651

nid < MAX_NUMNODES;

1652

nid = next_node(nid, mem->scan_nodes)) {

1652

nid = next_node(nid, mem->scan_nodes)) {

1653

1654

if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))

1654

if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))

1655

return true;

1655

return true;

1656

}

1656

}

1657

}

1657

}

1658

/*

1658

/*

1659

* Check rest of nodes.

1659

* Check rest of nodes.

1660

*/

1660

*/

1661

for_each_node_state(nid, N_HIGH_MEMORY) {

1661

for_each_node_state(nid, N_HIGH_MEMORY) {

1662

if (node_isset(nid, mem->scan_nodes))

1662

if (node_isset(nid, mem->scan_nodes))

1663

continue;

1663

continue;

1664

if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))

1664

if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))

1665

return true;

1665

return true;

1666

}

1666

}

1667

return false;

1667

return false;

1668

}

1668

}

1669

1670

#else

1670

#else

1671

int mem_cgroup_select_victim_node(struct mem_cgroup *mem)

1671

int mem_cgroup_select_victim_node(struct mem_cgroup *mem)

1672

{

1672

{

1673

return 0;

1673

return 0;

1674

}

1674

}

1675

1676

bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)

1676

bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)

1677

{

1677

{

1678

return test_mem_cgroup_node_reclaimable(mem, 0, noswap);

1678

return test_mem_cgroup_node_reclaimable(mem, 0, noswap);

1679

}

1679

}

1680

#endif

1680

#endif

1681

1682

static void __mem_cgroup_record_scanstat(unsigned long *stats,

1682

static void __mem_cgroup_record_scanstat(unsigned long *stats,

1683

struct memcg_scanrecord *rec)

1683

struct memcg_scanrecord *rec)

1684

{

1684

{

1685

1686

stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1];

1686

stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1];

1687

stats[SCAN_ANON] += rec->nr_scanned[0];

1687

stats[SCAN_ANON] += rec->nr_scanned[0];

1688

stats[SCAN_FILE] += rec->nr_scanned[1];

1688

stats[SCAN_FILE] += rec->nr_scanned[1];

1689

1690

stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1];

1690

stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1];

1691

stats[ROTATE_ANON] += rec->nr_rotated[0];

1691

stats[ROTATE_ANON] += rec->nr_rotated[0];

1692

stats[ROTATE_FILE] += rec->nr_rotated[1];

1692

stats[ROTATE_FILE] += rec->nr_rotated[1];

1693

1694

stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1];

1694

stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1];

1695

stats[FREED_ANON] += rec->nr_freed[0];

1695

stats[FREED_ANON] += rec->nr_freed[0];

1696

stats[FREED_FILE] += rec->nr_freed[1];

1696

stats[FREED_FILE] += rec->nr_freed[1];

1697

1698

stats[ELAPSED] += rec->elapsed;

1698

stats[ELAPSED] += rec->elapsed;

1699

}

1699

}

1700

1701

static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec)

1701

static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec)

1702

{

1702

{

1703

struct mem_cgroup *mem;

1703

struct mem_cgroup *mem;

1704

int context = rec->context;

1704

int context = rec->context;

1705

1706

if (context >= NR_SCAN_CONTEXT)

1706

if (context >= NR_SCAN_CONTEXT)

1707

return;

1707

return;

1708

1709

mem = rec->mem;

1709

mem = rec->mem;

1710

spin_lock(&mem->scanstat.lock);

1710

spin_lock(&mem->scanstat.lock);

1711

__mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec);

1711

__mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec);

1712

spin_unlock(&mem->scanstat.lock);

1712

spin_unlock(&mem->scanstat.lock);

1713

1714

mem = rec->root;

1714

mem = rec->root;

1715

spin_lock(&mem->scanstat.lock);

1715

spin_lock(&mem->scanstat.lock);

1716

__mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec);

1716

__mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec);

1717

spin_unlock(&mem->scanstat.lock);

1717

spin_unlock(&mem->scanstat.lock);

1718

}

1718

}

1719

1720

/*

1720

/*

1721

* Scan the hierarchy if needed to reclaim memory. We remember the last child

1721

* Scan the hierarchy if needed to reclaim memory. We remember the last child

1722

* we reclaimed from, so that we don't end up penalizing one child extensively

1722

* we reclaimed from, so that we don't end up penalizing one child extensively

1723

* based on its position in the children list.

1723

* based on its position in the children list.

1724

*

1724

*

1725

* root_mem is the original ancestor that we've been reclaim from.

1725

* root_mem is the original ancestor that we've been reclaim from.

1726

*

1726

*

1727

* We give up and return to the caller when we visit root_mem twice.

1727

* We give up and return to the caller when we visit root_mem twice.

1728

* (other groups can be removed while we're walking....)

1728

* (other groups can be removed while we're walking....)

1729

*

1729

*

1730

* If shrink==true, for avoiding to free too much, this returns immedieately.

1730

* If shrink==true, for avoiding to free too much, this returns immedieately.

1731

*/

1731

*/

1732

static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,

1732

static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,

1733

struct zone *zone,

1733

struct zone *zone,

1734

gfp_t gfp_mask,

1734

gfp_t gfp_mask,

1735

unsigned long reclaim_options,

1735

unsigned long reclaim_options,

1736

unsigned long *total_scanned)

1736

unsigned long *total_scanned)

1737

{

1737

{

1738

struct mem_cgroup *victim;

1738

struct mem_cgroup *victim;

1739

int ret, total = 0;

1739

int ret, total = 0;

1740

int loop = 0;

1740

int loop = 0;

1741

bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;

1741

bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;

1742

bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;

1742

bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;

1743

bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;

1743

bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;

1744

struct memcg_scanrecord rec;

1744

struct memcg_scanrecord rec;

1745

unsigned long excess;

1745

unsigned long excess;

1746

unsigned long scanned;

1746

unsigned long scanned;

1747

1748

excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;

1748

excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;

1749

1750

/* If memsw_is_minimum==1, swap-out is of-no-use. */

1750

/* If memsw_is_minimum==1, swap-out is of-no-use. */

1751

if (!check_soft && !shrink && root_mem->memsw_is_minimum)

1751

if (!check_soft && !shrink && root_mem->memsw_is_minimum)

1752

noswap = true;

1752

noswap = true;

1753

1754

if (shrink)

1754

if (shrink)

1755

rec.context = SCAN_BY_SHRINK;

1755

rec.context = SCAN_BY_SHRINK;

1756

else if (check_soft)

1756

else if (check_soft)

1757

rec.context = SCAN_BY_SYSTEM;

1757

rec.context = SCAN_BY_SYSTEM;

1758

else

1758

else

1759

rec.context = SCAN_BY_LIMIT;

1759

rec.context = SCAN_BY_LIMIT;

1760

1761

rec.root = root_mem;

1761

rec.root = root_mem;

1762

1763

while (1) {

1763

while (1) {

1764

victim = mem_cgroup_select_victim(root_mem);

1764

victim = mem_cgroup_select_victim(root_mem);

1765

if (victim == root_mem) {

1765

if (victim == root_mem) {

1766

loop++;

1766

loop++;

1767

/*

1767

/*

1768

* We are not draining per cpu cached charges during

1768

* We are not draining per cpu cached charges during

1769

* soft limit reclaim because global reclaim doesn't

1769

* soft limit reclaim because global reclaim doesn't

1770

* care about charges. It tries to free some memory and

1770

* care about charges. It tries to free some memory and

1771

* charges will not give any.

1771

* charges will not give any.

1772

*/

1772

*/

1773

if (!check_soft && loop >= 1)

1773

if (!check_soft && loop >= 1)

1774

drain_all_stock_async(root_mem);

1774

drain_all_stock_async(root_mem);

1775

if (loop >= 2) {

1775

if (loop >= 2) {

1776

/*

1776

/*

1777

* If we have not been able to reclaim

1777

* If we have not been able to reclaim

1778

* anything, it might because there are

1778

* anything, it might because there are

1779

* no reclaimable pages under this hierarchy

1779

* no reclaimable pages under this hierarchy

1780

*/

1780

*/

1781

if (!check_soft || !total) {

1781

if (!check_soft || !total) {

1782

css_put(&victim->css);

1782

css_put(&victim->css);

1783

break;

1783

break;

1784

}

1784

}

1785

/*

1785

/*

1786

* We want to do more targeted reclaim.

1786

* We want to do more targeted reclaim.

1787

* excess >> 2 is not to excessive so as to

1787

* excess >> 2 is not to excessive so as to

1788

* reclaim too much, nor too less that we keep

1788

* reclaim too much, nor too less that we keep

1789

* coming back to reclaim from this cgroup

1789

* coming back to reclaim from this cgroup

1790

*/

1790

*/

1791

if (total >= (excess >> 2) ||

1791

if (total >= (excess >> 2) ||

1792

(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {

1792

(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {

1793

css_put(&victim->css);

1793

css_put(&victim->css);

1794

break;

1794

break;

1795

}

1795

}

1796

}

1796

}

1797

}

1797

}

1798

if (!mem_cgroup_reclaimable(victim, noswap)) {

1798

if (!mem_cgroup_reclaimable(victim, noswap)) {

1799

/* this cgroup's local usage == 0 */

1799

/* this cgroup's local usage == 0 */

1800

css_put(&victim->css);

1800

css_put(&victim->css);

1801

continue;

1801

continue;

1802

}

1802

}

1803

rec.mem = victim;

1803

rec.mem = victim;

1804

rec.nr_scanned[0] = 0;

1804

rec.nr_scanned[0] = 0;

1805

rec.nr_scanned[1] = 0;

1805

rec.nr_scanned[1] = 0;

1806

rec.nr_rotated[0] = 0;

1806

rec.nr_rotated[0] = 0;

1807

rec.nr_rotated[1] = 0;

1807

rec.nr_rotated[1] = 0;

1808

rec.nr_freed[0] = 0;

1808

rec.nr_freed[0] = 0;

1809

rec.nr_freed[1] = 0;

1809

rec.nr_freed[1] = 0;

1810

rec.elapsed = 0;

1810

rec.elapsed = 0;

1811

/* we use swappiness of local cgroup */

1811

/* we use swappiness of local cgroup */

1812

if (check_soft) {

1812

if (check_soft) {

1813

ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,

1813

ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,

1814

noswap, zone, &rec, &scanned);

1814

noswap, zone, &rec, &scanned);

1815

*total_scanned += scanned;

1815

*total_scanned += scanned;

1816

} else

1816

} else

1817

ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,

1817

ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,

1818

noswap, &rec);

1818

noswap, &rec);

1819

mem_cgroup_record_scanstat(&rec);

1819

mem_cgroup_record_scanstat(&rec);

1820

css_put(&victim->css);

1820

css_put(&victim->css);

1821

/*

1821

/*

1822

* At shrinking usage, we can't check we should stop here or

1822

* At shrinking usage, we can't check we should stop here or

1823

* reclaim more. It's depends on callers. last_scanned_child

1823

* reclaim more. It's depends on callers. last_scanned_child

1824

* will work enough for keeping fairness under tree.

1824

* will work enough for keeping fairness under tree.

1825

*/

1825

*/

1826

if (shrink)

1826

if (shrink)

1827

return ret;

1827

return ret;

1828

total += ret;

1828

total += ret;

1829

if (check_soft) {

1829

if (check_soft) {

1830

if (!res_counter_soft_limit_excess(&root_mem->res))

1830

if (!res_counter_soft_limit_excess(&root_mem->res))

1831

return total;

1831

return total;

1832

} else if (mem_cgroup_margin(root_mem))

1832

} else if (mem_cgroup_margin(root_mem))

1833

return total;

1833

return total;

1834

}

1834

}

1835

return total;

1835

return total;

1836

}

1836

}

1837

1838

/*

1838

/*

1839

* Check OOM-Killer is already running under our hierarchy.

1839

* Check OOM-Killer is already running under our hierarchy.

1840

* If someone is running, return false.

1840

* If someone is running, return false.

1841

* Has to be called with memcg_oom_lock

1841

* Has to be called with memcg_oom_lock

1842

*/

1842

*/

1843

static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)

1843

static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)

1844

{

1844

{

1845

int lock_count = -1;

1845

int lock_count = -1;

1846

struct mem_cgroup *iter, *failed = NULL;

1846

struct mem_cgroup *iter, *failed = NULL;

1847

bool cond = true;

1847

bool cond = true;

1848

1849

for_each_mem_cgroup_tree_cond(iter, mem, cond) {

1849

for_each_mem_cgroup_tree_cond(iter, mem, cond) {

1850

bool locked = iter->oom_lock;

1850

bool locked = iter->oom_lock;

1851

1852

iter->oom_lock = true;

1852

iter->oom_lock = true;

1853

if (lock_count == -1)

1853

if (lock_count == -1)

1854

lock_count = iter->oom_lock;

1854

lock_count = iter->oom_lock;

1855

else if (lock_count != locked) {

1855

else if (lock_count != locked) {

1856

/*

1856

/*

1857

* this subtree of our hierarchy is already locked

1857

* this subtree of our hierarchy is already locked

1858

* so we cannot give a lock.

1858

* so we cannot give a lock.

1859

*/

1859

*/

1860

lock_count = 0;

1860

lock_count = 0;

1861

failed = iter;

1861

failed = iter;

1862

cond = false;

1862

cond = false;

1863

}

1863

}

1864

}

1864

}

1865

1866

if (!failed)

1866

if (!failed)

1867

goto done;

1867

goto done;

1868

1869

/*

1869

/*

1870

* OK, we failed to lock the whole subtree so we have to clean up

1870

* OK, we failed to lock the whole subtree so we have to clean up

1871

* what we set up to the failing subtree

1871

* what we set up to the failing subtree

1872

*/

1872

*/

1873

cond = true;

1873

cond = true;

1874

for_each_mem_cgroup_tree_cond(iter, mem, cond) {

1874

for_each_mem_cgroup_tree_cond(iter, mem, cond) {

1875

if (iter == failed) {

1875

if (iter == failed) {

1876

cond = false;

1876

cond = false;

1877

continue;

1877

continue;

1878

}

1878

}

1879

iter->oom_lock = false;

1879

iter->oom_lock = false;

1880

}

1880

}

1881

done:

1881

done:

1882

return lock_count;

1882

return lock_count;

1883

}

1883

}

1884

1885

/*

1885

/*

1886

* Has to be called with memcg_oom_lock

1886

* Has to be called with memcg_oom_lock

1887

*/

1887

*/

1888

static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)

1888

static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)

1889

{

1889

{

1890

struct mem_cgroup *iter;

1890

struct mem_cgroup *iter;

1891

1892

for_each_mem_cgroup_tree(iter, mem)

1892

for_each_mem_cgroup_tree(iter, mem)

1893

iter->oom_lock = false;

1893

iter->oom_lock = false;

1894

return 0;

1894

return 0;

1895

}

1895

}

1896

1897

static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)

1897

static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)

1898

{

1898

{

1899

struct mem_cgroup *iter;

1899

struct mem_cgroup *iter;

1900

1901

for_each_mem_cgroup_tree(iter, mem)

1901

for_each_mem_cgroup_tree(iter, mem)

1902

atomic_inc(&iter->under_oom);

1902

atomic_inc(&iter->under_oom);

1903

}

1903

}

1904

1905

static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)

1905

static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)

1906

{

1906

{

1907

struct mem_cgroup *iter;

1907

struct mem_cgroup *iter;

1908

1909

/*

1909

/*

1910

* When a new child is created while the hierarchy is under oom,

1910

* When a new child is created while the hierarchy is under oom,

1911

* mem_cgroup_oom_lock() may not be called. We have to use

1911

* mem_cgroup_oom_lock() may not be called. We have to use

1912

* atomic_add_unless() here.

1912

* atomic_add_unless() here.

1913

*/

1913

*/

1914

for_each_mem_cgroup_tree(iter, mem)

1914

for_each_mem_cgroup_tree(iter, mem)

1915

atomic_add_unless(&iter->under_oom, -1, 0);

1915

atomic_add_unless(&iter->under_oom, -1, 0);

1916

}

1916

}

1917

1918

static DEFINE_SPINLOCK(memcg_oom_lock);

1918

static DEFINE_SPINLOCK(memcg_oom_lock);

1919

static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);

1919

static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);

1920

1921

struct oom_wait_info {

1921

struct oom_wait_info {

1922

struct mem_cgroup *mem;

1922

struct mem_cgroup *mem;

1923

wait_queue_t wait;

1923

wait_queue_t wait;

1924

};

1924

};

1925

1926

static int memcg_oom_wake_function(wait_queue_t *wait,

1926

static int memcg_oom_wake_function(wait_queue_t *wait,

1927

unsigned mode, int sync, void *arg)

1927

unsigned mode, int sync, void *arg)

1928

{

1928

{

1929

struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,

1929

struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,

1930

*oom_wait_mem;

1930

*oom_wait_mem;

1931

struct oom_wait_info *oom_wait_info;

1931

struct oom_wait_info *oom_wait_info;

1932

1933

oom_wait_info = container_of(wait, struct oom_wait_info, wait);

1933

oom_wait_info = container_of(wait, struct oom_wait_info, wait);

1934

oom_wait_mem = oom_wait_info->mem;

1934

oom_wait_mem = oom_wait_info->mem;

1935

1936

/*

1936

/*

1937

* Both of oom_wait_info->mem and wake_mem are stable under us.

1937

* Both of oom_wait_info->mem and wake_mem are stable under us.

1938

* Then we can use css_is_ancestor without taking care of RCU.

1938

* Then we can use css_is_ancestor without taking care of RCU.

1939

*/

1939

*/

1940

if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)

1940

if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)

1941

&& !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))

1941

&& !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))

1942

return 0;

1942

return 0;

1943

return autoremove_wake_function(wait, mode, sync, arg);

1943

return autoremove_wake_function(wait, mode, sync, arg);

1944

}

1944

}

1945

1946

static void memcg_wakeup_oom(struct mem_cgroup *mem)

1946

static void memcg_wakeup_oom(struct mem_cgroup *mem)

1947

{

1947

{

1948

/* for filtering, pass "mem" as argument. */

1948

/* for filtering, pass "mem" as argument. */

1949

__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);

1949

__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);

1950

}

1950

}

1951

1952

static void memcg_oom_recover(struct mem_cgroup *mem)

1952

static void memcg_oom_recover(struct mem_cgroup *mem)

1953

{

1953

{

1954

if (mem && atomic_read(&mem->under_oom))

1954

if (mem && atomic_read(&mem->under_oom))

1955

memcg_wakeup_oom(mem);

1955

memcg_wakeup_oom(mem);

1956

}

1956

}

1957

1958

/*

1958

/*

1959

* try to call OOM killer. returns false if we should exit memory-reclaim loop.

1959

* try to call OOM killer. returns false if we should exit memory-reclaim loop.

1960

*/

1960

*/

1961

bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)

1961

bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)

1962

{

1962

{

1963

struct oom_wait_info owait;

1963

struct oom_wait_info owait;

1964

bool locked, need_to_kill;

1964

bool locked, need_to_kill;

1965

1966

owait.mem = mem;

1966

owait.mem = mem;

1967

owait.wait.flags = 0;

1967

owait.wait.flags = 0;

1968

owait.wait.func = memcg_oom_wake_function;

1968

owait.wait.func = memcg_oom_wake_function;

1969

owait.wait.private = current;

1969

owait.wait.private = current;

1970

INIT_LIST_HEAD(&owait.wait.task_list);

1970

INIT_LIST_HEAD(&owait.wait.task_list);

1971

need_to_kill = true;

1971

need_to_kill = true;

1972

mem_cgroup_mark_under_oom(mem);

1972

mem_cgroup_mark_under_oom(mem);

1973

1974

/* At first, try to OOM lock hierarchy under mem.*/

1974

/* At first, try to OOM lock hierarchy under mem.*/

1975

spin_lock(&memcg_oom_lock);

1975

spin_lock(&memcg_oom_lock);

1976

locked = mem_cgroup_oom_lock(mem);

1976

locked = mem_cgroup_oom_lock(mem);

1977

/*

1977

/*

1978

* Even if signal_pending(), we can't quit charge() loop without

1978

* Even if signal_pending(), we can't quit charge() loop without

1979

* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL

1979

* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL

1980

* under OOM is always welcomed, use TASK_KILLABLE here.

1980

* under OOM is always welcomed, use TASK_KILLABLE here.

1981

*/

1981

*/

1982

prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);

1982

prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);

1983

if (!locked || mem->oom_kill_disable)

1983

if (!locked || mem->oom_kill_disable)

1984

need_to_kill = false;

1984

need_to_kill = false;

1985

if (locked)

1985

if (locked)

1986

mem_cgroup_oom_notify(mem);

1986

mem_cgroup_oom_notify(mem);

1987

spin_unlock(&memcg_oom_lock);

1987

spin_unlock(&memcg_oom_lock);

1988

1989

if (need_to_kill) {

1989

if (need_to_kill) {

1990

finish_wait(&memcg_oom_waitq, &owait.wait);

1990

finish_wait(&memcg_oom_waitq, &owait.wait);

1991

mem_cgroup_out_of_memory(mem, mask);

1991

mem_cgroup_out_of_memory(mem, mask);

1992

} else {

1992

} else {

1993

schedule();

1993

schedule();

1994

finish_wait(&memcg_oom_waitq, &owait.wait);

1994

finish_wait(&memcg_oom_waitq, &owait.wait);

1995

}

1995

}

1996

spin_lock(&memcg_oom_lock);

1996

spin_lock(&memcg_oom_lock);

1997

if (locked)

1997

if (locked)

1998

mem_cgroup_oom_unlock(mem);

1998

mem_cgroup_oom_unlock(mem);

1999

memcg_wakeup_oom(mem);

1999

memcg_wakeup_oom(mem);

2000

spin_unlock(&memcg_oom_lock);

2000

spin_unlock(&memcg_oom_lock);

2001

2002

mem_cgroup_unmark_under_oom(mem);

2002

mem_cgroup_unmark_under_oom(mem);

2003

2004

if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))

2004

if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))

2005

return false;

2005

return false;

2006

/* Give chance to dying process */

2006

/* Give chance to dying process */

2007

schedule_timeout(1);

2007

schedule_timeout(1);

2008

return true;

2008

return true;

2009

}

2009

}

2010

2011

/*

2011

/*

2012

* Currently used to update mapped file statistics, but the routine can be

2012

* Currently used to update mapped file statistics, but the routine can be

2013

* generalized to update other statistics as well.

2013

* generalized to update other statistics as well.

2014

*

2014

*

2015

* Notes: Race condition

2015

* Notes: Race condition

2016

*

2016

*

2017

* We usually use page_cgroup_lock() for accessing page_cgroup member but

2017

* We usually use page_cgroup_lock() for accessing page_cgroup member but

2018

* it tends to be costly. But considering some conditions, we doesn't need

2018

* it tends to be costly. But considering some conditions, we doesn't need

2019

* to do so _always_.

2019

* to do so _always_.

2020

*

2020

*

2021

* Considering "charge", lock_page_cgroup() is not required because all

2021

* Considering "charge", lock_page_cgroup() is not required because all

2022

* file-stat operations happen after a page is attached to radix-tree. There

2022

* file-stat operations happen after a page is attached to radix-tree. There

2023

* are no race with "charge".

2023

* are no race with "charge".

2024

*

2024

*

2025

* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup

2025

* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup

2026

* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even

2026

* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even

2027

* if there are race with "uncharge". Statistics itself is properly handled

2027

* if there are race with "uncharge". Statistics itself is properly handled

2028

* by flags.

2028

* by flags.

2029

*

2029

*

2030

* Considering "move", this is an only case we see a race. To make the race

2030

* Considering "move", this is an only case we see a race. To make the race

2031

* small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are

2031

* small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are

2032

* possibility of race condition. If there is, we take a lock.

2032

* possibility of race condition. If there is, we take a lock.

2033

*/

2033

*/

2034

2035

void mem_cgroup_update_page_stat(struct page *page,

2035

void mem_cgroup_update_page_stat(struct page *page,

2036

enum mem_cgroup_page_stat_item idx, int val)

2036

enum mem_cgroup_page_stat_item idx, int val)

2037

{

2037

{

2038

struct mem_cgroup *mem;

2038

struct mem_cgroup *mem;

2039

struct page_cgroup *pc = lookup_page_cgroup(page);

2039

struct page_cgroup *pc = lookup_page_cgroup(page);

2040

bool need_unlock = false;

2040

bool need_unlock = false;

2041

unsigned long uninitialized_var(flags);

2041

unsigned long uninitialized_var(flags);

2042

2043

if (unlikely(!pc))

2043

if (unlikely(!pc))

2044

return;

2044

return;

2045

2046

rcu_read_lock();

2046

rcu_read_lock();

2047

mem = pc->mem_cgroup;

2047

mem = pc->mem_cgroup;

2048

if (unlikely(!mem || !PageCgroupUsed(pc)))

2048

if (unlikely(!mem || !PageCgroupUsed(pc)))

2049

goto out;

2049

goto out;

2050

/* pc->mem_cgroup is unstable ? */

2050

/* pc->mem_cgroup is unstable ? */

2051

if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {

2051

if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {

2052

/* take a lock against to access pc->mem_cgroup */

2052

/* take a lock against to access pc->mem_cgroup */

2053

move_lock_page_cgroup(pc, &flags);

2053

move_lock_page_cgroup(pc, &flags);

2054

need_unlock = true;

2054

need_unlock = true;

2055

mem = pc->mem_cgroup;

2055

mem = pc->mem_cgroup;

2056

if (!mem || !PageCgroupUsed(pc))

2056

if (!mem || !PageCgroupUsed(pc))

2057

goto out;

2057

goto out;

2058

}

2058

}

2059

2060

switch (idx) {

2060

switch (idx) {

2061

case MEMCG_NR_FILE_MAPPED:

2061

case MEMCG_NR_FILE_MAPPED:

2062

if (val > 0)

2062

if (val > 0)

2063

SetPageCgroupFileMapped(pc);

2063

SetPageCgroupFileMapped(pc);

2064

else if (!page_mapped(page))

2064

else if (!page_mapped(page))

2065

ClearPageCgroupFileMapped(pc);

2065

ClearPageCgroupFileMapped(pc);

2066

idx = MEM_CGROUP_STAT_FILE_MAPPED;

2066

idx = MEM_CGROUP_STAT_FILE_MAPPED;

2067

break;

2067

break;

2068

default:

2068

default:

2069

BUG();

2069

BUG();

2070

}

2070

}

2071

2072

this_cpu_add(mem->stat->count[idx], val);

2072

this_cpu_add(mem->stat->count[idx], val);

2073

2074

out:

2074

out:

2075

if (unlikely(need_unlock))

2075

if (unlikely(need_unlock))

2076

move_unlock_page_cgroup(pc, &flags);

2076

move_unlock_page_cgroup(pc, &flags);

2077

rcu_read_unlock();

2077

rcu_read_unlock();

2078

return;

2078

return;

2079

}

2079

}

2080

EXPORT_SYMBOL(mem_cgroup_update_page_stat);

2080

EXPORT_SYMBOL(mem_cgroup_update_page_stat);

2081

2082

/*

2082

/*

2083

* size of first charge trial. "32" comes from vmscan.c's magic value.

2083

* size of first charge trial. "32" comes from vmscan.c's magic value.

2084

* TODO: maybe necessary to use big numbers in big irons.

2084

* TODO: maybe necessary to use big numbers in big irons.

2085

*/

2085

*/

2086

#define CHARGE_BATCH 32U

2086

#define CHARGE_BATCH 32U

2087

struct memcg_stock_pcp {

2087

struct memcg_stock_pcp {

2088

struct mem_cgroup *cached; /* this never be root cgroup */

2088

struct mem_cgroup *cached; /* this never be root cgroup */

2089

unsigned int nr_pages;

2089

unsigned int nr_pages;

2090

struct work_struct work;

2090

struct work_struct work;

2091

unsigned long flags;

2091

unsigned long flags;

2092

#define FLUSHING_CACHED_CHARGE (0)

2092

#define FLUSHING_CACHED_CHARGE (0)

2093

};

2093

};

2094

static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);

2094

static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);

2095

static DEFINE_MUTEX(percpu_charge_mutex);

2096

2095

2097

/*

2096

/*

2098

* Try to consume stocked charge on this cpu. If success, one page is consumed

2097

* Try to consume stocked charge on this cpu. If success, one page is consumed

2099

* from local stock and true is returned. If the stock is 0 or charges from a

2098

* from local stock and true is returned. If the stock is 0 or charges from a

2100

* cgroup which is not current target, returns false. This stock will be

2099

* cgroup which is not current target, returns false. This stock will be

2101

* refilled.

2100

* refilled.

2102

*/

2101

*/

2103

static bool consume_stock(struct mem_cgroup *mem)

2102

static bool consume_stock(struct mem_cgroup *mem)

2104

{

2103

{

2105

struct memcg_stock_pcp *stock;

2104

struct memcg_stock_pcp *stock;

2106

bool ret = true;

2105

bool ret = true;

2107

2106

2108

stock = &get_cpu_var(memcg_stock);

2107

stock = &get_cpu_var(memcg_stock);

2109

if (mem == stock->cached && stock->nr_pages)

2108

if (mem == stock->cached && stock->nr_pages)

2110

stock->nr_pages--;

2109

stock->nr_pages--;

2111

else /* need to call res_counter_charge */

2110

else /* need to call res_counter_charge */

2112

ret = false;

2111

ret = false;

2113

put_cpu_var(memcg_stock);

2112

put_cpu_var(memcg_stock);

2114

return ret;

2113

return ret;

2115

}

2114

}

2116

2115

2117

/*

2116

/*

2118

* Returns stocks cached in percpu to res_counter and reset cached information.

2117

* Returns stocks cached in percpu to res_counter and reset cached information.

2119

*/

2118

*/

2120

static void drain_stock(struct memcg_stock_pcp *stock)

2119

static void drain_stock(struct memcg_stock_pcp *stock)

2121

{

2120

{

2122

struct mem_cgroup *old = stock->cached;

2121

struct mem_cgroup *old = stock->cached;

2123

2122

2124

if (stock->nr_pages) {

2123

if (stock->nr_pages) {

2125

unsigned long bytes = stock->nr_pages * PAGE_SIZE;

2124

unsigned long bytes = stock->nr_pages * PAGE_SIZE;

2126

2125

2127

res_counter_uncharge(&old->res, bytes);

2126

res_counter_uncharge(&old->res, bytes);

2128

if (do_swap_account)

2127

if (do_swap_account)

2129

res_counter_uncharge(&old->memsw, bytes);

2128

res_counter_uncharge(&old->memsw, bytes);

2130

stock->nr_pages = 0;

2129

stock->nr_pages = 0;

2131

}

2130

}

2132

stock->cached = NULL;

2131

stock->cached = NULL;

2133

}

2132

}

2134

2133

2135

/*

2134

/*

2136

* This must be called under preempt disabled or must be called by

2135

* This must be called under preempt disabled or must be called by

2137

* a thread which is pinned to local cpu.

2136

* a thread which is pinned to local cpu.

2138

*/

2137

*/

2139

static void drain_local_stock(struct work_struct *dummy)

2138

static void drain_local_stock(struct work_struct *dummy)

2140

{

2139

{

2141

struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);

2140

struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);

2142

drain_stock(stock);

2141

drain_stock(stock);

2143

clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);

2142

clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);

2144

}

2143

}

2145

2144

2146

/*

2145

/*

2147

* Cache charges(val) which is from res_counter, to local per_cpu area.

2146

* Cache charges(val) which is from res_counter, to local per_cpu area.

2148

* This will be consumed by consume_stock() function, later.

2147

* This will be consumed by consume_stock() function, later.

2149

*/

2148

*/

2150

static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)

2149

static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)

2151

{

2150

{

2152

struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);

2151

struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);

2153

2152

2154

if (stock->cached != mem) { /* reset if necessary */

2153

if (stock->cached != mem) { /* reset if necessary */

2155

drain_stock(stock);

2154

drain_stock(stock);

2156

stock->cached = mem;

2155

stock->cached = mem;

2157

}

2156

}

2158

stock->nr_pages += nr_pages;

2157

stock->nr_pages += nr_pages;

2159

put_cpu_var(memcg_stock);

2158

put_cpu_var(memcg_stock);

2160

}

2159

}

2161

2160

2162

/*

2161

/*

2163

* Drains all per-CPU charge caches for given root_mem resp. subtree

2162

* Drains all per-CPU charge caches for given root_mem resp. subtree

2164

* of the hierarchy under it. sync flag says whether we should block

2163

* of the hierarchy under it. sync flag says whether we should block

2165

* until the work is done.

2164

* until the work is done.

2166

*/

2165

*/

2167

static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)

2166

static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)

2168

{

2167

{

2169

int cpu, curcpu;

2168

int cpu, curcpu;

2170

2169

2171

/* Notify other cpus that system-wide "drain" is running */

2170

/* Notify other cpus that system-wide "drain" is running */

2172

get_online_cpus();

2171

get_online_cpus();

2173

/*

2172

/*

2174

* Get a hint for avoiding draining charges on the current cpu,

2173

* Get a hint for avoiding draining charges on the current cpu,

2175

* which must be exhausted by our charging. It is not required that

2174

* which must be exhausted by our charging. It is not required that

2176

* this be a precise check, so we use raw_smp_processor_id() instead of

2175

* this be a precise check, so we use raw_smp_processor_id() instead of

2177

* getcpu()/putcpu().

2176

* getcpu()/putcpu().

2178

*/

2177

*/

2179

curcpu = raw_smp_processor_id();

2178

curcpu = raw_smp_processor_id();

2180

for_each_online_cpu(cpu) {

2179

for_each_online_cpu(cpu) {

2181

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2180

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2182

struct mem_cgroup *mem;

2181

struct mem_cgroup *mem;

2183

2182

2184

mem = stock->cached;

2183

mem = stock->cached;

2185

if (!mem || !stock->nr_pages)

2184

if (!mem || !stock->nr_pages)

2186

continue;

2185

continue;

2187

if (!mem_cgroup_same_or_subtree(root_mem, mem))

2186

if (!mem_cgroup_same_or_subtree(root_mem, mem))

2188

continue;

2187

continue;

2189

if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {

2188

if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {

2190

if (cpu == curcpu)

2189

if (cpu == curcpu)

2191

drain_local_stock(&stock->work);

2190

drain_local_stock(&stock->work);

2192

else

2191

else

2193

schedule_work_on(cpu, &stock->work);

2192

schedule_work_on(cpu, &stock->work);

2194

}

2193

}

2195

}

2194

}

2196

2195

2197

if (!sync)

2196

if (!sync)

2198

goto out;

2197

goto out;

2199

2198

2200

for_each_online_cpu(cpu) {

2199

for_each_online_cpu(cpu) {

2201

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2200

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2202

if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))

2201

if (mem_cgroup_same_or_subtree(root_mem, stock->cached) &&

2202

test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))

2203

flush_work(&stock->work);

2203

flush_work(&stock->work);

2204

}

2204

}

2205

out:

2205

out:

2206

put_online_cpus();

2206

put_online_cpus();

2207

}

2207

}

2208

2209

/*

2209

/*

2210

* Tries to drain stocked charges in other cpus. This function is asynchronous

2210

* Tries to drain stocked charges in other cpus. This function is asynchronous

2211

* and just put a work per cpu for draining localy on each cpu. Caller can

2211

* and just put a work per cpu for draining localy on each cpu. Caller can

2212

* expects some charges will be back to res_counter later but cannot wait for

2212

* expects some charges will be back to res_counter later but cannot wait for

2213

* it.

2213

* it.

2214

*/

2214

*/

2215

static void drain_all_stock_async(struct mem_cgroup *root_mem)

2215

static void drain_all_stock_async(struct mem_cgroup *root_mem)

2216

{

2216

{

2217

/*

2218

* If someone calls draining, avoid adding more kworker runs.

2219

*/

2220

if (!mutex_trylock(&percpu_charge_mutex))

2221

return;

2222

drain_all_stock(root_mem, false);

2217

drain_all_stock(root_mem, false);

2223

mutex_unlock(&percpu_charge_mutex);

2224

}

2218

}

2225

2219

2226

/* This is a synchronous drain interface. */

2220

/* This is a synchronous drain interface. */

2227

static void drain_all_stock_sync(struct mem_cgroup *root_mem)

2221

static void drain_all_stock_sync(struct mem_cgroup *root_mem)

2228

{

2222

{

2229

/* called when force_empty is called */

2223

/* called when force_empty is called */

2230

mutex_lock(&percpu_charge_mutex);

2231

drain_all_stock(root_mem, true);

2224

drain_all_stock(root_mem, true);

2232

mutex_unlock(&percpu_charge_mutex);

2233

}

2225

}

2234

2226

2235

/*

2227

/*

2236

* This function drains percpu counter value from DEAD cpu and

2228

* This function drains percpu counter value from DEAD cpu and

2237

* move it to local cpu. Note that this function can be preempted.

2229

* move it to local cpu. Note that this function can be preempted.

2238

*/

2230

*/

2239

static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)

2231

static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)

2240

{

2232

{

2241

int i;

2233

int i;

2242

2234

2243

spin_lock(&mem->pcp_counter_lock);

2235

spin_lock(&mem->pcp_counter_lock);

2244

for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {

2236

for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {

2245

long x = per_cpu(mem->stat->count[i], cpu);

2237

long x = per_cpu(mem->stat->count[i], cpu);

2246

2238

2247

per_cpu(mem->stat->count[i], cpu) = 0;

2239

per_cpu(mem->stat->count[i], cpu) = 0;

2248

mem->nocpu_base.count[i] += x;

2240

mem->nocpu_base.count[i] += x;

2249

}

2241

}

2250

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

2242

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

2251

unsigned long x = per_cpu(mem->stat->events[i], cpu);

2243

unsigned long x = per_cpu(mem->stat->events[i], cpu);

2252

2244

2253

per_cpu(mem->stat->events[i], cpu) = 0;

2245

per_cpu(mem->stat->events[i], cpu) = 0;

2254

mem->nocpu_base.events[i] += x;

2246

mem->nocpu_base.events[i] += x;

2255

}

2247

}

2256

/* need to clear ON_MOVE value, works as a kind of lock. */

2248

/* need to clear ON_MOVE value, works as a kind of lock. */

2257

per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;

2249

per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;

2258

spin_unlock(&mem->pcp_counter_lock);

2250

spin_unlock(&mem->pcp_counter_lock);

2259

}

2251

}

2260

2252

2261

static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)

2253

static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)

2262

{

2254

{

2263

int idx = MEM_CGROUP_ON_MOVE;

2255

int idx = MEM_CGROUP_ON_MOVE;

2264

2256

2265

spin_lock(&mem->pcp_counter_lock);

2257

spin_lock(&mem->pcp_counter_lock);

2266

per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];

2258

per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];

2267

spin_unlock(&mem->pcp_counter_lock);

2259

spin_unlock(&mem->pcp_counter_lock);

2268

}

2260

}

2269

2261

2270

static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,

2262

static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,

2271

unsigned long action,

2263

unsigned long action,

2272

void *hcpu)

2264

void *hcpu)

2273

{

2265

{

2274

int cpu = (unsigned long)hcpu;

2266

int cpu = (unsigned long)hcpu;

2275

struct memcg_stock_pcp *stock;

2267

struct memcg_stock_pcp *stock;

2276

struct mem_cgroup *iter;

2268

struct mem_cgroup *iter;

2277

2269

2278

if ((action == CPU_ONLINE)) {

2270

if ((action == CPU_ONLINE)) {

2279

for_each_mem_cgroup_all(iter)

2271

for_each_mem_cgroup_all(iter)

2280

synchronize_mem_cgroup_on_move(iter, cpu);

2272

synchronize_mem_cgroup_on_move(iter, cpu);

2281

return NOTIFY_OK;

2273

return NOTIFY_OK;

2282

}

2274

}

2283

2275

2284

if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)

2276

if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)

2285

return NOTIFY_OK;

2277

return NOTIFY_OK;

2286

2278

2287

for_each_mem_cgroup_all(iter)

2279

for_each_mem_cgroup_all(iter)

2288

mem_cgroup_drain_pcp_counter(iter, cpu);

2280

mem_cgroup_drain_pcp_counter(iter, cpu);

2289

2281

2290

stock = &per_cpu(memcg_stock, cpu);

2282

stock = &per_cpu(memcg_stock, cpu);

2291

drain_stock(stock);

2283

drain_stock(stock);

2292

return NOTIFY_OK;

2284

return NOTIFY_OK;

2293

}

2285

}

2294

2286

2295

2287

2296

/* See __mem_cgroup_try_charge() for details */

2288

/* See __mem_cgroup_try_charge() for details */

2297

enum {

2289

enum {

2298

CHARGE_OK, /* success */

2290

CHARGE_OK, /* success */

2299

CHARGE_RETRY, /* need to retry but retry is not bad */

2291

CHARGE_RETRY, /* need to retry but retry is not bad */

2300

CHARGE_NOMEM, /* we can't do more. return -ENOMEM */

2292

CHARGE_NOMEM, /* we can't do more. return -ENOMEM */

2301

CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */

2293

CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */

2302

CHARGE_OOM_DIE, /* the current is killed because of OOM */

2294

CHARGE_OOM_DIE, /* the current is killed because of OOM */

2303

};

2295

};

2304

2296

2305

static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,

2297

static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,

2306

unsigned int nr_pages, bool oom_check)

2298

unsigned int nr_pages, bool oom_check)

2307

{

2299

{

2308

unsigned long csize = nr_pages * PAGE_SIZE;

2300

unsigned long csize = nr_pages * PAGE_SIZE;

2309

struct mem_cgroup *mem_over_limit;

2301

struct mem_cgroup *mem_over_limit;

2310

struct res_counter *fail_res;

2302

struct res_counter *fail_res;

2311

unsigned long flags = 0;

2303

unsigned long flags = 0;

2312

int ret;

2304

int ret;

2313

2305

2314

ret = res_counter_charge(&mem->res, csize, &fail_res);

2306

ret = res_counter_charge(&mem->res, csize, &fail_res);

2315

2307

2316

if (likely(!ret)) {

2308

if (likely(!ret)) {

2317

if (!do_swap_account)

2309

if (!do_swap_account)

2318

return CHARGE_OK;

2310

return CHARGE_OK;

2319

ret = res_counter_charge(&mem->memsw, csize, &fail_res);

2311

ret = res_counter_charge(&mem->memsw, csize, &fail_res);

2320

if (likely(!ret))

2312

if (likely(!ret))

2321

return CHARGE_OK;

2313

return CHARGE_OK;

2322

2314

2323

res_counter_uncharge(&mem->res, csize);

2315

res_counter_uncharge(&mem->res, csize);

2324

mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);

2316

mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);

2325

flags |= MEM_CGROUP_RECLAIM_NOSWAP;

2317

flags |= MEM_CGROUP_RECLAIM_NOSWAP;

2326

} else

2318

} else

2327

mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);

2319

mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);

2328

/*

2320

/*

2329

* nr_pages can be either a huge page (HPAGE_PMD_NR), a batch

2321

* nr_pages can be either a huge page (HPAGE_PMD_NR), a batch

2330

* of regular pages (CHARGE_BATCH), or a single regular page (1).

2322

* of regular pages (CHARGE_BATCH), or a single regular page (1).

2331

*

2323

*

2332

* Never reclaim on behalf of optional batching, retry with a

2324

* Never reclaim on behalf of optional batching, retry with a

2333

* single page instead.

2325

* single page instead.

2334

*/

2326

*/

2335

if (nr_pages == CHARGE_BATCH)

2327

if (nr_pages == CHARGE_BATCH)

2336

return CHARGE_RETRY;

2328

return CHARGE_RETRY;

2337

2329

2338

if (!(gfp_mask & __GFP_WAIT))

2330

if (!(gfp_mask & __GFP_WAIT))

2339

return CHARGE_WOULDBLOCK;

2331

return CHARGE_WOULDBLOCK;

2340

2332

2341

ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,

2333

ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,

2342

gfp_mask, flags, NULL);

2334

gfp_mask, flags, NULL);

2343

if (mem_cgroup_margin(mem_over_limit) >= nr_pages)

2335

if (mem_cgroup_margin(mem_over_limit) >= nr_pages)

2344

return CHARGE_RETRY;

2336

return CHARGE_RETRY;

2345

/*

2337

/*

2346

* Even though the limit is exceeded at this point, reclaim

2338

* Even though the limit is exceeded at this point, reclaim

2347

* may have been able to free some pages. Retry the charge

2339

* may have been able to free some pages. Retry the charge

2348

* before killing the task.

2340

* before killing the task.

2349

*

2341

*

2350

* Only for regular pages, though: huge pages are rather

2342

* Only for regular pages, though: huge pages are rather

2351

* unlikely to succeed so close to the limit, and we fall back

2343

* unlikely to succeed so close to the limit, and we fall back

2352

* to regular pages anyway in case of failure.

2344

* to regular pages anyway in case of failure.

2353

*/

2345

*/

2354

if (nr_pages == 1 && ret)

2346

if (nr_pages == 1 && ret)

2355

return CHARGE_RETRY;

2347

return CHARGE_RETRY;

2356

2348

2357

/*

2349

/*

2358

* At task move, charge accounts can be doubly counted. So, it's

2350

* At task move, charge accounts can be doubly counted. So, it's

2359

* better to wait until the end of task_move if something is going on.

2351

* better to wait until the end of task_move if something is going on.

2360

*/

2352

*/

2361

if (mem_cgroup_wait_acct_move(mem_over_limit))

2353

if (mem_cgroup_wait_acct_move(mem_over_limit))

2362

return CHARGE_RETRY;

2354

return CHARGE_RETRY;

2363

2355

2364

/* If we don't need to call oom-killer at el, return immediately */

2356

/* If we don't need to call oom-killer at el, return immediately */

2365

if (!oom_check)

2357

if (!oom_check)

2366

return CHARGE_NOMEM;

2358

return CHARGE_NOMEM;

2367

/* check OOM */

2359

/* check OOM */

2368

if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))

2360

if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))

2369

return CHARGE_OOM_DIE;

2361

return CHARGE_OOM_DIE;

2370

2362

2371

return CHARGE_RETRY;

2363

return CHARGE_RETRY;

2372

}

2364

}

2373

2365

2374

/*

2366

/*

2375

* Unlike exported interface, "oom" parameter is added. if oom==true,

2367

* Unlike exported interface, "oom" parameter is added. if oom==true,

2376

* oom-killer can be invoked.

2368

* oom-killer can be invoked.

2377

*/

2369

*/

2378

static int __mem_cgroup_try_charge(struct mm_struct *mm,

2370

static int __mem_cgroup_try_charge(struct mm_struct *mm,

2379

gfp_t gfp_mask,

2371

gfp_t gfp_mask,

2380

unsigned int nr_pages,

2372

unsigned int nr_pages,

2381

struct mem_cgroup **memcg,

2373

struct mem_cgroup **memcg,

2382

bool oom)

2374

bool oom)

2383

{

2375

{

2384

unsigned int batch = max(CHARGE_BATCH, nr_pages);

2376

unsigned int batch = max(CHARGE_BATCH, nr_pages);

2385

int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

2377

int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

2386

struct mem_cgroup *mem = NULL;

2378

struct mem_cgroup *mem = NULL;

2387

int ret;

2379

int ret;

2388

2380

2389

/*

2381

/*

2390

* Unlike gloval-vm's OOM-kill, we're not in memory shortage

2382

* Unlike gloval-vm's OOM-kill, we're not in memory shortage

2391

* in system level. So, allow to go ahead dying process in addition to

2383

* in system level. So, allow to go ahead dying process in addition to

2392

* MEMDIE process.

2384

* MEMDIE process.

2393

*/

2385

*/

2394

if (unlikely(test_thread_flag(TIF_MEMDIE)

2386

if (unlikely(test_thread_flag(TIF_MEMDIE)

2395

|| fatal_signal_pending(current)))

2387

|| fatal_signal_pending(current)))

2396

goto bypass;

2388

goto bypass;

2397

2389

2398

/*

2390

/*

2399

* We always charge the cgroup the mm_struct belongs to.

2391

* We always charge the cgroup the mm_struct belongs to.

2400

* The mm_struct's mem_cgroup changes on task migration if the

2392

* The mm_struct's mem_cgroup changes on task migration if the

2401

* thread group leader migrates. It's possible that mm is not

2393

* thread group leader migrates. It's possible that mm is not

2402

* set, if so charge the init_mm (happens for pagecache usage).

2394

* set, if so charge the init_mm (happens for pagecache usage).

2403

*/

2395

*/

2404

if (!*memcg && !mm)

2396

if (!*memcg && !mm)

2405

goto bypass;

2397

goto bypass;

2406

again:

2398

again:

2407

if (*memcg) { /* css should be a valid one */

2399

if (*memcg) { /* css should be a valid one */

2408

mem = *memcg;

2400

mem = *memcg;

2409

VM_BUG_ON(css_is_removed(&mem->css));

2401

VM_BUG_ON(css_is_removed(&mem->css));

2410

if (mem_cgroup_is_root(mem))

2402

if (mem_cgroup_is_root(mem))

2411

goto done;

2403

goto done;

2412

if (nr_pages == 1 && consume_stock(mem))

2404

if (nr_pages == 1 && consume_stock(mem))

2413

goto done;

2405

goto done;

2414

css_get(&mem->css);

2406

css_get(&mem->css);

2415

} else {

2407

} else {

2416

struct task_struct *p;

2408

struct task_struct *p;

2417

2409

2418

rcu_read_lock();

2410

rcu_read_lock();

2419

p = rcu_dereference(mm->owner);

2411

p = rcu_dereference(mm->owner);

2420

/*

2412

/*

2421

* Because we don't have task_lock(), "p" can exit.

2413

* Because we don't have task_lock(), "p" can exit.

2422

* In that case, "mem" can point to root or p can be NULL with

2414

* In that case, "mem" can point to root or p can be NULL with

2423

* race with swapoff. Then, we have small risk of mis-accouning.

2415

* race with swapoff. Then, we have small risk of mis-accouning.

2424

* But such kind of mis-account by race always happens because

2416

* But such kind of mis-account by race always happens because

2425

* we don't have cgroup_mutex(). It's overkill and we allo that

2417

* we don't have cgroup_mutex(). It's overkill and we allo that

2426

* small race, here.

2418

* small race, here.

2427

* (*) swapoff at el will charge against mm-struct not against

2419

* (*) swapoff at el will charge against mm-struct not against

2428

* task-struct. So, mm->owner can be NULL.

2420

* task-struct. So, mm->owner can be NULL.

2429

*/

2421

*/

2430

mem = mem_cgroup_from_task(p);

2422

mem = mem_cgroup_from_task(p);

2431

if (!mem || mem_cgroup_is_root(mem)) {

2423

if (!mem || mem_cgroup_is_root(mem)) {

2432

rcu_read_unlock();

2424

rcu_read_unlock();

2433

goto done;

2425

goto done;

2434

}

2426

}

2435

if (nr_pages == 1 && consume_stock(mem)) {

2427

if (nr_pages == 1 && consume_stock(mem)) {

2436

/*

2428

/*

2437

* It seems dagerous to access memcg without css_get().

2429

* It seems dagerous to access memcg without css_get().

2438

* But considering how consume_stok works, it's not

2430

* But considering how consume_stok works, it's not

2439

* necessary. If consume_stock success, some charges

2431

* necessary. If consume_stock success, some charges

2440

* from this memcg are cached on this cpu. So, we

2432

* from this memcg are cached on this cpu. So, we

2441

* don't need to call css_get()/css_tryget() before

2433

* don't need to call css_get()/css_tryget() before

2442

* calling consume_stock().

2434

* calling consume_stock().

2443

*/

2435

*/

2444

rcu_read_unlock();

2436

rcu_read_unlock();

2445

goto done;

2437

goto done;

2446

}

2438

}

2447

/* after here, we may be blocked. we need to get refcnt */

2439

/* after here, we may be blocked. we need to get refcnt */

2448

if (!css_tryget(&mem->css)) {

2440

if (!css_tryget(&mem->css)) {

2449

rcu_read_unlock();

2441

rcu_read_unlock();

2450

goto again;

2442

goto again;

2451

}

2443

}

2452

rcu_read_unlock();

2444

rcu_read_unlock();

2453

}

2445

}

2454

2446

2455

do {

2447

do {

2456

bool oom_check;

2448

bool oom_check;

2457

2449

2458

/* If killed, bypass charge */

2450

/* If killed, bypass charge */

2459

if (fatal_signal_pending(current)) {

2451

if (fatal_signal_pending(current)) {

2460

css_put(&mem->css);

2452

css_put(&mem->css);

2461

goto bypass;

2453

goto bypass;

2462

}

2454

}

2463

2455

2464

oom_check = false;

2456

oom_check = false;

2465

if (oom && !nr_oom_retries) {

2457

if (oom && !nr_oom_retries) {

2466

oom_check = true;

2458

oom_check = true;

2467

nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

2459

nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

2468

}

2460

}

2469

2461

2470

ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);

2462

ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);

2471

switch (ret) {

2463

switch (ret) {

2472

case CHARGE_OK:

2464

case CHARGE_OK:

2473

break;

2465

break;

2474

case CHARGE_RETRY: /* not in OOM situation but retry */

2466

case CHARGE_RETRY: /* not in OOM situation but retry */

2475

batch = nr_pages;

2467

batch = nr_pages;

2476

css_put(&mem->css);

2468

css_put(&mem->css);

2477

mem = NULL;

2469

mem = NULL;

2478

goto again;

2470

goto again;

2479

case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */

2471

case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */

2480

css_put(&mem->css);

2472

css_put(&mem->css);

2481

goto nomem;

2473

goto nomem;

2482

case CHARGE_NOMEM: /* OOM routine works */

2474

case CHARGE_NOMEM: /* OOM routine works */

2483

if (!oom) {

2475

if (!oom) {

2484

css_put(&mem->css);

2476

css_put(&mem->css);

2485

goto nomem;

2477

goto nomem;

2486

}

2478

}

2487

/* If oom, we never return -ENOMEM */

2479

/* If oom, we never return -ENOMEM */

2488

nr_oom_retries--;

2480

nr_oom_retries--;

2489

break;

2481

break;

2490

case CHARGE_OOM_DIE: /* Killed by OOM Killer */

2482

case CHARGE_OOM_DIE: /* Killed by OOM Killer */

2491

css_put(&mem->css);

2483

css_put(&mem->css);

2492

goto bypass;

2484

goto bypass;

2493

}

2485

}

2494

} while (ret != CHARGE_OK);

2486

} while (ret != CHARGE_OK);

2495

2487

2496

if (batch > nr_pages)

2488

if (batch > nr_pages)

2497

refill_stock(mem, batch - nr_pages);

2489

refill_stock(mem, batch - nr_pages);

2498

css_put(&mem->css);

2490

css_put(&mem->css);

2499

done:

2491

done:

2500

*memcg = mem;

2492

*memcg = mem;

2501

return 0;

2493

return 0;

2502

nomem:

2494

nomem:

2503

*memcg = NULL;

2495

*memcg = NULL;

2504

return -ENOMEM;

2496

return -ENOMEM;

2505

bypass:

2497

bypass:

2506

*memcg = NULL;

2498

*memcg = NULL;

2507

return 0;

2499

return 0;

2508

}

2500

}

2509

2501

2510

/*

2502

/*

2511

* Somemtimes we have to undo a charge we got by try_charge().

2503

* Somemtimes we have to undo a charge we got by try_charge().

2512

* This function is for that and do uncharge, put css's refcnt.

2504

* This function is for that and do uncharge, put css's refcnt.

2513

* gotten by try_charge().

2505

* gotten by try_charge().

2514

*/

2506

*/

2515

static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,

2507

static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,

2516

unsigned int nr_pages)

2508

unsigned int nr_pages)

2517

{

2509

{

2518

if (!mem_cgroup_is_root(mem)) {

2510

if (!mem_cgroup_is_root(mem)) {

2519

unsigned long bytes = nr_pages * PAGE_SIZE;

2511

unsigned long bytes = nr_pages * PAGE_SIZE;

2520

2512

2521

res_counter_uncharge(&mem->res, bytes);

2513

res_counter_uncharge(&mem->res, bytes);

2522

if (do_swap_account)

2514

if (do_swap_account)

2523

res_counter_uncharge(&mem->memsw, bytes);

2515

res_counter_uncharge(&mem->memsw, bytes);

2524

}

2516

}

2525

}

2517

}

2526

2518

2527

/*

2519

/*

2528

* A helper function to get mem_cgroup from ID. must be called under

2520

* A helper function to get mem_cgroup from ID. must be called under

2529

* rcu_read_lock(). The caller must check css_is_removed() or some if

2521

* rcu_read_lock(). The caller must check css_is_removed() or some if

2530

* it's concern. (dropping refcnt from swap can be called against removed

2522

* it's concern. (dropping refcnt from swap can be called against removed

2531

* memcg.)

2523

* memcg.)

2532

*/

2524

*/

2533

static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)

2525

static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)

2534

{

2526

{

2535

struct cgroup_subsys_state *css;

2527

struct cgroup_subsys_state *css;

2536

2528

2537

/* ID 0 is unused ID */

2529

/* ID 0 is unused ID */

2538

if (!id)

2530

if (!id)

2539

return NULL;

2531

return NULL;

2540

css = css_lookup(&mem_cgroup_subsys, id);

2532

css = css_lookup(&mem_cgroup_subsys, id);

2541

if (!css)

2533

if (!css)

2542

return NULL;

2534

return NULL;

2543

return container_of(css, struct mem_cgroup, css);

2535

return container_of(css, struct mem_cgroup, css);

2544

}

2536

}

2545

2537

2546

struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)

2538

struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)

2547

{

2539

{

2548

struct mem_cgroup *mem = NULL;

2540

struct mem_cgroup *mem = NULL;

2549

struct page_cgroup *pc;

2541

struct page_cgroup *pc;

2550

unsigned short id;

2542

unsigned short id;

2551

swp_entry_t ent;

2543

swp_entry_t ent;

2552

2544

2553

VM_BUG_ON(!PageLocked(page));

2545

VM_BUG_ON(!PageLocked(page));

2554

2546

2555

pc = lookup_page_cgroup(page);

2547

pc = lookup_page_cgroup(page);

2556

lock_page_cgroup(pc);

2548

lock_page_cgroup(pc);

2557

if (PageCgroupUsed(pc)) {

2549

if (PageCgroupUsed(pc)) {

2558

mem = pc->mem_cgroup;

2550

mem = pc->mem_cgroup;

2559

if (mem && !css_tryget(&mem->css))

2551

if (mem && !css_tryget(&mem->css))

2560

mem = NULL;

2552

mem = NULL;

2561

} else if (PageSwapCache(page)) {

2553

} else if (PageSwapCache(page)) {

2562

ent.val = page_private(page);

2554

ent.val = page_private(page);

2563

id = lookup_swap_cgroup(ent);

2555

id = lookup_swap_cgroup(ent);

2564

rcu_read_lock();

2556

rcu_read_lock();

2565

mem = mem_cgroup_lookup(id);

2557

mem = mem_cgroup_lookup(id);

2566

if (mem && !css_tryget(&mem->css))

2558

if (mem && !css_tryget(&mem->css))

2567

mem = NULL;

2559

mem = NULL;

2568

rcu_read_unlock();

2560

rcu_read_unlock();

2569

}

2561

}

2570

unlock_page_cgroup(pc);

2562

unlock_page_cgroup(pc);

2571

return mem;

2563

return mem;

2572

}

2564

}

2573

2565

2574

static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,

2566

static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,

2575

struct page *page,

2567

struct page *page,

2576

unsigned int nr_pages,

2568

unsigned int nr_pages,

2577

struct page_cgroup *pc,

2569

struct page_cgroup *pc,

2578

enum charge_type ctype)

2570

enum charge_type ctype)

2579

{

2571

{

2580

lock_page_cgroup(pc);

2572

lock_page_cgroup(pc);

2581

if (unlikely(PageCgroupUsed(pc))) {

2573

if (unlikely(PageCgroupUsed(pc))) {

2582

unlock_page_cgroup(pc);

2574

unlock_page_cgroup(pc);

2583

__mem_cgroup_cancel_charge(mem, nr_pages);

2575

__mem_cgroup_cancel_charge(mem, nr_pages);

2584

return;

2576

return;

2585

}

2577

}

2586

/*

2578

/*

2587

* we don't need page_cgroup_lock about tail pages, becase they are not

2579

* we don't need page_cgroup_lock about tail pages, becase they are not

2588

* accessed by any other context at this point.

2580

* accessed by any other context at this point.

2589

*/

2581

*/

2590

pc->mem_cgroup = mem;

2582

pc->mem_cgroup = mem;

2591

/*

2583

/*

2592

* We access a page_cgroup asynchronously without lock_page_cgroup().

2584

* We access a page_cgroup asynchronously without lock_page_cgroup().

2593

* Especially when a page_cgroup is taken from a page, pc->mem_cgroup

2585

* Especially when a page_cgroup is taken from a page, pc->mem_cgroup

2594

* is accessed after testing USED bit. To make pc->mem_cgroup visible

2586

* is accessed after testing USED bit. To make pc->mem_cgroup visible

2595

* before USED bit, we need memory barrier here.

2587

* before USED bit, we need memory barrier here.

2596

* See mem_cgroup_add_lru_list(), etc.

2588

* See mem_cgroup_add_lru_list(), etc.

2597

*/

2589

*/

2598

smp_wmb();

2590

smp_wmb();

2599

switch (ctype) {

2591

switch (ctype) {

2600

case MEM_CGROUP_CHARGE_TYPE_CACHE:

2592

case MEM_CGROUP_CHARGE_TYPE_CACHE:

2601

case MEM_CGROUP_CHARGE_TYPE_SHMEM:

2593

case MEM_CGROUP_CHARGE_TYPE_SHMEM:

2602

SetPageCgroupCache(pc);

2594

SetPageCgroupCache(pc);

2603

SetPageCgroupUsed(pc);

2595

SetPageCgroupUsed(pc);

2604

break;

2596

break;

2605

case MEM_CGROUP_CHARGE_TYPE_MAPPED:

2597

case MEM_CGROUP_CHARGE_TYPE_MAPPED:

2606

ClearPageCgroupCache(pc);

2598

ClearPageCgroupCache(pc);

2607

SetPageCgroupUsed(pc);

2599

SetPageCgroupUsed(pc);

2608

break;

2600

break;

2609

default:

2601

default:

2610

break;

2602

break;

2611

}

2603

}

2612

2604

2613

mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);

2605

mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);

2614

unlock_page_cgroup(pc);

2606

unlock_page_cgroup(pc);

2615

/*

2607

/*

2616

* "charge_statistics" updated event counter. Then, check it.

2608

* "charge_statistics" updated event counter. Then, check it.

2617

* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.

2609

* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.

2618

* if they exceeds softlimit.

2610

* if they exceeds softlimit.

2619

*/

2611

*/

2620

memcg_check_events(mem, page);

2612

memcg_check_events(mem, page);

2621

}

2613

}

2622

2614

2623

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

2615

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

2624

2616

2625

#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\

2617

#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\

2626

(1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))

2618

(1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))

2627

/*

2619

/*

2628

* Because tail pages are not marked as "used", set it. We're under

2620

* Because tail pages are not marked as "used", set it. We're under

2629

* zone->lru_lock, 'splitting on pmd' and compund_lock.

2621

* zone->lru_lock, 'splitting on pmd' and compund_lock.

2630

*/

2622

*/

2631

void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)

2623

void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)

2632

{

2624

{

2633

struct page_cgroup *head_pc = lookup_page_cgroup(head);

2625

struct page_cgroup *head_pc = lookup_page_cgroup(head);

2634

struct page_cgroup *tail_pc = lookup_page_cgroup(tail);

2626

struct page_cgroup *tail_pc = lookup_page_cgroup(tail);

2635

unsigned long flags;

2627

unsigned long flags;

2636

2628

2637

if (mem_cgroup_disabled())

2629

if (mem_cgroup_disabled())

2638

return;

2630

return;

2639

/*

2631

/*

2640

* We have no races with charge/uncharge but will have races with

2632

* We have no races with charge/uncharge but will have races with

2641

* page state accounting.

2633

* page state accounting.

2642

*/

2634

*/

2643

move_lock_page_cgroup(head_pc, &flags);

2635

move_lock_page_cgroup(head_pc, &flags);

2644

2636

2645

tail_pc->mem_cgroup = head_pc->mem_cgroup;

2637

tail_pc->mem_cgroup = head_pc->mem_cgroup;

2646

smp_wmb(); /* see __commit_charge() */

2638

smp_wmb(); /* see __commit_charge() */

2647

if (PageCgroupAcctLRU(head_pc)) {

2639

if (PageCgroupAcctLRU(head_pc)) {

2648

enum lru_list lru;

2640

enum lru_list lru;

2649

struct mem_cgroup_per_zone *mz;

2641

struct mem_cgroup_per_zone *mz;

2650

2642

2651

/*

2643

/*

2652

* LRU flags cannot be copied because we need to add tail

2644

* LRU flags cannot be copied because we need to add tail

2653

*.page to LRU by generic call and our hook will be called.

2645

*.page to LRU by generic call and our hook will be called.

2654

* We hold lru_lock, then, reduce counter directly.

2646

* We hold lru_lock, then, reduce counter directly.

2655

*/

2647

*/

2656

lru = page_lru(head);

2648

lru = page_lru(head);

2657

mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);

2649

mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);

2658

MEM_CGROUP_ZSTAT(mz, lru) -= 1;

2650

MEM_CGROUP_ZSTAT(mz, lru) -= 1;

2659

}

2651

}

2660

tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;

2652

tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;

2661

move_unlock_page_cgroup(head_pc, &flags);

2653

move_unlock_page_cgroup(head_pc, &flags);

2662

}

2654

}

2663

#endif

2655

#endif

2664

2656

2665

/**

2657

/**

2666

* mem_cgroup_move_account - move account of the page

2658

* mem_cgroup_move_account - move account of the page

2667

* @page: the page

2659

* @page: the page

2668

* @nr_pages: number of regular pages (>1 for huge pages)

2660

* @nr_pages: number of regular pages (>1 for huge pages)

2669

* @pc: page_cgroup of the page.

2661

* @pc: page_cgroup of the page.

2670

* @from: mem_cgroup which the page is moved from.

2662

* @from: mem_cgroup which the page is moved from.

2671

* @to: mem_cgroup which the page is moved to. @from != @to.

2663

* @to: mem_cgroup which the page is moved to. @from != @to.

2672

* @uncharge: whether we should call uncharge and css_put against @from.

2664

* @uncharge: whether we should call uncharge and css_put against @from.

2673

*

2665

*

2674

* The caller must confirm following.

2666

* The caller must confirm following.

2675

* - page is not on LRU (isolate_page() is useful.)

2667

* - page is not on LRU (isolate_page() is useful.)

2676

* - compound_lock is held when nr_pages > 1

2668

* - compound_lock is held when nr_pages > 1

2677

*

2669

*

2678

* This function doesn't do "charge" nor css_get to new cgroup. It should be

2670

* This function doesn't do "charge" nor css_get to new cgroup. It should be

2679

* done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is

2671

* done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is

2680

* true, this function does "uncharge" from old cgroup, but it doesn't if

2672

* true, this function does "uncharge" from old cgroup, but it doesn't if

2681

* @uncharge is false, so a caller should do "uncharge".

2673

* @uncharge is false, so a caller should do "uncharge".

2682

*/

2674

*/

2683

static int mem_cgroup_move_account(struct page *page,

2675

static int mem_cgroup_move_account(struct page *page,

2684

unsigned int nr_pages,

2676

unsigned int nr_pages,

2685

struct page_cgroup *pc,

2677

struct page_cgroup *pc,

2686

struct mem_cgroup *from,

2678

struct mem_cgroup *from,

2687

struct mem_cgroup *to,

2679

struct mem_cgroup *to,

2688

bool uncharge)

2680

bool uncharge)

2689

{

2681

{

2690

unsigned long flags;

2682

unsigned long flags;

2691

int ret;

2683

int ret;

2692

2684

2693

VM_BUG_ON(from == to);

2685

VM_BUG_ON(from == to);

2694

VM_BUG_ON(PageLRU(page));

2686

VM_BUG_ON(PageLRU(page));

2695

/*

2687

/*

2696

* The page is isolated from LRU. So, collapse function

2688

* The page is isolated from LRU. So, collapse function

2697

* will not handle this page. But page splitting can happen.

2689

* will not handle this page. But page splitting can happen.

2698

* Do this check under compound_page_lock(). The caller should

2690

* Do this check under compound_page_lock(). The caller should

2699

* hold it.

2691

* hold it.

2700

*/

2692

*/

2701

ret = -EBUSY;

2693

ret = -EBUSY;

2702

if (nr_pages > 1 && !PageTransHuge(page))

2694

if (nr_pages > 1 && !PageTransHuge(page))

2703

goto out;

2695

goto out;

2704

2696

2705

lock_page_cgroup(pc);

2697

lock_page_cgroup(pc);

2706

2698

2707

ret = -EINVAL;

2699

ret = -EINVAL;

2708

if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)

2700

if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)

2709

goto unlock;

2701

goto unlock;

2710

2702

2711

move_lock_page_cgroup(pc, &flags);

2703

move_lock_page_cgroup(pc, &flags);

2712

2704

2713

if (PageCgroupFileMapped(pc)) {

2705

if (PageCgroupFileMapped(pc)) {

2714

/* Update mapped_file data for mem_cgroup */

2706

/* Update mapped_file data for mem_cgroup */

2715

preempt_disable();

2707

preempt_disable();

2716

__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2708

__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2717

__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2709

__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2718

preempt_enable();

2710

preempt_enable();

2719

}

2711

}

2720

mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);

2712

mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);

2721

if (uncharge)

2713

if (uncharge)

2722

/* This is not "cancel", but cancel_charge does all we need. */

2714

/* This is not "cancel", but cancel_charge does all we need. */

2723

__mem_cgroup_cancel_charge(from, nr_pages);

2715

__mem_cgroup_cancel_charge(from, nr_pages);

2724

2716

2725

/* caller should have done css_get */

2717

/* caller should have done css_get */

2726

pc->mem_cgroup = to;

2718

pc->mem_cgroup = to;

2727

mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);

2719

mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);

2728

/*

2720

/*

2729

* We charges against "to" which may not have any tasks. Then, "to"

2721

* We charges against "to" which may not have any tasks. Then, "to"

2730

* can be under rmdir(). But in current implementation, caller of

2722

* can be under rmdir(). But in current implementation, caller of

2731

* this function is just force_empty() and move charge, so it's

2723

* this function is just force_empty() and move charge, so it's

2732

* guaranteed that "to" is never removed. So, we don't check rmdir

2724

* guaranteed that "to" is never removed. So, we don't check rmdir

2733

* status here.

2725

* status here.

2734

*/

2726

*/

2735

move_unlock_page_cgroup(pc, &flags);

2727

move_unlock_page_cgroup(pc, &flags);

2736

ret = 0;

2728

ret = 0;

2737

unlock:

2729

unlock:

2738

unlock_page_cgroup(pc);

2730

unlock_page_cgroup(pc);

2739

/*

2731

/*

2740

* check events

2732

* check events

2741

*/

2733

*/

2742

memcg_check_events(to, page);

2734

memcg_check_events(to, page);

2743

memcg_check_events(from, page);

2735

memcg_check_events(from, page);

2744

out:

2736

out:

2745

return ret;

2737

return ret;

2746

}

2738

}

2747

2739

2748

/*

2740

/*

2749

* move charges to its parent.

2741

* move charges to its parent.

2750

*/

2742

*/

2751

2743

2752

static int mem_cgroup_move_parent(struct page *page,

2744

static int mem_cgroup_move_parent(struct page *page,

2753

struct page_cgroup *pc,

2745

struct page_cgroup *pc,

2754

struct mem_cgroup *child,

2746

struct mem_cgroup *child,

2755

gfp_t gfp_mask)

2747

gfp_t gfp_mask)

2756

{

2748

{

2757

struct cgroup *cg = child->css.cgroup;

2749

struct cgroup *cg = child->css.cgroup;

2758

struct cgroup *pcg = cg->parent;

2750

struct cgroup *pcg = cg->parent;

2759

struct mem_cgroup *parent;

2751

struct mem_cgroup *parent;

2760

unsigned int nr_pages;

2752

unsigned int nr_pages;

2761

unsigned long uninitialized_var(flags);

2753

unsigned long uninitialized_var(flags);

2762

int ret;

2754

int ret;

2763

2755

2764

/* Is ROOT ? */

2756

/* Is ROOT ? */

2765

if (!pcg)

2757

if (!pcg)

2766

return -EINVAL;

2758

return -EINVAL;

2767

2759

2768

ret = -EBUSY;

2760

ret = -EBUSY;

2769

if (!get_page_unless_zero(page))

2761

if (!get_page_unless_zero(page))

2770

goto out;

2762

goto out;

2771

if (isolate_lru_page(page))

2763

if (isolate_lru_page(page))

2772

goto put;

2764

goto put;

2773

2765

2774

nr_pages = hpage_nr_pages(page);

2766

nr_pages = hpage_nr_pages(page);

2775

2767

2776

parent = mem_cgroup_from_cont(pcg);

2768

parent = mem_cgroup_from_cont(pcg);

2777

ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);

2769

ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);

2778

if (ret || !parent)

2770

if (ret || !parent)

2779

goto put_back;

2771

goto put_back;

2780

2772

2781

if (nr_pages > 1)

2773

if (nr_pages > 1)

2782

flags = compound_lock_irqsave(page);

2774

flags = compound_lock_irqsave(page);

2783

2775

2784

ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);

2776

ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);

2785

if (ret)

2777

if (ret)

2786

__mem_cgroup_cancel_charge(parent, nr_pages);

2778

__mem_cgroup_cancel_charge(parent, nr_pages);

2787

2779

2788

if (nr_pages > 1)

2780

if (nr_pages > 1)

2789

compound_unlock_irqrestore(page, flags);

2781

compound_unlock_irqrestore(page, flags);

2790

put_back:

2782

put_back:

2791

putback_lru_page(page);

2783

putback_lru_page(page);

2792

put:

2784

put:

2793

put_page(page);

2785

put_page(page);

2794

out:

2786

out:

2795

return ret;

2787

return ret;

2796

}

2788

}

2797

2789

2798

/*

2790

/*

2799

* Charge the memory controller for page usage.

2791

* Charge the memory controller for page usage.

2800

* Return

2792

* Return

2801

* 0 if the charge was successful

2793

* 0 if the charge was successful

2802

* < 0 if the cgroup is over its limit

2794

* < 0 if the cgroup is over its limit

2803

*/

2795

*/

2804

static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,

2796

static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,

2805

gfp_t gfp_mask, enum charge_type ctype)

2797

gfp_t gfp_mask, enum charge_type ctype)

2806

{

2798

{

2807

struct mem_cgroup *mem = NULL;

2799

struct mem_cgroup *mem = NULL;

2808

unsigned int nr_pages = 1;

2800

unsigned int nr_pages = 1;

2809

struct page_cgroup *pc;

2801

struct page_cgroup *pc;

2810

bool oom = true;

2802

bool oom = true;

2811

int ret;

2803

int ret;

2812

2804

2813

if (PageTransHuge(page)) {

2805

if (PageTransHuge(page)) {

2814

nr_pages <<= compound_order(page);

2806

nr_pages <<= compound_order(page);

2815

VM_BUG_ON(!PageTransHuge(page));

2807

VM_BUG_ON(!PageTransHuge(page));

2816

/*

2808

/*

2817

* Never OOM-kill a process for a huge page. The

2809

* Never OOM-kill a process for a huge page. The

2818

* fault handler will fall back to regular pages.

2810

* fault handler will fall back to regular pages.

2819

*/

2811

*/

2820

oom = false;

2812

oom = false;

2821

}

2813

}

2822

2814

2823

pc = lookup_page_cgroup(page);

2815

pc = lookup_page_cgroup(page);

2824

BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */

2816

BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */

2825

2817

2826

ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);

2818

ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);

2827

if (ret || !mem)

2819

if (ret || !mem)

2828

return ret;

2820

return ret;

2829

2821

2830

__mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);

2822

__mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);

2831

return 0;

2823

return 0;

2832

}

2824

}

2833

2825

2834

int mem_cgroup_newpage_charge(struct page *page,

2826

int mem_cgroup_newpage_charge(struct page *page,

2835

struct mm_struct *mm, gfp_t gfp_mask)

2827

struct mm_struct *mm, gfp_t gfp_mask)

2836

{

2828

{

2837

if (mem_cgroup_disabled())

2829

if (mem_cgroup_disabled())

2838

return 0;

2830

return 0;

2839

/*

2831

/*

2840

* If already mapped, we don't have to account.

2832

* If already mapped, we don't have to account.

2841

* If page cache, page->mapping has address_space.

2833

* If page cache, page->mapping has address_space.

2842

* But page->mapping may have out-of-use anon_vma pointer,

2834

* But page->mapping may have out-of-use anon_vma pointer,

2843

* detecit it by PageAnon() check. newly-mapped-anon's page->mapping

2835

* detecit it by PageAnon() check. newly-mapped-anon's page->mapping

2844

* is NULL.

2836

* is NULL.

2845

*/

2837

*/

2846

if (page_mapped(page) || (page->mapping && !PageAnon(page)))

2838

if (page_mapped(page) || (page->mapping && !PageAnon(page)))

2847

return 0;

2839

return 0;

2848

if (unlikely(!mm))

2840

if (unlikely(!mm))

2849

mm = &init_mm;

2841

mm = &init_mm;

2850

return mem_cgroup_charge_common(page, mm, gfp_mask,

2842

return mem_cgroup_charge_common(page, mm, gfp_mask,

2851

MEM_CGROUP_CHARGE_TYPE_MAPPED);

2843

MEM_CGROUP_CHARGE_TYPE_MAPPED);

2852

}

2844

}

2853

2845

2854

static void

2846

static void

2855

__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,

2847

__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,

2856

enum charge_type ctype);

2848

enum charge_type ctype);

2857

2849

2858

static void

2850

static void

2859

__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,

2851

__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,

2860

enum charge_type ctype)

2852

enum charge_type ctype)

2861

{

2853

{

2862

struct page_cgroup *pc = lookup_page_cgroup(page);

2854

struct page_cgroup *pc = lookup_page_cgroup(page);

2863

/*

2855

/*

2864

* In some case, SwapCache, FUSE(splice_buf->radixtree), the page

2856

* In some case, SwapCache, FUSE(splice_buf->radixtree), the page

2865

* is already on LRU. It means the page may on some other page_cgroup's

2857

* is already on LRU. It means the page may on some other page_cgroup's

2866

* LRU. Take care of it.

2858

* LRU. Take care of it.

2867

*/

2859

*/

2868

mem_cgroup_lru_del_before_commit(page);

2860

mem_cgroup_lru_del_before_commit(page);

2869

__mem_cgroup_commit_charge(mem, page, 1, pc, ctype);

2861

__mem_cgroup_commit_charge(mem, page, 1, pc, ctype);

2870

mem_cgroup_lru_add_after_commit(page);

2862

mem_cgroup_lru_add_after_commit(page);

2871

return;

2863

return;

2872

}

2864

}

2873

2865

2874

int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,

2866

int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,

2875

gfp_t gfp_mask)

2867

gfp_t gfp_mask)

2876

{

2868

{

2877

struct mem_cgroup *mem = NULL;

2869

struct mem_cgroup *mem = NULL;

2878

int ret;

2870

int ret;

2879

2871

2880

if (mem_cgroup_disabled())

2872

if (mem_cgroup_disabled())

2881

return 0;

2873

return 0;

2882

if (PageCompound(page))

2874

if (PageCompound(page))

2883

return 0;

2875

return 0;

2884

/*

2876

/*

2885

* Corner case handling. This is called from add_to_page_cache()

2877

* Corner case handling. This is called from add_to_page_cache()

2886

* in usual. But some FS (shmem) precharges this page before calling it

2878

* in usual. But some FS (shmem) precharges this page before calling it

2887

* and call add_to_page_cache() with GFP_NOWAIT.

2879

* and call add_to_page_cache() with GFP_NOWAIT.

2888

*

2880

*

2889

* For GFP_NOWAIT case, the page may be pre-charged before calling

2881

* For GFP_NOWAIT case, the page may be pre-charged before calling

2890

* add_to_page_cache(). (See shmem.c) check it here and avoid to call

2882

* add_to_page_cache(). (See shmem.c) check it here and avoid to call

2891

* charge twice. (It works but has to pay a bit larger cost.)

2883

* charge twice. (It works but has to pay a bit larger cost.)

2892

* And when the page is SwapCache, it should take swap information

2884

* And when the page is SwapCache, it should take swap information

2893

* into account. This is under lock_page() now.

2885

* into account. This is under lock_page() now.

2894

*/

2886

*/

2895

if (!(gfp_mask & __GFP_WAIT)) {

2887

if (!(gfp_mask & __GFP_WAIT)) {

2896

struct page_cgroup *pc;

2888

struct page_cgroup *pc;

2897

2889

2898

pc = lookup_page_cgroup(page);

2890

pc = lookup_page_cgroup(page);

2899

if (!pc)

2891

if (!pc)

2900

return 0;

2892

return 0;

2901

lock_page_cgroup(pc);

2893

lock_page_cgroup(pc);

2902

if (PageCgroupUsed(pc)) {

2894

if (PageCgroupUsed(pc)) {

2903

unlock_page_cgroup(pc);

2895

unlock_page_cgroup(pc);

2904

return 0;

2896

return 0;

2905

}

2897

}

2906

unlock_page_cgroup(pc);

2898

unlock_page_cgroup(pc);

2907

}

2899

}

2908

2900

2909

if (unlikely(!mm))

2901

if (unlikely(!mm))

2910

mm = &init_mm;

2902

mm = &init_mm;

2911

2903

2912

if (page_is_file_cache(page)) {

2904

if (page_is_file_cache(page)) {

2913

ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);

2905

ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);

2914

if (ret || !mem)

2906

if (ret || !mem)

2915

return ret;

2907

return ret;

2916

2908

2917

/*

2909

/*

2918

* FUSE reuses pages without going through the final

2910

* FUSE reuses pages without going through the final

2919

* put that would remove them from the LRU list, make

2911

* put that would remove them from the LRU list, make

2920

* sure that they get relinked properly.

2912

* sure that they get relinked properly.

2921

*/

2913

*/

2922

__mem_cgroup_commit_charge_lrucare(page, mem,

2914

__mem_cgroup_commit_charge_lrucare(page, mem,

2923

MEM_CGROUP_CHARGE_TYPE_CACHE);

2915

MEM_CGROUP_CHARGE_TYPE_CACHE);

2924

return ret;

2916

return ret;

2925

}

2917

}

2926

/* shmem */

2918

/* shmem */

2927

if (PageSwapCache(page)) {

2919

if (PageSwapCache(page)) {

2928

ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);

2920

ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);

2929

if (!ret)

2921

if (!ret)

2930

__mem_cgroup_commit_charge_swapin(page, mem,

2922

__mem_cgroup_commit_charge_swapin(page, mem,

2931

MEM_CGROUP_CHARGE_TYPE_SHMEM);

2923

MEM_CGROUP_CHARGE_TYPE_SHMEM);

2932

} else

2924

} else

2933

ret = mem_cgroup_charge_common(page, mm, gfp_mask,

2925

ret = mem_cgroup_charge_common(page, mm, gfp_mask,

2934

MEM_CGROUP_CHARGE_TYPE_SHMEM);

2926

MEM_CGROUP_CHARGE_TYPE_SHMEM);

2935

2927

2936

return ret;

2928

return ret;

2937

}

2929

}

2938

2930

2939

/*

2931

/*

2940

* While swap-in, try_charge -> commit or cancel, the page is locked.

2932

* While swap-in, try_charge -> commit or cancel, the page is locked.

2941

* And when try_charge() successfully returns, one refcnt to memcg without

2933

* And when try_charge() successfully returns, one refcnt to memcg without

2942

* struct page_cgroup is acquired. This refcnt will be consumed by

2934

* struct page_cgroup is acquired. This refcnt will be consumed by

2943

* "commit()" or removed by "cancel()"

2935

* "commit()" or removed by "cancel()"

2944

*/

2936

*/

2945

int mem_cgroup_try_charge_swapin(struct mm_struct *mm,

2937

int mem_cgroup_try_charge_swapin(struct mm_struct *mm,

2946

struct page *page,

2938

struct page *page,

2947

gfp_t mask, struct mem_cgroup **ptr)

2939

gfp_t mask, struct mem_cgroup **ptr)

2948

{

2940

{

2949

struct mem_cgroup *mem;

2941

struct mem_cgroup *mem;

2950

int ret;

2942

int ret;

2951

2943

2952

*ptr = NULL;

2944

*ptr = NULL;

2953

2945

2954

if (mem_cgroup_disabled())

2946

if (mem_cgroup_disabled())

2955

return 0;

2947

return 0;

2956

2948

2957

if (!do_swap_account)

2949

if (!do_swap_account)

2958

goto charge_cur_mm;

2950

goto charge_cur_mm;

2959

/*

2951

/*

2960

* A racing thread's fault, or swapoff, may have already updated

2952

* A racing thread's fault, or swapoff, may have already updated

2961

* the pte, and even removed page from swap cache: in those cases

2953

* the pte, and even removed page from swap cache: in those cases

2962

* do_swap_page()'s pte_same() test will fail; but there's also a

2954

* do_swap_page()'s pte_same() test will fail; but there's also a

2963

* KSM case which does need to charge the page.

2955

* KSM case which does need to charge the page.

2964

*/

2956

*/

2965

if (!PageSwapCache(page))

2957

if (!PageSwapCache(page))

2966

goto charge_cur_mm;

2958

goto charge_cur_mm;

2967

mem = try_get_mem_cgroup_from_page(page);

2959

mem = try_get_mem_cgroup_from_page(page);

2968

if (!mem)

2960

if (!mem)

2969

goto charge_cur_mm;

2961

goto charge_cur_mm;

2970

*ptr = mem;

2962

*ptr = mem;

2971

ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);

2963

ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);

2972

css_put(&mem->css);

2964

css_put(&mem->css);

2973

return ret;

2965

return ret;

2974

charge_cur_mm:

2966

charge_cur_mm:

2975

if (unlikely(!mm))

2967

if (unlikely(!mm))

2976

mm = &init_mm;

2968

mm = &init_mm;

2977

return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);

2969

return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);

2978

}

2970

}

2979

2971

2980

static void

2972

static void

2981

__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,

2973

__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,

2982

enum charge_type ctype)

2974

enum charge_type ctype)

2983

{

2975

{

2984

if (mem_cgroup_disabled())

2976

if (mem_cgroup_disabled())

2985

return;

2977

return;

2986

if (!ptr)

2978

if (!ptr)

2987

return;

2979

return;

2988

cgroup_exclude_rmdir(&ptr->css);

2980

cgroup_exclude_rmdir(&ptr->css);

2989

2981

2990

__mem_cgroup_commit_charge_lrucare(page, ptr, ctype);

2982

__mem_cgroup_commit_charge_lrucare(page, ptr, ctype);

2991

/*

2983

/*

2992

* Now swap is on-memory. This means this page may be

2984

* Now swap is on-memory. This means this page may be

2993

* counted both as mem and swap....double count.

2985

* counted both as mem and swap....double count.

2994

* Fix it by uncharging from memsw. Basically, this SwapCache is stable

2986

* Fix it by uncharging from memsw. Basically, this SwapCache is stable

2995

* under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()

2987

* under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()

2996

* may call delete_from_swap_cache() before reach here.

2988

* may call delete_from_swap_cache() before reach here.

2997

*/

2989

*/

2998

if (do_swap_account && PageSwapCache(page)) {

2990

if (do_swap_account && PageSwapCache(page)) {

2999

swp_entry_t ent = {.val = page_private(page)};

2991

swp_entry_t ent = {.val = page_private(page)};

3000

unsigned short id;

2992

unsigned short id;

3001

struct mem_cgroup *memcg;

2993

struct mem_cgroup *memcg;

3002

2994

3003

id = swap_cgroup_record(ent, 0);

2995

id = swap_cgroup_record(ent, 0);

3004

rcu_read_lock();

2996

rcu_read_lock();

3005

memcg = mem_cgroup_lookup(id);

2997

memcg = mem_cgroup_lookup(id);

3006

if (memcg) {

2998

if (memcg) {

3007

/*

2999

/*

3008

* This recorded memcg can be obsolete one. So, avoid

3000

* This recorded memcg can be obsolete one. So, avoid

3009

* calling css_tryget

3001

* calling css_tryget

3010

*/

3002

*/

3011

if (!mem_cgroup_is_root(memcg))

3003

if (!mem_cgroup_is_root(memcg))

3012

res_counter_uncharge(&memcg->memsw, PAGE_SIZE);

3004

res_counter_uncharge(&memcg->memsw, PAGE_SIZE);

3013

mem_cgroup_swap_statistics(memcg, false);

3005

mem_cgroup_swap_statistics(memcg, false);

3014

mem_cgroup_put(memcg);

3006

mem_cgroup_put(memcg);

3015

}

3007

}

3016

rcu_read_unlock();

3008

rcu_read_unlock();

3017

}

3009

}

3018

/*

3010

/*

3019

* At swapin, we may charge account against cgroup which has no tasks.

3011

* At swapin, we may charge account against cgroup which has no tasks.

3020

* So, rmdir()->pre_destroy() can be called while we do this charge.

3012

* So, rmdir()->pre_destroy() can be called while we do this charge.

3021

* In that case, we need to call pre_destroy() again. check it here.

3013

* In that case, we need to call pre_destroy() again. check it here.

3022

*/

3014

*/

3023

cgroup_release_and_wakeup_rmdir(&ptr->css);

3015

cgroup_release_and_wakeup_rmdir(&ptr->css);

3024

}

3016

}

3025

3017

3026

void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)

3018

void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)

3027

{

3019

{

3028

__mem_cgroup_commit_charge_swapin(page, ptr,

3020

__mem_cgroup_commit_charge_swapin(page, ptr,

3029

MEM_CGROUP_CHARGE_TYPE_MAPPED);

3021

MEM_CGROUP_CHARGE_TYPE_MAPPED);

3030

}

3022

}

3031

3023

3032

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)

3024

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)

3033

{

3025

{

3034

if (mem_cgroup_disabled())

3026

if (mem_cgroup_disabled())

3035

return;

3027

return;

3036

if (!mem)

3028

if (!mem)

3037

return;

3029

return;

3038

__mem_cgroup_cancel_charge(mem, 1);

3030

__mem_cgroup_cancel_charge(mem, 1);

3039

}

3031

}

3040

3032

3041

static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,

3033

static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,

3042

unsigned int nr_pages,

3034

unsigned int nr_pages,

3043

const enum charge_type ctype)

3035

const enum charge_type ctype)

3044

{

3036

{

3045

struct memcg_batch_info *batch = NULL;

3037

struct memcg_batch_info *batch = NULL;

3046

bool uncharge_memsw = true;

3038

bool uncharge_memsw = true;

3047

3039

3048

/* If swapout, usage of swap doesn't decrease */

3040

/* If swapout, usage of swap doesn't decrease */

3049

if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)

3041

if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)

3050

uncharge_memsw = false;

3042

uncharge_memsw = false;

3051

3043

3052

batch = &current->memcg_batch;

3044

batch = &current->memcg_batch;

3053

/*

3045

/*

3054

* In usual, we do css_get() when we remember memcg pointer.

3046

* In usual, we do css_get() when we remember memcg pointer.

3055

* But in this case, we keep res->usage until end of a series of

3047

* But in this case, we keep res->usage until end of a series of

3056

* uncharges. Then, it's ok to ignore memcg's refcnt.

3048

* uncharges. Then, it's ok to ignore memcg's refcnt.

3057

*/

3049

*/

3058

if (!batch->memcg)

3050

if (!batch->memcg)

3059

batch->memcg = mem;

3051

batch->memcg = mem;

3060

/*

3052

/*

3061

* do_batch > 0 when unmapping pages or inode invalidate/truncate.

3053

* do_batch > 0 when unmapping pages or inode invalidate/truncate.

3062

* In those cases, all pages freed continuously can be expected to be in

3054

* In those cases, all pages freed continuously can be expected to be in

3063

* the same cgroup and we have chance to coalesce uncharges.

3055

* the same cgroup and we have chance to coalesce uncharges.

3064

* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)

3056

* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)

3065

* because we want to do uncharge as soon as possible.

3057

* because we want to do uncharge as soon as possible.

3066

*/

3058

*/

3067

3059

3068

if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))

3060

if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))

3069

goto direct_uncharge;

3061

goto direct_uncharge;

3070

3062

3071

if (nr_pages > 1)

3063

if (nr_pages > 1)

3072

goto direct_uncharge;

3064

goto direct_uncharge;

3073

3065

3074

/*

3066

/*

3075

* In typical case, batch->memcg == mem. This means we can

3067

* In typical case, batch->memcg == mem. This means we can

3076

* merge a series of uncharges to an uncharge of res_counter.

3068

* merge a series of uncharges to an uncharge of res_counter.

3077

* If not, we uncharge res_counter ony by one.

3069

* If not, we uncharge res_counter ony by one.

3078

*/

3070

*/

3079

if (batch->memcg != mem)

3071

if (batch->memcg != mem)

3080

goto direct_uncharge;

3072

goto direct_uncharge;

3081

/* remember freed charge and uncharge it later */

3073

/* remember freed charge and uncharge it later */

3082

batch->nr_pages++;

3074

batch->nr_pages++;

3083

if (uncharge_memsw)

3075

if (uncharge_memsw)

3084

batch->memsw_nr_pages++;

3076

batch->memsw_nr_pages++;

3085

return;

3077

return;

3086

direct_uncharge:

3078

direct_uncharge:

3087

res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);

3079

res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);

3088

if (uncharge_memsw)

3080

if (uncharge_memsw)

3089

res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);

3081

res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);

3090

if (unlikely(batch->memcg != mem))

3082

if (unlikely(batch->memcg != mem))

3091

memcg_oom_recover(mem);

3083

memcg_oom_recover(mem);

3092

return;

3084

return;

3093

}

3085

}

3094

3086

3095

/*

3087

/*

3096

* uncharge if !page_mapped(page)

3088

* uncharge if !page_mapped(page)

3097

*/

3089

*/

3098

static struct mem_cgroup *

3090

static struct mem_cgroup *

3099

__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)

3091

__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)

3100

{

3092

{

3101

struct mem_cgroup *mem = NULL;

3093

struct mem_cgroup *mem = NULL;

3102

unsigned int nr_pages = 1;

3094

unsigned int nr_pages = 1;

3103

struct page_cgroup *pc;

3095

struct page_cgroup *pc;

3104

3096

3105

if (mem_cgroup_disabled())

3097

if (mem_cgroup_disabled())

3106

return NULL;

3098

return NULL;

3107

3099

3108

if (PageSwapCache(page))

3100

if (PageSwapCache(page))

3109

return NULL;

3101

return NULL;

3110

3102

3111

if (PageTransHuge(page)) {

3103

if (PageTransHuge(page)) {

3112

nr_pages <<= compound_order(page);

3104

nr_pages <<= compound_order(page);

3113

VM_BUG_ON(!PageTransHuge(page));

3105

VM_BUG_ON(!PageTransHuge(page));

3114

}

3106

}

3115

/*

3107

/*

3116

* Check if our page_cgroup is valid

3108

* Check if our page_cgroup is valid

3117

*/

3109

*/

3118

pc = lookup_page_cgroup(page);

3110

pc = lookup_page_cgroup(page);

3119

if (unlikely(!pc || !PageCgroupUsed(pc)))

3111

if (unlikely(!pc || !PageCgroupUsed(pc)))

3120

return NULL;

3112

return NULL;

3121

3113

3122

lock_page_cgroup(pc);

3114

lock_page_cgroup(pc);

3123

3115

3124

mem = pc->mem_cgroup;

3116

mem = pc->mem_cgroup;

3125

3117

3126

if (!PageCgroupUsed(pc))

3118

if (!PageCgroupUsed(pc))

3127

goto unlock_out;

3119

goto unlock_out;

3128

3120

3129

switch (ctype) {

3121

switch (ctype) {

3130

case MEM_CGROUP_CHARGE_TYPE_MAPPED:

3122

case MEM_CGROUP_CHARGE_TYPE_MAPPED:

3131

case MEM_CGROUP_CHARGE_TYPE_DROP:

3123

case MEM_CGROUP_CHARGE_TYPE_DROP:

3132

/* See mem_cgroup_prepare_migration() */

3124

/* See mem_cgroup_prepare_migration() */

3133

if (page_mapped(page) || PageCgroupMigration(pc))

3125

if (page_mapped(page) || PageCgroupMigration(pc))

3134

goto unlock_out;

3126

goto unlock_out;

3135

break;

3127

break;

3136

case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:

3128

case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:

3137

if (!PageAnon(page)) { /* Shared memory */

3129

if (!PageAnon(page)) { /* Shared memory */

3138

if (page->mapping && !page_is_file_cache(page))

3130

if (page->mapping && !page_is_file_cache(page))

3139

goto unlock_out;

3131

goto unlock_out;

3140

} else if (page_mapped(page)) /* Anon */

3132

} else if (page_mapped(page)) /* Anon */

3141

goto unlock_out;

3133

goto unlock_out;

3142

break;

3134

break;

3143

default:

3135

default:

3144

break;

3136

break;

3145

}

3137

}

3146

3138

3147

mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);

3139

mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);

3148

3140

3149

ClearPageCgroupUsed(pc);

3141

ClearPageCgroupUsed(pc);

3150

/*

3142

/*

3151

* pc->mem_cgroup is not cleared here. It will be accessed when it's

3143

* pc->mem_cgroup is not cleared here. It will be accessed when it's

3152

* freed from LRU. This is safe because uncharged page is expected not

3144

* freed from LRU. This is safe because uncharged page is expected not

3153

* to be reused (freed soon). Exception is SwapCache, it's handled by

3145

* to be reused (freed soon). Exception is SwapCache, it's handled by

3154

* special functions.

3146

* special functions.

3155

*/

3147

*/

3156

3148

3157

unlock_page_cgroup(pc);

3149

unlock_page_cgroup(pc);

3158

/*

3150

/*

3159

* even after unlock, we have mem->res.usage here and this memcg

3151

* even after unlock, we have mem->res.usage here and this memcg

3160

* will never be freed.

3152

* will never be freed.

3161

*/

3153

*/

3162

memcg_check_events(mem, page);

3154

memcg_check_events(mem, page);

3163

if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {

3155

if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {

3164

mem_cgroup_swap_statistics(mem, true);

3156

mem_cgroup_swap_statistics(mem, true);

3165

mem_cgroup_get(mem);

3157

mem_cgroup_get(mem);

3166

}

3158

}

3167

if (!mem_cgroup_is_root(mem))

3159

if (!mem_cgroup_is_root(mem))

3168

mem_cgroup_do_uncharge(mem, nr_pages, ctype);

3160

mem_cgroup_do_uncharge(mem, nr_pages, ctype);

3169

3161

3170

return mem;

3162

return mem;

3171

3163

3172

unlock_out:

3164

unlock_out:

3173

unlock_page_cgroup(pc);

3165

unlock_page_cgroup(pc);

3174

return NULL;

3166

return NULL;

3175

}

3167

}

3176

3168

3177

void mem_cgroup_uncharge_page(struct page *page)

3169

void mem_cgroup_uncharge_page(struct page *page)

3178

{

3170

{

3179

/* early check. */

3171

/* early check. */

3180

if (page_mapped(page))

3172

if (page_mapped(page))

3181

return;

3173

return;

3182

if (page->mapping && !PageAnon(page))

3174

if (page->mapping && !PageAnon(page))

3183

return;

3175

return;

3184

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);

3176

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);

3185

}

3177

}

3186

3178

3187

void mem_cgroup_uncharge_cache_page(struct page *page)

3179

void mem_cgroup_uncharge_cache_page(struct page *page)

3188

{

3180

{

3189

VM_BUG_ON(page_mapped(page));

3181

VM_BUG_ON(page_mapped(page));

3190

VM_BUG_ON(page->mapping);

3182

VM_BUG_ON(page->mapping);

3191

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);

3183

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);

3192

}

3184

}

3193

3185

3194

/*

3186

/*

3195

* Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.

3187

* Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.

3196

* In that cases, pages are freed continuously and we can expect pages

3188

* In that cases, pages are freed continuously and we can expect pages

3197

* are in the same memcg. All these calls itself limits the number of

3189

* are in the same memcg. All these calls itself limits the number of

3198

* pages freed at once, then uncharge_start/end() is called properly.

3190

* pages freed at once, then uncharge_start/end() is called properly.

3199

* This may be called prural(2) times in a context,

3191

* This may be called prural(2) times in a context,

3200

*/

3192

*/

3201

3193

3202

void mem_cgroup_uncharge_start(void)

3194

void mem_cgroup_uncharge_start(void)

3203

{

3195

{

3204

current->memcg_batch.do_batch++;

3196

current->memcg_batch.do_batch++;

3205

/* We can do nest. */

3197

/* We can do nest. */

3206

if (current->memcg_batch.do_batch == 1) {

3198

if (current->memcg_batch.do_batch == 1) {

3207

current->memcg_batch.memcg = NULL;

3199

current->memcg_batch.memcg = NULL;

3208

current->memcg_batch.nr_pages = 0;

3200

current->memcg_batch.nr_pages = 0;

3209

current->memcg_batch.memsw_nr_pages = 0;

3201

current->memcg_batch.memsw_nr_pages = 0;

3210

}

3202

}

3211

}

3203

}

3212

3204

3213

void mem_cgroup_uncharge_end(void)

3205

void mem_cgroup_uncharge_end(void)

3214

{

3206

{

3215

struct memcg_batch_info *batch = &current->memcg_batch;

3207

struct memcg_batch_info *batch = &current->memcg_batch;

3216

3208

3217

if (!batch->do_batch)

3209

if (!batch->do_batch)

3218

return;

3210

return;

3219

3211

3220

batch->do_batch--;

3212

batch->do_batch--;

3221

if (batch->do_batch) /* If stacked, do nothing. */

3213

if (batch->do_batch) /* If stacked, do nothing. */

3222

return;

3214

return;

3223

3215

3224

if (!batch->memcg)

3216

if (!batch->memcg)

3225

return;

3217

return;

3226

/*

3218

/*

3227

* This "batch->memcg" is valid without any css_get/put etc...

3219

* This "batch->memcg" is valid without any css_get/put etc...

3228

* bacause we hide charges behind us.

3220

* bacause we hide charges behind us.

3229

*/

3221

*/

3230

if (batch->nr_pages)

3222

if (batch->nr_pages)

3231

res_counter_uncharge(&batch->memcg->res,

3223

res_counter_uncharge(&batch->memcg->res,

3232

batch->nr_pages * PAGE_SIZE);

3224

batch->nr_pages * PAGE_SIZE);

3233

if (batch->memsw_nr_pages)

3225

if (batch->memsw_nr_pages)

3234

res_counter_uncharge(&batch->memcg->memsw,

3226

res_counter_uncharge(&batch->memcg->memsw,

3235

batch->memsw_nr_pages * PAGE_SIZE);

3227

batch->memsw_nr_pages * PAGE_SIZE);

3236

memcg_oom_recover(batch->memcg);

3228

memcg_oom_recover(batch->memcg);

3237

/* forget this pointer (for sanity check) */

3229

/* forget this pointer (for sanity check) */

3238

batch->memcg = NULL;

3230

batch->memcg = NULL;

3239

}

3231

}

3240

3232

3241

#ifdef CONFIG_SWAP

3233

#ifdef CONFIG_SWAP

3242

/*

3234

/*

3243

* called after __delete_from_swap_cache() and drop "page" account.

3235

* called after __delete_from_swap_cache() and drop "page" account.

3244

* memcg information is recorded to swap_cgroup of "ent"

3236

* memcg information is recorded to swap_cgroup of "ent"

3245

*/

3237

*/

3246

void

3238

void

3247

mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)

3239

mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)

3248

{

3240

{

3249

struct mem_cgroup *memcg;

3241

struct mem_cgroup *memcg;

3250

int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;

3242

int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;

3251

3243

3252

if (!swapout) /* this was a swap cache but the swap is unused ! */

3244

if (!swapout) /* this was a swap cache but the swap is unused ! */

3253

ctype = MEM_CGROUP_CHARGE_TYPE_DROP;

3245

ctype = MEM_CGROUP_CHARGE_TYPE_DROP;

3254

3246

3255

memcg = __mem_cgroup_uncharge_common(page, ctype);

3247

memcg = __mem_cgroup_uncharge_common(page, ctype);

3256

3248

3257

/*

3249

/*

3258

* record memcg information, if swapout && memcg != NULL,

3250

* record memcg information, if swapout && memcg != NULL,

3259

* mem_cgroup_get() was called in uncharge().

3251

* mem_cgroup_get() was called in uncharge().

3260

*/

3252

*/

3261

if (do_swap_account && swapout && memcg)

3253

if (do_swap_account && swapout && memcg)

3262

swap_cgroup_record(ent, css_id(&memcg->css));

3254

swap_cgroup_record(ent, css_id(&memcg->css));

3263

}

3255

}

3264

#endif

3256

#endif

3265

3257

3266

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

3258

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

3267

/*

3259

/*

3268

* called from swap_entry_free(). remove record in swap_cgroup and

3260

* called from swap_entry_free(). remove record in swap_cgroup and

3269

* uncharge "memsw" account.

3261

* uncharge "memsw" account.

3270

*/

3262

*/

3271

void mem_cgroup_uncharge_swap(swp_entry_t ent)

3263

void mem_cgroup_uncharge_swap(swp_entry_t ent)

3272

{

3264

{

3273

struct mem_cgroup *memcg;

3265

struct mem_cgroup *memcg;

3274

unsigned short id;

3266

unsigned short id;

3275

3267

3276

if (!do_swap_account)

3268

if (!do_swap_account)

3277

return;

3269

return;

3278

3270

3279

id = swap_cgroup_record(ent, 0);

3271

id = swap_cgroup_record(ent, 0);

3280

rcu_read_lock();

3272

rcu_read_lock();

3281

memcg = mem_cgroup_lookup(id);

3273

memcg = mem_cgroup_lookup(id);

3282

if (memcg) {

3274

if (memcg) {

3283

/*

3275

/*

3284

* We uncharge this because swap is freed.

3276

* We uncharge this because swap is freed.

3285

* This memcg can be obsolete one. We avoid calling css_tryget

3277

* This memcg can be obsolete one. We avoid calling css_tryget

3286

*/

3278

*/

3287

if (!mem_cgroup_is_root(memcg))

3279

if (!mem_cgroup_is_root(memcg))

3288

res_counter_uncharge(&memcg->memsw, PAGE_SIZE);

3280

res_counter_uncharge(&memcg->memsw, PAGE_SIZE);

3289

mem_cgroup_swap_statistics(memcg, false);

3281

mem_cgroup_swap_statistics(memcg, false);

3290

mem_cgroup_put(memcg);

3282

mem_cgroup_put(memcg);

3291

}

3283

}

3292

rcu_read_unlock();

3284

rcu_read_unlock();

3293

}

3285

}

3294

3286

3295

/**

3287

/**

3296

* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.

3288

* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.

3297

* @entry: swap entry to be moved

3289

* @entry: swap entry to be moved

3298

* @from: mem_cgroup which the entry is moved from

3290

* @from: mem_cgroup which the entry is moved from

3299

* @to: mem_cgroup which the entry is moved to

3291

* @to: mem_cgroup which the entry is moved to

3300

* @need_fixup: whether we should fixup res_counters and refcounts.

3292

* @need_fixup: whether we should fixup res_counters and refcounts.

3301

*

3293

*

3302

* It succeeds only when the swap_cgroup's record for this entry is the same

3294

* It succeeds only when the swap_cgroup's record for this entry is the same

3303

* as the mem_cgroup's id of @from.

3295

* as the mem_cgroup's id of @from.

3304

*

3296

*

3305

* Returns 0 on success, -EINVAL on failure.

3297

* Returns 0 on success, -EINVAL on failure.

3306

*

3298

*

3307

* The caller must have charged to @to, IOW, called res_counter_charge() about

3299

* The caller must have charged to @to, IOW, called res_counter_charge() about

3308

* both res and memsw, and called css_get().

3300

* both res and memsw, and called css_get().

3309

*/

3301

*/

3310

static int mem_cgroup_move_swap_account(swp_entry_t entry,

3302

static int mem_cgroup_move_swap_account(swp_entry_t entry,

3311

struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)

3303

struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)

3312

{

3304

{

3313

unsigned short old_id, new_id;

3305

unsigned short old_id, new_id;

3314

3306

3315

old_id = css_id(&from->css);

3307

old_id = css_id(&from->css);

3316

new_id = css_id(&to->css);

3308

new_id = css_id(&to->css);

3317

3309

3318

if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {

3310

if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {

3319

mem_cgroup_swap_statistics(from, false);

3311

mem_cgroup_swap_statistics(from, false);

3320

mem_cgroup_swap_statistics(to, true);

3312

mem_cgroup_swap_statistics(to, true);

3321

/*

3313

/*

3322

* This function is only called from task migration context now.

3314

* This function is only called from task migration context now.

3323

* It postpones res_counter and refcount handling till the end

3315

* It postpones res_counter and refcount handling till the end

3324

* of task migration(mem_cgroup_clear_mc()) for performance

3316

* of task migration(mem_cgroup_clear_mc()) for performance

3325

* improvement. But we cannot postpone mem_cgroup_get(to)

3317

* improvement. But we cannot postpone mem_cgroup_get(to)

3326

* because if the process that has been moved to @to does

3318

* because if the process that has been moved to @to does

3327

* swap-in, the refcount of @to might be decreased to 0.

3319

* swap-in, the refcount of @to might be decreased to 0.

3328

*/

3320

*/

3329

mem_cgroup_get(to);

3321

mem_cgroup_get(to);

3330

if (need_fixup) {

3322

if (need_fixup) {

3331

if (!mem_cgroup_is_root(from))

3323

if (!mem_cgroup_is_root(from))

3332

res_counter_uncharge(&from->memsw, PAGE_SIZE);

3324

res_counter_uncharge(&from->memsw, PAGE_SIZE);

3333

mem_cgroup_put(from);

3325

mem_cgroup_put(from);

3334

/*

3326

/*

3335

* we charged both to->res and to->memsw, so we should

3327

* we charged both to->res and to->memsw, so we should

3336

* uncharge to->res.

3328

* uncharge to->res.

3337

*/

3329

*/

3338

if (!mem_cgroup_is_root(to))

3330

if (!mem_cgroup_is_root(to))

3339

res_counter_uncharge(&to->res, PAGE_SIZE);

3331

res_counter_uncharge(&to->res, PAGE_SIZE);

3340

}

3332

}

3341

return 0;

3333

return 0;

3342

}

3334

}

3343

return -EINVAL;

3335

return -EINVAL;

3344

}

3336

}

3345

#else

3337

#else

3346

static inline int mem_cgroup_move_swap_account(swp_entry_t entry,

3338

static inline int mem_cgroup_move_swap_account(swp_entry_t entry,

3347

struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)

3339

struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)

3348

{

3340

{

3349

return -EINVAL;

3341

return -EINVAL;

3350

}

3342

}

3351

#endif

3343

#endif

3352

3344

3353

/*

3345

/*

3354

* Before starting migration, account PAGE_SIZE to mem_cgroup that the old

3346

* Before starting migration, account PAGE_SIZE to mem_cgroup that the old

3355

* page belongs to.

3347

* page belongs to.

3356

*/

3348

*/

3357

int mem_cgroup_prepare_migration(struct page *page,

3349

int mem_cgroup_prepare_migration(struct page *page,

3358

struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)

3350

struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)

3359

{

3351

{

3360

struct mem_cgroup *mem = NULL;

3352

struct mem_cgroup *mem = NULL;

3361

struct page_cgroup *pc;

3353

struct page_cgroup *pc;

3362

enum charge_type ctype;

3354

enum charge_type ctype;

3363

int ret = 0;

3355

int ret = 0;

3364

3356

3365

*ptr = NULL;

3357

*ptr = NULL;

3366

3358

3367

VM_BUG_ON(PageTransHuge(page));

3359

VM_BUG_ON(PageTransHuge(page));

3368

if (mem_cgroup_disabled())

3360

if (mem_cgroup_disabled())

3369

return 0;

3361

return 0;

3370

3362

3371

pc = lookup_page_cgroup(page);

3363

pc = lookup_page_cgroup(page);

3372

lock_page_cgroup(pc);

3364

lock_page_cgroup(pc);

3373

if (PageCgroupUsed(pc)) {

3365

if (PageCgroupUsed(pc)) {

3374

mem = pc->mem_cgroup;

3366

mem = pc->mem_cgroup;

3375

css_get(&mem->css);

3367

css_get(&mem->css);

3376

/*

3368

/*

3377

* At migrating an anonymous page, its mapcount goes down

3369

* At migrating an anonymous page, its mapcount goes down

3378

* to 0 and uncharge() will be called. But, even if it's fully

3370

* to 0 and uncharge() will be called. But, even if it's fully

3379

* unmapped, migration may fail and this page has to be

3371

* unmapped, migration may fail and this page has to be

3380

* charged again. We set MIGRATION flag here and delay uncharge

3372

* charged again. We set MIGRATION flag here and delay uncharge

3381

* until end_migration() is called

3373

* until end_migration() is called

3382

*

3374

*

3383

* Corner Case Thinking

3375

* Corner Case Thinking

3384

* A)

3376

* A)

3385

* When the old page was mapped as Anon and it's unmap-and-freed

3377

* When the old page was mapped as Anon and it's unmap-and-freed

3386

* while migration was ongoing.

3378

* while migration was ongoing.

3387

* If unmap finds the old page, uncharge() of it will be delayed

3379

* If unmap finds the old page, uncharge() of it will be delayed

3388

* until end_migration(). If unmap finds a new page, it's

3380

* until end_migration(). If unmap finds a new page, it's

3389

* uncharged when it make mapcount to be 1->0. If unmap code

3381

* uncharged when it make mapcount to be 1->0. If unmap code

3390

* finds swap_migration_entry, the new page will not be mapped

3382

* finds swap_migration_entry, the new page will not be mapped

3391

* and end_migration() will find it(mapcount==0).

3383

* and end_migration() will find it(mapcount==0).

3392

*

3384

*

3393

* B)

3385

* B)

3394

* When the old page was mapped but migraion fails, the kernel

3386

* When the old page was mapped but migraion fails, the kernel

3395

* remaps it. A charge for it is kept by MIGRATION flag even

3387

* remaps it. A charge for it is kept by MIGRATION flag even

3396

* if mapcount goes down to 0. We can do remap successfully

3388

* if mapcount goes down to 0. We can do remap successfully

3397

* without charging it again.

3389

* without charging it again.

3398

*

3390

*

3399

* C)

3391

* C)

3400

* The "old" page is under lock_page() until the end of

3392

* The "old" page is under lock_page() until the end of

3401

* migration, so, the old page itself will not be swapped-out.

3393

* migration, so, the old page itself will not be swapped-out.

3402

* If the new page is swapped out before end_migraton, our

3394

* If the new page is swapped out before end_migraton, our

3403

* hook to usual swap-out path will catch the event.

3395

* hook to usual swap-out path will catch the event.

3404

*/

3396

*/

3405

if (PageAnon(page))

3397

if (PageAnon(page))

3406

SetPageCgroupMigration(pc);

3398

SetPageCgroupMigration(pc);

3407

}

3399

}

3408

unlock_page_cgroup(pc);

3400

unlock_page_cgroup(pc);

3409

/*

3401

/*

3410

* If the page is not charged at this point,

3402

* If the page is not charged at this point,

3411

* we return here.

3403

* we return here.

3412

*/

3404

*/

3413

if (!mem)

3405

if (!mem)

3414

return 0;

3406

return 0;

3415

3407

3416

*ptr = mem;

3408

*ptr = mem;

3417

ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);

3409

ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);

3418

css_put(&mem->css);/* drop extra refcnt */

3410

css_put(&mem->css);/* drop extra refcnt */

3419

if (ret || *ptr == NULL) {

3411

if (ret || *ptr == NULL) {

3420

if (PageAnon(page)) {

3412

if (PageAnon(page)) {

3421

lock_page_cgroup(pc);

3413

lock_page_cgroup(pc);

3422

ClearPageCgroupMigration(pc);

3414

ClearPageCgroupMigration(pc);

3423

unlock_page_cgroup(pc);

3415

unlock_page_cgroup(pc);

3424

/*

3416

/*

3425

* The old page may be fully unmapped while we kept it.

3417

* The old page may be fully unmapped while we kept it.

3426

*/

3418

*/

3427

mem_cgroup_uncharge_page(page);

3419

mem_cgroup_uncharge_page(page);

3428

}

3420

}

3429

return -ENOMEM;

3421

return -ENOMEM;

3430

}

3422

}

3431

/*

3423

/*

3432

* We charge new page before it's used/mapped. So, even if unlock_page()

3424

* We charge new page before it's used/mapped. So, even if unlock_page()

3433

* is called before end_migration, we can catch all events on this new

3425

* is called before end_migration, we can catch all events on this new

3434

* page. In the case new page is migrated but not remapped, new page's

3426

* page. In the case new page is migrated but not remapped, new page's

3435

* mapcount will be finally 0 and we call uncharge in end_migration().

3427

* mapcount will be finally 0 and we call uncharge in end_migration().

3436

*/

3428

*/

3437

pc = lookup_page_cgroup(newpage);

3429

pc = lookup_page_cgroup(newpage);

3438

if (PageAnon(page))

3430

if (PageAnon(page))

3439

ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;

3431

ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;

3440

else if (page_is_file_cache(page))

3432

else if (page_is_file_cache(page))

3441

ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;

3433

ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;

3442

else

3434

else

3443

ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;

3435

ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;

3444

__mem_cgroup_commit_charge(mem, page, 1, pc, ctype);

3436

__mem_cgroup_commit_charge(mem, page, 1, pc, ctype);

3445

return ret;

3437

return ret;

3446

}

3438

}

3447

3439

3448

/* remove redundant charge if migration failed*/

3440

/* remove redundant charge if migration failed*/

3449

void mem_cgroup_end_migration(struct mem_cgroup *mem,

3441

void mem_cgroup_end_migration(struct mem_cgroup *mem,

3450

struct page *oldpage, struct page *newpage, bool migration_ok)

3442

struct page *oldpage, struct page *newpage, bool migration_ok)

3451

{

3443

{

3452

struct page *used, *unused;

3444

struct page *used, *unused;

3453

struct page_cgroup *pc;

3445

struct page_cgroup *pc;

3454

3446

3455

if (!mem)

3447

if (!mem)

3456

return;

3448

return;

3457

/* blocks rmdir() */

3449

/* blocks rmdir() */

3458

cgroup_exclude_rmdir(&mem->css);

3450

cgroup_exclude_rmdir(&mem->css);

3459

if (!migration_ok) {

3451

if (!migration_ok) {

3460

used = oldpage;

3452

used = oldpage;

3461

unused = newpage;

3453

unused = newpage;

3462

} else {

3454

} else {

3463

used = newpage;

3455

used = newpage;

3464

unused = oldpage;

3456

unused = oldpage;

3465

}

3457

}

3466

/*

3458

/*

3467

* We disallowed uncharge of pages under migration because mapcount

3459

* We disallowed uncharge of pages under migration because mapcount

3468

* of the page goes down to zero, temporarly.

3460

* of the page goes down to zero, temporarly.

3469

* Clear the flag and check the page should be charged.

3461

* Clear the flag and check the page should be charged.

3470

*/

3462

*/

3471

pc = lookup_page_cgroup(oldpage);

3463

pc = lookup_page_cgroup(oldpage);

3472

lock_page_cgroup(pc);

3464

lock_page_cgroup(pc);

3473

ClearPageCgroupMigration(pc);

3465

ClearPageCgroupMigration(pc);

3474

unlock_page_cgroup(pc);

3466

unlock_page_cgroup(pc);

3475

3467

3476

__mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);

3468

__mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);

3477

3469

3478

/*

3470

/*

3479

* If a page is a file cache, radix-tree replacement is very atomic

3471

* If a page is a file cache, radix-tree replacement is very atomic

3480

* and we can skip this check. When it was an Anon page, its mapcount

3472

* and we can skip this check. When it was an Anon page, its mapcount

3481

* goes down to 0. But because we added MIGRATION flage, it's not

3473

* goes down to 0. But because we added MIGRATION flage, it's not

3482

* uncharged yet. There are several case but page->mapcount check

3474

* uncharged yet. There are several case but page->mapcount check

3483

* and USED bit check in mem_cgroup_uncharge_page() will do enough

3475

* and USED bit check in mem_cgroup_uncharge_page() will do enough

3484

* check. (see prepare_charge() also)

3476

* check. (see prepare_charge() also)

3485

*/

3477

*/

3486

if (PageAnon(used))

3478

if (PageAnon(used))

3487

mem_cgroup_uncharge_page(used);

3479

mem_cgroup_uncharge_page(used);

3488

/*

3480

/*

3489

* At migration, we may charge account against cgroup which has no

3481

* At migration, we may charge account against cgroup which has no

3490

* tasks.

3482

* tasks.

3491

* So, rmdir()->pre_destroy() can be called while we do this charge.

3483

* So, rmdir()->pre_destroy() can be called while we do this charge.

3492

* In that case, we need to call pre_destroy() again. check it here.

3484

* In that case, we need to call pre_destroy() again. check it here.

3493

*/

3485

*/

3494

cgroup_release_and_wakeup_rmdir(&mem->css);

3486

cgroup_release_and_wakeup_rmdir(&mem->css);

3495

}

3487

}

3496

3488

3497

/*

3489

/*

3498

* A call to try to shrink memory usage on charge failure at shmem's swapin.

3490

* A call to try to shrink memory usage on charge failure at shmem's swapin.

3499

* Calling hierarchical_reclaim is not enough because we should update

3491

* Calling hierarchical_reclaim is not enough because we should update

3500

* last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.

3492

* last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.

3501

* Moreover considering hierarchy, we should reclaim from the mem_over_limit,

3493

* Moreover considering hierarchy, we should reclaim from the mem_over_limit,

3502

* not from the memcg which this page would be charged to.

3494

* not from the memcg which this page would be charged to.

3503

* try_charge_swapin does all of these works properly.

3495

* try_charge_swapin does all of these works properly.

3504

*/

3496

*/

3505

int mem_cgroup_shmem_charge_fallback(struct page *page,

3497

int mem_cgroup_shmem_charge_fallback(struct page *page,

3506

struct mm_struct *mm,

3498

struct mm_struct *mm,

3507

gfp_t gfp_mask)

3499

gfp_t gfp_mask)

3508

{

3500

{

3509

struct mem_cgroup *mem;

3501

struct mem_cgroup *mem;

3510

int ret;

3502

int ret;

3511

3503

3512

if (mem_cgroup_disabled())

3504

if (mem_cgroup_disabled())

3513

return 0;

3505

return 0;

3514

3506

3515

ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);

3507

ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);

3516

if (!ret)

3508

if (!ret)

3517

mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */

3509

mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */

3518

3510

3519

return ret;

3511

return ret;

3520

}

3512

}

3521

3513

3522

#ifdef CONFIG_DEBUG_VM

3514

#ifdef CONFIG_DEBUG_VM

3523

static struct page_cgroup *lookup_page_cgroup_used(struct page *page)

3515

static struct page_cgroup *lookup_page_cgroup_used(struct page *page)

3524

{

3516

{

3525

struct page_cgroup *pc;

3517

struct page_cgroup *pc;

3526

3518

3527

pc = lookup_page_cgroup(page);

3519

pc = lookup_page_cgroup(page);

3528

if (likely(pc) && PageCgroupUsed(pc))

3520

if (likely(pc) && PageCgroupUsed(pc))

3529

return pc;

3521

return pc;

3530

return NULL;

3522

return NULL;

3531

}

3523

}

3532

3524

3533

bool mem_cgroup_bad_page_check(struct page *page)

3525

bool mem_cgroup_bad_page_check(struct page *page)

3534

{

3526

{

3535

if (mem_cgroup_disabled())

3527

if (mem_cgroup_disabled())

3536

return false;

3528

return false;

3537

3529

3538

return lookup_page_cgroup_used(page) != NULL;

3530

return lookup_page_cgroup_used(page) != NULL;

3539

}

3531

}

3540

3532

3541

void mem_cgroup_print_bad_page(struct page *page)

3533

void mem_cgroup_print_bad_page(struct page *page)

3542

{

3534

{

3543

struct page_cgroup *pc;

3535

struct page_cgroup *pc;

3544

3536

3545

pc = lookup_page_cgroup_used(page);

3537

pc = lookup_page_cgroup_used(page);

3546

if (pc) {

3538

if (pc) {

3547

int ret = -1;

3539

int ret = -1;

3548

char *path;

3540

char *path;

3549

3541

3550

printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",

3542

printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",

3551

pc, pc->flags, pc->mem_cgroup);

3543

pc, pc->flags, pc->mem_cgroup);

3552

3544

3553

path = kmalloc(PATH_MAX, GFP_KERNEL);

3545

path = kmalloc(PATH_MAX, GFP_KERNEL);

3554

if (path) {

3546

if (path) {

3555

rcu_read_lock();

3547

rcu_read_lock();

3556

ret = cgroup_path(pc->mem_cgroup->css.cgroup,

3548

ret = cgroup_path(pc->mem_cgroup->css.cgroup,

3557

path, PATH_MAX);

3549

path, PATH_MAX);

3558

rcu_read_unlock();

3550

rcu_read_unlock();

3559

}

3551

}

3560

3552

3561

printk(KERN_CONT "(%s)\n",

3553

printk(KERN_CONT "(%s)\n",

3562

(ret < 0) ? "cannot get the path" : path);

3554

(ret < 0) ? "cannot get the path" : path);

3563

kfree(path);

3555

kfree(path);

3564

}

3556

}

3565

}

3557

}

3566

#endif

3558

#endif

3567

3559

3568

static DEFINE_MUTEX(set_limit_mutex);

3560

static DEFINE_MUTEX(set_limit_mutex);

3569

3561

3570

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,

3562

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,

3571

unsigned long long val)

3563

unsigned long long val)

3572

{

3564

{

3573

int retry_count;

3565

int retry_count;

3574

u64 memswlimit, memlimit;

3566

u64 memswlimit, memlimit;

3575

int ret = 0;

3567

int ret = 0;

3576

int children = mem_cgroup_count_children(memcg);

3568

int children = mem_cgroup_count_children(memcg);

3577

u64 curusage, oldusage;

3569

u64 curusage, oldusage;

3578

int enlarge;

3570

int enlarge;

3579

3571

3580

/*

3572

/*

3581

* For keeping hierarchical_reclaim simple, how long we should retry

3573

* For keeping hierarchical_reclaim simple, how long we should retry

3582

* is depends on callers. We set our retry-count to be function

3574

* is depends on callers. We set our retry-count to be function

3583

* of # of children which we should visit in this loop.

3575

* of # of children which we should visit in this loop.

3584

*/

3576

*/

3585

retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;

3577

retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;

3586

3578

3587

oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);

3579

oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);

3588

3580

3589

enlarge = 0;

3581

enlarge = 0;

3590

while (retry_count) {

3582

while (retry_count) {

3591

if (signal_pending(current)) {

3583

if (signal_pending(current)) {

3592

ret = -EINTR;

3584

ret = -EINTR;

3593

break;

3585

break;

3594

}

3586

}

3595

/*

3587

/*

3596

* Rather than hide all in some function, I do this in

3588

* Rather than hide all in some function, I do this in

3597

* open coded manner. You see what this really does.

3589

* open coded manner. You see what this really does.

3598

* We have to guarantee mem->res.limit < mem->memsw.limit.

3590

* We have to guarantee mem->res.limit < mem->memsw.limit.

3599

*/

3591

*/

3600

mutex_lock(&set_limit_mutex);

3592

mutex_lock(&set_limit_mutex);

3601

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3593

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3602

if (memswlimit < val) {

3594

if (memswlimit < val) {

3603

ret = -EINVAL;

3595

ret = -EINVAL;

3604

mutex_unlock(&set_limit_mutex);

3596

mutex_unlock(&set_limit_mutex);

3605

break;

3597

break;

3606

}

3598

}

3607

3599

3608

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3600

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3609

if (memlimit < val)

3601

if (memlimit < val)

3610

enlarge = 1;

3602

enlarge = 1;

3611

3603

3612

ret = res_counter_set_limit(&memcg->res, val);

3604

ret = res_counter_set_limit(&memcg->res, val);

3613

if (!ret) {

3605

if (!ret) {

3614

if (memswlimit == val)

3606

if (memswlimit == val)

3615

memcg->memsw_is_minimum = true;

3607

memcg->memsw_is_minimum = true;

3616

else

3608

else

3617

memcg->memsw_is_minimum = false;

3609

memcg->memsw_is_minimum = false;

3618

}

3610

}

3619

mutex_unlock(&set_limit_mutex);

3611

mutex_unlock(&set_limit_mutex);

3620

3612

3621

if (!ret)

3613

if (!ret)

3622

break;

3614

break;

3623

3615

3624

mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,

3616

mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,

3625

MEM_CGROUP_RECLAIM_SHRINK,

3617

MEM_CGROUP_RECLAIM_SHRINK,

3626

NULL);

3618

NULL);

3627

curusage = res_counter_read_u64(&memcg->res, RES_USAGE);

3619

curusage = res_counter_read_u64(&memcg->res, RES_USAGE);

3628

/* Usage is reduced ? */

3620

/* Usage is reduced ? */

3629

if (curusage >= oldusage)

3621

if (curusage >= oldusage)

3630

retry_count--;

3622

retry_count--;

3631

else

3623

else

3632

oldusage = curusage;

3624

oldusage = curusage;

3633

}

3625

}

3634

if (!ret && enlarge)

3626

if (!ret && enlarge)

3635

memcg_oom_recover(memcg);

3627

memcg_oom_recover(memcg);

3636

3628

3637

return ret;

3629

return ret;

3638

}

3630

}

3639

3631

3640

static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,

3632

static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,

3641

unsigned long long val)

3633

unsigned long long val)

3642

{

3634

{

3643

int retry_count;

3635

int retry_count;

3644

u64 memlimit, memswlimit, oldusage, curusage;

3636

u64 memlimit, memswlimit, oldusage, curusage;

3645

int children = mem_cgroup_count_children(memcg);

3637

int children = mem_cgroup_count_children(memcg);

3646

int ret = -EBUSY;

3638

int ret = -EBUSY;

3647

int enlarge = 0;

3639

int enlarge = 0;

3648

3640

3649

/* see mem_cgroup_resize_res_limit */

3641

/* see mem_cgroup_resize_res_limit */

3650

retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;

3642

retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;

3651

oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

3643

oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

3652

while (retry_count) {

3644

while (retry_count) {

3653

if (signal_pending(current)) {

3645

if (signal_pending(current)) {

3654

ret = -EINTR;

3646

ret = -EINTR;

3655

break;

3647

break;

3656

}

3648

}

3657

/*

3649

/*

3658

* Rather than hide all in some function, I do this in

3650

* Rather than hide all in some function, I do this in

3659

* open coded manner. You see what this really does.

3651

* open coded manner. You see what this really does.

3660

* We have to guarantee mem->res.limit < mem->memsw.limit.

3652

* We have to guarantee mem->res.limit < mem->memsw.limit.

3661

*/

3653

*/

3662

mutex_lock(&set_limit_mutex);

3654

mutex_lock(&set_limit_mutex);

3663

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3655

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3664

if (memlimit > val) {

3656

if (memlimit > val) {

3665

ret = -EINVAL;

3657

ret = -EINVAL;

3666

mutex_unlock(&set_limit_mutex);

3658

mutex_unlock(&set_limit_mutex);

3667

break;

3659

break;

3668

}

3660

}

3669

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3661

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3670

if (memswlimit < val)

3662

if (memswlimit < val)

3671

enlarge = 1;

3663

enlarge = 1;

3672

ret = res_counter_set_limit(&memcg->memsw, val);

3664

ret = res_counter_set_limit(&memcg->memsw, val);

3673

if (!ret) {

3665

if (!ret) {

3674

if (memlimit == val)

3666

if (memlimit == val)

3675

memcg->memsw_is_minimum = true;

3667

memcg->memsw_is_minimum = true;

3676

else

3668

else

3677

memcg->memsw_is_minimum = false;

3669

memcg->memsw_is_minimum = false;

3678

}

3670

}

3679

mutex_unlock(&set_limit_mutex);

3671

mutex_unlock(&set_limit_mutex);

3680

3672

3681

if (!ret)

3673

if (!ret)

3682

break;

3674

break;

3683

3675

3684

mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,

3676

mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,

3685

MEM_CGROUP_RECLAIM_NOSWAP |

3677

MEM_CGROUP_RECLAIM_NOSWAP |

3686

MEM_CGROUP_RECLAIM_SHRINK,

3678

MEM_CGROUP_RECLAIM_SHRINK,

3687

NULL);

3679

NULL);

3688

curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

3680

curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

3689

/* Usage is reduced ? */

3681

/* Usage is reduced ? */

3690

if (curusage >= oldusage)

3682

if (curusage >= oldusage)

3691

retry_count--;

3683

retry_count--;

3692

else

3684

else

3693

oldusage = curusage;

3685

oldusage = curusage;

3694

}

3686

}

3695

if (!ret && enlarge)

3687

if (!ret && enlarge)

3696

memcg_oom_recover(memcg);

3688

memcg_oom_recover(memcg);

3697

return ret;

3689

return ret;

3698

}

3690

}

3699

3691

3700

unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,

3692

unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,

3701

gfp_t gfp_mask,

3693

gfp_t gfp_mask,

3702

unsigned long *total_scanned)

3694

unsigned long *total_scanned)

3703

{

3695

{

3704

unsigned long nr_reclaimed = 0;

3696

unsigned long nr_reclaimed = 0;

3705

struct mem_cgroup_per_zone *mz, *next_mz = NULL;

3697

struct mem_cgroup_per_zone *mz, *next_mz = NULL;

3706

unsigned long reclaimed;

3698

unsigned long reclaimed;

3707

int loop = 0;

3699

int loop = 0;

3708

struct mem_cgroup_tree_per_zone *mctz;

3700

struct mem_cgroup_tree_per_zone *mctz;

3709

unsigned long long excess;

3701

unsigned long long excess;

3710

unsigned long nr_scanned;

3702

unsigned long nr_scanned;

3711

3703

3712

if (order > 0)

3704

if (order > 0)

3713

return 0;

3705

return 0;

3714

3706

3715

mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));

3707

mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));

3716

/*

3708

/*

3717

* This loop can run a while, specially if mem_cgroup's continuously

3709

* This loop can run a while, specially if mem_cgroup's continuously

3718

* keep exceeding their soft limit and putting the system under

3710

* keep exceeding their soft limit and putting the system under

3719

* pressure

3711

* pressure

3720

*/

3712

*/

3721

do {

3713

do {

3722

if (next_mz)

3714

if (next_mz)

3723

mz = next_mz;

3715

mz = next_mz;

3724

else

3716

else

3725

mz = mem_cgroup_largest_soft_limit_node(mctz);

3717

mz = mem_cgroup_largest_soft_limit_node(mctz);

3726

if (!mz)

3718

if (!mz)

3727

break;

3719

break;

3728

3720

3729

nr_scanned = 0;

3721

nr_scanned = 0;

3730

reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,

3722

reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,

3731

gfp_mask,

3723

gfp_mask,

3732

MEM_CGROUP_RECLAIM_SOFT,

3724

MEM_CGROUP_RECLAIM_SOFT,

3733

&nr_scanned);

3725

&nr_scanned);

3734

nr_reclaimed += reclaimed;

3726

nr_reclaimed += reclaimed;

3735

*total_scanned += nr_scanned;

3727

*total_scanned += nr_scanned;

3736

spin_lock(&mctz->lock);

3728

spin_lock(&mctz->lock);

3737

3729

3738

/*

3730

/*

3739

* If we failed to reclaim anything from this memory cgroup

3731

* If we failed to reclaim anything from this memory cgroup

3740

* it is time to move on to the next cgroup

3732

* it is time to move on to the next cgroup

3741

*/

3733

*/

3742

next_mz = NULL;

3734

next_mz = NULL;

3743

if (!reclaimed) {

3735

if (!reclaimed) {

3744

do {

3736

do {

3745

/*

3737

/*

3746

* Loop until we find yet another one.

3738

* Loop until we find yet another one.

3747

*

3739

*

3748

* By the time we get the soft_limit lock

3740

* By the time we get the soft_limit lock

3749

* again, someone might have aded the

3741

* again, someone might have aded the

3750

* group back on the RB tree. Iterate to

3742

* group back on the RB tree. Iterate to

3751

* make sure we get a different mem.

3743

* make sure we get a different mem.

3752

* mem_cgroup_largest_soft_limit_node returns

3744

* mem_cgroup_largest_soft_limit_node returns

3753

* NULL if no other cgroup is present on

3745

* NULL if no other cgroup is present on

3754

* the tree

3746

* the tree

3755

*/

3747

*/

3756

next_mz =

3748

next_mz =

3757

__mem_cgroup_largest_soft_limit_node(mctz);

3749

__mem_cgroup_largest_soft_limit_node(mctz);

3758

if (next_mz == mz)

3750

if (next_mz == mz)

3759

css_put(&next_mz->mem->css);

3751

css_put(&next_mz->mem->css);

3760

else /* next_mz == NULL or other memcg */

3752

else /* next_mz == NULL or other memcg */

3761

break;

3753

break;

3762

} while (1);

3754

} while (1);

3763

}

3755

}

3764

__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);

3756

__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);

3765

excess = res_counter_soft_limit_excess(&mz->mem->res);

3757

excess = res_counter_soft_limit_excess(&mz->mem->res);

3766

/*

3758

/*

3767

* One school of thought says that we should not add

3759

* One school of thought says that we should not add

3768

* back the node to the tree if reclaim returns 0.

3760

* back the node to the tree if reclaim returns 0.

3769

* But our reclaim could return 0, simply because due

3761

* But our reclaim could return 0, simply because due

3770

* to priority we are exposing a smaller subset of

3762

* to priority we are exposing a smaller subset of

3771

* memory to reclaim from. Consider this as a longer

3763

* memory to reclaim from. Consider this as a longer

3772

* term TODO.

3764

* term TODO.

3773

*/

3765

*/

3774

/* If excess == 0, no tree ops */

3766

/* If excess == 0, no tree ops */

3775

__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);

3767

__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);

3776

spin_unlock(&mctz->lock);

3768

spin_unlock(&mctz->lock);

3777

css_put(&mz->mem->css);

3769

css_put(&mz->mem->css);

3778

loop++;

3770

loop++;

3779

/*

3771

/*

3780

* Could not reclaim anything and there are no more

3772

* Could not reclaim anything and there are no more

3781

* mem cgroups to try or we seem to be looping without

3773

* mem cgroups to try or we seem to be looping without

3782

* reclaiming anything.

3774

* reclaiming anything.

3783

*/

3775

*/

3784

if (!nr_reclaimed &&

3776

if (!nr_reclaimed &&

3785

(next_mz == NULL ||

3777

(next_mz == NULL ||

3786

loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))

3778

loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))

3787

break;

3779

break;

3788

} while (!nr_reclaimed);

3780

} while (!nr_reclaimed);

3789

if (next_mz)

3781

if (next_mz)

3790

css_put(&next_mz->mem->css);

3782

css_put(&next_mz->mem->css);

3791

return nr_reclaimed;

3783

return nr_reclaimed;

3792

}

3784

}

3793

3785

3794

/*

3786

/*

3795

* This routine traverse page_cgroup in given list and drop them all.

3787

* This routine traverse page_cgroup in given list and drop them all.

3796

* *And* this routine doesn't reclaim page itself, just removes page_cgroup.

3788

* *And* this routine doesn't reclaim page itself, just removes page_cgroup.

3797

*/

3789

*/

3798

static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,

3790

static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,

3799

int node, int zid, enum lru_list lru)

3791

int node, int zid, enum lru_list lru)

3800

{

3792

{

3801

struct zone *zone;

3793

struct zone *zone;

3802

struct mem_cgroup_per_zone *mz;

3794

struct mem_cgroup_per_zone *mz;

3803

struct page_cgroup *pc, *busy;

3795

struct page_cgroup *pc, *busy;

3804

unsigned long flags, loop;

3796

unsigned long flags, loop;

3805

struct list_head *list;

3797

struct list_head *list;

3806

int ret = 0;

3798

int ret = 0;

3807

3799

3808

zone = &NODE_DATA(node)->node_zones[zid];

3800

zone = &NODE_DATA(node)->node_zones[zid];

3809

mz = mem_cgroup_zoneinfo(mem, node, zid);

3801

mz = mem_cgroup_zoneinfo(mem, node, zid);

3810

list = &mz->lists[lru];

3802

list = &mz->lists[lru];

3811

3803

3812

loop = MEM_CGROUP_ZSTAT(mz, lru);

3804

loop = MEM_CGROUP_ZSTAT(mz, lru);

3813

/* give some margin against EBUSY etc...*/

3805

/* give some margin against EBUSY etc...*/

3814

loop += 256;

3806

loop += 256;

3815

busy = NULL;

3807

busy = NULL;

3816

while (loop--) {

3808

while (loop--) {

3817

struct page *page;

3809

struct page *page;

3818

3810

3819

ret = 0;

3811

ret = 0;

3820

spin_lock_irqsave(&zone->lru_lock, flags);

3812

spin_lock_irqsave(&zone->lru_lock, flags);

3821

if (list_empty(list)) {

3813

if (list_empty(list)) {

3822

spin_unlock_irqrestore(&zone->lru_lock, flags);

3814

spin_unlock_irqrestore(&zone->lru_lock, flags);

3823

break;

3815

break;

3824

}

3816

}

3825

pc = list_entry(list->prev, struct page_cgroup, lru);

3817

pc = list_entry(list->prev, struct page_cgroup, lru);

3826

if (busy == pc) {

3818

if (busy == pc) {

3827

list_move(&pc->lru, list);

3819

list_move(&pc->lru, list);

3828

busy = NULL;

3820

busy = NULL;

3829

spin_unlock_irqrestore(&zone->lru_lock, flags);

3821

spin_unlock_irqrestore(&zone->lru_lock, flags);

3830

continue;

3822

continue;

3831

}

3823

}

3832

spin_unlock_irqrestore(&zone->lru_lock, flags);

3824

spin_unlock_irqrestore(&zone->lru_lock, flags);

3833

3825

3834

page = lookup_cgroup_page(pc);

3826

page = lookup_cgroup_page(pc);

3835

3827

3836

ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);

3828

ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);

3837

if (ret == -ENOMEM)

3829

if (ret == -ENOMEM)

3838

break;

3830

break;

3839

3831

3840

if (ret == -EBUSY || ret == -EINVAL) {

3832

if (ret == -EBUSY || ret == -EINVAL) {

3841

/* found lock contention or "pc" is obsolete. */

3833

/* found lock contention or "pc" is obsolete. */

3842

busy = pc;

3834

busy = pc;

3843

cond_resched();

3835

cond_resched();

3844

} else

3836

} else

3845

busy = NULL;

3837

busy = NULL;

3846

}

3838

}

3847

3839

3848

if (!ret && !list_empty(list))

3840

if (!ret && !list_empty(list))

3849

return -EBUSY;

3841

return -EBUSY;

3850

return ret;

3842

return ret;

3851

}

3843

}

3852

3844

3853

/*

3845

/*

3854

* make mem_cgroup's charge to be 0 if there is no task.

3846

* make mem_cgroup's charge to be 0 if there is no task.

3855

* This enables deleting this mem_cgroup.

3847

* This enables deleting this mem_cgroup.

3856

*/

3848

*/

3857

static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)

3849

static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)

3858

{

3850

{

3859

int ret;

3851

int ret;

3860

int node, zid, shrink;

3852

int node, zid, shrink;

3861

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

3853

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

3862

struct cgroup *cgrp = mem->css.cgroup;

3854

struct cgroup *cgrp = mem->css.cgroup;

3863

3855

3864

css_get(&mem->css);

3856

css_get(&mem->css);

3865

3857

3866

shrink = 0;

3858

shrink = 0;

3867

/* should free all ? */

3859

/* should free all ? */

3868

if (free_all)

3860

if (free_all)

3869

goto try_to_free;

3861

goto try_to_free;

3870

move_account:

3862

move_account:

3871

do {

3863

do {

3872

ret = -EBUSY;

3864

ret = -EBUSY;

3873

if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))

3865

if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))

3874

goto out;

3866

goto out;

3875

ret = -EINTR;

3867

ret = -EINTR;

3876

if (signal_pending(current))

3868

if (signal_pending(current))

3877

goto out;

3869

goto out;

3878

/* This is for making all *used* pages to be on LRU. */

3870

/* This is for making all *used* pages to be on LRU. */

3879

lru_add_drain_all();

3871

lru_add_drain_all();

3880

drain_all_stock_sync(mem);

3872

drain_all_stock_sync(mem);

3881

ret = 0;

3873

ret = 0;

3882

mem_cgroup_start_move(mem);

3874

mem_cgroup_start_move(mem);

3883

for_each_node_state(node, N_HIGH_MEMORY) {

3875

for_each_node_state(node, N_HIGH_MEMORY) {

3884

for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {

3876

for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {

3885

enum lru_list l;

3877

enum lru_list l;

3886

for_each_lru(l) {

3878

for_each_lru(l) {

3887

ret = mem_cgroup_force_empty_list(mem,

3879

ret = mem_cgroup_force_empty_list(mem,

3888

node, zid, l);

3880

node, zid, l);

3889

if (ret)

3881

if (ret)

3890

break;

3882

break;

3891

}

3883

}

3892

}

3884

}

3893

if (ret)

3885

if (ret)

3894

break;

3886

break;

3895

}

3887

}

3896

mem_cgroup_end_move(mem);

3888

mem_cgroup_end_move(mem);

3897

memcg_oom_recover(mem);

3889

memcg_oom_recover(mem);

3898

/* it seems parent cgroup doesn't have enough mem */

3890

/* it seems parent cgroup doesn't have enough mem */

3899

if (ret == -ENOMEM)

3891

if (ret == -ENOMEM)

3900

goto try_to_free;

3892

goto try_to_free;

3901

cond_resched();

3893

cond_resched();

3902

/* "ret" should also be checked to ensure all lists are empty. */

3894

/* "ret" should also be checked to ensure all lists are empty. */

3903

} while (mem->res.usage > 0 || ret);

3895

} while (mem->res.usage > 0 || ret);

3904

out:

3896

out:

3905

css_put(&mem->css);

3897

css_put(&mem->css);

3906

return ret;

3898

return ret;

3907

3899

3908

try_to_free:

3900

try_to_free:

3909

/* returns EBUSY if there is a task or if we come here twice. */

3901

/* returns EBUSY if there is a task or if we come here twice. */

3910

if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {

3902

if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {

3911

ret = -EBUSY;

3903

ret = -EBUSY;

3912

goto out;

3904

goto out;

3913

}

3905

}

3914

/* we call try-to-free pages for make this cgroup empty */

3906

/* we call try-to-free pages for make this cgroup empty */

3915

lru_add_drain_all();

3907

lru_add_drain_all();

3916

/* try to free all pages in this cgroup */

3908

/* try to free all pages in this cgroup */

3917

shrink = 1;

3909

shrink = 1;

3918

while (nr_retries && mem->res.usage > 0) {

3910

while (nr_retries && mem->res.usage > 0) {

3919

struct memcg_scanrecord rec;

3911

struct memcg_scanrecord rec;

3920

int progress;

3912

int progress;

3921

3913

3922

if (signal_pending(current)) {

3914

if (signal_pending(current)) {

3923

ret = -EINTR;

3915

ret = -EINTR;

3924

goto out;

3916

goto out;

3925

}

3917

}

3926

rec.context = SCAN_BY_SHRINK;

3918

rec.context = SCAN_BY_SHRINK;

3927

rec.mem = mem;

3919

rec.mem = mem;

3928

rec.root = mem;

3920

rec.root = mem;

3929

progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,

3921

progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,

3930

false, &rec);

3922

false, &rec);

3931

if (!progress) {

3923

if (!progress) {

3932

nr_retries--;

3924

nr_retries--;

3933

/* maybe some writeback is necessary */

3925

/* maybe some writeback is necessary */

3934

congestion_wait(BLK_RW_ASYNC, HZ/10);

3926

congestion_wait(BLK_RW_ASYNC, HZ/10);

3935

}

3927

}

3936

3928

3937

}

3929

}

3938

lru_add_drain();

3930

lru_add_drain();

3939

/* try move_account...there may be some *locked* pages. */

3931

/* try move_account...there may be some *locked* pages. */

3940

goto move_account;

3932

goto move_account;

3941

}

3933

}

3942

3934

3943

int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)

3935

int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)

3944

{

3936

{

3945

return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);

3937

return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);

3946

}

3938

}

3947

3939

3948

3940

3949

static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)

3941

static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)

3950

{

3942

{

3951

return mem_cgroup_from_cont(cont)->use_hierarchy;

3943

return mem_cgroup_from_cont(cont)->use_hierarchy;

3952

}

3944

}

3953

3945

3954

static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,

3946

static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,

3955

u64 val)

3947

u64 val)

3956

{

3948

{

3957

int retval = 0;

3949

int retval = 0;

3958

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

3950

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

3959

struct cgroup *parent = cont->parent;

3951

struct cgroup *parent = cont->parent;

3960

struct mem_cgroup *parent_mem = NULL;

3952

struct mem_cgroup *parent_mem = NULL;

3961

3953

3962

if (parent)

3954

if (parent)

3963

parent_mem = mem_cgroup_from_cont(parent);

3955

parent_mem = mem_cgroup_from_cont(parent);

3964

3956

3965

cgroup_lock();

3957

cgroup_lock();

3966

/*

3958

/*

3967

* If parent's use_hierarchy is set, we can't make any modifications

3959

* If parent's use_hierarchy is set, we can't make any modifications

3968

* in the child subtrees. If it is unset, then the change can

3960

* in the child subtrees. If it is unset, then the change can

3969

* occur, provided the current cgroup has no children.

3961

* occur, provided the current cgroup has no children.

3970

*

3962

*

3971

* For the root cgroup, parent_mem is NULL, we allow value to be

3963

* For the root cgroup, parent_mem is NULL, we allow value to be

3972

* set if there are no children.

3964

* set if there are no children.

3973

*/

3965

*/

3974

if ((!parent_mem || !parent_mem->use_hierarchy) &&

3966

if ((!parent_mem || !parent_mem->use_hierarchy) &&

3975

(val == 1 || val == 0)) {

3967

(val == 1 || val == 0)) {

3976

if (list_empty(&cont->children))

3968

if (list_empty(&cont->children))

3977

mem->use_hierarchy = val;

3969

mem->use_hierarchy = val;

3978

else

3970

else

3979

retval = -EBUSY;

3971

retval = -EBUSY;

3980

} else

3972

} else

3981

retval = -EINVAL;

3973

retval = -EINVAL;

3982

cgroup_unlock();

3974

cgroup_unlock();

3983

3975

3984

return retval;

3976

return retval;

3985

}

3977

}

3986

3978

3987

3979

3988

static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,

3980

static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,

3989

enum mem_cgroup_stat_index idx)

3981

enum mem_cgroup_stat_index idx)

3990

{

3982

{

3991

struct mem_cgroup *iter;

3983

struct mem_cgroup *iter;

3992

long val = 0;

3984

long val = 0;

3993

3985

3994

/* Per-cpu values can be negative, use a signed accumulator */

3986

/* Per-cpu values can be negative, use a signed accumulator */

3995

for_each_mem_cgroup_tree(iter, mem)

3987

for_each_mem_cgroup_tree(iter, mem)

3996

val += mem_cgroup_read_stat(iter, idx);

3988

val += mem_cgroup_read_stat(iter, idx);

3997

3989

3998

if (val < 0) /* race ? */

3990

if (val < 0) /* race ? */

3999

val = 0;

3991

val = 0;

4000

return val;

3992

return val;

4001

}

3993

}

4002

3994

4003

static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)

3995

static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)

4004

{

3996

{

4005

u64 val;

3997

u64 val;

4006

3998

4007

if (!mem_cgroup_is_root(mem)) {

3999

if (!mem_cgroup_is_root(mem)) {

4008

if (!swap)

4000

if (!swap)

4009

return res_counter_read_u64(&mem->res, RES_USAGE);

4001

return res_counter_read_u64(&mem->res, RES_USAGE);

4010

else

4002

else

4011

return res_counter_read_u64(&mem->memsw, RES_USAGE);

4003

return res_counter_read_u64(&mem->memsw, RES_USAGE);

4012

}

4004

}

4013

4005

4014

val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);

4006

val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);

4015

val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);

4007

val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);

4016

4008

4017

if (swap)

4009

if (swap)

4018

val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);

4010

val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);

4019

4011

4020

return val << PAGE_SHIFT;

4012

return val << PAGE_SHIFT;

4021

}

4013

}

4022

4014

4023

static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)

4015

static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)

4024

{

4016

{

4025

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

4017

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

4026

u64 val;

4018

u64 val;

4027

int type, name;

4019

int type, name;

4028

4020

4029

type = MEMFILE_TYPE(cft->private);

4021

type = MEMFILE_TYPE(cft->private);

4030

name = MEMFILE_ATTR(cft->private);

4022

name = MEMFILE_ATTR(cft->private);

4031

switch (type) {

4023

switch (type) {

4032

case _MEM:

4024

case _MEM:

4033

if (name == RES_USAGE)

4025

if (name == RES_USAGE)

4034

val = mem_cgroup_usage(mem, false);

4026

val = mem_cgroup_usage(mem, false);

4035

else

4027

else

4036

val = res_counter_read_u64(&mem->res, name);

4028

val = res_counter_read_u64(&mem->res, name);

4037

break;

4029

break;

4038

case _MEMSWAP:

4030

case _MEMSWAP:

4039

if (name == RES_USAGE)

4031

if (name == RES_USAGE)

4040

val = mem_cgroup_usage(mem, true);

4032

val = mem_cgroup_usage(mem, true);

4041

else

4033

else

4042

val = res_counter_read_u64(&mem->memsw, name);

4034

val = res_counter_read_u64(&mem->memsw, name);

4043

break;

4035

break;

4044

default:

4036

default:

4045

BUG();

4037

BUG();

4046

break;

4038

break;

4047

}

4039

}

4048

return val;

4040

return val;

4049

}

4041

}

4050

/*

4042

/*

4051

* The user of this function is...

4043

* The user of this function is...

4052

* RES_LIMIT.

4044

* RES_LIMIT.

4053

*/

4045

*/

4054

static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,

4046

static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,

4055

const char *buffer)

4047

const char *buffer)

4056

{

4048

{

4057

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

4049

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

4058

int type, name;

4050

int type, name;

4059

unsigned long long val;

4051

unsigned long long val;

4060

int ret;

4052

int ret;

4061

4053

4062

type = MEMFILE_TYPE(cft->private);

4054

type = MEMFILE_TYPE(cft->private);

4063

name = MEMFILE_ATTR(cft->private);

4055

name = MEMFILE_ATTR(cft->private);

4064

switch (name) {

4056

switch (name) {

4065

case RES_LIMIT:

4057

case RES_LIMIT:

4066

if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */

4058

if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */

4067

ret = -EINVAL;

4059

ret = -EINVAL;

4068

break;

4060

break;

4069

}

4061

}

4070

/* This function does all necessary parse...reuse it */

4062

/* This function does all necessary parse...reuse it */

4071

ret = res_counter_memparse_write_strategy(buffer, &val);

4063

ret = res_counter_memparse_write_strategy(buffer, &val);

4072

if (ret)

4064

if (ret)

4073

break;

4065

break;

4074

if (type == _MEM)

4066

if (type == _MEM)

4075

ret = mem_cgroup_resize_limit(memcg, val);

4067

ret = mem_cgroup_resize_limit(memcg, val);

4076

else

4068

else

4077

ret = mem_cgroup_resize_memsw_limit(memcg, val);

4069

ret = mem_cgroup_resize_memsw_limit(memcg, val);

4078

break;

4070

break;

4079

case RES_SOFT_LIMIT:

4071

case RES_SOFT_LIMIT:

4080

ret = res_counter_memparse_write_strategy(buffer, &val);

4072

ret = res_counter_memparse_write_strategy(buffer, &val);

4081

if (ret)

4073

if (ret)

4082

break;

4074

break;

4083

/*

4075

/*

4084

* For memsw, soft limits are hard to implement in terms

4076

* For memsw, soft limits are hard to implement in terms

4085

* of semantics, for now, we support soft limits for

4077

* of semantics, for now, we support soft limits for

4086

* control without swap

4078

* control without swap

4087

*/

4079

*/

4088

if (type == _MEM)

4080

if (type == _MEM)

4089

ret = res_counter_set_soft_limit(&memcg->res, val);

4081

ret = res_counter_set_soft_limit(&memcg->res, val);

4090

else

4082

else

4091

ret = -EINVAL;

4083

ret = -EINVAL;

4092

break;

4084

break;

4093

default:

4085

default:

4094

ret = -EINVAL; /* should be BUG() ? */

4086

ret = -EINVAL; /* should be BUG() ? */

4095

break;

4087

break;

4096

}

4088

}

4097

return ret;

4089

return ret;

4098

}

4090

}

4099

4091

4100

static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,

4092

static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,

4101

unsigned long long *mem_limit, unsigned long long *memsw_limit)

4093

unsigned long long *mem_limit, unsigned long long *memsw_limit)

4102

{

4094

{

4103

struct cgroup *cgroup;

4095

struct cgroup *cgroup;

4104

unsigned long long min_limit, min_memsw_limit, tmp;

4096

unsigned long long min_limit, min_memsw_limit, tmp;

4105

4097

4106

min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

4098

min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

4107

min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

4099

min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

4108

cgroup = memcg->css.cgroup;

4100

cgroup = memcg->css.cgroup;

4109

if (!memcg->use_hierarchy)

4101

if (!memcg->use_hierarchy)

4110

goto out;

4102

goto out;

4111

4103

4112

while (cgroup->parent) {

4104

while (cgroup->parent) {

4113

cgroup = cgroup->parent;

4105

cgroup = cgroup->parent;

4114

memcg = mem_cgroup_from_cont(cgroup);

4106

memcg = mem_cgroup_from_cont(cgroup);

4115

if (!memcg->use_hierarchy)

4107

if (!memcg->use_hierarchy)

4116

break;

4108

break;

4117

tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);

4109

tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);

4118

min_limit = min(min_limit, tmp);

4110

min_limit = min(min_limit, tmp);

4119

tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

4111

tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

4120

min_memsw_limit = min(min_memsw_limit, tmp);

4112

min_memsw_limit = min(min_memsw_limit, tmp);

4121

}

4113

}

4122

out:

4114

out:

4123

*mem_limit = min_limit;

4115

*mem_limit = min_limit;

4124

*memsw_limit = min_memsw_limit;

4116

*memsw_limit = min_memsw_limit;

4125

return;

4117

return;

4126

}

4118

}

4127

4119

4128

static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)

4120

static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)

4129

{

4121

{

4130

struct mem_cgroup *mem;

4122

struct mem_cgroup *mem;

4131

int type, name;

4123

int type, name;

4132

4124

4133

mem = mem_cgroup_from_cont(cont);

4125

mem = mem_cgroup_from_cont(cont);

4134

type = MEMFILE_TYPE(event);

4126

type = MEMFILE_TYPE(event);

4135

name = MEMFILE_ATTR(event);

4127

name = MEMFILE_ATTR(event);

4136

switch (name) {

4128

switch (name) {

4137

case RES_MAX_USAGE:

4129

case RES_MAX_USAGE:

4138

if (type == _MEM)

4130

if (type == _MEM)

4139

res_counter_reset_max(&mem->res);

4131

res_counter_reset_max(&mem->res);

4140

else

4132

else

4141

res_counter_reset_max(&mem->memsw);

4133

res_counter_reset_max(&mem->memsw);

4142

break;

4134

break;

4143

case RES_FAILCNT:

4135

case RES_FAILCNT:

4144

if (type == _MEM)

4136

if (type == _MEM)

4145

res_counter_reset_failcnt(&mem->res);

4137

res_counter_reset_failcnt(&mem->res);

4146

else

4138

else

4147

res_counter_reset_failcnt(&mem->memsw);

4139

res_counter_reset_failcnt(&mem->memsw);

4148

break;

4140

break;

4149

}

4141

}

4150

4142

4151

return 0;

4143

return 0;

4152

}

4144

}

4153

4145

4154

static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,

4146

static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,

4155

struct cftype *cft)

4147

struct cftype *cft)

4156

{

4148

{

4157

return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;

4149

return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;

4158

}

4150

}

4159

4151

4160

#ifdef CONFIG_MMU

4152

#ifdef CONFIG_MMU

4161

static int mem_cgroup_move_charge_write(struct cgroup *cgrp,

4153

static int mem_cgroup_move_charge_write(struct cgroup *cgrp,

4162

struct cftype *cft, u64 val)

4154

struct cftype *cft, u64 val)

4163

{

4155

{

4164

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4156

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4165

4157

4166

if (val >= (1 << NR_MOVE_TYPE))

4158

if (val >= (1 << NR_MOVE_TYPE))

4167

return -EINVAL;

4159

return -EINVAL;

4168

/*

4160

/*

4169

* We check this value several times in both in can_attach() and

4161

* We check this value several times in both in can_attach() and

4170

* attach(), so we need cgroup lock to prevent this value from being

4162

* attach(), so we need cgroup lock to prevent this value from being

4171

* inconsistent.

4163

* inconsistent.

4172

*/

4164

*/

4173

cgroup_lock();

4165

cgroup_lock();

4174

mem->move_charge_at_immigrate = val;

4166

mem->move_charge_at_immigrate = val;

4175

cgroup_unlock();

4167

cgroup_unlock();

4176

4168

4177

return 0;

4169

return 0;

4178

}

4170

}

4179

#else

4171

#else

4180

static int mem_cgroup_move_charge_write(struct cgroup *cgrp,

4172

static int mem_cgroup_move_charge_write(struct cgroup *cgrp,

4181

struct cftype *cft, u64 val)

4173

struct cftype *cft, u64 val)

4182

{

4174

{

4183

return -ENOSYS;

4175

return -ENOSYS;

4184

}

4176

}

4185

#endif

4177

#endif

4186

4178

4187

4179

4188

/* For read statistics */

4180

/* For read statistics */

4189

enum {

4181

enum {

4190

MCS_CACHE,

4182

MCS_CACHE,

4191

MCS_RSS,

4183

MCS_RSS,

4192

MCS_FILE_MAPPED,

4184

MCS_FILE_MAPPED,

4193

MCS_PGPGIN,

4185

MCS_PGPGIN,

4194

MCS_PGPGOUT,

4186

MCS_PGPGOUT,

4195

MCS_SWAP,

4187

MCS_SWAP,

4196

MCS_PGFAULT,

4188

MCS_PGFAULT,

4197

MCS_PGMAJFAULT,

4189

MCS_PGMAJFAULT,

4198

MCS_INACTIVE_ANON,

4190

MCS_INACTIVE_ANON,

4199

MCS_ACTIVE_ANON,

4191

MCS_ACTIVE_ANON,

4200

MCS_INACTIVE_FILE,

4192

MCS_INACTIVE_FILE,

4201

MCS_ACTIVE_FILE,

4193

MCS_ACTIVE_FILE,

4202

MCS_UNEVICTABLE,

4194

MCS_UNEVICTABLE,

4203

NR_MCS_STAT,

4195

NR_MCS_STAT,

4204

};

4196

};

4205

4197

4206

struct mcs_total_stat {

4198

struct mcs_total_stat {

4207

s64 stat[NR_MCS_STAT];

4199

s64 stat[NR_MCS_STAT];

4208

};

4200

};

4209

4201

4210

struct {

4202

struct {

4211

char *local_name;

4203

char *local_name;

4212

char *total_name;

4204

char *total_name;

4213

} memcg_stat_strings[NR_MCS_STAT] = {

4205

} memcg_stat_strings[NR_MCS_STAT] = {

4214

{"cache", "total_cache"},

4206

{"cache", "total_cache"},

4215

{"rss", "total_rss"},

4207

{"rss", "total_rss"},

4216

{"mapped_file", "total_mapped_file"},

4208

{"mapped_file", "total_mapped_file"},

4217

{"pgpgin", "total_pgpgin"},

4209

{"pgpgin", "total_pgpgin"},

4218

{"pgpgout", "total_pgpgout"},

4210

{"pgpgout", "total_pgpgout"},

4219

{"swap", "total_swap"},

4211

{"swap", "total_swap"},

4220

{"pgfault", "total_pgfault"},

4212

{"pgfault", "total_pgfault"},

4221

{"pgmajfault", "total_pgmajfault"},

4213

{"pgmajfault", "total_pgmajfault"},

4222

{"inactive_anon", "total_inactive_anon"},

4214

{"inactive_anon", "total_inactive_anon"},

4223

{"active_anon", "total_active_anon"},

4215

{"active_anon", "total_active_anon"},

4224

{"inactive_file", "total_inactive_file"},

4216

{"inactive_file", "total_inactive_file"},

4225

{"active_file", "total_active_file"},

4217

{"active_file", "total_active_file"},

4226

{"unevictable", "total_unevictable"}

4218

{"unevictable", "total_unevictable"}

4227

};

4219

};

4228

4220

4229

4221

4230

static void

4222

static void

4231

mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)

4223

mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)

4232

{

4224

{

4233

s64 val;

4225

s64 val;

4234

4226

4235

/* per cpu stat */

4227

/* per cpu stat */

4236

val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);

4228

val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);

4237

s->stat[MCS_CACHE] += val * PAGE_SIZE;

4229

s->stat[MCS_CACHE] += val * PAGE_SIZE;

4238

val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);

4230

val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);

4239

s->stat[MCS_RSS] += val * PAGE_SIZE;

4231

s->stat[MCS_RSS] += val * PAGE_SIZE;

4240

val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);

4232

val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);

4241

s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;

4233

s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;

4242

val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);

4234

val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);

4243

s->stat[MCS_PGPGIN] += val;

4235

s->stat[MCS_PGPGIN] += val;

4244

val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);

4236

val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);

4245

s->stat[MCS_PGPGOUT] += val;

4237

s->stat[MCS_PGPGOUT] += val;

4246

if (do_swap_account) {

4238

if (do_swap_account) {

4247

val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);

4239

val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);

4248

s->stat[MCS_SWAP] += val * PAGE_SIZE;

4240

s->stat[MCS_SWAP] += val * PAGE_SIZE;

4249

}

4241

}

4250

val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);

4242

val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);

4251

s->stat[MCS_PGFAULT] += val;

4243

s->stat[MCS_PGFAULT] += val;

4252

val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);

4244

val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);

4253

s->stat[MCS_PGMAJFAULT] += val;

4245

s->stat[MCS_PGMAJFAULT] += val;

4254

4246

4255

/* per zone stat */

4247

/* per zone stat */

4256

val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));

4248

val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));

4257

s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;

4249

s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;

4258

val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));

4250

val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));

4259

s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;

4251

s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;

4260

val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));

4252

val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));

4261

s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;

4253

s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;

4262

val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));

4254

val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));

4263

s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;

4255

s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;

4264

val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));

4256

val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));

4265

s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;

4257

s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;

4266

}

4258

}

4267

4259

4268

static void

4260

static void

4269

mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)

4261

mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)

4270

{

4262

{

4271

struct mem_cgroup *iter;

4263

struct mem_cgroup *iter;

4272

4264

4273

for_each_mem_cgroup_tree(iter, mem)

4265

for_each_mem_cgroup_tree(iter, mem)

4274

mem_cgroup_get_local_stat(iter, s);

4266

mem_cgroup_get_local_stat(iter, s);

4275

}

4267

}

4276

4268

4277

#ifdef CONFIG_NUMA

4269

#ifdef CONFIG_NUMA

4278

static int mem_control_numa_stat_show(struct seq_file *m, void *arg)

4270

static int mem_control_numa_stat_show(struct seq_file *m, void *arg)

4279

{

4271

{

4280

int nid;

4272

int nid;

4281

unsigned long total_nr, file_nr, anon_nr, unevictable_nr;

4273

unsigned long total_nr, file_nr, anon_nr, unevictable_nr;

4282

unsigned long node_nr;

4274

unsigned long node_nr;

4283

struct cgroup *cont = m->private;

4275

struct cgroup *cont = m->private;

4284

struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);

4276

struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);

4285

4277

4286

total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);

4278

total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);

4287

seq_printf(m, "total=%lu", total_nr);

4279

seq_printf(m, "total=%lu", total_nr);

4288

for_each_node_state(nid, N_HIGH_MEMORY) {

4280

for_each_node_state(nid, N_HIGH_MEMORY) {

4289

node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);

4281

node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);

4290

seq_printf(m, " N%d=%lu", nid, node_nr);

4282

seq_printf(m, " N%d=%lu", nid, node_nr);

4291

}

4283

}

4292

seq_putc(m, '\n');

4284

seq_putc(m, '\n');

4293

4285

4294

file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);

4286

file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);

4295

seq_printf(m, "file=%lu", file_nr);

4287

seq_printf(m, "file=%lu", file_nr);

4296

for_each_node_state(nid, N_HIGH_MEMORY) {

4288

for_each_node_state(nid, N_HIGH_MEMORY) {

4297

node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,

4289

node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,

4298

LRU_ALL_FILE);

4290

LRU_ALL_FILE);

4299

seq_printf(m, " N%d=%lu", nid, node_nr);

4291

seq_printf(m, " N%d=%lu", nid, node_nr);

4300

}

4292

}

4301

seq_putc(m, '\n');

4293

seq_putc(m, '\n');

4302

4294

4303

anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);

4295

anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);

4304

seq_printf(m, "anon=%lu", anon_nr);

4296

seq_printf(m, "anon=%lu", anon_nr);

4305

for_each_node_state(nid, N_HIGH_MEMORY) {

4297

for_each_node_state(nid, N_HIGH_MEMORY) {

4306

node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,

4298

node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,

4307

LRU_ALL_ANON);

4299

LRU_ALL_ANON);

4308

seq_printf(m, " N%d=%lu", nid, node_nr);

4300

seq_printf(m, " N%d=%lu", nid, node_nr);

4309

}

4301

}

4310

seq_putc(m, '\n');

4302

seq_putc(m, '\n');

4311

4303

4312

unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));

4304

unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));

4313

seq_printf(m, "unevictable=%lu", unevictable_nr);

4305

seq_printf(m, "unevictable=%lu", unevictable_nr);

4314

for_each_node_state(nid, N_HIGH_MEMORY) {

4306

for_each_node_state(nid, N_HIGH_MEMORY) {

4315

node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,

4307

node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,

4316

BIT(LRU_UNEVICTABLE));

4308

BIT(LRU_UNEVICTABLE));

4317

seq_printf(m, " N%d=%lu", nid, node_nr);

4309

seq_printf(m, " N%d=%lu", nid, node_nr);

4318

}

4310

}

4319

seq_putc(m, '\n');

4311

seq_putc(m, '\n');

4320

return 0;

4312

return 0;

4321

}

4313

}

4322

#endif /* CONFIG_NUMA */

4314

#endif /* CONFIG_NUMA */

4323

4315

4324

static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,

4316

static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,

4325

struct cgroup_map_cb *cb)

4317

struct cgroup_map_cb *cb)

4326

{

4318

{

4327

struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);

4319

struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);

4328

struct mcs_total_stat mystat;

4320

struct mcs_total_stat mystat;

4329

int i;

4321

int i;

4330

4322

4331

memset(&mystat, 0, sizeof(mystat));

4323

memset(&mystat, 0, sizeof(mystat));

4332

mem_cgroup_get_local_stat(mem_cont, &mystat);

4324

mem_cgroup_get_local_stat(mem_cont, &mystat);

4333

4325

4334

4326

4335

for (i = 0; i < NR_MCS_STAT; i++) {

4327

for (i = 0; i < NR_MCS_STAT; i++) {

4336

if (i == MCS_SWAP && !do_swap_account)

4328

if (i == MCS_SWAP && !do_swap_account)

4337

continue;

4329

continue;

4338

cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);

4330

cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);

4339

}

4331

}

4340

4332

4341

/* Hierarchical information */

4333

/* Hierarchical information */

4342

{

4334

{

4343

unsigned long long limit, memsw_limit;

4335

unsigned long long limit, memsw_limit;

4344

memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);

4336

memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);

4345

cb->fill(cb, "hierarchical_memory_limit", limit);

4337

cb->fill(cb, "hierarchical_memory_limit", limit);

4346

if (do_swap_account)

4338

if (do_swap_account)

4347

cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);

4339

cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);

4348

}

4340

}

4349

4341

4350

memset(&mystat, 0, sizeof(mystat));

4342

memset(&mystat, 0, sizeof(mystat));

4351

mem_cgroup_get_total_stat(mem_cont, &mystat);

4343

mem_cgroup_get_total_stat(mem_cont, &mystat);

4352

for (i = 0; i < NR_MCS_STAT; i++) {

4344

for (i = 0; i < NR_MCS_STAT; i++) {

4353

if (i == MCS_SWAP && !do_swap_account)

4345

if (i == MCS_SWAP && !do_swap_account)

4354

continue;

4346

continue;

4355

cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);

4347

cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);

4356

}

4348

}

4357

4349

4358

#ifdef CONFIG_DEBUG_VM

4350

#ifdef CONFIG_DEBUG_VM

4359

cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));

4351

cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));

4360

4352

4361

{

4353

{

4362

int nid, zid;

4354

int nid, zid;

4363

struct mem_cgroup_per_zone *mz;

4355

struct mem_cgroup_per_zone *mz;

4364

unsigned long recent_rotated[2] = {0, 0};

4356

unsigned long recent_rotated[2] = {0, 0};

4365

unsigned long recent_scanned[2] = {0, 0};

4357

unsigned long recent_scanned[2] = {0, 0};

4366

4358

4367

for_each_online_node(nid)

4359

for_each_online_node(nid)

4368

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

4360

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

4369

mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);

4361

mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);

4370

4362

4371

recent_rotated[0] +=

4363

recent_rotated[0] +=

4372

mz->reclaim_stat.recent_rotated[0];

4364

mz->reclaim_stat.recent_rotated[0];

4373

recent_rotated[1] +=

4365

recent_rotated[1] +=

4374

mz->reclaim_stat.recent_rotated[1];

4366

mz->reclaim_stat.recent_rotated[1];

4375

recent_scanned[0] +=

4367

recent_scanned[0] +=

4376

mz->reclaim_stat.recent_scanned[0];

4368

mz->reclaim_stat.recent_scanned[0];

4377

recent_scanned[1] +=

4369

recent_scanned[1] +=

4378

mz->reclaim_stat.recent_scanned[1];

4370

mz->reclaim_stat.recent_scanned[1];

4379

}

4371

}

4380

cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);

4372

cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);

4381

cb->fill(cb, "recent_rotated_file", recent_rotated[1]);

4373

cb->fill(cb, "recent_rotated_file", recent_rotated[1]);

4382

cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);

4374

cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);

4383

cb->fill(cb, "recent_scanned_file", recent_scanned[1]);

4375

cb->fill(cb, "recent_scanned_file", recent_scanned[1]);

4384

}

4376

}

4385

#endif

4377

#endif

4386

4378

4387

return 0;

4379

return 0;

4388

}

4380

}

4389

4381

4390

static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)

4382

static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)

4391

{

4383

{

4392

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4384

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4393

4385

4394

return mem_cgroup_swappiness(memcg);

4386

return mem_cgroup_swappiness(memcg);

4395

}

4387

}

4396

4388

4397

static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,

4389

static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,

4398

u64 val)

4390

u64 val)

4399

{

4391

{

4400

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4392

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4401

struct mem_cgroup *parent;

4393

struct mem_cgroup *parent;

4402

4394

4403

if (val > 100)

4395

if (val > 100)

4404

return -EINVAL;

4396

return -EINVAL;

4405

4397

4406

if (cgrp->parent == NULL)

4398

if (cgrp->parent == NULL)

4407

return -EINVAL;

4399

return -EINVAL;

4408

4400

4409

parent = mem_cgroup_from_cont(cgrp->parent);

4401

parent = mem_cgroup_from_cont(cgrp->parent);

4410

4402

4411

cgroup_lock();

4403

cgroup_lock();

4412

4404

4413

/* If under hierarchy, only empty-root can set this value */

4405

/* If under hierarchy, only empty-root can set this value */

4414

if ((parent->use_hierarchy) ||

4406

if ((parent->use_hierarchy) ||

4415

(memcg->use_hierarchy && !list_empty(&cgrp->children))) {

4407

(memcg->use_hierarchy && !list_empty(&cgrp->children))) {

4416

cgroup_unlock();

4408

cgroup_unlock();

4417

return -EINVAL;

4409

return -EINVAL;

4418

}

4410

}

4419

4411

4420

memcg->swappiness = val;

4412

memcg->swappiness = val;

4421

4413

4422

cgroup_unlock();

4414

cgroup_unlock();

4423

4415

4424

return 0;

4416

return 0;

4425

}

4417

}

4426

4418

4427

static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)

4419

static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)

4428

{

4420

{

4429

struct mem_cgroup_threshold_ary *t;

4421

struct mem_cgroup_threshold_ary *t;

4430

u64 usage;

4422

u64 usage;

4431

int i;

4423

int i;

4432

4424

4433

rcu_read_lock();

4425

rcu_read_lock();

4434

if (!swap)

4426

if (!swap)

4435

t = rcu_dereference(memcg->thresholds.primary);

4427

t = rcu_dereference(memcg->thresholds.primary);

4436

else

4428

else

4437

t = rcu_dereference(memcg->memsw_thresholds.primary);

4429

t = rcu_dereference(memcg->memsw_thresholds.primary);

4438

4430

4439

if (!t)

4431

if (!t)

4440

goto unlock;

4432

goto unlock;

4441

4433

4442

usage = mem_cgroup_usage(memcg, swap);

4434

usage = mem_cgroup_usage(memcg, swap);

4443

4435

4444

/*

4436

/*

4445

* current_threshold points to threshold just below usage.

4437

* current_threshold points to threshold just below usage.

4446

* If it's not true, a threshold was crossed after last

4438

* If it's not true, a threshold was crossed after last

4447

* call of __mem_cgroup_threshold().

4439

* call of __mem_cgroup_threshold().

4448

*/

4440

*/

4449

i = t->current_threshold;

4441

i = t->current_threshold;

4450

4442

4451

/*

4443

/*

4452

* Iterate backward over array of thresholds starting from

4444

* Iterate backward over array of thresholds starting from

4453

* current_threshold and check if a threshold is crossed.

4445

* current_threshold and check if a threshold is crossed.

4454

* If none of thresholds below usage is crossed, we read

4446

* If none of thresholds below usage is crossed, we read

4455

* only one element of the array here.

4447

* only one element of the array here.

4456

*/

4448

*/

4457

for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)

4449

for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)

4458

eventfd_signal(t->entries[i].eventfd, 1);

4450

eventfd_signal(t->entries[i].eventfd, 1);

4459

4451

4460

/* i = current_threshold + 1 */

4452

/* i = current_threshold + 1 */

4461

i++;

4453

i++;

4462

4454

4463

/*

4455

/*

4464

* Iterate forward over array of thresholds starting from

4456

* Iterate forward over array of thresholds starting from

4465

* current_threshold+1 and check if a threshold is crossed.

4457

* current_threshold+1 and check if a threshold is crossed.

4466

* If none of thresholds above usage is crossed, we read

4458

* If none of thresholds above usage is crossed, we read

4467

* only one element of the array here.

4459

* only one element of the array here.

4468

*/

4460

*/

4469

for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)

4461

for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)

4470

eventfd_signal(t->entries[i].eventfd, 1);

4462

eventfd_signal(t->entries[i].eventfd, 1);

4471

4463

4472

/* Update current_threshold */

4464

/* Update current_threshold */

4473

t->current_threshold = i - 1;

4465

t->current_threshold = i - 1;

4474

unlock:

4466

unlock:

4475

rcu_read_unlock();

4467

rcu_read_unlock();

4476

}

4468

}

4477

4469

4478

static void mem_cgroup_threshold(struct mem_cgroup *memcg)

4470

static void mem_cgroup_threshold(struct mem_cgroup *memcg)

4479

{

4471

{

4480

while (memcg) {

4472

while (memcg) {

4481

__mem_cgroup_threshold(memcg, false);

4473

__mem_cgroup_threshold(memcg, false);

4482

if (do_swap_account)

4474

if (do_swap_account)

4483

__mem_cgroup_threshold(memcg, true);

4475

__mem_cgroup_threshold(memcg, true);

4484

4476

4485

memcg = parent_mem_cgroup(memcg);

4477

memcg = parent_mem_cgroup(memcg);

4486

}

4478

}

4487

}

4479

}

4488

4480

4489

static int compare_thresholds(const void *a, const void *b)

4481

static int compare_thresholds(const void *a, const void *b)

4490

{

4482

{

4491

const struct mem_cgroup_threshold *_a = a;

4483

const struct mem_cgroup_threshold *_a = a;

4492

const struct mem_cgroup_threshold *_b = b;

4484

const struct mem_cgroup_threshold *_b = b;

4493

4485

4494

return _a->threshold - _b->threshold;

4486

return _a->threshold - _b->threshold;

4495

}

4487

}

4496

4488

4497

static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)

4489

static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)

4498

{

4490

{

4499

struct mem_cgroup_eventfd_list *ev;

4491

struct mem_cgroup_eventfd_list *ev;

4500

4492

4501

list_for_each_entry(ev, &mem->oom_notify, list)

4493

list_for_each_entry(ev, &mem->oom_notify, list)

4502

eventfd_signal(ev->eventfd, 1);

4494

eventfd_signal(ev->eventfd, 1);

4503

return 0;

4495

return 0;

4504

}

4496

}

4505

4497

4506

static void mem_cgroup_oom_notify(struct mem_cgroup *mem)

4498

static void mem_cgroup_oom_notify(struct mem_cgroup *mem)

4507

{

4499

{

4508

struct mem_cgroup *iter;

4500

struct mem_cgroup *iter;

4509

4501

4510

for_each_mem_cgroup_tree(iter, mem)

4502

for_each_mem_cgroup_tree(iter, mem)

4511

mem_cgroup_oom_notify_cb(iter);

4503

mem_cgroup_oom_notify_cb(iter);

4512

}

4504

}

4513

4505

4514

static int mem_cgroup_usage_register_event(struct cgroup *cgrp,

4506

static int mem_cgroup_usage_register_event(struct cgroup *cgrp,

4515

struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)

4507

struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)

4516

{

4508

{

4517

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4509

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4518

struct mem_cgroup_thresholds *thresholds;

4510

struct mem_cgroup_thresholds *thresholds;

4519

struct mem_cgroup_threshold_ary *new;

4511

struct mem_cgroup_threshold_ary *new;

4520

int type = MEMFILE_TYPE(cft->private);

4512

int type = MEMFILE_TYPE(cft->private);

4521

u64 threshold, usage;

4513

u64 threshold, usage;

4522

int i, size, ret;

4514

int i, size, ret;

4523

4515

4524

ret = res_counter_memparse_write_strategy(args, &threshold);

4516

ret = res_counter_memparse_write_strategy(args, &threshold);

4525

if (ret)

4517

if (ret)

4526

return ret;

4518

return ret;

4527

4519

4528

mutex_lock(&memcg->thresholds_lock);

4520

mutex_lock(&memcg->thresholds_lock);

4529

4521

4530

if (type == _MEM)

4522

if (type == _MEM)

4531

thresholds = &memcg->thresholds;

4523

thresholds = &memcg->thresholds;

4532

else if (type == _MEMSWAP)

4524

else if (type == _MEMSWAP)

4533

thresholds = &memcg->memsw_thresholds;

4525

thresholds = &memcg->memsw_thresholds;

4534

else

4526

else

4535

BUG();

4527

BUG();

4536

4528

4537

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

4529

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

4538

4530

4539

/* Check if a threshold crossed before adding a new one */

4531

/* Check if a threshold crossed before adding a new one */

4540

if (thresholds->primary)

4532

if (thresholds->primary)

4541

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4533

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4542

4534

4543

size = thresholds->primary ? thresholds->primary->size + 1 : 1;

4535

size = thresholds->primary ? thresholds->primary->size + 1 : 1;

4544

4536

4545

/* Allocate memory for new array of thresholds */

4537

/* Allocate memory for new array of thresholds */

4546

new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),

4538

new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),

4547

GFP_KERNEL);

4539

GFP_KERNEL);

4548

if (!new) {

4540

if (!new) {

4549

ret = -ENOMEM;

4541

ret = -ENOMEM;

4550

goto unlock;

4542

goto unlock;

4551

}

4543

}

4552

new->size = size;

4544

new->size = size;

4553

4545

4554

/* Copy thresholds (if any) to new array */

4546

/* Copy thresholds (if any) to new array */

4555

if (thresholds->primary) {

4547

if (thresholds->primary) {

4556

memcpy(new->entries, thresholds->primary->entries, (size - 1) *

4548

memcpy(new->entries, thresholds->primary->entries, (size - 1) *

4557

sizeof(struct mem_cgroup_threshold));

4549

sizeof(struct mem_cgroup_threshold));

4558

}

4550

}

4559

4551

4560

/* Add new threshold */

4552

/* Add new threshold */

4561

new->entries[size - 1].eventfd = eventfd;

4553

new->entries[size - 1].eventfd = eventfd;

4562

new->entries[size - 1].threshold = threshold;

4554

new->entries[size - 1].threshold = threshold;

4563

4555

4564

/* Sort thresholds. Registering of new threshold isn't time-critical */

4556

/* Sort thresholds. Registering of new threshold isn't time-critical */

4565

sort(new->entries, size, sizeof(struct mem_cgroup_threshold),

4557

sort(new->entries, size, sizeof(struct mem_cgroup_threshold),

4566

compare_thresholds, NULL);

4558

compare_thresholds, NULL);

4567

4559

4568

/* Find current threshold */

4560

/* Find current threshold */

4569

new->current_threshold = -1;

4561

new->current_threshold = -1;

4570

for (i = 0; i < size; i++) {

4562

for (i = 0; i < size; i++) {

4571

if (new->entries[i].threshold < usage) {

4563

if (new->entries[i].threshold < usage) {

4572

/*

4564

/*

4573

* new->current_threshold will not be used until

4565

* new->current_threshold will not be used until

4574

* rcu_assign_pointer(), so it's safe to increment

4566

* rcu_assign_pointer(), so it's safe to increment

4575

* it here.

4567

* it here.

4576

*/

4568

*/

4577

++new->current_threshold;

4569

++new->current_threshold;

4578

}

4570

}

4579

}

4571

}

4580

4572

4581

/* Free old spare buffer and save old primary buffer as spare */

4573

/* Free old spare buffer and save old primary buffer as spare */

4582

kfree(thresholds->spare);

4574

kfree(thresholds->spare);

4583

thresholds->spare = thresholds->primary;

4575

thresholds->spare = thresholds->primary;

4584

4576

4585

rcu_assign_pointer(thresholds->primary, new);

4577

rcu_assign_pointer(thresholds->primary, new);

4586

4578

4587

/* To be sure that nobody uses thresholds */

4579

/* To be sure that nobody uses thresholds */

4588

synchronize_rcu();

4580

synchronize_rcu();

4589

4581

4590

unlock:

4582

unlock:

4591

mutex_unlock(&memcg->thresholds_lock);

4583

mutex_unlock(&memcg->thresholds_lock);

4592

4584

4593

return ret;

4585

return ret;

4594

}

4586

}

4595

4587

4596

static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,

4588

static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,

4597

struct cftype *cft, struct eventfd_ctx *eventfd)

4589

struct cftype *cft, struct eventfd_ctx *eventfd)

4598

{

4590

{

4599

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4591

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4600

struct mem_cgroup_thresholds *thresholds;

4592

struct mem_cgroup_thresholds *thresholds;

4601

struct mem_cgroup_threshold_ary *new;

4593

struct mem_cgroup_threshold_ary *new;

4602

int type = MEMFILE_TYPE(cft->private);

4594

int type = MEMFILE_TYPE(cft->private);

4603

u64 usage;

4595

u64 usage;

4604

int i, j, size;

4596

int i, j, size;

4605

4597

4606

mutex_lock(&memcg->thresholds_lock);

4598

mutex_lock(&memcg->thresholds_lock);

4607

if (type == _MEM)

4599

if (type == _MEM)

4608

thresholds = &memcg->thresholds;

4600

thresholds = &memcg->thresholds;

4609

else if (type == _MEMSWAP)

4601

else if (type == _MEMSWAP)

4610

thresholds = &memcg->memsw_thresholds;

4602

thresholds = &memcg->memsw_thresholds;

4611

else

4603

else

4612

BUG();

4604

BUG();

4613

4605

4614

/*

4606

/*

4615

* Something went wrong if we trying to unregister a threshold

4607

* Something went wrong if we trying to unregister a threshold

4616

* if we don't have thresholds

4608

* if we don't have thresholds

4617

*/

4609

*/

4618

BUG_ON(!thresholds);

4610

BUG_ON(!thresholds);

4619

4611

4620

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

4612

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

4621

4613

4622

/* Check if a threshold crossed before removing */

4614

/* Check if a threshold crossed before removing */

4623

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4615

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4624

4616

4625

/* Calculate new number of threshold */

4617

/* Calculate new number of threshold */

4626

size = 0;

4618

size = 0;

4627

for (i = 0; i < thresholds->primary->size; i++) {

4619

for (i = 0; i < thresholds->primary->size; i++) {

4628

if (thresholds->primary->entries[i].eventfd != eventfd)

4620

if (thresholds->primary->entries[i].eventfd != eventfd)

4629

size++;

4621

size++;

4630

}

4622

}

4631

4623

4632

new = thresholds->spare;

4624

new = thresholds->spare;

4633

4625

4634

/* Set thresholds array to NULL if we don't have thresholds */

4626

/* Set thresholds array to NULL if we don't have thresholds */

4635

if (!size) {

4627

if (!size) {

4636

kfree(new);

4628

kfree(new);

4637

new = NULL;

4629

new = NULL;

4638

goto swap_buffers;

4630

goto swap_buffers;

4639

}

4631

}

4640

4632

4641

new->size = size;

4633

new->size = size;

4642

4634

4643

/* Copy thresholds and find current threshold */

4635

/* Copy thresholds and find current threshold */

4644

new->current_threshold = -1;

4636

new->current_threshold = -1;

4645

for (i = 0, j = 0; i < thresholds->primary->size; i++) {

4637

for (i = 0, j = 0; i < thresholds->primary->size; i++) {

4646

if (thresholds->primary->entries[i].eventfd == eventfd)

4638

if (thresholds->primary->entries[i].eventfd == eventfd)

4647

continue;

4639

continue;

4648

4640

4649

new->entries[j] = thresholds->primary->entries[i];

4641

new->entries[j] = thresholds->primary->entries[i];

4650

if (new->entries[j].threshold < usage) {

4642

if (new->entries[j].threshold < usage) {

4651

/*

4643

/*

4652

* new->current_threshold will not be used

4644

* new->current_threshold will not be used

4653

* until rcu_assign_pointer(), so it's safe to increment

4645

* until rcu_assign_pointer(), so it's safe to increment

4654

* it here.

4646

* it here.

4655

*/

4647

*/

4656

++new->current_threshold;

4648

++new->current_threshold;

4657

}

4649

}

4658

j++;

4650

j++;

4659

}

4651

}

4660

4652

4661

swap_buffers:

4653

swap_buffers:

4662

/* Swap primary and spare array */

4654

/* Swap primary and spare array */

4663

thresholds->spare = thresholds->primary;

4655

thresholds->spare = thresholds->primary;

4664

rcu_assign_pointer(thresholds->primary, new);

4656

rcu_assign_pointer(thresholds->primary, new);

4665

4657

4666

/* To be sure that nobody uses thresholds */

4658

/* To be sure that nobody uses thresholds */

4667

synchronize_rcu();

4659

synchronize_rcu();

4668

4660

4669

mutex_unlock(&memcg->thresholds_lock);

4661

mutex_unlock(&memcg->thresholds_lock);

4670

}

4662

}

4671

4663

4672

static int mem_cgroup_oom_register_event(struct cgroup *cgrp,

4664

static int mem_cgroup_oom_register_event(struct cgroup *cgrp,

4673

struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)

4665

struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)

4674

{

4666

{

4675

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4667

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4676

struct mem_cgroup_eventfd_list *event;

4668

struct mem_cgroup_eventfd_list *event;

4677

int type = MEMFILE_TYPE(cft->private);

4669

int type = MEMFILE_TYPE(cft->private);

4678

4670

4679

BUG_ON(type != _OOM_TYPE);

4671

BUG_ON(type != _OOM_TYPE);

4680

event = kmalloc(sizeof(*event), GFP_KERNEL);

4672

event = kmalloc(sizeof(*event), GFP_KERNEL);

4681

if (!event)

4673

if (!event)

4682

return -ENOMEM;

4674

return -ENOMEM;

4683

4675

4684

spin_lock(&memcg_oom_lock);

4676

spin_lock(&memcg_oom_lock);

4685

4677

4686

event->eventfd = eventfd;

4678

event->eventfd = eventfd;

4687

list_add(&event->list, &memcg->oom_notify);

4679

list_add(&event->list, &memcg->oom_notify);

4688

4680

4689

/* already in OOM ? */

4681

/* already in OOM ? */

4690

if (atomic_read(&memcg->under_oom))

4682

if (atomic_read(&memcg->under_oom))

4691

eventfd_signal(eventfd, 1);

4683

eventfd_signal(eventfd, 1);

4692

spin_unlock(&memcg_oom_lock);

4684

spin_unlock(&memcg_oom_lock);

4693

4685

4694

return 0;

4686

return 0;

4695

}

4687

}

4696

4688

4697

static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,

4689

static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,

4698

struct cftype *cft, struct eventfd_ctx *eventfd)

4690

struct cftype *cft, struct eventfd_ctx *eventfd)

4699

{

4691

{

4700

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4692

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4701

struct mem_cgroup_eventfd_list *ev, *tmp;

4693

struct mem_cgroup_eventfd_list *ev, *tmp;

4702

int type = MEMFILE_TYPE(cft->private);

4694

int type = MEMFILE_TYPE(cft->private);

4703

4695

4704

BUG_ON(type != _OOM_TYPE);

4696

BUG_ON(type != _OOM_TYPE);

4705

4697

4706

spin_lock(&memcg_oom_lock);

4698

spin_lock(&memcg_oom_lock);

4707

4699

4708

list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {

4700

list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {

4709

if (ev->eventfd == eventfd) {

4701

if (ev->eventfd == eventfd) {

4710

list_del(&ev->list);

4702

list_del(&ev->list);

4711

kfree(ev);

4703

kfree(ev);

4712

}

4704

}

4713

}

4705

}

4714

4706

4715

spin_unlock(&memcg_oom_lock);

4707

spin_unlock(&memcg_oom_lock);

4716

}

4708

}

4717

4709

4718

static int mem_cgroup_oom_control_read(struct cgroup *cgrp,

4710

static int mem_cgroup_oom_control_read(struct cgroup *cgrp,

4719

struct cftype *cft, struct cgroup_map_cb *cb)

4711

struct cftype *cft, struct cgroup_map_cb *cb)

4720

{

4712

{

4721

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4713

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4722

4714

4723

cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);

4715

cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);

4724

4716

4725

if (atomic_read(&mem->under_oom))

4717

if (atomic_read(&mem->under_oom))

4726

cb->fill(cb, "under_oom", 1);

4718

cb->fill(cb, "under_oom", 1);

4727

else

4719

else

4728

cb->fill(cb, "under_oom", 0);

4720

cb->fill(cb, "under_oom", 0);

4729

return 0;

4721

return 0;

4730

}

4722

}

4731

4723

4732

static int mem_cgroup_oom_control_write(struct cgroup *cgrp,

4724

static int mem_cgroup_oom_control_write(struct cgroup *cgrp,

4733

struct cftype *cft, u64 val)

4725

struct cftype *cft, u64 val)

4734

{

4726

{

4735

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4727

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4736

struct mem_cgroup *parent;

4728

struct mem_cgroup *parent;

4737

4729

4738

/* cannot set to root cgroup and only 0 and 1 are allowed */

4730

/* cannot set to root cgroup and only 0 and 1 are allowed */

4739

if (!cgrp->parent || !((val == 0) || (val == 1)))

4731

if (!cgrp->parent || !((val == 0) || (val == 1)))

4740

return -EINVAL;

4732

return -EINVAL;

4741

4733

4742

parent = mem_cgroup_from_cont(cgrp->parent);

4734

parent = mem_cgroup_from_cont(cgrp->parent);

4743

4735

4744

cgroup_lock();

4736

cgroup_lock();

4745

/* oom-kill-disable is a flag for subhierarchy. */

4737

/* oom-kill-disable is a flag for subhierarchy. */

4746

if ((parent->use_hierarchy) ||

4738

if ((parent->use_hierarchy) ||

4747

(mem->use_hierarchy && !list_empty(&cgrp->children))) {

4739

(mem->use_hierarchy && !list_empty(&cgrp->children))) {

4748

cgroup_unlock();

4740

cgroup_unlock();

4749

return -EINVAL;

4741

return -EINVAL;

4750

}

4742

}

4751

mem->oom_kill_disable = val;

4743

mem->oom_kill_disable = val;

4752

if (!val)

4744

if (!val)

4753

memcg_oom_recover(mem);

4745

memcg_oom_recover(mem);

4754

cgroup_unlock();

4746

cgroup_unlock();

4755

return 0;

4747

return 0;

4756

}

4748

}

4757

4749

4758

#ifdef CONFIG_NUMA

4750

#ifdef CONFIG_NUMA

4759

static const struct file_operations mem_control_numa_stat_file_operations = {

4751

static const struct file_operations mem_control_numa_stat_file_operations = {

4760

.read = seq_read,

4752

.read = seq_read,

4761

.llseek = seq_lseek,

4753

.llseek = seq_lseek,

4762

.release = single_release,

4754

.release = single_release,

4763

};

4755

};

4764

4756

4765

static int mem_control_numa_stat_open(struct inode *unused, struct file *file)

4757

static int mem_control_numa_stat_open(struct inode *unused, struct file *file)

4766

{

4758

{

4767

struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;

4759

struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;

4768

4760

4769

file->f_op = &mem_control_numa_stat_file_operations;

4761

file->f_op = &mem_control_numa_stat_file_operations;

4770

return single_open(file, mem_control_numa_stat_show, cont);

4762

return single_open(file, mem_control_numa_stat_show, cont);

4771

}

4763

}

4772

#endif /* CONFIG_NUMA */

4764

#endif /* CONFIG_NUMA */

4773

4765

4774

static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp,

4766

static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp,

4775

struct cftype *cft,

4767

struct cftype *cft,

4776

struct cgroup_map_cb *cb)

4768

struct cgroup_map_cb *cb)

4777

{

4769

{

4778

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4770

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4779

char string[64];

4771

char string[64];

4780

int i;

4772

int i;

4781

4773

4782

for (i = 0; i < NR_SCANSTATS; i++) {

4774

for (i = 0; i < NR_SCANSTATS; i++) {

4783

strcpy(string, scanstat_string[i]);

4775

strcpy(string, scanstat_string[i]);

4784

strcat(string, SCANSTAT_WORD_LIMIT);

4776

strcat(string, SCANSTAT_WORD_LIMIT);

4785

cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]);

4777

cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]);

4786

}

4778

}

4787

4779

4788

for (i = 0; i < NR_SCANSTATS; i++) {

4780

for (i = 0; i < NR_SCANSTATS; i++) {

4789

strcpy(string, scanstat_string[i]);

4781

strcpy(string, scanstat_string[i]);

4790

strcat(string, SCANSTAT_WORD_SYSTEM);

4782

strcat(string, SCANSTAT_WORD_SYSTEM);

4791

cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]);

4783

cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]);

4792

}

4784

}

4793

4785

4794

for (i = 0; i < NR_SCANSTATS; i++) {

4786

for (i = 0; i < NR_SCANSTATS; i++) {

4795

strcpy(string, scanstat_string[i]);

4787

strcpy(string, scanstat_string[i]);

4796

strcat(string, SCANSTAT_WORD_LIMIT);

4788

strcat(string, SCANSTAT_WORD_LIMIT);

4797

strcat(string, SCANSTAT_WORD_HIERARCHY);

4789

strcat(string, SCANSTAT_WORD_HIERARCHY);

4798

cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]);

4790

cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]);

4799

}

4791

}

4800

for (i = 0; i < NR_SCANSTATS; i++) {

4792

for (i = 0; i < NR_SCANSTATS; i++) {

4801

strcpy(string, scanstat_string[i]);

4793

strcpy(string, scanstat_string[i]);

4802

strcat(string, SCANSTAT_WORD_SYSTEM);

4794

strcat(string, SCANSTAT_WORD_SYSTEM);

4803

strcat(string, SCANSTAT_WORD_HIERARCHY);

4795

strcat(string, SCANSTAT_WORD_HIERARCHY);

4804

cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]);

4796

cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]);

4805

}

4797

}

4806

return 0;

4798

return 0;

4807

}

4799

}

4808

4800

4809

static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp,

4801

static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp,

4810

unsigned int event)

4802

unsigned int event)

4811

{

4803

{

4812

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4804

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4813

4805

4814

spin_lock(&mem->scanstat.lock);

4806

spin_lock(&mem->scanstat.lock);

4815

memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats));

4807

memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats));

4816

memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats));

4808

memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats));

4817

spin_unlock(&mem->scanstat.lock);

4809

spin_unlock(&mem->scanstat.lock);

4818

return 0;

4810

return 0;

4819

}

4811

}

4820

4812

4821

4813

4822

static struct cftype mem_cgroup_files[] = {

4814

static struct cftype mem_cgroup_files[] = {

4823

{

4815

{

4824

.name = "usage_in_bytes",

4816

.name = "usage_in_bytes",

4825

.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),

4817

.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),

4826

.read_u64 = mem_cgroup_read,

4818

.read_u64 = mem_cgroup_read,

4827

.register_event = mem_cgroup_usage_register_event,

4819

.register_event = mem_cgroup_usage_register_event,

4828

.unregister_event = mem_cgroup_usage_unregister_event,

4820

.unregister_event = mem_cgroup_usage_unregister_event,

4829

},

4821

},

4830

{

4822

{

4831

.name = "max_usage_in_bytes",

4823

.name = "max_usage_in_bytes",

4832

.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),

4824

.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),

4833

.trigger = mem_cgroup_reset,

4825

.trigger = mem_cgroup_reset,

4834

.read_u64 = mem_cgroup_read,

4826

.read_u64 = mem_cgroup_read,

4835

},

4827

},

4836

{

4828

{

4837

.name = "limit_in_bytes",

4829

.name = "limit_in_bytes",

4838

.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),

4830

.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),

4839

.write_string = mem_cgroup_write,

4831

.write_string = mem_cgroup_write,

4840

.read_u64 = mem_cgroup_read,

4832

.read_u64 = mem_cgroup_read,

4841

},

4833

},

4842

{

4834

{

4843

.name = "soft_limit_in_bytes",

4835

.name = "soft_limit_in_bytes",

4844

.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),

4836

.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),

4845

.write_string = mem_cgroup_write,

4837

.write_string = mem_cgroup_write,

4846

.read_u64 = mem_cgroup_read,

4838

.read_u64 = mem_cgroup_read,

4847

},

4839

},

4848

{

4840

{

4849

.name = "failcnt",

4841

.name = "failcnt",

4850

.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),

4842

.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),

4851

.trigger = mem_cgroup_reset,

4843

.trigger = mem_cgroup_reset,

4852

.read_u64 = mem_cgroup_read,

4844

.read_u64 = mem_cgroup_read,

4853

},

4845

},

4854

{

4846

{

4855

.name = "stat",

4847

.name = "stat",

4856

.read_map = mem_control_stat_show,

4848

.read_map = mem_control_stat_show,

4857

},

4849

},

4858

{

4850

{

4859

.name = "force_empty",

4851

.name = "force_empty",

4860

.trigger = mem_cgroup_force_empty_write,

4852

.trigger = mem_cgroup_force_empty_write,

4861

},

4853

},

4862

{

4854

{

4863

.name = "use_hierarchy",

4855

.name = "use_hierarchy",

4864

.write_u64 = mem_cgroup_hierarchy_write,

4856

.write_u64 = mem_cgroup_hierarchy_write,

4865

.read_u64 = mem_cgroup_hierarchy_read,

4857

.read_u64 = mem_cgroup_hierarchy_read,

4866

},

4858

},

4867

{

4859

{

4868

.name = "swappiness",

4860

.name = "swappiness",

4869

.read_u64 = mem_cgroup_swappiness_read,

4861

.read_u64 = mem_cgroup_swappiness_read,

4870

.write_u64 = mem_cgroup_swappiness_write,

4862

.write_u64 = mem_cgroup_swappiness_write,

4871

},

4863

},

4872

{

4864

{

4873

.name = "move_charge_at_immigrate",

4865

.name = "move_charge_at_immigrate",

4874

.read_u64 = mem_cgroup_move_charge_read,

4866

.read_u64 = mem_cgroup_move_charge_read,

4875

.write_u64 = mem_cgroup_move_charge_write,

4867

.write_u64 = mem_cgroup_move_charge_write,

4876

},

4868

},

4877

{

4869

{

4878

.name = "oom_control",

4870

.name = "oom_control",

4879

.read_map = mem_cgroup_oom_control_read,

4871

.read_map = mem_cgroup_oom_control_read,

4880

.write_u64 = mem_cgroup_oom_control_write,

4872

.write_u64 = mem_cgroup_oom_control_write,

4881

.register_event = mem_cgroup_oom_register_event,

4873

.register_event = mem_cgroup_oom_register_event,

4882

.unregister_event = mem_cgroup_oom_unregister_event,

4874

.unregister_event = mem_cgroup_oom_unregister_event,

4883

.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),

4875

.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),

4884

},

4876

},

4885

#ifdef CONFIG_NUMA

4877

#ifdef CONFIG_NUMA

4886

{

4878

{

4887

.name = "numa_stat",

4879

.name = "numa_stat",

4888

.open = mem_control_numa_stat_open,

4880

.open = mem_control_numa_stat_open,

4889

.mode = S_IRUGO,

4881

.mode = S_IRUGO,

4890

},

4882

},

4891

#endif

4883

#endif

4892

{

4884

{

4893

.name = "vmscan_stat",

4885

.name = "vmscan_stat",

4894

.read_map = mem_cgroup_vmscan_stat_read,

4886

.read_map = mem_cgroup_vmscan_stat_read,

4895

.trigger = mem_cgroup_reset_vmscan_stat,

4887

.trigger = mem_cgroup_reset_vmscan_stat,

4896

},

4888

},

4897

};

4889

};

4898

4890

4899

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

4891

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

4900

static struct cftype memsw_cgroup_files[] = {

4892

static struct cftype memsw_cgroup_files[] = {

4901

{

4893

{

4902

.name = "memsw.usage_in_bytes",

4894

.name = "memsw.usage_in_bytes",

4903

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),

4895

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),

4904

.read_u64 = mem_cgroup_read,

4896

.read_u64 = mem_cgroup_read,

4905

.register_event = mem_cgroup_usage_register_event,

4897

.register_event = mem_cgroup_usage_register_event,

4906

.unregister_event = mem_cgroup_usage_unregister_event,

4898

.unregister_event = mem_cgroup_usage_unregister_event,

4907

},

4899

},

4908

{

4900

{

4909

.name = "memsw.max_usage_in_bytes",

4901

.name = "memsw.max_usage_in_bytes",

4910

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),

4902

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),

4911

.trigger = mem_cgroup_reset,

4903

.trigger = mem_cgroup_reset,

4912

.read_u64 = mem_cgroup_read,

4904

.read_u64 = mem_cgroup_read,

4913

},

4905

},

4914

{

4906

{

4915

.name = "memsw.limit_in_bytes",

4907

.name = "memsw.limit_in_bytes",

4916

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),

4908

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),

4917

.write_string = mem_cgroup_write,

4909

.write_string = mem_cgroup_write,

4918

.read_u64 = mem_cgroup_read,

4910

.read_u64 = mem_cgroup_read,

4919

},

4911

},

4920

{

4912

{

4921

.name = "memsw.failcnt",

4913

.name = "memsw.failcnt",

4922

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),

4914

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),

4923

.trigger = mem_cgroup_reset,

4915

.trigger = mem_cgroup_reset,

4924

.read_u64 = mem_cgroup_read,

4916

.read_u64 = mem_cgroup_read,

4925

},

4917

},

4926

};

4918

};

4927

4919

4928

static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)

4920

static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)

4929

{

4921

{

4930

if (!do_swap_account)

4922

if (!do_swap_account)

4931

return 0;

4923

return 0;

4932

return cgroup_add_files(cont, ss, memsw_cgroup_files,

4924

return cgroup_add_files(cont, ss, memsw_cgroup_files,

4933

ARRAY_SIZE(memsw_cgroup_files));

4925

ARRAY_SIZE(memsw_cgroup_files));

4934

};

4926

};

4935

#else

4927

#else

4936

static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)

4928

static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)

4937

{

4929

{

4938

return 0;

4930

return 0;

4939

}

4931

}

4940

#endif

4932

#endif

4941

4933

4942

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)

4934

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)

4943

{

4935

{

4944

struct mem_cgroup_per_node *pn;

4936

struct mem_cgroup_per_node *pn;

4945

struct mem_cgroup_per_zone *mz;

4937

struct mem_cgroup_per_zone *mz;

4946

enum lru_list l;

4938

enum lru_list l;

4947

int zone, tmp = node;

4939

int zone, tmp = node;

4948

/*

4940

/*

4949

* This routine is called against possible nodes.

4941

* This routine is called against possible nodes.

4950

* But it's BUG to call kmalloc() against offline node.

4942

* But it's BUG to call kmalloc() against offline node.

4951

*

4943

*

4952

* TODO: this routine can waste much memory for nodes which will

4944

* TODO: this routine can waste much memory for nodes which will

4953

* never be onlined. It's better to use memory hotplug callback

4945

* never be onlined. It's better to use memory hotplug callback

4954

* function.

4946

* function.

4955

*/

4947

*/

4956

if (!node_state(node, N_NORMAL_MEMORY))

4948

if (!node_state(node, N_NORMAL_MEMORY))

4957

tmp = -1;

4949

tmp = -1;

4958

pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

4950

pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

4959

if (!pn)

4951

if (!pn)

4960

return 1;

4952

return 1;

4961

4953

4962

mem->info.nodeinfo[node] = pn;

4954

mem->info.nodeinfo[node] = pn;

4963

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

4955

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

4964

mz = &pn->zoneinfo[zone];

4956

mz = &pn->zoneinfo[zone];

4965

for_each_lru(l)

4957

for_each_lru(l)

4966

INIT_LIST_HEAD(&mz->lists[l]);

4958

INIT_LIST_HEAD(&mz->lists[l]);

4967

mz->usage_in_excess = 0;

4959

mz->usage_in_excess = 0;

4968

mz->on_tree = false;

4960

mz->on_tree = false;

4969

mz->mem = mem;

4961

mz->mem = mem;

4970

}

4962

}

4971

return 0;

4963

return 0;

4972

}

4964

}

4973

4965

4974

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)

4966

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)

4975

{

4967

{

4976

kfree(mem->info.nodeinfo[node]);

4968

kfree(mem->info.nodeinfo[node]);

4977

}

4969

}

4978

4970

4979

static struct mem_cgroup *mem_cgroup_alloc(void)

4971

static struct mem_cgroup *mem_cgroup_alloc(void)

4980

{

4972

{

4981

struct mem_cgroup *mem;

4973

struct mem_cgroup *mem;

4982

int size = sizeof(struct mem_cgroup);

4974

int size = sizeof(struct mem_cgroup);

4983

4975

4984

/* Can be very big if MAX_NUMNODES is very big */

4976

/* Can be very big if MAX_NUMNODES is very big */

4985

if (size < PAGE_SIZE)

4977

if (size < PAGE_SIZE)

4986

mem = kzalloc(size, GFP_KERNEL);

4978

mem = kzalloc(size, GFP_KERNEL);

4987

else

4979

else

4988

mem = vzalloc(size);

4980

mem = vzalloc(size);

4989

4981

4990

if (!mem)

4982

if (!mem)

4991

return NULL;

4983

return NULL;

4992

4984

4993

mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);

4985

mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);

4994

if (!mem->stat)

4986

if (!mem->stat)

4995

goto out_free;

4987

goto out_free;

4996

spin_lock_init(&mem->pcp_counter_lock);

4988

spin_lock_init(&mem->pcp_counter_lock);

4997

return mem;

4989

return mem;

4998

4990

4999

out_free:

4991

out_free:

5000

if (size < PAGE_SIZE)

4992

if (size < PAGE_SIZE)

5001

kfree(mem);

4993

kfree(mem);

5002

else

4994

else

5003

vfree(mem);

4995

vfree(mem);

5004

return NULL;

4996

return NULL;

5005

}

4997

}

5006

4998

5007

/*

4999

/*

5008

* At destroying mem_cgroup, references from swap_cgroup can remain.

5000

* At destroying mem_cgroup, references from swap_cgroup can remain.

5009

* (scanning all at force_empty is too costly...)

5001

* (scanning all at force_empty is too costly...)

5010

*

5002

*

5011

* Instead of clearing all references at force_empty, we remember

5003

* Instead of clearing all references at force_empty, we remember

5012

* the number of reference from swap_cgroup and free mem_cgroup when

5004

* the number of reference from swap_cgroup and free mem_cgroup when

5013

* it goes down to 0.

5005

* it goes down to 0.

5014

*

5006

*

5015

* Removal of cgroup itself succeeds regardless of refs from swap.

5007

* Removal of cgroup itself succeeds regardless of refs from swap.

5016

*/

5008

*/

5017

5009

5018

static void __mem_cgroup_free(struct mem_cgroup *mem)

5010

static void __mem_cgroup_free(struct mem_cgroup *mem)

5019

{

5011

{

5020

int node;

5012

int node;

5021

5013

5022

mem_cgroup_remove_from_trees(mem);

5014

mem_cgroup_remove_from_trees(mem);

5023

free_css_id(&mem_cgroup_subsys, &mem->css);

5015

free_css_id(&mem_cgroup_subsys, &mem->css);

5024

5016

5025

for_each_node_state(node, N_POSSIBLE)

5017

for_each_node_state(node, N_POSSIBLE)

5026

free_mem_cgroup_per_zone_info(mem, node);

5018

free_mem_cgroup_per_zone_info(mem, node);

5027

5019

5028

free_percpu(mem->stat);

5020

free_percpu(mem->stat);

5029

if (sizeof(struct mem_cgroup) < PAGE_SIZE)

5021

if (sizeof(struct mem_cgroup) < PAGE_SIZE)

5030

kfree(mem);

5022

kfree(mem);

5031

else

5023

else

5032

vfree(mem);

5024

vfree(mem);

5033

}

5025

}

5034

5026

5035

static void mem_cgroup_get(struct mem_cgroup *mem)

5027

static void mem_cgroup_get(struct mem_cgroup *mem)

5036

{

5028

{

5037

atomic_inc(&mem->refcnt);

5029

atomic_inc(&mem->refcnt);

5038

}

5030

}

5039

5031

5040

static void __mem_cgroup_put(struct mem_cgroup *mem, int count)

5032

static void __mem_cgroup_put(struct mem_cgroup *mem, int count)

5041

{

5033

{

5042

if (atomic_sub_and_test(count, &mem->refcnt)) {

5034

if (atomic_sub_and_test(count, &mem->refcnt)) {

5043

struct mem_cgroup *parent = parent_mem_cgroup(mem);

5035

struct mem_cgroup *parent = parent_mem_cgroup(mem);

5044

__mem_cgroup_free(mem);

5036

__mem_cgroup_free(mem);

5045

if (parent)

5037

if (parent)

5046

mem_cgroup_put(parent);

5038

mem_cgroup_put(parent);

5047

}

5039

}

5048

}

5040

}

5049

5041

5050

static void mem_cgroup_put(struct mem_cgroup *mem)

5042

static void mem_cgroup_put(struct mem_cgroup *mem)

5051

{

5043

{

5052

__mem_cgroup_put(mem, 1);

5044

__mem_cgroup_put(mem, 1);

5053

}

5045

}

5054

5046

5055

/*

5047

/*

5056

* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.

5048

* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.

5057

*/

5049

*/

5058

static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)

5050

static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)

5059

{

5051

{

5060

if (!mem->res.parent)

5052

if (!mem->res.parent)

5061

return NULL;

5053

return NULL;

5062

return mem_cgroup_from_res_counter(mem->res.parent, res);

5054

return mem_cgroup_from_res_counter(mem->res.parent, res);

5063

}

5055

}

5064

5056

5065

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

5057

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

5066

static void __init enable_swap_cgroup(void)

5058

static void __init enable_swap_cgroup(void)

5067

{

5059

{

5068

if (!mem_cgroup_disabled() && really_do_swap_account)

5060

if (!mem_cgroup_disabled() && really_do_swap_account)

5069

do_swap_account = 1;

5061

do_swap_account = 1;

5070

}

5062

}

5071

#else

5063

#else

5072

static void __init enable_swap_cgroup(void)

5064

static void __init enable_swap_cgroup(void)

5073

{

5065

{

5074

}

5066

}

5075

#endif

5067

#endif

5076

5068

5077

static int mem_cgroup_soft_limit_tree_init(void)

5069

static int mem_cgroup_soft_limit_tree_init(void)

5078

{

5070

{

5079

struct mem_cgroup_tree_per_node *rtpn;

5071

struct mem_cgroup_tree_per_node *rtpn;

5080

struct mem_cgroup_tree_per_zone *rtpz;

5072

struct mem_cgroup_tree_per_zone *rtpz;

5081

int tmp, node, zone;

5073

int tmp, node, zone;

5082

5074

5083

for_each_node_state(node, N_POSSIBLE) {

5075

for_each_node_state(node, N_POSSIBLE) {

5084

tmp = node;

5076

tmp = node;

5085

if (!node_state(node, N_NORMAL_MEMORY))

5077

if (!node_state(node, N_NORMAL_MEMORY))

5086

tmp = -1;

5078

tmp = -1;

5087

rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);

5079

rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);

5088

if (!rtpn)

5080

if (!rtpn)

5089

return 1;

5081

return 1;

5090

5082

5091

soft_limit_tree.rb_tree_per_node[node] = rtpn;

5083

soft_limit_tree.rb_tree_per_node[node] = rtpn;

5092

5084

5093

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

5085

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

5094

rtpz = &rtpn->rb_tree_per_zone[zone];

5086

rtpz = &rtpn->rb_tree_per_zone[zone];

5095

rtpz->rb_root = RB_ROOT;

5087

rtpz->rb_root = RB_ROOT;

5096

spin_lock_init(&rtpz->lock);

5088

spin_lock_init(&rtpz->lock);

5097

}

5089

}

5098

}

5090

}

5099

return 0;

5091

return 0;

5100

}

5092

}

5101

5093

5102

static struct cgroup_subsys_state * __ref

5094

static struct cgroup_subsys_state * __ref

5103

mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)

5095

mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)

5104

{

5096

{

5105

struct mem_cgroup *mem, *parent;

5097

struct mem_cgroup *mem, *parent;

5106

long error = -ENOMEM;

5098

long error = -ENOMEM;

5107

int node;

5099

int node;

5108

5100

5109

mem = mem_cgroup_alloc();

5101

mem = mem_cgroup_alloc();

5110

if (!mem)

5102

if (!mem)

5111

return ERR_PTR(error);

5103

return ERR_PTR(error);

5112

5104

5113

for_each_node_state(node, N_POSSIBLE)

5105

for_each_node_state(node, N_POSSIBLE)

5114

if (alloc_mem_cgroup_per_zone_info(mem, node))

5106

if (alloc_mem_cgroup_per_zone_info(mem, node))

5115

goto free_out;

5107

goto free_out;

5116

5108

5117

/* root ? */

5109

/* root ? */

5118

if (cont->parent == NULL) {

5110

if (cont->parent == NULL) {

5119

int cpu;

5111

int cpu;

5120

enable_swap_cgroup();

5112

enable_swap_cgroup();

5121

parent = NULL;

5113

parent = NULL;

5122

root_mem_cgroup = mem;

5114

root_mem_cgroup = mem;

5123

if (mem_cgroup_soft_limit_tree_init())

5115

if (mem_cgroup_soft_limit_tree_init())

5124

goto free_out;

5116

goto free_out;

5125

for_each_possible_cpu(cpu) {

5117

for_each_possible_cpu(cpu) {

5126

struct memcg_stock_pcp *stock =

5118

struct memcg_stock_pcp *stock =

5127

&per_cpu(memcg_stock, cpu);

5119

&per_cpu(memcg_stock, cpu);

5128

INIT_WORK(&stock->work, drain_local_stock);

5120

INIT_WORK(&stock->work, drain_local_stock);

5129

}

5121

}

5130

hotcpu_notifier(memcg_cpu_hotplug_callback, 0);

5122

hotcpu_notifier(memcg_cpu_hotplug_callback, 0);

5131

} else {

5123

} else {

5132

parent = mem_cgroup_from_cont(cont->parent);

5124

parent = mem_cgroup_from_cont(cont->parent);

5133

mem->use_hierarchy = parent->use_hierarchy;

5125

mem->use_hierarchy = parent->use_hierarchy;

5134

mem->oom_kill_disable = parent->oom_kill_disable;

5126

mem->oom_kill_disable = parent->oom_kill_disable;

5135

}

5127

}

5136

5128

5137

if (parent && parent->use_hierarchy) {

5129

if (parent && parent->use_hierarchy) {

5138

res_counter_init(&mem->res, &parent->res);

5130

res_counter_init(&mem->res, &parent->res);

5139

res_counter_init(&mem->memsw, &parent->memsw);

5131

res_counter_init(&mem->memsw, &parent->memsw);

5140

/*

5132

/*

5141

* We increment refcnt of the parent to ensure that we can

5133

* We increment refcnt of the parent to ensure that we can

5142

* safely access it on res_counter_charge/uncharge.

5134

* safely access it on res_counter_charge/uncharge.

5143

* This refcnt will be decremented when freeing this

5135

* This refcnt will be decremented when freeing this

5144

* mem_cgroup(see mem_cgroup_put).

5136

* mem_cgroup(see mem_cgroup_put).

5145

*/

5137

*/

5146

mem_cgroup_get(parent);

5138

mem_cgroup_get(parent);

5147

} else {

5139

} else {

5148

res_counter_init(&mem->res, NULL);

5140

res_counter_init(&mem->res, NULL);

5149

res_counter_init(&mem->memsw, NULL);

5141

res_counter_init(&mem->memsw, NULL);

5150

}

5142

}

5151

mem->last_scanned_child = 0;

5143

mem->last_scanned_child = 0;

5152

mem->last_scanned_node = MAX_NUMNODES;

5144

mem->last_scanned_node = MAX_NUMNODES;

5153

INIT_LIST_HEAD(&mem->oom_notify);

5145

INIT_LIST_HEAD(&mem->oom_notify);

5154

5146

5155

if (parent)

5147

if (parent)

5156

mem->swappiness = mem_cgroup_swappiness(parent);

5148

mem->swappiness = mem_cgroup_swappiness(parent);

5157

atomic_set(&mem->refcnt, 1);

5149

atomic_set(&mem->refcnt, 1);

5158

mem->move_charge_at_immigrate = 0;

5150

mem->move_charge_at_immigrate = 0;

5159

mutex_init(&mem->thresholds_lock);

5151

mutex_init(&mem->thresholds_lock);

5160

spin_lock_init(&mem->scanstat.lock);

5152

spin_lock_init(&mem->scanstat.lock);

5161

return &mem->css;

5153

return &mem->css;

5162

free_out:

5154

free_out:

5163

__mem_cgroup_free(mem);

5155

__mem_cgroup_free(mem);

5164

root_mem_cgroup = NULL;

5156

root_mem_cgroup = NULL;

5165

return ERR_PTR(error);

5157

return ERR_PTR(error);

5166

}

5158

}

5167

5159

5168

static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,

5160

static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,

5169

struct cgroup *cont)

5161

struct cgroup *cont)

5170

{

5162

{

5171

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

5163

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

5172

5164

5173

return mem_cgroup_force_empty(mem, false);

5165

return mem_cgroup_force_empty(mem, false);

5174

}

5166

}

5175

5167

5176

static void mem_cgroup_destroy(struct cgroup_subsys *ss,

5168

static void mem_cgroup_destroy(struct cgroup_subsys *ss,

5177

struct cgroup *cont)

5169

struct cgroup *cont)

5178

{

5170

{

5179

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

5171

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

5180

5172

5181

mem_cgroup_put(mem);

5173

mem_cgroup_put(mem);

5182

}

5174

}

5183

5175

5184

static int mem_cgroup_populate(struct cgroup_subsys *ss,

5176

static int mem_cgroup_populate(struct cgroup_subsys *ss,

5185

struct cgroup *cont)

5177

struct cgroup *cont)

5186

{

5178

{

5187

int ret;

5179

int ret;

5188

5180

5189

ret = cgroup_add_files(cont, ss, mem_cgroup_files,

5181

ret = cgroup_add_files(cont, ss, mem_cgroup_files,

5190

ARRAY_SIZE(mem_cgroup_files));

5182

ARRAY_SIZE(mem_cgroup_files));

5191

5183

5192

if (!ret)

5184

if (!ret)

5193

ret = register_memsw_files(cont, ss);

5185

ret = register_memsw_files(cont, ss);

5194

return ret;

5186

return ret;

5195

}

5187

}

5196

5188

5197

#ifdef CONFIG_MMU

5189

#ifdef CONFIG_MMU

5198

/* Handlers for move charge at task migration. */

5190

/* Handlers for move charge at task migration. */

5199

#define PRECHARGE_COUNT_AT_ONCE 256

5191

#define PRECHARGE_COUNT_AT_ONCE 256

5200

static int mem_cgroup_do_precharge(unsigned long count)

5192

static int mem_cgroup_do_precharge(unsigned long count)

5201

{

5193

{

5202

int ret = 0;

5194

int ret = 0;

5203

int batch_count = PRECHARGE_COUNT_AT_ONCE;

5195

int batch_count = PRECHARGE_COUNT_AT_ONCE;

5204

struct mem_cgroup *mem = mc.to;

5196

struct mem_cgroup *mem = mc.to;

5205

5197

5206

if (mem_cgroup_is_root(mem)) {

5198

if (mem_cgroup_is_root(mem)) {

5207

mc.precharge += count;

5199

mc.precharge += count;

5208

/* we don't need css_get for root */

5200

/* we don't need css_get for root */

5209

return ret;

5201

return ret;

5210

}

5202

}

5211

/* try to charge at once */

5203

/* try to charge at once */

5212

if (count > 1) {

5204

if (count > 1) {

5213

struct res_counter *dummy;

5205

struct res_counter *dummy;

5214

/*

5206

/*

5215

* "mem" cannot be under rmdir() because we've already checked

5207

* "mem" cannot be under rmdir() because we've already checked

5216

* by cgroup_lock_live_cgroup() that it is not removed and we

5208

* by cgroup_lock_live_cgroup() that it is not removed and we

5217

* are still under the same cgroup_mutex. So we can postpone

5209

* are still under the same cgroup_mutex. So we can postpone

5218

* css_get().

5210

* css_get().

5219

*/

5211

*/

5220

if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))

5212

if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))

5221

goto one_by_one;

5213

goto one_by_one;

5222

if (do_swap_account && res_counter_charge(&mem->memsw,

5214

if (do_swap_account && res_counter_charge(&mem->memsw,

5223

PAGE_SIZE * count, &dummy)) {

5215

PAGE_SIZE * count, &dummy)) {

5224

res_counter_uncharge(&mem->res, PAGE_SIZE * count);

5216

res_counter_uncharge(&mem->res, PAGE_SIZE * count);

5225

goto one_by_one;

5217

goto one_by_one;

5226

}

5218

}

5227

mc.precharge += count;

5219

mc.precharge += count;

5228

return ret;

5220

return ret;

5229

}

5221

}

5230

one_by_one:

5222

one_by_one:

5231

/* fall back to one by one charge */

5223

/* fall back to one by one charge */

5232

while (count--) {

5224

while (count--) {

5233

if (signal_pending(current)) {

5225

if (signal_pending(current)) {

5234

ret = -EINTR;

5226

ret = -EINTR;

5235

break;

5227

break;

5236

}

5228

}

5237

if (!batch_count--) {

5229

if (!batch_count--) {

5238

batch_count = PRECHARGE_COUNT_AT_ONCE;

5230

batch_count = PRECHARGE_COUNT_AT_ONCE;

5239

cond_resched();

5231

cond_resched();

5240

}

5232

}

5241

ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);

5233

ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);

5242

if (ret || !mem)

5234

if (ret || !mem)

5243

/* mem_cgroup_clear_mc() will do uncharge later */

5235

/* mem_cgroup_clear_mc() will do uncharge later */

5244

return -ENOMEM;

5236

return -ENOMEM;

5245

mc.precharge++;

5237

mc.precharge++;

5246

}

5238

}

5247

return ret;

5239

return ret;

5248

}

5240

}

5249

5241

5250

/**

5242

/**

5251

* is_target_pte_for_mc - check a pte whether it is valid for move charge

5243

* is_target_pte_for_mc - check a pte whether it is valid for move charge

5252

* @vma: the vma the pte to be checked belongs

5244

* @vma: the vma the pte to be checked belongs

5253

* @addr: the address corresponding to the pte to be checked

5245

* @addr: the address corresponding to the pte to be checked

5254

* @ptent: the pte to be checked

5246

* @ptent: the pte to be checked

5255

* @target: the pointer the target page or swap ent will be stored(can be NULL)

5247

* @target: the pointer the target page or swap ent will be stored(can be NULL)

5256

*

5248

*

5257

* Returns

5249

* Returns

5258

* 0(MC_TARGET_NONE): if the pte is not a target for move charge.

5250

* 0(MC_TARGET_NONE): if the pte is not a target for move charge.

5259

* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for

5251

* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for

5260

* move charge. if @target is not NULL, the page is stored in target->page

5252

* move charge. if @target is not NULL, the page is stored in target->page

5261

* with extra refcnt got(Callers should handle it).

5253

* with extra refcnt got(Callers should handle it).

5262

* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a

5254

* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a

5263

* target for charge migration. if @target is not NULL, the entry is stored

5255

* target for charge migration. if @target is not NULL, the entry is stored

5264

* in target->ent.

5256

* in target->ent.

5265

*

5257

*

5266

* Called with pte lock held.

5258

* Called with pte lock held.

5267

*/

5259

*/

5268

union mc_target {

5260

union mc_target {

5269

struct page *page;

5261

struct page *page;

5270

swp_entry_t ent;

5262

swp_entry_t ent;

5271

};

5263

};

5272

5264

5273

enum mc_target_type {

5265

enum mc_target_type {

5274

MC_TARGET_NONE, /* not used */

5266

MC_TARGET_NONE, /* not used */

5275

MC_TARGET_PAGE,

5267

MC_TARGET_PAGE,

5276

MC_TARGET_SWAP,

5268

MC_TARGET_SWAP,

5277

};

5269

};

5278

5270

5279

static struct page *mc_handle_present_pte(struct vm_area_struct *vma,

5271

static struct page *mc_handle_present_pte(struct vm_area_struct *vma,

5280

unsigned long addr, pte_t ptent)

5272

unsigned long addr, pte_t ptent)

5281

{

5273

{

5282

struct page *page = vm_normal_page(vma, addr, ptent);

5274

struct page *page = vm_normal_page(vma, addr, ptent);

5283

5275

5284

if (!page || !page_mapped(page))

5276

if (!page || !page_mapped(page))

5285

return NULL;

5277

return NULL;

5286

if (PageAnon(page)) {

5278

if (PageAnon(page)) {

5287

/* we don't move shared anon */

5279

/* we don't move shared anon */

5288

if (!move_anon() || page_mapcount(page) > 2)

5280

if (!move_anon() || page_mapcount(page) > 2)

5289

return NULL;

5281

return NULL;

5290

} else if (!move_file())

5282

} else if (!move_file())

5291

/* we ignore mapcount for file pages */

5283

/* we ignore mapcount for file pages */

5292

return NULL;

5284

return NULL;

5293

if (!get_page_unless_zero(page))

5285

if (!get_page_unless_zero(page))

5294

return NULL;

5286

return NULL;

5295

5287

5296

return page;

5288

return page;

5297

}

5289

}

5298

5290

5299

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

5291

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

5300

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5292

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5301

{

5293

{

5302

int usage_count;

5294

int usage_count;

5303

struct page *page = NULL;

5295

struct page *page = NULL;

5304

swp_entry_t ent = pte_to_swp_entry(ptent);

5296

swp_entry_t ent = pte_to_swp_entry(ptent);

5305

5297

5306

if (!move_anon() || non_swap_entry(ent))

5298

if (!move_anon() || non_swap_entry(ent))

5307

return NULL;

5299

return NULL;

5308

usage_count = mem_cgroup_count_swap_user(ent, &page);

5300

usage_count = mem_cgroup_count_swap_user(ent, &page);

5309

if (usage_count > 1) { /* we don't move shared anon */

5301

if (usage_count > 1) { /* we don't move shared anon */

5310

if (page)

5302

if (page)

5311

put_page(page);

5303

put_page(page);

5312

return NULL;

5304

return NULL;

5313

}

5305

}

5314

if (do_swap_account)

5306

if (do_swap_account)

5315

entry->val = ent.val;

5307

entry->val = ent.val;

5316

5308

5317

return page;

5309

return page;

5318

}

5310

}

5319

5311

5320

static struct page *mc_handle_file_pte(struct vm_area_struct *vma,

5312

static struct page *mc_handle_file_pte(struct vm_area_struct *vma,

5321

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5313

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5322

{

5314

{

5323

struct page *page = NULL;

5315

struct page *page = NULL;

5324

struct inode *inode;

5316

struct inode *inode;

5325

struct address_space *mapping;

5317

struct address_space *mapping;

5326

pgoff_t pgoff;

5318

pgoff_t pgoff;

5327

5319

5328

if (!vma->vm_file) /* anonymous vma */

5320

if (!vma->vm_file) /* anonymous vma */

5329

return NULL;

5321

return NULL;

5330

if (!move_file())

5322

if (!move_file())

5331

return NULL;

5323

return NULL;

5332

5324

5333

inode = vma->vm_file->f_path.dentry->d_inode;

5325

inode = vma->vm_file->f_path.dentry->d_inode;

5334

mapping = vma->vm_file->f_mapping;

5326

mapping = vma->vm_file->f_mapping;

5335

if (pte_none(ptent))

5327

if (pte_none(ptent))

5336

pgoff = linear_page_index(vma, addr);

5328

pgoff = linear_page_index(vma, addr);

5337

else /* pte_file(ptent) is true */

5329

else /* pte_file(ptent) is true */

5338

pgoff = pte_to_pgoff(ptent);

5330

pgoff = pte_to_pgoff(ptent);

5339

5331

5340

/* page is moved even if it's not RSS of this task(page-faulted). */

5332

/* page is moved even if it's not RSS of this task(page-faulted). */

5341

if (!mapping_cap_swap_backed(mapping)) { /* normal file */

5333

if (!mapping_cap_swap_backed(mapping)) { /* normal file */

5342

page = find_get_page(mapping, pgoff);

5334

page = find_get_page(mapping, pgoff);

5343

} else { /* shmem/tmpfs file. we should take account of swap too. */

5335

} else { /* shmem/tmpfs file. we should take account of swap too. */

5344

swp_entry_t ent;

5336

swp_entry_t ent;

5345

mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);

5337

mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);

5346

if (do_swap_account)

5338

if (do_swap_account)

5347

entry->val = ent.val;

5339

entry->val = ent.val;

5348

}

5340

}

5349

5341

5350

return page;

5342

return page;

5351

}

5343

}

5352

5344

5353

static int is_target_pte_for_mc(struct vm_area_struct *vma,

5345

static int is_target_pte_for_mc(struct vm_area_struct *vma,

5354

unsigned long addr, pte_t ptent, union mc_target *target)

5346

unsigned long addr, pte_t ptent, union mc_target *target)

5355

{

5347

{

5356

struct page *page = NULL;

5348

struct page *page = NULL;

5357

struct page_cgroup *pc;

5349

struct page_cgroup *pc;

5358

int ret = 0;

5350

int ret = 0;

5359

swp_entry_t ent = { .val = 0 };

5351

swp_entry_t ent = { .val = 0 };

5360

5352

5361

if (pte_present(ptent))

5353

if (pte_present(ptent))

5362

page = mc_handle_present_pte(vma, addr, ptent);

5354

page = mc_handle_present_pte(vma, addr, ptent);

5363

else if (is_swap_pte(ptent))

5355

else if (is_swap_pte(ptent))

5364

page = mc_handle_swap_pte(vma, addr, ptent, &ent);

5356

page = mc_handle_swap_pte(vma, addr, ptent, &ent);

5365

else if (pte_none(ptent) || pte_file(ptent))

5357

else if (pte_none(ptent) || pte_file(ptent))

5366

page = mc_handle_file_pte(vma, addr, ptent, &ent);

5358

page = mc_handle_file_pte(vma, addr, ptent, &ent);

5367

5359

5368

if (!page && !ent.val)

5360

if (!page && !ent.val)

5369

return 0;

5361

return 0;

5370

if (page) {

5362

if (page) {

5371

pc = lookup_page_cgroup(page);

5363

pc = lookup_page_cgroup(page);

5372

/*

5364

/*

5373

* Do only loose check w/o page_cgroup lock.

5365

* Do only loose check w/o page_cgroup lock.

5374

* mem_cgroup_move_account() checks the pc is valid or not under

5366

* mem_cgroup_move_account() checks the pc is valid or not under

5375

* the lock.

5367

* the lock.

5376

*/

5368

*/

5377

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

5369

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

5378

ret = MC_TARGET_PAGE;

5370

ret = MC_TARGET_PAGE;

5379

if (target)

5371

if (target)

5380

target->page = page;

5372

target->page = page;

5381

}

5373

}

5382

if (!ret || !target)

5374

if (!ret || !target)

5383

put_page(page);

5375

put_page(page);

5384

}

5376

}

5385

/* There is a swap entry and a page doesn't exist or isn't charged */

5377

/* There is a swap entry and a page doesn't exist or isn't charged */

5386

if (ent.val && !ret &&

5378

if (ent.val && !ret &&

5387

css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {

5379

css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {

5388

ret = MC_TARGET_SWAP;

5380

ret = MC_TARGET_SWAP;

5389

if (target)

5381

if (target)

5390

target->ent = ent;

5382

target->ent = ent;

5391

}

5383

}

5392

return ret;

5384

return ret;

5393

}

5385

}

5394

5386

5395

static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,

5387

static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,

5396

unsigned long addr, unsigned long end,

5388

unsigned long addr, unsigned long end,

5397

struct mm_walk *walk)

5389

struct mm_walk *walk)

5398

{

5390

{

5399

struct vm_area_struct *vma = walk->private;

5391

struct vm_area_struct *vma = walk->private;

5400

pte_t *pte;

5392

pte_t *pte;

5401

spinlock_t *ptl;

5393

spinlock_t *ptl;

5402

5394

5403

split_huge_page_pmd(walk->mm, pmd);

5395

split_huge_page_pmd(walk->mm, pmd);

5404

5396

5405

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5397

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5406

for (; addr != end; pte++, addr += PAGE_SIZE)

5398

for (; addr != end; pte++, addr += PAGE_SIZE)

5407

if (is_target_pte_for_mc(vma, addr, *pte, NULL))

5399

if (is_target_pte_for_mc(vma, addr, *pte, NULL))

5408

mc.precharge++; /* increment precharge temporarily */

5400

mc.precharge++; /* increment precharge temporarily */

5409

pte_unmap_unlock(pte - 1, ptl);

5401

pte_unmap_unlock(pte - 1, ptl);

5410

cond_resched();

5402

cond_resched();

5411

5403

5412

return 0;

5404

return 0;

5413

}

5405

}

5414

5406

5415

static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)

5407

static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)

5416

{

5408

{

5417

unsigned long precharge;

5409

unsigned long precharge;

5418

struct vm_area_struct *vma;

5410

struct vm_area_struct *vma;

5419

5411

5420

down_read(&mm->mmap_sem);

5412

down_read(&mm->mmap_sem);

5421

for (vma = mm->mmap; vma; vma = vma->vm_next) {

5413

for (vma = mm->mmap; vma; vma = vma->vm_next) {

5422

struct mm_walk mem_cgroup_count_precharge_walk = {

5414

struct mm_walk mem_cgroup_count_precharge_walk = {

5423

.pmd_entry = mem_cgroup_count_precharge_pte_range,

5415

.pmd_entry = mem_cgroup_count_precharge_pte_range,

5424

.mm = mm,

5416

.mm = mm,

5425

.private = vma,

5417

.private = vma,

5426

};

5418

};

5427

if (is_vm_hugetlb_page(vma))

5419

if (is_vm_hugetlb_page(vma))

5428

continue;

5420

continue;

5429

walk_page_range(vma->vm_start, vma->vm_end,

5421

walk_page_range(vma->vm_start, vma->vm_end,

5430

&mem_cgroup_count_precharge_walk);

5422

&mem_cgroup_count_precharge_walk);

5431

}

5423

}

5432

up_read(&mm->mmap_sem);

5424

up_read(&mm->mmap_sem);

5433

5425

5434

precharge = mc.precharge;

5426

precharge = mc.precharge;

5435

mc.precharge = 0;

5427

mc.precharge = 0;

5436

5428

5437

return precharge;

5429

return precharge;

5438

}

5430

}

5439

5431

5440

static int mem_cgroup_precharge_mc(struct mm_struct *mm)

5432

static int mem_cgroup_precharge_mc(struct mm_struct *mm)

5441

{

5433

{

5442

unsigned long precharge = mem_cgroup_count_precharge(mm);

5434

unsigned long precharge = mem_cgroup_count_precharge(mm);

5443

5435

5444

VM_BUG_ON(mc.moving_task);

5436

VM_BUG_ON(mc.moving_task);

5445

mc.moving_task = current;

5437

mc.moving_task = current;

5446

return mem_cgroup_do_precharge(precharge);

5438

return mem_cgroup_do_precharge(precharge);

5447

}

5439

}

5448

5440

5449

/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */

5441

/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */

5450

static void __mem_cgroup_clear_mc(void)

5442

static void __mem_cgroup_clear_mc(void)

5451

{

5443

{

5452

struct mem_cgroup *from = mc.from;

5444

struct mem_cgroup *from = mc.from;

5453

struct mem_cgroup *to = mc.to;

5445

struct mem_cgroup *to = mc.to;

5454

5446

5455

/* we must uncharge all the leftover precharges from mc.to */

5447

/* we must uncharge all the leftover precharges from mc.to */

5456

if (mc.precharge) {

5448

if (mc.precharge) {

5457

__mem_cgroup_cancel_charge(mc.to, mc.precharge);

5449

__mem_cgroup_cancel_charge(mc.to, mc.precharge);

5458

mc.precharge = 0;

5450

mc.precharge = 0;

5459

}

5451

}

5460

/*

5452

/*

5461

* we didn't uncharge from mc.from at mem_cgroup_move_account(), so

5453

* we didn't uncharge from mc.from at mem_cgroup_move_account(), so

5462

* we must uncharge here.

5454

* we must uncharge here.

5463

*/

5455

*/

5464

if (mc.moved_charge) {

5456

if (mc.moved_charge) {

5465

__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);

5457

__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);

5466

mc.moved_charge = 0;

5458

mc.moved_charge = 0;

5467

}

5459

}

5468

/* we must fixup refcnts and charges */

5460

/* we must fixup refcnts and charges */

5469

if (mc.moved_swap) {

5461

if (mc.moved_swap) {

5470

/* uncharge swap account from the old cgroup */

5462

/* uncharge swap account from the old cgroup */

5471

if (!mem_cgroup_is_root(mc.from))

5463

if (!mem_cgroup_is_root(mc.from))

5472

res_counter_uncharge(&mc.from->memsw,

5464

res_counter_uncharge(&mc.from->memsw,

5473

PAGE_SIZE * mc.moved_swap);

5465

PAGE_SIZE * mc.moved_swap);

5474

__mem_cgroup_put(mc.from, mc.moved_swap);

5466

__mem_cgroup_put(mc.from, mc.moved_swap);

5475

5467

5476

if (!mem_cgroup_is_root(mc.to)) {

5468

if (!mem_cgroup_is_root(mc.to)) {

5477

/*

5469

/*

5478

* we charged both to->res and to->memsw, so we should

5470

* we charged both to->res and to->memsw, so we should

5479

* uncharge to->res.

5471

* uncharge to->res.

5480

*/

5472

*/

5481

res_counter_uncharge(&mc.to->res,

5473

res_counter_uncharge(&mc.to->res,

5482

PAGE_SIZE * mc.moved_swap);

5474

PAGE_SIZE * mc.moved_swap);

5483

}

5475

}

5484

/* we've already done mem_cgroup_get(mc.to) */

5476

/* we've already done mem_cgroup_get(mc.to) */

5485

mc.moved_swap = 0;

5477

mc.moved_swap = 0;

5486

}

5478

}

5487

memcg_oom_recover(from);

5479

memcg_oom_recover(from);

5488

memcg_oom_recover(to);

5480

memcg_oom_recover(to);

5489

wake_up_all(&mc.waitq);

5481

wake_up_all(&mc.waitq);

5490

}

5482

}

5491

5483

5492

static void mem_cgroup_clear_mc(void)

5484

static void mem_cgroup_clear_mc(void)

5493

{

5485

{

5494

struct mem_cgroup *from = mc.from;

5486

struct mem_cgroup *from = mc.from;

5495

5487

5496

/*

5488

/*

5497

* we must clear moving_task before waking up waiters at the end of

5489

* we must clear moving_task before waking up waiters at the end of

5498

* task migration.

5490

* task migration.

5499

*/

5491

*/

5500

mc.moving_task = NULL;

5492

mc.moving_task = NULL;

5501

__mem_cgroup_clear_mc();

5493

__mem_cgroup_clear_mc();

5502

spin_lock(&mc.lock);

5494

spin_lock(&mc.lock);

5503

mc.from = NULL;

5495

mc.from = NULL;

5504

mc.to = NULL;

5496

mc.to = NULL;

5505

spin_unlock(&mc.lock);

5497

spin_unlock(&mc.lock);

5506

mem_cgroup_end_move(from);

5498

mem_cgroup_end_move(from);

5507

}

5499

}

5508

5500

5509

static int mem_cgroup_can_attach(struct cgroup_subsys *ss,

5501

static int mem_cgroup_can_attach(struct cgroup_subsys *ss,

5510

struct cgroup *cgroup,

5502

struct cgroup *cgroup,

5511

struct task_struct *p)

5503

struct task_struct *p)

5512

{

5504

{

5513

int ret = 0;

5505

int ret = 0;

5514

struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);

5506

struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);

5515

5507

5516

if (mem->move_charge_at_immigrate) {

5508

if (mem->move_charge_at_immigrate) {

5517

struct mm_struct *mm;

5509

struct mm_struct *mm;

5518

struct mem_cgroup *from = mem_cgroup_from_task(p);

5510

struct mem_cgroup *from = mem_cgroup_from_task(p);

5519

5511

5520

VM_BUG_ON(from == mem);

5512

VM_BUG_ON(from == mem);

5521

5513

5522

mm = get_task_mm(p);

5514

mm = get_task_mm(p);

5523

if (!mm)

5515

if (!mm)

5524

return 0;

5516

return 0;

5525

/* We move charges only when we move a owner of the mm */

5517

/* We move charges only when we move a owner of the mm */

5526

if (mm->owner == p) {

5518

if (mm->owner == p) {

5527

VM_BUG_ON(mc.from);

5519

VM_BUG_ON(mc.from);

5528

VM_BUG_ON(mc.to);

5520

VM_BUG_ON(mc.to);

5529

VM_BUG_ON(mc.precharge);

5521

VM_BUG_ON(mc.precharge);

5530

VM_BUG_ON(mc.moved_charge);

5522

VM_BUG_ON(mc.moved_charge);

5531

VM_BUG_ON(mc.moved_swap);

5523

VM_BUG_ON(mc.moved_swap);

5532

mem_cgroup_start_move(from);

5524

mem_cgroup_start_move(from);

5533

spin_lock(&mc.lock);

5525

spin_lock(&mc.lock);

5534

mc.from = from;

5526

mc.from = from;

5535

mc.to = mem;

5527

mc.to = mem;

5536

spin_unlock(&mc.lock);

5528

spin_unlock(&mc.lock);

5537

/* We set mc.moving_task later */

5529

/* We set mc.moving_task later */

5538

5530

5539

ret = mem_cgroup_precharge_mc(mm);

5531

ret = mem_cgroup_precharge_mc(mm);

5540

if (ret)

5532

if (ret)

5541

mem_cgroup_clear_mc();

5533

mem_cgroup_clear_mc();

5542

}

5534

}

5543

mmput(mm);

5535

mmput(mm);

5544

}

5536

}

5545

return ret;

5537

return ret;

5546

}

5538

}

5547

5539

5548

static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,

5540

static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,

5549

struct cgroup *cgroup,

5541

struct cgroup *cgroup,

5550

struct task_struct *p)

5542

struct task_struct *p)

5551

{

5543

{

5552

mem_cgroup_clear_mc();

5544

mem_cgroup_clear_mc();

5553

}

5545

}

5554

5546

5555

static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,

5547

static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,

5556

unsigned long addr, unsigned long end,

5548

unsigned long addr, unsigned long end,

5557

struct mm_walk *walk)

5549

struct mm_walk *walk)

5558

{

5550

{

5559

int ret = 0;

5551

int ret = 0;

5560

struct vm_area_struct *vma = walk->private;

5552

struct vm_area_struct *vma = walk->private;

5561

pte_t *pte;

5553

pte_t *pte;

5562

spinlock_t *ptl;

5554

spinlock_t *ptl;

5563

5555

5564

split_huge_page_pmd(walk->mm, pmd);

5556

split_huge_page_pmd(walk->mm, pmd);

5565

retry:

5557

retry:

5566

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5558

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5567

for (; addr != end; addr += PAGE_SIZE) {

5559

for (; addr != end; addr += PAGE_SIZE) {

5568

pte_t ptent = *(pte++);

5560

pte_t ptent = *(pte++);

5569

union mc_target target;

5561

union mc_target target;

5570

int type;

5562

int type;

5571

struct page *page;

5563

struct page *page;

5572

struct page_cgroup *pc;

5564

struct page_cgroup *pc;

5573

swp_entry_t ent;

5565

swp_entry_t ent;

5574

5566

5575

if (!mc.precharge)

5567

if (!mc.precharge)

5576

break;

5568

break;

5577

5569

5578

type = is_target_pte_for_mc(vma, addr, ptent, &target);

5570

type = is_target_pte_for_mc(vma, addr, ptent, &target);

5579

switch (type) {

5571

switch (type) {

5580

case MC_TARGET_PAGE:

5572

case MC_TARGET_PAGE:

5581

page = target.page;

5573

page = target.page;

5582

if (isolate_lru_page(page))

5574

if (isolate_lru_page(page))

5583

goto put;

5575

goto put;

5584

pc = lookup_page_cgroup(page);

5576

pc = lookup_page_cgroup(page);

5585

if (!mem_cgroup_move_account(page, 1, pc,

5577

if (!mem_cgroup_move_account(page, 1, pc,

5586

mc.from, mc.to, false)) {

5578

mc.from, mc.to, false)) {

5587

mc.precharge--;

5579

mc.precharge--;

5588

/* we uncharge from mc.from later. */

5580

/* we uncharge from mc.from later. */

5589

mc.moved_charge++;

5581

mc.moved_charge++;

5590

}

5582

}

5591

putback_lru_page(page);

5583

putback_lru_page(page);

5592

put: /* is_target_pte_for_mc() gets the page */

5584

put: /* is_target_pte_for_mc() gets the page */

5593

put_page(page);

5585

put_page(page);

5594

break;

5586

break;

5595

case MC_TARGET_SWAP:

5587

case MC_TARGET_SWAP:

5596

ent = target.ent;

5588

ent = target.ent;

5597

if (!mem_cgroup_move_swap_account(ent,

5589

if (!mem_cgroup_move_swap_account(ent,

5598

mc.from, mc.to, false)) {

5590

mc.from, mc.to, false)) {

5599

mc.precharge--;

5591

mc.precharge--;

5600

/* we fixup refcnts and charges later. */

5592

/* we fixup refcnts and charges later. */

5601

mc.moved_swap++;

5593

mc.moved_swap++;

5602

}

5594

}

5603

break;

5595

break;

5604

default:

5596

default:

5605

break;

5597

break;

5606

}

5598

}

5607

}

5599

}

5608

pte_unmap_unlock(pte - 1, ptl);

5600

pte_unmap_unlock(pte - 1, ptl);

5609

cond_resched();

5601

cond_resched();

5610

5602

5611

if (addr != end) {

5603

if (addr != end) {

5612

/*

5604

/*

5613

* We have consumed all precharges we got in can_attach().

5605

* We have consumed all precharges we got in can_attach().

5614

* We try charge one by one, but don't do any additional

5606

* We try charge one by one, but don't do any additional

5615

* charges to mc.to if we have failed in charge once in attach()

5607

* charges to mc.to if we have failed in charge once in attach()

5616

* phase.

5608

* phase.

5617

*/

5609

*/

5618

ret = mem_cgroup_do_precharge(1);

5610

ret = mem_cgroup_do_precharge(1);

5619

if (!ret)

5611

if (!ret)

5620

goto retry;

5612

goto retry;

5621

}

5613

}

5622

5614

5623

return ret;

5615

return ret;

5624

}

5616

}

5625

5617

5626

static void mem_cgroup_move_charge(struct mm_struct *mm)

5618

static void mem_cgroup_move_charge(struct mm_struct *mm)

5627

{

5619

{

5628

struct vm_area_struct *vma;

5620

struct vm_area_struct *vma;

5629

5621

5630

lru_add_drain_all();

5622

lru_add_drain_all();

5631

retry:

5623

retry:

5632

if (unlikely(!down_read_trylock(&mm->mmap_sem))) {

5624

if (unlikely(!down_read_trylock(&mm->mmap_sem))) {

5633

/*

5625

/*

5634

* Someone who are holding the mmap_sem might be waiting in

5626

* Someone who are holding the mmap_sem might be waiting in

5635

* waitq. So we cancel all extra charges, wake up all waiters,

5627

* waitq. So we cancel all extra charges, wake up all waiters,

5636

* and retry. Because we cancel precharges, we might not be able

5628

* and retry. Because we cancel precharges, we might not be able

5637

* to move enough charges, but moving charge is a best-effort

5629

* to move enough charges, but moving charge is a best-effort

5638

* feature anyway, so it wouldn't be a big problem.

5630

* feature anyway, so it wouldn't be a big problem.

5639

*/

5631

*/

5640

__mem_cgroup_clear_mc();

5632

__mem_cgroup_clear_mc();

5641

cond_resched();

5633

cond_resched();

5642

goto retry;

5634

goto retry;

5643

}

5635

}

5644

for (vma = mm->mmap; vma; vma = vma->vm_next) {

5636

for (vma = mm->mmap; vma; vma = vma->vm_next) {

5645

int ret;

5637

int ret;

5646

struct mm_walk mem_cgroup_move_charge_walk = {

5638

struct mm_walk mem_cgroup_move_charge_walk = {

5647

.pmd_entry = mem_cgroup_move_charge_pte_range,

5639

.pmd_entry = mem_cgroup_move_charge_pte_range,

5648

.mm = mm,

5640

.mm = mm,

5649

.private = vma,

5641

.private = vma,

5650

};

5642

};

5651

if (is_vm_hugetlb_page(vma))

5643

if (is_vm_hugetlb_page(vma))

5652

continue;

5644

continue;

5653

ret = walk_page_range(vma->vm_start, vma->vm_end,

5645

ret = walk_page_range(vma->vm_start, vma->vm_end,

5654

&mem_cgroup_move_charge_walk);

5646

&mem_cgroup_move_charge_walk);

5655

if (ret)

5647

if (ret)

5656

/*

5648

/*

5657

* means we have consumed all precharges and failed in

5649

* means we have consumed all precharges and failed in

5658

* doing additional charge. Just abandon here.

5650

* doing additional charge. Just abandon here.

5659

*/

5651

*/

5660

break;

5652

break;

5661

}

5653

}

5662

up_read(&mm->mmap_sem);

5654

up_read(&mm->mmap_sem);

5663

}

5655

}

5664

5656

5665

static void mem_cgroup_move_task(struct cgroup_subsys *ss,

5657

static void mem_cgroup_move_task(struct cgroup_subsys *ss,

5666

struct cgroup *cont,

5658

struct cgroup *cont,

5667

struct cgroup *old_cont,

5659

struct cgroup *old_cont,

5668

struct task_struct *p)

5660

struct task_struct *p)

5669

{

5661

{

5670

struct mm_struct *mm = get_task_mm(p);

5662

struct mm_struct *mm = get_task_mm(p);

5671

5663

5672

if (mm) {

5664

if (mm) {

5673

if (mc.to)

5665

if (mc.to)

5674

mem_cgroup_move_charge(mm);

5666

mem_cgroup_move_charge(mm);

5675

put_swap_token(mm);

5667

put_swap_token(mm);

5676

mmput(mm);

5668

mmput(mm);

5677

}

5669

}

5678

if (mc.to)

5670

if (mc.to)

5679

mem_cgroup_clear_mc();

5671

mem_cgroup_clear_mc();

5680

}

5672

}

5681

#else /* !CONFIG_MMU */

5673

#else /* !CONFIG_MMU */

5682

static int mem_cgroup_can_attach(struct cgroup_subsys *ss,

5674

static int mem_cgroup_can_attach(struct cgroup_subsys *ss,

5683

struct cgroup *cgroup,

5675

struct cgroup *cgroup,

5684

struct task_struct *p)

5676

struct task_struct *p)

5685

{

5677

{

5686

return 0;

5678

return 0;

5687

}

5679

}

5688

static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,

5680

static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,

5689

struct cgroup *cgroup,

5681

struct cgroup *cgroup,

5690

struct task_struct *p)

5682

struct task_struct *p)

5691

{

5683

{

5692

}

5684

}

5693

static void mem_cgroup_move_task(struct cgroup_subsys *ss,

5685

static void mem_cgroup_move_task(struct cgroup_subsys *ss,

5694

struct cgroup *cont,

5686

struct cgroup *cont,

5695

struct cgroup *old_cont,

5687

struct cgroup *old_cont,

5696

struct task_struct *p)

5688

struct task_struct *p)

5697

{

5689

{

5698

}

5690

}

5699

#endif

5691

#endif

5700

5692

5701

struct cgroup_subsys mem_cgroup_subsys = {

5693

struct cgroup_subsys mem_cgroup_subsys = {

5702

.name = "memory",

5694

.name = "memory",

5703

.subsys_id = mem_cgroup_subsys_id,

5695

.subsys_id = mem_cgroup_subsys_id,

5704

.create = mem_cgroup_create,

5696

.create = mem_cgroup_create,

5705

.pre_destroy = mem_cgroup_pre_destroy,

5697

.pre_destroy = mem_cgroup_pre_destroy,

5706

.destroy = mem_cgroup_destroy,

5698

.destroy = mem_cgroup_destroy,

5707

.populate = mem_cgroup_populate,

5699

.populate = mem_cgroup_populate,

5708

.can_attach = mem_cgroup_can_attach,

5700

.can_attach = mem_cgroup_can_attach,

5709

.cancel_attach = mem_cgroup_cancel_attach,

5701

.cancel_attach = mem_cgroup_cancel_attach,

5710

.attach = mem_cgroup_move_task,

5702

.attach = mem_cgroup_move_task,

5711

.early_init = 0,

5703

.early_init = 0,

5712

.use_id = 1,

5704

.use_id = 1,

5713

};

5705

};

5714

5706

5715

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

5707

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

5716

static int __init enable_swap_account(char *s)

5708

static int __init enable_swap_account(char *s)

5717

{

5709

{

5718

/* consider enabled if no parameter or 1 is given */

5710

/* consider enabled if no parameter or 1 is given */

5719

if (!strcmp(s, "1"))

5711

if (!strcmp(s, "1"))

5720

really_do_swap_account = 1;

5712

really_do_swap_account = 1;

5721

else if (!strcmp(s, "0"))

5713

else if (!strcmp(s, "0"))

5722

really_do_swap_account = 0;

5714

really_do_swap_account = 0;

5723

return 1;

5715

return 1;

5724

}

5716

}

5725

__setup("swapaccount=", enable_swap_account);

5717

__setup("swapaccount=", enable_swap_account);

5726

5718

5727

#endif

5719

#endif

GITLAB

Eric Lee / smarc-fsl-linux-kernel

memcg: get rid of percpu_charge_mutex lock

 /* memcontrol.c - Memory Controller
  *
  * Copyright IBM Corporation, 2007
  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
  *
  * Copyright 2007 OpenVZ SWsoft Inc
  * Author: Pavel Emelianov <xemul@openvz.org>
  *
  * Memory thresholds
  * Copyright (C) 2009 Nokia Corporation
  * Author: Kirill A. Shutemov
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  */
 #include <linux/res_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/smp.h>
 #include <linux/page-flags.h>
 #include <linux/backing-dev.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/limits.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/spinlock.h>
 #include <linux/eventfd.h>
 #include <linux/sort.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include "internal.h"
 #include <asm/uaccess.h>
 #include <trace/events/vmscan.h>
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 struct mem_cgroup *root_mem_cgroup __read_mostly;
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
 int do_swap_account __read_mostly;
 /* for remember boot option*/
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
 static int really_do_swap_account __initdata = 1;
 #else
 static int really_do_swap_account __initdata = 0;
 #endif
 #else
 #define do_swap_account		(0)
 #endif
 /*
  * Statistics for memory cgroup.
  */
 enum mem_cgroup_stat_index {
 	/*
 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
 	 */
 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
 	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
 	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
 	MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
 	MEM_CGROUP_ON_MOVE,	/* someone is moving account between groups */
 	MEM_CGROUP_STAT_NSTATS,
 };
 enum mem_cgroup_events_index {
 	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
 	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
 	MEM_CGROUP_EVENTS_COUNT,	/* # of pages paged in/out */
 	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
 	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
 	MEM_CGROUP_EVENTS_NSTATS,
 };
 /*
  * Per memcg event counter is incremented at every pagein/pageout. With THP,
  * it will be incremated by the number of pages. This counter is used for
  * for trigger some periodic events. This is straightforward and better
  * than using jiffies etc. to handle periodic memcg event.
  */
 enum mem_cgroup_events_target {
 	MEM_CGROUP_TARGET_THRESH,
 	MEM_CGROUP_TARGET_SOFTLIMIT,
 	MEM_CGROUP_TARGET_NUMAINFO,
 	MEM_CGROUP_NTARGETS,
 };
 #define THRESHOLDS_EVENTS_TARGET (128)
 #define SOFTLIMIT_EVENTS_TARGET (1024)
 #define NUMAINFO_EVENTS_TARGET	(1024)
 struct mem_cgroup_stat_cpu {
 	long count[MEM_CGROUP_STAT_NSTATS];
 	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
 	unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 /*
  * per-zone information in memory controller.
  */
 struct mem_cgroup_per_zone {
 	/*
 	 * spin_lock to protect the per cgroup LRU
 	 */
 	struct list_head	lists[NR_LRU_LISTS];
 	unsigned long		count[NR_LRU_LISTS];
 	struct zone_reclaim_stat reclaim_stat;
 	struct rb_node		tree_node;	/* RB tree node */
 	unsigned long long	usage_in_excess;/* Set to the value by which */
 						/* the soft limit is exceeded*/
 	bool			on_tree;
 	struct mem_cgroup	*mem;		/* Back pointer, we cannot */
 						/* use container_of	   */
 };
 /* Macro for accessing counter */
 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
 struct mem_cgroup_per_node {
 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 };
 struct mem_cgroup_lru_info {
 	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 };
 /*
  * Cgroups above their limits are maintained in a RB-Tree, independent of
  * their hierarchy representation
  */
 struct mem_cgroup_tree_per_zone {
 	struct rb_root rb_root;
 	spinlock_t lock;
 };
 struct mem_cgroup_tree_per_node {
 	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
 };
 struct mem_cgroup_tree {
 	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 };
 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 struct mem_cgroup_threshold {
 	struct eventfd_ctx *eventfd;
 	u64 threshold;
 };
 /* For threshold */
 struct mem_cgroup_threshold_ary {
 	/* An array index points to threshold just below usage. */
 	int current_threshold;
 	/* Size of entries[] */
 	unsigned int size;
 	/* Array of thresholds */
 	struct mem_cgroup_threshold entries[0];
 };
 struct mem_cgroup_thresholds {
 	/* Primary thresholds array */
 	struct mem_cgroup_threshold_ary *primary;
 	/*
 	 * Spare threshold array.
 	 * This is needed to make mem_cgroup_unregister_event() "never fail".
 	 * It must be able to store at least primary->size - 1 entries.
 	 */
 	struct mem_cgroup_threshold_ary *spare;
 };
 /* for OOM */
 struct mem_cgroup_eventfd_list {
 	struct list_head list;
 	struct eventfd_ctx *eventfd;
 };
 static void mem_cgroup_threshold(struct mem_cgroup *mem);
 static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
 enum {
 	SCAN_BY_LIMIT,
 	SCAN_BY_SYSTEM,
 	NR_SCAN_CONTEXT,
 	SCAN_BY_SHRINK,	/* not recorded now */
 };
 enum {
 	SCAN,
 	SCAN_ANON,
 	SCAN_FILE,
 	ROTATE,
 	ROTATE_ANON,
 	ROTATE_FILE,
 	FREED,
 	FREED_ANON,
 	FREED_FILE,
 	ELAPSED,
 	NR_SCANSTATS,
 };
 struct scanstat {
 	spinlock_t	lock;
 	unsigned long	stats[NR_SCAN_CONTEXT][NR_SCANSTATS];
 	unsigned long	rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS];
 };
 const char *scanstat_string[NR_SCANSTATS] = {
 	"scanned_pages",
 	"scanned_anon_pages",
 	"scanned_file_pages",
 	"rotated_pages",
 	"rotated_anon_pages",
 	"rotated_file_pages",
 	"freed_pages",
 	"freed_anon_pages",
 	"freed_file_pages",
 	"elapsed_ns",
 };
 #define SCANSTAT_WORD_LIMIT	"_by_limit"
 #define SCANSTAT_WORD_SYSTEM	"_by_system"
 #define SCANSTAT_WORD_HIERARCHY	"_under_hierarchy"
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
  * to help the administrator determine what knobs to tune.
  *
  * TODO: Add a water mark for the memory controller. Reclaim will begin when
  * we hit the water mark. May be even add a low water mark, such that
  * no reclaim occurs from a cgroup at it's low water mark, this is
  * a feature that will be implemented much later in the future.
  */
 struct mem_cgroup {
 	struct cgroup_subsys_state css;
 	/*
 	 * the counter to account for memory usage
 	 */
 	struct res_counter res;
 	/*
 	 * the counter to account for mem+swap usage.
 	 */
 	struct res_counter memsw;
 	/*
 	 * Per cgroup active and inactive list, similar to the
 	 * per zone LRU lists.
 	 */
 	struct mem_cgroup_lru_info info;
 	/*
 	 * While reclaiming in a hierarchy, we cache the last child we
 	 * reclaimed from.
 	 */
 	int last_scanned_child;
 	int last_scanned_node;
 #if MAX_NUMNODES > 1
 	nodemask_t	scan_nodes;
 	atomic_t	numainfo_events;
 	atomic_t	numainfo_updating;
 #endif
 	/*
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
 	bool use_hierarchy;
 	bool		oom_lock;
 	atomic_t	under_oom;
 	atomic_t	refcnt;
 	int	swappiness;
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
 	/* set when res.limit == memsw.limit */
 	bool		memsw_is_minimum;
 	/* protect arrays of thresholds */
 	struct mutex thresholds_lock;
 	/* thresholds for memory usage. RCU-protected */
 	struct mem_cgroup_thresholds thresholds;
 	/* thresholds for mem+swap usage. RCU-protected */
 	struct mem_cgroup_thresholds memsw_thresholds;
 	/* For oom notifier event fd */
 	struct list_head oom_notify;
 	/* For recording LRU-scan statistics */
 	struct scanstat scanstat;
 	/*
 	 * Should we move charges of a task when a task is moved into this
 	 * mem_cgroup ? And what type of charges should we move ?
 	 */
 	unsigned long 	move_charge_at_immigrate;
 	/*
 	 * percpu counter.
 	 */
 	struct mem_cgroup_stat_cpu *stat;
 	/*
 	 * used when a cpu is offlined or other synchronizations
 	 * See mem_cgroup_read_stat().
 	 */
 	struct mem_cgroup_stat_cpu nocpu_base;
 	spinlock_t pcp_counter_lock;
 };
 /* Stuffs for move charges at task migration. */
 /*
  * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
  * left-shifted bitmap of these types.
  */
 enum move_type {
 	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
 	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
 	NR_MOVE_TYPE,
 };
 /* "mc" and its members are protected by cgroup_mutex */
 static struct move_charge_struct {
 	spinlock_t	  lock; /* for from, to */
 	struct mem_cgroup *from;
 	struct mem_cgroup *to;
 	unsigned long precharge;
 	unsigned long moved_charge;
 	unsigned long moved_swap;
 	struct task_struct *moving_task;	/* a task moving charges */
 	wait_queue_head_t waitq;		/* a waitq for other context */
 } mc = {
 	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 };
 static bool move_anon(void)
 {
 	return test_bit(MOVE_CHARGE_TYPE_ANON,
 					&mc.to->move_charge_at_immigrate);
 }
 static bool move_file(void)
 {
 	return test_bit(MOVE_CHARGE_TYPE_FILE,
 					&mc.to->move_charge_at_immigrate);
 }
 /*
  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
  * limit reclaim to prevent infinite loops, if they ever occur.
  */
 #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		(100)
 #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	(2)
 enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
 	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
 	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
 	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
 	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
 	NR_CHARGE_TYPE,
 };
 /* for encoding cft->private value on file */
 #define _MEM			(0)
 #define _MEMSWAP		(1)
 #define _OOM_TYPE		(2)
 #define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
 #define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)
 /* Used for OOM nofiier */
 #define OOM_CONTROL		(0)
 /*
  * Reclaim flags for mem_cgroup_hierarchical_reclaim
  */
 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
 #define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
 #define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
 #define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
 #define MEM_CGROUP_RECLAIM_SOFT_BIT	0x2
 #define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
 static void drain_all_stock_async(struct mem_cgroup *mem);
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
 {
 	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
 }
 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
 {
 	return &mem->css;
 }
 static struct mem_cgroup_per_zone *
 page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
 {
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	return mem_cgroup_zoneinfo(mem, nid, zid);
 }
 static struct mem_cgroup_tree_per_zone *
 soft_limit_tree_node_zone(int nid, int zid)
 {
 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 }
 static struct mem_cgroup_tree_per_zone *
 soft_limit_tree_from_page(struct page *page)
 {
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 }
 static void
 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz,
 				unsigned long long new_usage_in_excess)
 {
 	struct rb_node **p = &mctz->rb_root.rb_node;
 	struct rb_node *parent = NULL;
 	struct mem_cgroup_per_zone *mz_node;
 	if (mz->on_tree)
 		return;
 	mz->usage_in_excess = new_usage_in_excess;
 	if (!mz->usage_in_excess)
 		return;
 	while (*p) {
 		parent = *p;
 		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
 					tree_node);
 		if (mz->usage_in_excess < mz_node->usage_in_excess)
 			p = &(*p)->rb_left;
 		/*
 		 * We can't avoid mem cgroups that are over their soft
 		 * limit by the same amount
 		 */
 		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 			p = &(*p)->rb_right;
 	}
 	rb_link_node(&mz->tree_node, parent, p);
 	rb_insert_color(&mz->tree_node, &mctz->rb_root);
 	mz->on_tree = true;
 }
 static void
 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz)
 {
 	if (!mz->on_tree)
 		return;
 	rb_erase(&mz->tree_node, &mctz->rb_root);
 	mz->on_tree = false;
 }
 static void
 mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz)
 {
 	spin_lock(&mctz->lock);
 	__mem_cgroup_remove_exceeded(mem, mz, mctz);
 	spin_unlock(&mctz->lock);
 }
 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
 {
 	unsigned long long excess;
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup_tree_per_zone *mctz;
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	mctz = soft_limit_tree_from_page(page);
 	/*
 	 * Necessary to update all ancestors when hierarchy is used.
 	 * because their event counter is not touched.
 	 */
 	for (; mem; mem = parent_mem_cgroup(mem)) {
 		mz = mem_cgroup_zoneinfo(mem, nid, zid);
 		excess = res_counter_soft_limit_excess(&mem->res);
 		/*
 		 * We have to update the tree if mz is on RB-tree or
 		 * mem is over its softlimit.
 		 */
 		if (excess || mz->on_tree) {
 			spin_lock(&mctz->lock);
 			/* if on-tree, remove it */
 			if (mz->on_tree)
 				__mem_cgroup_remove_exceeded(mem, mz, mctz);
 			/*
 			 * Insert again. mz->usage_in_excess will be updated.
 			 * If excess is 0, no tree ops.
 			 */
 			__mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
 			spin_unlock(&mctz->lock);
 		}
 	}
 }
 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
 {
 	int node, zone;
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup_tree_per_zone *mctz;
 	for_each_node_state(node, N_POSSIBLE) {
 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 			mz = mem_cgroup_zoneinfo(mem, node, zone);
 			mctz = soft_limit_tree_node_zone(node, zone);
 			mem_cgroup_remove_exceeded(mem, mz, mctz);
 		}
 	}
 }
 static struct mem_cgroup_per_zone *
 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 {
 	struct rb_node *rightmost = NULL;
 	struct mem_cgroup_per_zone *mz;
 retry:
 	mz = NULL;
 	rightmost = rb_last(&mctz->rb_root);
 	if (!rightmost)
 		goto done;		/* Nothing to reclaim from */
 	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
 	/*
 	 * Remove the node now but someone else can add it back,
 	 * we will to add it back at the end of reclaim to its correct
 	 * position in the tree.
 	 */
 	__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
 	if (!res_counter_soft_limit_excess(&mz->mem->res) ||
 		!css_tryget(&mz->mem->css))
 		goto retry;
 done:
 	return mz;
 }
 static struct mem_cgroup_per_zone *
 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 {
 	struct mem_cgroup_per_zone *mz;
 	spin_lock(&mctz->lock);
 	mz = __mem_cgroup_largest_soft_limit_node(mctz);
 	spin_unlock(&mctz->lock);
 	return mz;
 }
 /*
  * Implementation Note: reading percpu statistics for memcg.
  *
  * Both of vmstat[] and percpu_counter has threshold and do periodic
  * synchronization to implement "quick" read. There are trade-off between
  * reading cost and precision of value. Then, we may have a chance to implement
  * a periodic synchronizion of counter in memcg's counter.
  *
  * But this _read() function is used for user interface now. The user accounts
  * memory usage by memory cgroup and he _always_ requires exact value because
  * he accounts memory. Even if we provide quick-and-fuzzy read, we always
  * have to visit all online cpus and make sum. So, for now, unnecessary
  * synchronization is not implemented. (just implemented for cpu hotplug)
  *
  * If there are kernel internal actions which can make use of some not-exact
  * value, and reading all cpu value can be performance bottleneck in some
  * common workload, threashold and synchonization as vmstat[] should be
  * implemented.
  */
 static long mem_cgroup_read_stat(struct mem_cgroup *mem,
 				 enum mem_cgroup_stat_index idx)
 {
 	long val = 0;
 	int cpu;
 	get_online_cpus();
 	for_each_online_cpu(cpu)
 		val += per_cpu(mem->stat->count[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 	spin_lock(&mem->pcp_counter_lock);
 	val += mem->nocpu_base.count[idx];
 	spin_unlock(&mem->pcp_counter_lock);
 #endif
 	put_online_cpus();
 	return val;
 }
 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
 					 bool charge)
 {
 	int val = (charge) ? 1 : -1;
 	this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
 }
 void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
 {
 	this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
 }
 void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
 {
 	this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
 }
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
 					    enum mem_cgroup_events_index idx)
 {
 	unsigned long val = 0;
 	int cpu;
 	for_each_online_cpu(cpu)
 		val += per_cpu(mem->stat->events[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 	spin_lock(&mem->pcp_counter_lock);
 	val += mem->nocpu_base.events[idx];
 	spin_unlock(&mem->pcp_counter_lock);
 #endif
 	return val;
 }
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 					 bool file, int nr_pages)
 {
 	preempt_disable();
 	if (file)
 		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
 	else
 		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
 	/* pagein of a big page is an event. So, ignore page size */
 	if (nr_pages > 0)
 		__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 	else {
 		__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 		nr_pages = -nr_pages; /* for event */
 	}
 	__this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
 	preempt_enable();
 }
 unsigned long
 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
 			unsigned int lru_mask)
 {
 	struct mem_cgroup_per_zone *mz;
 	enum lru_list l;
 	unsigned long ret = 0;
 	mz = mem_cgroup_zoneinfo(mem, nid, zid);
 	for_each_lru(l) {
 		if (BIT(l) & lru_mask)
 			ret += MEM_CGROUP_ZSTAT(mz, l);
 	}
 	return ret;
 }
 static unsigned long
 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,
 			int nid, unsigned int lru_mask)
 {
 	u64 total = 0;
 	int zid;
 	for (zid = 0; zid < MAX_NR_ZONES; zid++)
 		total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);
 	return total;
 }
 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,
 			unsigned int lru_mask)
 {
 	int nid;
 	u64 total = 0;
 	for_each_node_state(nid, N_HIGH_MEMORY)
 		total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);
 	return total;
 }
 static bool __memcg_event_check(struct mem_cgroup *mem, int target)
 {
 	unsigned long val, next;
 	val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
 	next = this_cpu_read(mem->stat->targets[target]);
 	/* from time_after() in jiffies.h */
 	return ((long)next - (long)val < 0);
 }
 static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
 {
 	unsigned long val, next;
 	val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
 	switch (target) {
 	case MEM_CGROUP_TARGET_THRESH:
 		next = val + THRESHOLDS_EVENTS_TARGET;
 		break;
 	case MEM_CGROUP_TARGET_SOFTLIMIT:
 		next = val + SOFTLIMIT_EVENTS_TARGET;
 		break;
 	case MEM_CGROUP_TARGET_NUMAINFO:
 		next = val + NUMAINFO_EVENTS_TARGET;
 		break;
 	default:
 		return;
 	}
 	this_cpu_write(mem->stat->targets[target], next);
 }
 /*
  * Check events in order.
  *
  */
 static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
 {
 	/* threshold event is triggered in finer grain than soft limit */
 	if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
 		mem_cgroup_threshold(mem);
 		__mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
 		if (unlikely(__memcg_event_check(mem,
 			     MEM_CGROUP_TARGET_SOFTLIMIT))) {
 			mem_cgroup_update_tree(mem, page);
 			__mem_cgroup_target_update(mem,
 						   MEM_CGROUP_TARGET_SOFTLIMIT);
 		}
 #if MAX_NUMNODES > 1
 		if (unlikely(__memcg_event_check(mem,
 			MEM_CGROUP_TARGET_NUMAINFO))) {
 			atomic_inc(&mem->numainfo_events);
 			__mem_cgroup_target_update(mem,
 				MEM_CGROUP_TARGET_NUMAINFO);
 		}
 #endif
 	}
 }
 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 {
 	return container_of(cgroup_subsys_state(cont,
 				mem_cgroup_subsys_id), struct mem_cgroup,
 				css);
 }
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 {
 	/*
 	 * mm_update_next_owner() may clear mm->owner to NULL
 	 * if it races with swapoff, page migration, etc.
 	 * So this can be called with p == NULL.
 	 */
 	if (unlikely(!p))
 		return NULL;
 	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 				struct mem_cgroup, css);
 }
 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
 	struct mem_cgroup *mem = NULL;
 	if (!mm)
 		return NULL;
 	/*
 	 * Because we have no locks, mm->owner's may be being moved to other
 	 * cgroup. We use css_tryget() here even if this looks
 	 * pessimistic (rather than adding locks here).
 	 */
 	rcu_read_lock();
 	do {
 		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 		if (unlikely(!mem))
 			break;
 	} while (!css_tryget(&mem->css));
 	rcu_read_unlock();
 	return mem;
 }
 /* The caller has to guarantee "mem" exists before calling this */
 static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
 {
 	struct cgroup_subsys_state *css;
 	int found;
 	if (!mem) /* ROOT cgroup has the smallest ID */
 		return root_mem_cgroup; /*css_put/get against root is ignored*/
 	if (!mem->use_hierarchy) {
 		if (css_tryget(&mem->css))
 			return mem;
 		return NULL;
 	}
 	rcu_read_lock();
 	/*
 	 * searching a memory cgroup which has the smallest ID under given
 	 * ROOT cgroup. (ID >= 1)
 	 */
 	css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
 	if (css && css_tryget(css))
 		mem = container_of(css, struct mem_cgroup, css);
 	else
 		mem = NULL;
 	rcu_read_unlock();
 	return mem;
 }
 static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
 					struct mem_cgroup *root,
 					bool cond)
 {
 	int nextid = css_id(&iter->css) + 1;
 	int found;
 	int hierarchy_used;
 	struct cgroup_subsys_state *css;
 	hierarchy_used = iter->use_hierarchy;
 	css_put(&iter->css);
 	/* If no ROOT, walk all, ignore hierarchy */
 	if (!cond || (root && !hierarchy_used))
 		return NULL;
 	if (!root)
 		root = root_mem_cgroup;
 	do {
 		iter = NULL;
 		rcu_read_lock();
 		css = css_get_next(&mem_cgroup_subsys, nextid,
 				&root->css, &found);
 		if (css && css_tryget(css))
 			iter = container_of(css, struct mem_cgroup, css);
 		rcu_read_unlock();
 		/* If css is NULL, no more cgroups will be found */
 		nextid = found + 1;
 	} while (css && !iter);
 	return iter;
 }
 /*
  * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
  * be careful that "break" loop is not allowed. We have reference count.
  * Instead of that modify "cond" to be false and "continue" to exit the loop.
  */
 #define for_each_mem_cgroup_tree_cond(iter, root, cond)	\
 	for (iter = mem_cgroup_start_loop(root);\
 	     iter != NULL;\
 	     iter = mem_cgroup_get_next(iter, root, cond))
 #define for_each_mem_cgroup_tree(iter, root) \
 	for_each_mem_cgroup_tree_cond(iter, root, true)
 #define for_each_mem_cgroup_all(iter) \
 	for_each_mem_cgroup_tree_cond(iter, NULL, true)
 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
 {
 	return (mem == root_mem_cgroup);
 }
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 	struct mem_cgroup *mem;
 	if (!mm)
 		return;
 	rcu_read_lock();
 	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 	if (unlikely(!mem))
 		goto out;
 	switch (idx) {
 	case PGMAJFAULT:
 		mem_cgroup_pgmajfault(mem, 1);
 		break;
 	case PGFAULT:
 		mem_cgroup_pgfault(mem, 1);
 		break;
 	default:
 		BUG();
 	}
 out:
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(mem_cgroup_count_vm_event);
 /*
  * Following LRU functions are allowed to be used without PCG_LOCK.
  * Operations are called by routine of global LRU independently from memcg.
  * What we have to take care of here is validness of pc->mem_cgroup.
  *
  * Changes to pc->mem_cgroup happens when
  * 1. charge
  * 2. moving account
  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
  * It is added to LRU before charge.
  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
  * When moving account, the page is not on LRU. It's isolated.
  */
 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup_per_zone *mz;
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(page);
 	/* can happen while we handle swapcache. */
 	if (!TestClearPageCgroupAcctLRU(pc))
 		return;
 	VM_BUG_ON(!pc->mem_cgroup);
 	/*
 	 * We don't check PCG_USED bit. It's cleared when the "page" is finally
 	 * removed from global LRU.
 	 */
 	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
 	/* huge page split is done under lru_lock. so, we have no races. */
 	MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
 	if (mem_cgroup_is_root(pc->mem_cgroup))
 		return;
 	VM_BUG_ON(list_empty(&pc->lru));
 	list_del_init(&pc->lru);
 }
 void mem_cgroup_del_lru(struct page *page)
 {
 	mem_cgroup_del_lru_list(page, page_lru(page));
 }
 /*
  * Writeback is about to end against a page which has been marked for immediate
  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
  * inactive list.
  */
 void mem_cgroup_rotate_reclaimable_page(struct page *page)
 {
 	struct mem_cgroup_per_zone *mz;
 	struct page_cgroup *pc;
 	enum lru_list lru = page_lru(page);
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(page);
 	/* unused or root page is not rotated. */
 	if (!PageCgroupUsed(pc))
 		return;
 	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
 	smp_rmb();
 	if (mem_cgroup_is_root(pc->mem_cgroup))
 		return;
 	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
 	list_move_tail(&pc->lru, &mz->lists[lru]);
 }
 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
 {
 	struct mem_cgroup_per_zone *mz;
 	struct page_cgroup *pc;
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(page);
 	/* unused or root page is not rotated. */
 	if (!PageCgroupUsed(pc))
 		return;
 	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
 	smp_rmb();
 	if (mem_cgroup_is_root(pc->mem_cgroup))
 		return;
 	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
 	list_move(&pc->lru, &mz->lists[lru]);
 }
 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup_per_zone *mz;
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(page);
 	VM_BUG_ON(PageCgroupAcctLRU(pc));
 	if (!PageCgroupUsed(pc))
 		return;
 	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
 	smp_rmb();
 	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
 	/* huge page split is done under lru_lock. so, we have no races. */
 	MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
 	SetPageCgroupAcctLRU(pc);
 	if (mem_cgroup_is_root(pc->mem_cgroup))
 		return;
 	list_add(&pc->lru, &mz->lists[lru]);
 }
 /*
  * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
  * while it's linked to lru because the page may be reused after it's fully
  * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
  * It's done under lock_page and expected that zone->lru_lock isnever held.
  */
 static void mem_cgroup_lru_del_before_commit(struct page *page)
 {
 	unsigned long flags;
 	struct zone *zone = page_zone(page);
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	/*
 	 * Doing this check without taking ->lru_lock seems wrong but this
 	 * is safe. Because if page_cgroup's USED bit is unset, the page
 	 * will not be added to any memcg's LRU. If page_cgroup's USED bit is
 	 * set, the commit after this will fail, anyway.
 	 * This all charge/uncharge is done under some mutual execustion.
 	 * So, we don't need to taking care of changes in USED bit.
 	 */
 	if (likely(!PageLRU(page)))
 		return;
 	spin_lock_irqsave(&zone->lru_lock, flags);
 	/*
 	 * Forget old LRU when this page_cgroup is *not* used. This Used bit
 	 * is guarded by lock_page() because the page is SwapCache.
 	 */
 	if (!PageCgroupUsed(pc))
 		mem_cgroup_del_lru_list(page, page_lru(page));
 	spin_unlock_irqrestore(&zone->lru_lock, flags);
 }
 static void mem_cgroup_lru_add_after_commit(struct page *page)
 {
 	unsigned long flags;
 	struct zone *zone = page_zone(page);
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	/* taking care of that the page is added to LRU while we commit it */
 	if (likely(!PageLRU(page)))
 		return;
 	spin_lock_irqsave(&zone->lru_lock, flags);
 	/* link when the page is linked to LRU but page_cgroup isn't */
 	if (PageLRU(page) && !PageCgroupAcctLRU(pc))
 		mem_cgroup_add_lru_list(page, page_lru(page));
 	spin_unlock_irqrestore(&zone->lru_lock, flags);
 }
 void mem_cgroup_move_lists(struct page *page,
 			   enum lru_list from, enum lru_list to)
 {
 	if (mem_cgroup_disabled())
 		return;
 	mem_cgroup_del_lru_list(page, from);
 	mem_cgroup_add_lru_list(page, to);
 }
 /*
  * Checks whether given mem is same or in the root_mem's
  * hierarchy subtree
  */
 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,
 		struct mem_cgroup *mem)
 {
 	if (root_mem != mem) {
 		return (root_mem->use_hierarchy &&
 			css_is_ancestor(&mem->css, &root_mem->css));
 	}
 	return true;
 }
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 {
 	int ret;
 	struct mem_cgroup *curr = NULL;
 	struct task_struct *p;
 	p = find_lock_task_mm(task);
 	if (!p)
 		return 0;
 	curr = try_get_mem_cgroup_from_mm(p->mm);
 	task_unlock(p);
 	if (!curr)
 		return 0;
 	/*
 	 * We should check use_hierarchy of "mem" not "curr". Because checking
 	 * use_hierarchy of "curr" here make this function true if hierarchy is
 	 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
 	 * hierarchy(even if use_hierarchy is disabled in "mem").
 	 */
 	ret = mem_cgroup_same_or_subtree(mem, curr);
 	css_put(&curr->css);
 	return ret;
 }
 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
 {
 	unsigned long active;
 	unsigned long inactive;
 	unsigned long gb;
 	unsigned long inactive_ratio;
 	inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
 	active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
 	if (gb)
 		inactive_ratio = int_sqrt(10 * gb);
 	else
 		inactive_ratio = 1;
 	if (present_pages) {
 		present_pages[0] = inactive;
 		present_pages[1] = active;
 	}
 	return inactive_ratio;
 }
 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
 {
 	unsigned long active;
 	unsigned long inactive;
 	unsigned long present_pages[2];
 	unsigned long inactive_ratio;
 	inactive_ratio = calc_inactive_ratio(memcg, present_pages);
 	inactive = present_pages[0];
 	active = present_pages[1];
 	if (inactive * inactive_ratio < active)
 		return 1;
 	return 0;
 }
 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
 {
 	unsigned long active;
 	unsigned long inactive;
 	inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
 	active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
 	return (active > inactive);
 }
 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
 						      struct zone *zone)
 {
 	int nid = zone_to_nid(zone);
 	int zid = zone_idx(zone);
 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 	return &mz->reclaim_stat;
 }
 struct zone_reclaim_stat *
 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup_per_zone *mz;
 	if (mem_cgroup_disabled())
 		return NULL;
 	pc = lookup_page_cgroup(page);
 	if (!PageCgroupUsed(pc))
 		return NULL;
 	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
 	smp_rmb();
 	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
 	return &mz->reclaim_stat;
 }
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					struct list_head *dst,
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
 					int active, int file)
 {
 	unsigned long nr_taken = 0;
 	struct page *page;
 	unsigned long scan;
 	LIST_HEAD(pc_list);
 	struct list_head *src;
 	struct page_cgroup *pc, *tmp;
 	int nid = zone_to_nid(z);
 	int zid = zone_idx(z);
 	struct mem_cgroup_per_zone *mz;
 	int lru = LRU_FILE * file + active;
 	int ret;
 	BUG_ON(!mem_cont);
 	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
 	src = &mz->lists[lru];
 	scan = 0;
 	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
 		if (scan >= nr_to_scan)
 			break;
 		if (unlikely(!PageCgroupUsed(pc)))
 			continue;
 		page = lookup_cgroup_page(pc);
 		if (unlikely(!PageLRU(page)))
 			continue;
 		scan++;
 		ret = __isolate_lru_page(page, mode, file);
 		switch (ret) {
 		case 0:
 			list_move(&page->lru, dst);
 			mem_cgroup_del_lru(page);
 			nr_taken += hpage_nr_pages(page);
 			break;
 		case -EBUSY:
 			/* we don't affect global LRU but rotate in our LRU */
 			mem_cgroup_rotate_lru_list(page, page_lru(page));
 			break;
 		default:
 			break;
 		}
 	}
 	*scanned = scan;
 	trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
 				      0, 0, 0, mode);
 	return nr_taken;
 }
 #define mem_cgroup_from_res_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 /**
  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
  * @mem: the memory cgroup
  *
  * Returns the maximum amount of memory @mem can be charged with, in
  * pages.
  */
 static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
 {
 	unsigned long long margin;
 	margin = res_counter_margin(&mem->res);
 	if (do_swap_account)
 		margin = min(margin, res_counter_margin(&mem->memsw));
 	return margin >> PAGE_SHIFT;
 }
 int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
 	struct cgroup *cgrp = memcg->css.cgroup;
 	/* root ? */
 	if (cgrp->parent == NULL)
 		return vm_swappiness;
 	return memcg->swappiness;
 }
 static void mem_cgroup_start_move(struct mem_cgroup *mem)
 {
 	int cpu;
 	get_online_cpus();
 	spin_lock(&mem->pcp_counter_lock);
 	for_each_online_cpu(cpu)
 		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
 	mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
 	spin_unlock(&mem->pcp_counter_lock);
 	put_online_cpus();
 	synchronize_rcu();
 }
 static void mem_cgroup_end_move(struct mem_cgroup *mem)
 {
 	int cpu;
 	if (!mem)
 		return;
 	get_online_cpus();
 	spin_lock(&mem->pcp_counter_lock);
 	for_each_online_cpu(cpu)
 		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
 	mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
 	spin_unlock(&mem->pcp_counter_lock);
 	put_online_cpus();
 }
 /*
  * 2 routines for checking "mem" is under move_account() or not.
  *
  * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
  *			  for avoiding race in accounting. If true,
  *			  pc->mem_cgroup may be overwritten.
  *
  * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
  *			  under hierarchy of moving cgroups. This is for
  *			  waiting at hith-memory prressure caused by "move".
  */
 static bool mem_cgroup_stealed(struct mem_cgroup *mem)
 {
 	VM_BUG_ON(!rcu_read_lock_held());
 	return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
 }
 static bool mem_cgroup_under_move(struct mem_cgroup *mem)
 {
 	struct mem_cgroup *from;
 	struct mem_cgroup *to;
 	bool ret = false;
 	/*
 	 * Unlike task_move routines, we access mc.to, mc.from not under
 	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
 	 */
 	spin_lock(&mc.lock);
 	from = mc.from;
 	to = mc.to;
 	if (!from)
 		goto unlock;
 	ret = mem_cgroup_same_or_subtree(mem, from)
 		|| mem_cgroup_same_or_subtree(mem, to);
 unlock:
 	spin_unlock(&mc.lock);
 	return ret;
 }
 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
 {
 	if (mc.moving_task && current != mc.moving_task) {
 		if (mem_cgroup_under_move(mem)) {
 			DEFINE_WAIT(wait);
 			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
 			/* moving charge context might have finished. */
 			if (mc.moving_task)
 				schedule();
 			finish_wait(&mc.waitq, &wait);
 			return true;
 		}
 	}
 	return false;
 }
 /**
  * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
  * @memcg: The memory cgroup that went over limit
  * @p: Task that is going to be killed
  *
  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
  * enabled
  */
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 	struct cgroup *task_cgrp;
 	struct cgroup *mem_cgrp;
 	/*
 	 * Need a buffer in BSS, can't rely on allocations. The code relies
 	 * on the assumption that OOM is serialized for memory controller.
 	 * If this assumption is broken, revisit this code.
 	 */
 	static char memcg_name[PATH_MAX];
 	int ret;
 	if (!memcg || !p)
 		return;
 	rcu_read_lock();
 	mem_cgrp = memcg->css.cgroup;
 	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
 	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
 	if (ret < 0) {
 		/*
 		 * Unfortunately, we are unable to convert to a useful name
 		 * But we'll still print out the usage information
 		 */
 		rcu_read_unlock();
 		goto done;
 	}
 	rcu_read_unlock();
 	printk(KERN_INFO "Task in %s killed", memcg_name);
 	rcu_read_lock();
 	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
 	if (ret < 0) {
 		rcu_read_unlock();
 		goto done;
 	}
 	rcu_read_unlock();
 	/*
 	 * Continues from above, so we don't need an KERN_ level
 	 */
 	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
 done:
 	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
 		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
 		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
 		res_counter_read_u64(&memcg->res, RES_FAILCNT));
 	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
 		"failcnt %llu\n",
 		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
 		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
 		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
 }
 /*
  * This function returns the number of memcg under hierarchy tree. Returns
  * 1(self count) if no children.
  */
 static int mem_cgroup_count_children(struct mem_cgroup *mem)
 {
 	int num = 0;
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, mem)
 		num++;
 	return num;
 }
 /*
  * Return the memory (and swap, if configured) limit for a memcg.
  */
 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
 {
 	u64 limit;
 	u64 memsw;
 	limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 	limit += total_swap_pages << PAGE_SHIFT;
 	memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 	/*
 	 * If memsw is finite and limits the amount of swap space available
 	 * to this memcg, return that limit.
 	 */
 	return min(limit, memsw);
 }
 /*
  * Visit the first child (need not be the first child as per the ordering
  * of the cgroup list, since we track last_scanned_child) of @mem and use
  * that to reclaim free pages from.
  */
 static struct mem_cgroup *
 mem_cgroup_select_victim(struct mem_cgroup *root_mem)
 {
 	struct mem_cgroup *ret = NULL;
 	struct cgroup_subsys_state *css;
 	int nextid, found;
 	if (!root_mem->use_hierarchy) {
 		css_get(&root_mem->css);
 		ret = root_mem;
 	}
 	while (!ret) {
 		rcu_read_lock();
 		nextid = root_mem->last_scanned_child + 1;
 		css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
 				   &found);
 		if (css && css_tryget(css))
 			ret = container_of(css, struct mem_cgroup, css);
 		rcu_read_unlock();
 		/* Updates scanning parameter */
 		if (!css) {
 			/* this means start scan from ID:1 */
 			root_mem->last_scanned_child = 0;
 		} else
 			root_mem->last_scanned_child = found;
 	}
 	return ret;
 }
 /**
  * test_mem_cgroup_node_reclaimable
  * @mem: the target memcg
  * @nid: the node ID to be checked.
  * @noswap : specify true here if the user wants flle only information.
  *
  * This function returns whether the specified memcg contains any
  * reclaimable pages on a node. Returns true if there are any reclaimable
  * pages in the node.
  */
 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
 		int nid, bool noswap)
 {
 	if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
 		return true;
 	if (noswap || !total_swap_pages)
 		return false;
 	if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
 		return true;
 	return false;
 }
 #if MAX_NUMNODES > 1
 /*
  * Always updating the nodemask is not very good - even if we have an empty
  * list or the wrong list here, we can start from some node and traverse all
  * nodes based on the zonelist. So update the list loosely once per 10 secs.
  *
  */
 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
 {
 	int nid;
 	/*
 	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
 	 * pagein/pageout changes since the last update.
 	 */
 	if (!atomic_read(&mem->numainfo_events))
 		return;
 	if (atomic_inc_return(&mem->numainfo_updating) > 1)
 		return;
 	/* make a nodemask where this memcg uses memory from */
 	mem->scan_nodes = node_states[N_HIGH_MEMORY];
 	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
 		if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
 			node_clear(nid, mem->scan_nodes);
 	}
 	atomic_set(&mem->numainfo_events, 0);
 	atomic_set(&mem->numainfo_updating, 0);
 }
 /*
  * Selecting a node where we start reclaim from. Because what we need is just
  * reducing usage counter, start from anywhere is O,K. Considering
  * memory reclaim from current node, there are pros. and cons.
  *
  * Freeing memory from current node means freeing memory from a node which
  * we'll use or we've used. So, it may make LRU bad. And if several threads
  * hit limits, it will see a contention on a node. But freeing from remote
  * node means more costs for memory reclaim because of memory latency.
  *
  * Now, we use round-robin. Better algorithm is welcomed.
  */
 int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
 {
 	int node;
 	mem_cgroup_may_update_nodemask(mem);
 	node = mem->last_scanned_node;
 	node = next_node(node, mem->scan_nodes);
 	if (node == MAX_NUMNODES)
 		node = first_node(mem->scan_nodes);
 	/*
 	 * We call this when we hit limit, not when pages are added to LRU.
 	 * No LRU may hold pages because all pages are UNEVICTABLE or
 	 * memcg is too small and all pages are not on LRU. In that case,
 	 * we use curret node.
 	 */
 	if (unlikely(node == MAX_NUMNODES))
 		node = numa_node_id();
 	mem->last_scanned_node = node;
 	return node;
 }
 /*
  * Check all nodes whether it contains reclaimable pages or not.
  * For quick scan, we make use of scan_nodes. This will allow us to skip
  * unused nodes. But scan_nodes is lazily updated and may not cotain
  * enough new information. We need to do double check.
  */
 bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
 {
 	int nid;
 	/*
 	 * quick check...making use of scan_node.
 	 * We can skip unused nodes.
 	 */
 	if (!nodes_empty(mem->scan_nodes)) {
 		for (nid = first_node(mem->scan_nodes);
 		     nid < MAX_NUMNODES;
 		     nid = next_node(nid, mem->scan_nodes)) {
 			if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
 				return true;
 		}
 	}
 	/*
 	 * Check rest of nodes.
 	 */
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		if (node_isset(nid, mem->scan_nodes))
 			continue;
 		if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
 			return true;
 	}
 	return false;
 }
 #else
 int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
 {
 	return 0;
 }
 bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
 {
 	return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
 }
 #endif
 static void __mem_cgroup_record_scanstat(unsigned long *stats,
 			   struct memcg_scanrecord *rec)
 {
 	stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1];
 	stats[SCAN_ANON] += rec->nr_scanned[0];
 	stats[SCAN_FILE] += rec->nr_scanned[1];
 	stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1];
 	stats[ROTATE_ANON] += rec->nr_rotated[0];
 	stats[ROTATE_FILE] += rec->nr_rotated[1];
 	stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1];
 	stats[FREED_ANON] += rec->nr_freed[0];
 	stats[FREED_FILE] += rec->nr_freed[1];
 	stats[ELAPSED] += rec->elapsed;
 }
 static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec)
 {
 	struct mem_cgroup *mem;
 	int context = rec->context;
 	if (context >= NR_SCAN_CONTEXT)
 		return;
 	mem = rec->mem;
 	spin_lock(&mem->scanstat.lock);
 	__mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec);
 	spin_unlock(&mem->scanstat.lock);
 	mem = rec->root;
 	spin_lock(&mem->scanstat.lock);
 	__mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec);
 	spin_unlock(&mem->scanstat.lock);
 }
 /*
  * Scan the hierarchy if needed to reclaim memory. We remember the last child
  * we reclaimed from, so that we don't end up penalizing one child extensively
  * based on its position in the children list.
  *
  * root_mem is the original ancestor that we've been reclaim from.
  *
  * We give up and return to the caller when we visit root_mem twice.
  * (other groups can be removed while we're walking....)
  *
  * If shrink==true, for avoiding to free too much, this returns immedieately.
  */
 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 						struct zone *zone,
 						gfp_t gfp_mask,
 						unsigned long reclaim_options,
 						unsigned long *total_scanned)
 {
 	struct mem_cgroup *victim;
 	int ret, total = 0;
 	int loop = 0;
 	bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
 	bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
 	bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
 	struct memcg_scanrecord rec;
 	unsigned long excess;
 	unsigned long scanned;
 	excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
 	/* If memsw_is_minimum==1, swap-out is of-no-use. */
 	if (!check_soft && !shrink && root_mem->memsw_is_minimum)
 		noswap = true;
 	if (shrink)
 		rec.context = SCAN_BY_SHRINK;
 	else if (check_soft)
 		rec.context = SCAN_BY_SYSTEM;
 	else
 		rec.context = SCAN_BY_LIMIT;
 	rec.root = root_mem;
 	while (1) {
 		victim = mem_cgroup_select_victim(root_mem);
 		if (victim == root_mem) {
 			loop++;
 			/*
 			 * We are not draining per cpu cached charges during
 			 * soft limit reclaim  because global reclaim doesn't
 			 * care about charges. It tries to free some memory and
 			 * charges will not give any.
 			 */
 			if (!check_soft && loop >= 1)
 				drain_all_stock_async(root_mem);
 			if (loop >= 2) {
 				/*
 				 * If we have not been able to reclaim
 				 * anything, it might because there are
 				 * no reclaimable pages under this hierarchy
 				 */
 				if (!check_soft || !total) {
 					css_put(&victim->css);
 					break;
 				}
 				/*
 				 * We want to do more targeted reclaim.
 				 * excess >> 2 is not to excessive so as to
 				 * reclaim too much, nor too less that we keep
 				 * coming back to reclaim from this cgroup
 				 */
 				if (total >= (excess >> 2) ||
 					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
 					css_put(&victim->css);
 					break;
 				}
 			}
 		}
 		if (!mem_cgroup_reclaimable(victim, noswap)) {
 			/* this cgroup's local usage == 0 */
 			css_put(&victim->css);
 			continue;
 		}
 		rec.mem = victim;
 		rec.nr_scanned[0] = 0;
 		rec.nr_scanned[1] = 0;
 		rec.nr_rotated[0] = 0;
 		rec.nr_rotated[1] = 0;
 		rec.nr_freed[0] = 0;
 		rec.nr_freed[1] = 0;
 		rec.elapsed = 0;
 		/* we use swappiness of local cgroup */
 		if (check_soft) {
 			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
 				noswap, zone, &rec, &scanned);
 			*total_scanned += scanned;
 		} else
 			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
 						noswap, &rec);
 		mem_cgroup_record_scanstat(&rec);
 		css_put(&victim->css);
 		/*
 		 * At shrinking usage, we can't check we should stop here or
 		 * reclaim more. It's depends on callers. last_scanned_child
 		 * will work enough for keeping fairness under tree.
 		 */
 		if (shrink)
 			return ret;
 		total += ret;
 		if (check_soft) {
 			if (!res_counter_soft_limit_excess(&root_mem->res))
 				return total;
 		} else if (mem_cgroup_margin(root_mem))
 			return total;
 	}
 	return total;
 }
 /*
  * Check OOM-Killer is already running under our hierarchy.
  * If someone is running, return false.
  * Has to be called with memcg_oom_lock
  */
 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
 {
 	int lock_count = -1;
 	struct mem_cgroup *iter, *failed = NULL;
 	bool cond = true;
 	for_each_mem_cgroup_tree_cond(iter, mem, cond) {
 		bool locked = iter->oom_lock;
 		iter->oom_lock = true;
 		if (lock_count == -1)
 			lock_count = iter->oom_lock;
 		else if (lock_count != locked) {
 			/*
 			 * this subtree of our hierarchy is already locked
 			 * so we cannot give a lock.
 			 */
 			lock_count = 0;
 			failed = iter;
 			cond = false;
 		}
 	}
 	if (!failed)
 		goto done;
 	/*
 	 * OK, we failed to lock the whole subtree so we have to clean up
 	 * what we set up to the failing subtree
 	 */
 	cond = true;
 	for_each_mem_cgroup_tree_cond(iter, mem, cond) {
 		if (iter == failed) {
 			cond = false;
 			continue;
 		}
 		iter->oom_lock = false;
 	}
 done:
 	return lock_count;
 }
 /*
  * Has to be called with memcg_oom_lock
  */
 static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
 {
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, mem)
 		iter->oom_lock = false;
 	return 0;
 }
 static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
 {
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, mem)
 		atomic_inc(&iter->under_oom);
 }
 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
 {
 	struct mem_cgroup *iter;
 	/*
 	 * When a new child is created while the hierarchy is under oom,
 	 * mem_cgroup_oom_lock() may not be called. We have to use
 	 * atomic_add_unless() here.
 	 */
 	for_each_mem_cgroup_tree(iter, mem)
 		atomic_add_unless(&iter->under_oom, -1, 0);
 }
 static DEFINE_SPINLOCK(memcg_oom_lock);
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 struct oom_wait_info {
 	struct mem_cgroup *mem;
 	wait_queue_t	wait;
 };
 static int memcg_oom_wake_function(wait_queue_t *wait,
 	unsigned mode, int sync, void *arg)
 {
 	struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,
 			  *oom_wait_mem;
 	struct oom_wait_info *oom_wait_info;
 	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
 	oom_wait_mem = oom_wait_info->mem;
 	/*
 	 * Both of oom_wait_info->mem and wake_mem are stable under us.
 	 * Then we can use css_is_ancestor without taking care of RCU.
 	 */
 	if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)
 			&& !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))
 		return 0;
 	return autoremove_wake_function(wait, mode, sync, arg);
 }
 static void memcg_wakeup_oom(struct mem_cgroup *mem)
 {
 	/* for filtering, pass "mem" as argument. */
 	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
 }
 static void memcg_oom_recover(struct mem_cgroup *mem)
 {
 	if (mem && atomic_read(&mem->under_oom))
 		memcg_wakeup_oom(mem);
 }
 /*
  * try to call OOM killer. returns false if we should exit memory-reclaim loop.
  */
 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
 {
 	struct oom_wait_info owait;
 	bool locked, need_to_kill;
 	owait.mem = mem;
 	owait.wait.flags = 0;
 	owait.wait.func = memcg_oom_wake_function;
 	owait.wait.private = current;
 	INIT_LIST_HEAD(&owait.wait.task_list);
 	need_to_kill = true;
 	mem_cgroup_mark_under_oom(mem);
 	/* At first, try to OOM lock hierarchy under mem.*/
 	spin_lock(&memcg_oom_lock);
 	locked = mem_cgroup_oom_lock(mem);
 	/*
 	 * Even if signal_pending(), we can't quit charge() loop without
 	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
 	 * under OOM is always welcomed, use TASK_KILLABLE here.
 	 */
 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
 	if (!locked || mem->oom_kill_disable)
 		need_to_kill = false;
 	if (locked)
 		mem_cgroup_oom_notify(mem);
 	spin_unlock(&memcg_oom_lock);
 	if (need_to_kill) {
 		finish_wait(&memcg_oom_waitq, &owait.wait);
 		mem_cgroup_out_of_memory(mem, mask);
 	} else {
 		schedule();
 		finish_wait(&memcg_oom_waitq, &owait.wait);
 	}
 	spin_lock(&memcg_oom_lock);
 	if (locked)
 		mem_cgroup_oom_unlock(mem);
 	memcg_wakeup_oom(mem);
 	spin_unlock(&memcg_oom_lock);
 	mem_cgroup_unmark_under_oom(mem);
 	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
 		return false;
 	/* Give chance to dying process */
 	schedule_timeout(1);
 	return true;
 }
 /*
  * Currently used to update mapped file statistics, but the routine can be
  * generalized to update other statistics as well.
  *
  * Notes: Race condition
  *
  * We usually use page_cgroup_lock() for accessing page_cgroup member but
  * it tends to be costly. But considering some conditions, we doesn't need
  * to do so _always_.
  *
  * Considering "charge", lock_page_cgroup() is not required because all
  * file-stat operations happen after a page is attached to radix-tree. There
  * are no race with "charge".
  *
  * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
  * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
  * if there are race with "uncharge". Statistics itself is properly handled
  * by flags.
  *
  * Considering "move", this is an only case we see a race. To make the race
  * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
  * possibility of race condition. If there is, we take a lock.
  */
 void mem_cgroup_update_page_stat(struct page *page,
 				 enum mem_cgroup_page_stat_item idx, int val)
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	bool need_unlock = false;
 	unsigned long uninitialized_var(flags);
 	if (unlikely(!pc))
 		return;
 	rcu_read_lock();
 	mem = pc->mem_cgroup;
 	if (unlikely(!mem || !PageCgroupUsed(pc)))
 		goto out;
 	/* pc->mem_cgroup is unstable ? */
 	if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
 		/* take a lock against to access pc->mem_cgroup */
 		move_lock_page_cgroup(pc, &flags);
 		need_unlock = true;
 		mem = pc->mem_cgroup;
 		if (!mem || !PageCgroupUsed(pc))
 			goto out;
 	}
 	switch (idx) {
 	case MEMCG_NR_FILE_MAPPED:
 		if (val > 0)
 			SetPageCgroupFileMapped(pc);
 		else if (!page_mapped(page))
 			ClearPageCgroupFileMapped(pc);
 		idx = MEM_CGROUP_STAT_FILE_MAPPED;
 		break;
 	default:
 		BUG();
 	}
 	this_cpu_add(mem->stat->count[idx], val);
 out:
 	if (unlikely(need_unlock))
 		move_unlock_page_cgroup(pc, &flags);
 	rcu_read_unlock();
 	return;
 }
 EXPORT_SYMBOL(mem_cgroup_update_page_stat);
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
  * TODO: maybe necessary to use big numbers in big irons.
  */
 #define CHARGE_BATCH	32U
 struct memcg_stock_pcp {
 	struct mem_cgroup *cached; /* this never be root cgroup */
 	unsigned int nr_pages;
 	struct work_struct work;
 	unsigned long flags;
 #define FLUSHING_CACHED_CHARGE	(0)
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
-static DEFINE_MUTEX(percpu_charge_mutex);
 /*
  * Try to consume stocked charge on this cpu. If success, one page is consumed
  * from local stock and true is returned. If the stock is 0 or charges from a
  * cgroup which is not current target, returns false. This stock will be
  * refilled.
  */
 static bool consume_stock(struct mem_cgroup *mem)
 {
 	struct memcg_stock_pcp *stock;
 	bool ret = true;
 	stock = &get_cpu_var(memcg_stock);
 	if (mem == stock->cached && stock->nr_pages)
 		stock->nr_pages--;
 	else /* need to call res_counter_charge */
 		ret = false;
 	put_cpu_var(memcg_stock);
 	return ret;
 }
 /*
  * Returns stocks cached in percpu to res_counter and reset cached information.
  */
 static void drain_stock(struct memcg_stock_pcp *stock)
 {
 	struct mem_cgroup *old = stock->cached;
 	if (stock->nr_pages) {
 		unsigned long bytes = stock->nr_pages * PAGE_SIZE;
 		res_counter_uncharge(&old->res, bytes);
 		if (do_swap_account)
 			res_counter_uncharge(&old->memsw, bytes);
 		stock->nr_pages = 0;
 	}
 	stock->cached = NULL;
 }
 /*
  * This must be called under preempt disabled or must be called by
  * a thread which is pinned to local cpu.
  */
 static void drain_local_stock(struct work_struct *dummy)
 {
 	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
 	drain_stock(stock);
 	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
 /*
  * Cache charges(val) which is from res_counter, to local per_cpu area.
  * This will be consumed by consume_stock() function, later.
  */
 static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
 {
 	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
 	if (stock->cached != mem) { /* reset if necessary */
 		drain_stock(stock);
 		stock->cached = mem;
 	}
 	stock->nr_pages += nr_pages;
 	put_cpu_var(memcg_stock);
 }
 /*
  * Drains all per-CPU charge caches for given root_mem resp. subtree
  * of the hierarchy under it. sync flag says whether we should block
  * until the work is done.
  */
 static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
 {
 	int cpu, curcpu;
 	/* Notify other cpus that system-wide "drain" is running */
 	get_online_cpus();
 	/*
 	 * Get a hint for avoiding draining charges on the current cpu,
 	 * which must be exhausted by our charging.  It is not required that
 	 * this be a precise check, so we use raw_smp_processor_id() instead of
 	 * getcpu()/putcpu().
 	 */
 	curcpu = raw_smp_processor_id();
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
 		struct mem_cgroup *mem;
 		mem = stock->cached;
 		if (!mem || !stock->nr_pages)
 			continue;
 		if (!mem_cgroup_same_or_subtree(root_mem, mem))
 			continue;
 		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
 			if (cpu == curcpu)
 				drain_local_stock(&stock->work);
 			else
 				schedule_work_on(cpu, &stock->work);
 		}
 	}
 	if (!sync)
 		goto out;
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
-		if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
+		if (mem_cgroup_same_or_subtree(root_mem, stock->cached) &&
+				test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
 			flush_work(&stock->work);
 	}
 out:
  	put_online_cpus();
 }
 /*
  * Tries to drain stocked charges in other cpus. This function is asynchronous
  * and just put a work per cpu for draining localy on each cpu. Caller can
  * expects some charges will be back to res_counter later but cannot wait for
  * it.
  */
 static void drain_all_stock_async(struct mem_cgroup *root_mem)
 {
-	/*
-	 * If someone calls draining, avoid adding more kworker runs.
-	 */
-	if (!mutex_trylock(&percpu_charge_mutex))
-		return;
 	drain_all_stock(root_mem, false);
-	mutex_unlock(&percpu_charge_mutex);
 }
 /* This is a synchronous drain interface. */
 static void drain_all_stock_sync(struct mem_cgroup *root_mem)
 {
 	/* called when force_empty is called */
-	mutex_lock(&percpu_charge_mutex);
 	drain_all_stock(root_mem, true);
-	mutex_unlock(&percpu_charge_mutex);
 }
 /*
  * This function drains percpu counter value from DEAD cpu and
  * move it to local cpu. Note that this function can be preempted.
  */
 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
 {
 	int i;
 	spin_lock(&mem->pcp_counter_lock);
 	for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
 		long x = per_cpu(mem->stat->count[i], cpu);
 		per_cpu(mem->stat->count[i], cpu) = 0;
 		mem->nocpu_base.count[i] += x;
 	}
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
 		unsigned long x = per_cpu(mem->stat->events[i], cpu);
 		per_cpu(mem->stat->events[i], cpu) = 0;
 		mem->nocpu_base.events[i] += x;
 	}
 	/* need to clear ON_MOVE value, works as a kind of lock. */
 	per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
 	spin_unlock(&mem->pcp_counter_lock);
 }
 static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
 {
 	int idx = MEM_CGROUP_ON_MOVE;
 	spin_lock(&mem->pcp_counter_lock);
 	per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
 	spin_unlock(&mem->pcp_counter_lock);
 }
 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
 					unsigned long action,
 					void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	struct memcg_stock_pcp *stock;
 	struct mem_cgroup *iter;
 	if ((action == CPU_ONLINE)) {
 		for_each_mem_cgroup_all(iter)
 			synchronize_mem_cgroup_on_move(iter, cpu);
 		return NOTIFY_OK;
 	}
 	if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
 		return NOTIFY_OK;
 	for_each_mem_cgroup_all(iter)
 		mem_cgroup_drain_pcp_counter(iter, cpu);
 	stock = &per_cpu(memcg_stock, cpu);
 	drain_stock(stock);
 	return NOTIFY_OK;
 }
 /* See __mem_cgroup_try_charge() for details */
 enum {
 	CHARGE_OK,		/* success */
 	CHARGE_RETRY,		/* need to retry but retry is not bad */
 	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
 	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
 	CHARGE_OOM_DIE,		/* the current is killed because of OOM */
 };
 static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
 				unsigned int nr_pages, bool oom_check)
 {
 	unsigned long csize = nr_pages * PAGE_SIZE;
 	struct mem_cgroup *mem_over_limit;
 	struct res_counter *fail_res;
 	unsigned long flags = 0;
 	int ret;
 	ret = res_counter_charge(&mem->res, csize, &fail_res);
 	if (likely(!ret)) {
 		if (!do_swap_account)
 			return CHARGE_OK;
 		ret = res_counter_charge(&mem->memsw, csize, &fail_res);
 		if (likely(!ret))
 			return CHARGE_OK;
 		res_counter_uncharge(&mem->res, csize);
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
 		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
 	} else
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
 	/*
 	 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
 	 * of regular pages (CHARGE_BATCH), or a single regular page (1).
 	 *
 	 * Never reclaim on behalf of optional batching, retry with a
 	 * single page instead.
 	 */
 	if (nr_pages == CHARGE_BATCH)
 		return CHARGE_RETRY;
 	if (!(gfp_mask & __GFP_WAIT))
 		return CHARGE_WOULDBLOCK;
 	ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
 					      gfp_mask, flags, NULL);
 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
 		return CHARGE_RETRY;
 	/*
 	 * Even though the limit is exceeded at this point, reclaim
 	 * may have been able to free some pages.  Retry the charge
 	 * before killing the task.
 	 *
 	 * Only for regular pages, though: huge pages are rather
 	 * unlikely to succeed so close to the limit, and we fall back
 	 * to regular pages anyway in case of failure.
 	 */
 	if (nr_pages == 1 && ret)
 		return CHARGE_RETRY;
 	/*
 	 * At task move, charge accounts can be doubly counted. So, it's
 	 * better to wait until the end of task_move if something is going on.
 	 */
 	if (mem_cgroup_wait_acct_move(mem_over_limit))
 		return CHARGE_RETRY;
 	/* If we don't need to call oom-killer at el, return immediately */
 	if (!oom_check)
 		return CHARGE_NOMEM;
 	/* check OOM */
 	if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
 		return CHARGE_OOM_DIE;
 	return CHARGE_RETRY;
 }
 /*
  * Unlike exported interface, "oom" parameter is added. if oom==true,
  * oom-killer can be invoked.
  */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
 				   gfp_t gfp_mask,
 				   unsigned int nr_pages,
 				   struct mem_cgroup **memcg,
 				   bool oom)
 {
 	unsigned int batch = max(CHARGE_BATCH, nr_pages);
 	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct mem_cgroup *mem = NULL;
 	int ret;
 	/*
 	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
 	 * in system level. So, allow to go ahead dying process in addition to
 	 * MEMDIE process.
 	 */
 	if (unlikely(test_thread_flag(TIF_MEMDIE)
 		     || fatal_signal_pending(current)))
 		goto bypass;
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
 	 * The mm_struct's mem_cgroup changes on task migration if the
 	 * thread group leader migrates. It's possible that mm is not
 	 * set, if so charge the init_mm (happens for pagecache usage).
 	 */
 	if (!*memcg && !mm)
 		goto bypass;
 again:
 	if (*memcg) { /* css should be a valid one */
 		mem = *memcg;
 		VM_BUG_ON(css_is_removed(&mem->css));
 		if (mem_cgroup_is_root(mem))
 			goto done;
 		if (nr_pages == 1 && consume_stock(mem))
 			goto done;
 		css_get(&mem->css);
 	} else {
 		struct task_struct *p;
 		rcu_read_lock();
 		p = rcu_dereference(mm->owner);
 		/*
 		 * Because we don't have task_lock(), "p" can exit.
 		 * In that case, "mem" can point to root or p can be NULL with
 		 * race with swapoff. Then, we have small risk of mis-accouning.
 		 * But such kind of mis-account by race always happens because
 		 * we don't have cgroup_mutex(). It's overkill and we allo that
 		 * small race, here.
 		 * (*) swapoff at el will charge against mm-struct not against
 		 * task-struct. So, mm->owner can be NULL.
 		 */
 		mem = mem_cgroup_from_task(p);
 		if (!mem || mem_cgroup_is_root(mem)) {
 			rcu_read_unlock();
 			goto done;
 		}
 		if (nr_pages == 1 && consume_stock(mem)) {
 			/*
 			 * It seems dagerous to access memcg without css_get().
 			 * But considering how consume_stok works, it's not
 			 * necessary. If consume_stock success, some charges
 			 * from this memcg are cached on this cpu. So, we
 			 * don't need to call css_get()/css_tryget() before
 			 * calling consume_stock().
 			 */
 			rcu_read_unlock();
 			goto done;
 		}
 		/* after here, we may be blocked. we need to get refcnt */
 		if (!css_tryget(&mem->css)) {
 			rcu_read_unlock();
 			goto again;
 		}
 		rcu_read_unlock();
 	}
 	do {
 		bool oom_check;
 		/* If killed, bypass charge */
 		if (fatal_signal_pending(current)) {
 			css_put(&mem->css);
 			goto bypass;
 		}
 		oom_check = false;
 		if (oom && !nr_oom_retries) {
 			oom_check = true;
 			nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
 		}
 		ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
 		switch (ret) {
 		case CHARGE_OK:
 			break;
 		case CHARGE_RETRY: /* not in OOM situation but retry */
 			batch = nr_pages;
 			css_put(&mem->css);
 			mem = NULL;
 			goto again;
 		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
 			css_put(&mem->css);
 			goto nomem;
 		case CHARGE_NOMEM: /* OOM routine works */
 			if (!oom) {
 				css_put(&mem->css);
 				goto nomem;
 			}
 			/* If oom, we never return -ENOMEM */
 			nr_oom_retries--;
 			break;
 		case CHARGE_OOM_DIE: /* Killed by OOM Killer */
 			css_put(&mem->css);
 			goto bypass;
 		}
 	} while (ret != CHARGE_OK);
 	if (batch > nr_pages)
 		refill_stock(mem, batch - nr_pages);
 	css_put(&mem->css);
 done:
 	*memcg = mem;
 	return 0;
 nomem:
 	*memcg = NULL;
 	return -ENOMEM;
 bypass:
 	*memcg = NULL;
 	return 0;
 }
 /*
  * Somemtimes we have to undo a charge we got by try_charge().
  * This function is for that and do uncharge, put css's refcnt.
  * gotten by try_charge().
  */
 static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
 				       unsigned int nr_pages)
 {
 	if (!mem_cgroup_is_root(mem)) {
 		unsigned long bytes = nr_pages * PAGE_SIZE;
 		res_counter_uncharge(&mem->res, bytes);
 		if (do_swap_account)
 			res_counter_uncharge(&mem->memsw, bytes);
 	}
 }
 /*
  * A helper function to get mem_cgroup from ID. must be called under
  * rcu_read_lock(). The caller must check css_is_removed() or some if
  * it's concern. (dropping refcnt from swap can be called against removed
  * memcg.)
  */
 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
 {
 	struct cgroup_subsys_state *css;
 	/* ID 0 is unused ID */
 	if (!id)
 		return NULL;
 	css = css_lookup(&mem_cgroup_subsys, id);
 	if (!css)
 		return NULL;
 	return container_of(css, struct mem_cgroup, css);
 }
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 {
 	struct mem_cgroup *mem = NULL;
 	struct page_cgroup *pc;
 	unsigned short id;
 	swp_entry_t ent;
 	VM_BUG_ON(!PageLocked(page));
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
 		if (mem && !css_tryget(&mem->css))
 			mem = NULL;
 	} else if (PageSwapCache(page)) {
 		ent.val = page_private(page);
 		id = lookup_swap_cgroup(ent);
 		rcu_read_lock();
 		mem = mem_cgroup_lookup(id);
 		if (mem && !css_tryget(&mem->css))
 			mem = NULL;
 		rcu_read_unlock();
 	}
 	unlock_page_cgroup(pc);
 	return mem;
 }
 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 				       struct page *page,
 				       unsigned int nr_pages,
 				       struct page_cgroup *pc,
 				       enum charge_type ctype)
 {
 	lock_page_cgroup(pc);
 	if (unlikely(PageCgroupUsed(pc))) {
 		unlock_page_cgroup(pc);
 		__mem_cgroup_cancel_charge(mem, nr_pages);
 		return;
 	}
 	/*
 	 * we don't need page_cgroup_lock about tail pages, becase they are not
 	 * accessed by any other context at this point.
 	 */
 	pc->mem_cgroup = mem;
 	/*
 	 * We access a page_cgroup asynchronously without lock_page_cgroup().
 	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
 	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
 	 * before USED bit, we need memory barrier here.
 	 * See mem_cgroup_add_lru_list(), etc.
  	 */
 	smp_wmb();
 	switch (ctype) {
 	case MEM_CGROUP_CHARGE_TYPE_CACHE:
 	case MEM_CGROUP_CHARGE_TYPE_SHMEM:
 		SetPageCgroupCache(pc);
 		SetPageCgroupUsed(pc);
 		break;
 	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
 		ClearPageCgroupCache(pc);
 		SetPageCgroupUsed(pc);
 		break;
 	default:
 		break;
 	}
 	mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
 	unlock_page_cgroup(pc);
 	/*
 	 * "charge_statistics" updated event counter. Then, check it.
 	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
 	 * if they exceeds softlimit.
 	 */
 	memcg_check_events(mem, page);
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
 			(1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
 /*
  * Because tail pages are not marked as "used", set it. We're under
  * zone->lru_lock, 'splitting on pmd' and compund_lock.
  */
 void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
 {
 	struct page_cgroup *head_pc = lookup_page_cgroup(head);
 	struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
 	unsigned long flags;
 	if (mem_cgroup_disabled())
 		return;
 	/*
 	 * We have no races with charge/uncharge but will have races with
 	 * page state accounting.
 	 */
 	move_lock_page_cgroup(head_pc, &flags);
 	tail_pc->mem_cgroup = head_pc->mem_cgroup;
 	smp_wmb(); /* see __commit_charge() */
 	if (PageCgroupAcctLRU(head_pc)) {
 		enum lru_list lru;
 		struct mem_cgroup_per_zone *mz;
 		/*
 		 * LRU flags cannot be copied because we need to add tail
 		 *.page to LRU by generic call and our hook will be called.
 		 * We hold lru_lock, then, reduce counter directly.
 		 */
 		lru = page_lru(head);
 		mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
 		MEM_CGROUP_ZSTAT(mz, lru) -= 1;
 	}
 	tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
 	move_unlock_page_cgroup(head_pc, &flags);
 }
 #endif
 /**
  * mem_cgroup_move_account - move account of the page
  * @page: the page
  * @nr_pages: number of regular pages (>1 for huge pages)
  * @pc:	page_cgroup of the page.
  * @from: mem_cgroup which the page is moved from.
  * @to:	mem_cgroup which the page is moved to. @from != @to.
  * @uncharge: whether we should call uncharge and css_put against @from.
  *
  * The caller must confirm following.
  * - page is not on LRU (isolate_page() is useful.)
  * - compound_lock is held when nr_pages > 1
  *
  * This function doesn't do "charge" nor css_get to new cgroup. It should be
  * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
  * true, this function does "uncharge" from old cgroup, but it doesn't if
  * @uncharge is false, so a caller should do "uncharge".
  */
 static int mem_cgroup_move_account(struct page *page,
 				   unsigned int nr_pages,
 				   struct page_cgroup *pc,
 				   struct mem_cgroup *from,
 				   struct mem_cgroup *to,
 				   bool uncharge)
 {
 	unsigned long flags;
 	int ret;
 	VM_BUG_ON(from == to);
 	VM_BUG_ON(PageLRU(page));
 	/*
 	 * The page is isolated from LRU. So, collapse function
 	 * will not handle this page. But page splitting can happen.
 	 * Do this check under compound_page_lock(). The caller should
 	 * hold it.
 	 */
 	ret = -EBUSY;
 	if (nr_pages > 1 && !PageTransHuge(page))
 		goto out;
 	lock_page_cgroup(pc);
 	ret = -EINVAL;
 	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
 		goto unlock;
 	move_lock_page_cgroup(pc, &flags);
 	if (PageCgroupFileMapped(pc)) {
 		/* Update mapped_file data for mem_cgroup */
 		preempt_disable();
 		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
 		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
 		preempt_enable();
 	}
 	mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
 	if (uncharge)
 		/* This is not "cancel", but cancel_charge does all we need. */
 		__mem_cgroup_cancel_charge(from, nr_pages);
 	/* caller should have done css_get */
 	pc->mem_cgroup = to;
 	mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
 	/*
 	 * We charges against "to" which may not have any tasks. Then, "to"
 	 * can be under rmdir(). But in current implementation, caller of
 	 * this function is just force_empty() and move charge, so it's
 	 * guaranteed that "to" is never removed. So, we don't check rmdir
 	 * status here.
 	 */
 	move_unlock_page_cgroup(pc, &flags);
 	ret = 0;
 unlock:
 	unlock_page_cgroup(pc);
 	/*
 	 * check events
 	 */
 	memcg_check_events(to, page);
 	memcg_check_events(from, page);
 out:
 	return ret;
 }
 /*
  * move charges to its parent.
  */
 static int mem_cgroup_move_parent(struct page *page,
 				  struct page_cgroup *pc,
 				  struct mem_cgroup *child,
 				  gfp_t gfp_mask)
 {
 	struct cgroup *cg = child->css.cgroup;
 	struct cgroup *pcg = cg->parent;
 	struct mem_cgroup *parent;
 	unsigned int nr_pages;
 	unsigned long uninitialized_var(flags);
 	int ret;
 	/* Is ROOT ? */
 	if (!pcg)
 		return -EINVAL;
 	ret = -EBUSY;
 	if (!get_page_unless_zero(page))
 		goto out;
 	if (isolate_lru_page(page))
 		goto put;
 	nr_pages = hpage_nr_pages(page);
 	parent = mem_cgroup_from_cont(pcg);
 	ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
 	if (ret || !parent)
 		goto put_back;
 	if (nr_pages > 1)
 		flags = compound_lock_irqsave(page);
 	ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
 	if (ret)
 		__mem_cgroup_cancel_charge(parent, nr_pages);
 	if (nr_pages > 1)
 		compound_unlock_irqrestore(page, flags);
 put_back:
 	putback_lru_page(page);
 put:
 	put_page(page);
 out:
 	return ret;
 }
 /*
  * Charge the memory controller for page usage.
  * Return
  * 0 if the charge was successful
  * < 0 if the cgroup is over its limit
  */
 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask, enum charge_type ctype)
 {
 	struct mem_cgroup *mem = NULL;
 	unsigned int nr_pages = 1;
 	struct page_cgroup *pc;
 	bool oom = true;
 	int ret;
 	if (PageTransHuge(page)) {
 		nr_pages <<= compound_order(page);
 		VM_BUG_ON(!PageTransHuge(page));
 		/*
 		 * Never OOM-kill a process for a huge page.  The
 		 * fault handler will fall back to regular pages.
 		 */
 		oom = false;
 	}
 	pc = lookup_page_cgroup(page);
 	BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
 	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
 	if (ret || !mem)
 		return ret;
 	__mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
 	return 0;
 }
 int mem_cgroup_newpage_charge(struct page *page,
 			      struct mm_struct *mm, gfp_t gfp_mask)
 {
 	if (mem_cgroup_disabled())
 		return 0;
 	/*
 	 * If already mapped, we don't have to account.
 	 * If page cache, page->mapping has address_space.
 	 * But page->mapping may have out-of-use anon_vma pointer,
 	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
 	 * is NULL.
   	 */
 	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
 		return 0;
 	if (unlikely(!mm))
 		mm = &init_mm;
 	return mem_cgroup_charge_common(page, mm, gfp_mask,
 				MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
 					enum charge_type ctype);
 static void
 __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
 					enum charge_type ctype)
 {
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	/*
 	 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
 	 * is already on LRU. It means the page may on some other page_cgroup's
 	 * LRU. Take care of it.
 	 */
 	mem_cgroup_lru_del_before_commit(page);
 	__mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
 	mem_cgroup_lru_add_after_commit(page);
 	return;
 }
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask)
 {
 	struct mem_cgroup *mem = NULL;
 	int ret;
 	if (mem_cgroup_disabled())
 		return 0;
 	if (PageCompound(page))
 		return 0;
 	/*
 	 * Corner case handling. This is called from add_to_page_cache()
 	 * in usual. But some FS (shmem) precharges this page before calling it
 	 * and call add_to_page_cache() with GFP_NOWAIT.
 	 *
 	 * For GFP_NOWAIT case, the page may be pre-charged before calling
 	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
 	 * charge twice. (It works but has to pay a bit larger cost.)
 	 * And when the page is SwapCache, it should take swap information
 	 * into account. This is under lock_page() now.
 	 */
 	if (!(gfp_mask & __GFP_WAIT)) {
 		struct page_cgroup *pc;
 		pc = lookup_page_cgroup(page);
 		if (!pc)
 			return 0;
 		lock_page_cgroup(pc);
 		if (PageCgroupUsed(pc)) {
 			unlock_page_cgroup(pc);
 			return 0;
 		}
 		unlock_page_cgroup(pc);
 	}
 	if (unlikely(!mm))
 		mm = &init_mm;
 	if (page_is_file_cache(page)) {
 		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
 		if (ret || !mem)
 			return ret;
 		/*
 		 * FUSE reuses pages without going through the final
 		 * put that would remove them from the LRU list, make
 		 * sure that they get relinked properly.
 		 */
 		__mem_cgroup_commit_charge_lrucare(page, mem,
 					MEM_CGROUP_CHARGE_TYPE_CACHE);
 		return ret;
 	}
 	/* shmem */
 	if (PageSwapCache(page)) {
 		ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
 		if (!ret)
 			__mem_cgroup_commit_charge_swapin(page, mem,
 					MEM_CGROUP_CHARGE_TYPE_SHMEM);
 	} else
 		ret = mem_cgroup_charge_common(page, mm, gfp_mask,
 					MEM_CGROUP_CHARGE_TYPE_SHMEM);
 	return ret;
 }
 /*
  * While swap-in, try_charge -> commit or cancel, the page is locked.
  * And when try_charge() successfully returns, one refcnt to memcg without
  * struct page_cgroup is acquired. This refcnt will be consumed by
  * "commit()" or removed by "cancel()"
  */
 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 				 struct page *page,
 				 gfp_t mask, struct mem_cgroup **ptr)
 {
 	struct mem_cgroup *mem;
 	int ret;
 	*ptr = NULL;
 	if (mem_cgroup_disabled())
 		return 0;
 	if (!do_swap_account)
 		goto charge_cur_mm;
 	/*
 	 * A racing thread's fault, or swapoff, may have already updated
 	 * the pte, and even removed page from swap cache: in those cases
 	 * do_swap_page()'s pte_same() test will fail; but there's also a
 	 * KSM case which does need to charge the page.
 	 */
 	if (!PageSwapCache(page))
 		goto charge_cur_mm;
 	mem = try_get_mem_cgroup_from_page(page);
 	if (!mem)
 		goto charge_cur_mm;
 	*ptr = mem;
 	ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
 	css_put(&mem->css);
 	return ret;
 charge_cur_mm:
 	if (unlikely(!mm))
 		mm = &init_mm;
 	return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
 }
 static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
 					enum charge_type ctype)
 {
 	if (mem_cgroup_disabled())
 		return;
 	if (!ptr)
 		return;
 	cgroup_exclude_rmdir(&ptr->css);
 	__mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
 	/*
 	 * Now swap is on-memory. This means this page may be
 	 * counted both as mem and swap....double count.
 	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
 	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
 	 * may call delete_from_swap_cache() before reach here.
 	 */
 	if (do_swap_account && PageSwapCache(page)) {
 		swp_entry_t ent = {.val = page_private(page)};
 		unsigned short id;
 		struct mem_cgroup *memcg;
 		id = swap_cgroup_record(ent, 0);
 		rcu_read_lock();
 		memcg = mem_cgroup_lookup(id);
 		if (memcg) {
 			/*
 			 * This recorded memcg can be obsolete one. So, avoid
 			 * calling css_tryget
 			 */
 			if (!mem_cgroup_is_root(memcg))
 				res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 			mem_cgroup_swap_statistics(memcg, false);
 			mem_cgroup_put(memcg);
 		}
 		rcu_read_unlock();
 	}
 	/*
 	 * At swapin, we may charge account against cgroup which has no tasks.
 	 * So, rmdir()->pre_destroy() can be called while we do this charge.
 	 * In that case, we need to call pre_destroy() again. check it here.
 	 */
 	cgroup_release_and_wakeup_rmdir(&ptr->css);
 }
 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 {
 	__mem_cgroup_commit_charge_swapin(page, ptr,
 					MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
 {
 	if (mem_cgroup_disabled())
 		return;
 	if (!mem)
 		return;
 	__mem_cgroup_cancel_charge(mem, 1);
 }
 static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
 				   unsigned int nr_pages,
 				   const enum charge_type ctype)
 {
 	struct memcg_batch_info *batch = NULL;
 	bool uncharge_memsw = true;
 	/* If swapout, usage of swap doesn't decrease */
 	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
 		uncharge_memsw = false;
 	batch = &current->memcg_batch;
 	/*
 	 * In usual, we do css_get() when we remember memcg pointer.
 	 * But in this case, we keep res->usage until end of a series of
 	 * uncharges. Then, it's ok to ignore memcg's refcnt.
 	 */
 	if (!batch->memcg)
 		batch->memcg = mem;
 	/*
 	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
 	 * In those cases, all pages freed continuously can be expected to be in
 	 * the same cgroup and we have chance to coalesce uncharges.
 	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
 	 * because we want to do uncharge as soon as possible.
 	 */
 	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
 		goto direct_uncharge;
 	if (nr_pages > 1)
 		goto direct_uncharge;
 	/*
 	 * In typical case, batch->memcg == mem. This means we can
 	 * merge a series of uncharges to an uncharge of res_counter.
 	 * If not, we uncharge res_counter ony by one.
 	 */
 	if (batch->memcg != mem)
 		goto direct_uncharge;
 	/* remember freed charge and uncharge it later */
 	batch->nr_pages++;
 	if (uncharge_memsw)
 		batch->memsw_nr_pages++;
 	return;
 direct_uncharge:
 	res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
 	if (uncharge_memsw)
 		res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
 	if (unlikely(batch->memcg != mem))
 		memcg_oom_recover(mem);
 	return;
 }
 /*
  * uncharge if !page_mapped(page)
  */
 static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
 	struct mem_cgroup *mem = NULL;
 	unsigned int nr_pages = 1;
 	struct page_cgroup *pc;
 	if (mem_cgroup_disabled())
 		return NULL;
 	if (PageSwapCache(page))
 		return NULL;
 	if (PageTransHuge(page)) {
 		nr_pages <<= compound_order(page);
 		VM_BUG_ON(!PageTransHuge(page));
 	}
 	/*
 	 * Check if our page_cgroup is valid
 	 */
 	pc = lookup_page_cgroup(page);
 	if (unlikely(!pc || !PageCgroupUsed(pc)))
 		return NULL;
 	lock_page_cgroup(pc);
 	mem = pc->mem_cgroup;
 	if (!PageCgroupUsed(pc))
 		goto unlock_out;
 	switch (ctype) {
 	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
 	case MEM_CGROUP_CHARGE_TYPE_DROP:
 		/* See mem_cgroup_prepare_migration() */
 		if (page_mapped(page) || PageCgroupMigration(pc))
 			goto unlock_out;
 		break;
 	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
 		if (!PageAnon(page)) {	/* Shared memory */
 			if (page->mapping && !page_is_file_cache(page))
 				goto unlock_out;
 		} else if (page_mapped(page)) /* Anon */
 				goto unlock_out;
 		break;
 	default:
 		break;
 	}
 	mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
 	ClearPageCgroupUsed(pc);
 	/*
 	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
 	 * freed from LRU. This is safe because uncharged page is expected not
 	 * to be reused (freed soon). Exception is SwapCache, it's handled by
 	 * special functions.
 	 */
 	unlock_page_cgroup(pc);
 	/*
 	 * even after unlock, we have mem->res.usage here and this memcg
 	 * will never be freed.
 	 */
 	memcg_check_events(mem, page);
 	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
 		mem_cgroup_swap_statistics(mem, true);
 		mem_cgroup_get(mem);
 	}
 	if (!mem_cgroup_is_root(mem))
 		mem_cgroup_do_uncharge(mem, nr_pages, ctype);
 	return mem;
 unlock_out:
 	unlock_page_cgroup(pc);
 	return NULL;
 }
 void mem_cgroup_uncharge_page(struct page *page)
 {
 	/* early check. */
 	if (page_mapped(page))
 		return;
 	if (page->mapping && !PageAnon(page))
 		return;
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 void mem_cgroup_uncharge_cache_page(struct page *page)
 {
 	VM_BUG_ON(page_mapped(page));
 	VM_BUG_ON(page->mapping);
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 }
 /*
  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
  * In that cases, pages are freed continuously and we can expect pages
  * are in the same memcg. All these calls itself limits the number of
  * pages freed at once, then uncharge_start/end() is called properly.
  * This may be called prural(2) times in a context,
  */
 void mem_cgroup_uncharge_start(void)
 {
 	current->memcg_batch.do_batch++;
 	/* We can do nest. */
 	if (current->memcg_batch.do_batch == 1) {
 		current->memcg_batch.memcg = NULL;
 		current->memcg_batch.nr_pages = 0;
 		current->memcg_batch.memsw_nr_pages = 0;
 	}
 }
 void mem_cgroup_uncharge_end(void)
 {
 	struct memcg_batch_info *batch = &current->memcg_batch;
 	if (!batch->do_batch)
 		return;
 	batch->do_batch--;
 	if (batch->do_batch) /* If stacked, do nothing. */
 		return;
 	if (!batch->memcg)
 		return;
 	/*
 	 * This "batch->memcg" is valid without any css_get/put etc...
 	 * bacause we hide charges behind us.
 	 */
 	if (batch->nr_pages)
 		res_counter_uncharge(&batch->memcg->res,
 				     batch->nr_pages * PAGE_SIZE);
 	if (batch->memsw_nr_pages)
 		res_counter_uncharge(&batch->memcg->memsw,
 				     batch->memsw_nr_pages * PAGE_SIZE);
 	memcg_oom_recover(batch->memcg);
 	/* forget this pointer (for sanity check) */
 	batch->memcg = NULL;
 }
 #ifdef CONFIG_SWAP
 /*
  * called after __delete_from_swap_cache() and drop "page" account.
  * memcg information is recorded to swap_cgroup of "ent"
  */
 void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 {
 	struct mem_cgroup *memcg;
 	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
 	if (!swapout) /* this was a swap cache but the swap is unused ! */
 		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
 	memcg = __mem_cgroup_uncharge_common(page, ctype);
 	/*
 	 * record memcg information,  if swapout && memcg != NULL,
 	 * mem_cgroup_get() was called in uncharge().
 	 */
 	if (do_swap_account && swapout && memcg)
 		swap_cgroup_record(ent, css_id(&memcg->css));
 }
 #endif
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 /*
  * called from swap_entry_free(). remove record in swap_cgroup and
  * uncharge "memsw" account.
  */
 void mem_cgroup_uncharge_swap(swp_entry_t ent)
 {
 	struct mem_cgroup *memcg;
 	unsigned short id;
 	if (!do_swap_account)
 		return;
 	id = swap_cgroup_record(ent, 0);
 	rcu_read_lock();
 	memcg = mem_cgroup_lookup(id);
 	if (memcg) {
 		/*
 		 * We uncharge this because swap is freed.
 		 * This memcg can be obsolete one. We avoid calling css_tryget
 		 */
 		if (!mem_cgroup_is_root(memcg))
 			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 		mem_cgroup_swap_statistics(memcg, false);
 		mem_cgroup_put(memcg);
 	}
 	rcu_read_unlock();
 }
 /**
  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
  * @entry: swap entry to be moved
  * @from:  mem_cgroup which the entry is moved from
  * @to:  mem_cgroup which the entry is moved to
  * @need_fixup: whether we should fixup res_counters and refcounts.
  *
  * It succeeds only when the swap_cgroup's record for this entry is the same
  * as the mem_cgroup's id of @from.
  *
  * Returns 0 on success, -EINVAL on failure.
  *
  * The caller must have charged to @to, IOW, called res_counter_charge() about
  * both res and memsw, and called css_get().
  */
 static int mem_cgroup_move_swap_account(swp_entry_t entry,
 		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
 {
 	unsigned short old_id, new_id;
 	old_id = css_id(&from->css);
 	new_id = css_id(&to->css);
 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
 		mem_cgroup_swap_statistics(from, false);
 		mem_cgroup_swap_statistics(to, true);
 		/*
 		 * This function is only called from task migration context now.
 		 * It postpones res_counter and refcount handling till the end
 		 * of task migration(mem_cgroup_clear_mc()) for performance
 		 * improvement. But we cannot postpone mem_cgroup_get(to)
 		 * because if the process that has been moved to @to does
 		 * swap-in, the refcount of @to might be decreased to 0.
 		 */
 		mem_cgroup_get(to);
 		if (need_fixup) {
 			if (!mem_cgroup_is_root(from))
 				res_counter_uncharge(&from->memsw, PAGE_SIZE);
 			mem_cgroup_put(from);
 			/*
 			 * we charged both to->res and to->memsw, so we should
 			 * uncharge to->res.
 			 */
 			if (!mem_cgroup_is_root(to))
 				res_counter_uncharge(&to->res, PAGE_SIZE);
 		}
 		return 0;
 	}
 	return -EINVAL;
 }
 #else
 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
 {
 	return -EINVAL;
 }
 #endif
 /*
  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
  * page belongs to.
  */
 int mem_cgroup_prepare_migration(struct page *page,
 	struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
 {
 	struct mem_cgroup *mem = NULL;
 	struct page_cgroup *pc;
 	enum charge_type ctype;
 	int ret = 0;
 	*ptr = NULL;
 	VM_BUG_ON(PageTransHuge(page));
 	if (mem_cgroup_disabled())
 		return 0;
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
 		css_get(&mem->css);
 		/*
 		 * At migrating an anonymous page, its mapcount goes down
 		 * to 0 and uncharge() will be called. But, even if it's fully
 		 * unmapped, migration may fail and this page has to be
 		 * charged again. We set MIGRATION flag here and delay uncharge
 		 * until end_migration() is called
 		 *
 		 * Corner Case Thinking
 		 * A)
 		 * When the old page was mapped as Anon and it's unmap-and-freed
 		 * while migration was ongoing.
 		 * If unmap finds the old page, uncharge() of it will be delayed
 		 * until end_migration(). If unmap finds a new page, it's
 		 * uncharged when it make mapcount to be 1->0. If unmap code
 		 * finds swap_migration_entry, the new page will not be mapped
 		 * and end_migration() will find it(mapcount==0).
 		 *
 		 * B)
 		 * When the old page was mapped but migraion fails, the kernel
 		 * remaps it. A charge for it is kept by MIGRATION flag even
 		 * if mapcount goes down to 0. We can do remap successfully
 		 * without charging it again.
 		 *
 		 * C)
 		 * The "old" page is under lock_page() until the end of
 		 * migration, so, the old page itself will not be swapped-out.
 		 * If the new page is swapped out before end_migraton, our
 		 * hook to usual swap-out path will catch the event.
 		 */
 		if (PageAnon(page))
 			SetPageCgroupMigration(pc);
 	}
 	unlock_page_cgroup(pc);
 	/*
 	 * If the page is not charged at this point,
 	 * we return here.
 	 */
 	if (!mem)
 		return 0;
 	*ptr = mem;
 	ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
 	css_put(&mem->css);/* drop extra refcnt */
 	if (ret || *ptr == NULL) {
 		if (PageAnon(page)) {
 			lock_page_cgroup(pc);
 			ClearPageCgroupMigration(pc);
 			unlock_page_cgroup(pc);
 			/*
 			 * The old page may be fully unmapped while we kept it.
 			 */
 			mem_cgroup_uncharge_page(page);
 		}
 		return -ENOMEM;
 	}
 	/*
 	 * We charge new page before it's used/mapped. So, even if unlock_page()
 	 * is called before end_migration, we can catch all events on this new
 	 * page. In the case new page is migrated but not remapped, new page's
 	 * mapcount will be finally 0 and we call uncharge in end_migration().
 	 */
 	pc = lookup_page_cgroup(newpage);
 	if (PageAnon(page))
 		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
 	else if (page_is_file_cache(page))
 		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	else
 		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
 	__mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
 	return ret;
 }
 /* remove redundant charge if migration failed*/
 void mem_cgroup_end_migration(struct mem_cgroup *mem,
 	struct page *oldpage, struct page *newpage, bool migration_ok)
 {
 	struct page *used, *unused;
 	struct page_cgroup *pc;
 	if (!mem)
 		return;
 	/* blocks rmdir() */
 	cgroup_exclude_rmdir(&mem->css);
 	if (!migration_ok) {
 		used = oldpage;
 		unused = newpage;
 	} else {
 		used = newpage;
 		unused = oldpage;
 	}
 	/*
 	 * We disallowed uncharge of pages under migration because mapcount
 	 * of the page goes down to zero, temporarly.
 	 * Clear the flag and check the page should be charged.
 	 */
 	pc = lookup_page_cgroup(oldpage);
 	lock_page_cgroup(pc);
 	ClearPageCgroupMigration(pc);
 	unlock_page_cgroup(pc);
 	__mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
 	/*
 	 * If a page is a file cache, radix-tree replacement is very atomic
 	 * and we can skip this check. When it was an Anon page, its mapcount
 	 * goes down to 0. But because we added MIGRATION flage, it's not
 	 * uncharged yet. There are several case but page->mapcount check
 	 * and USED bit check in mem_cgroup_uncharge_page() will do enough
 	 * check. (see prepare_charge() also)
 	 */
 	if (PageAnon(used))
 		mem_cgroup_uncharge_page(used);
 	/*
 	 * At migration, we may charge account against cgroup which has no
 	 * tasks.
 	 * So, rmdir()->pre_destroy() can be called while we do this charge.
 	 * In that case, we need to call pre_destroy() again. check it here.
 	 */
 	cgroup_release_and_wakeup_rmdir(&mem->css);
 }
 /*
  * A call to try to shrink memory usage on charge failure at shmem's swapin.
  * Calling hierarchical_reclaim is not enough because we should update
  * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
  * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
  * not from the memcg which this page would be charged to.
  * try_charge_swapin does all of these works properly.
  */
 int mem_cgroup_shmem_charge_fallback(struct page *page,
 			    struct mm_struct *mm,
 			    gfp_t gfp_mask)
 {
 	struct mem_cgroup *mem;
 	int ret;
 	if (mem_cgroup_disabled())
 		return 0;
 	ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
 	if (!ret)
 		mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
 	return ret;
 }
 #ifdef CONFIG_DEBUG_VM
 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
 {
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup(page);
 	if (likely(pc) && PageCgroupUsed(pc))
 		return pc;
 	return NULL;
 }
 bool mem_cgroup_bad_page_check(struct page *page)
 {
 	if (mem_cgroup_disabled())
 		return false;
 	return lookup_page_cgroup_used(page) != NULL;
 }
 void mem_cgroup_print_bad_page(struct page *page)
 {
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup_used(page);
 	if (pc) {
 		int ret = -1;
 		char *path;
 		printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
 		       pc, pc->flags, pc->mem_cgroup);
 		path = kmalloc(PATH_MAX, GFP_KERNEL);
 		if (path) {
 			rcu_read_lock();
 			ret = cgroup_path(pc->mem_cgroup->css.cgroup,
 							path, PATH_MAX);
 			rcu_read_unlock();
 		}
 		printk(KERN_CONT "(%s)\n",
 				(ret < 0) ? "cannot get the path" : path);
 		kfree(path);
 	}
 }
 #endif
 static DEFINE_MUTEX(set_limit_mutex);
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 				unsigned long long val)
 {
 	int retry_count;
 	u64 memswlimit, memlimit;
 	int ret = 0;
 	int children = mem_cgroup_count_children(memcg);
 	u64 curusage, oldusage;
 	int enlarge;
 	/*
 	 * For keeping hierarchical_reclaim simple, how long we should retry
 	 * is depends on callers. We set our retry-count to be function
 	 * of # of children which we should visit in this loop.
 	 */
 	retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
 	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 	enlarge = 0;
 	while (retry_count) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		/*
 		 * Rather than hide all in some function, I do this in
 		 * open coded manner. You see what this really does.
 		 * We have to guarantee mem->res.limit < mem->memsw.limit.
 		 */
 		mutex_lock(&set_limit_mutex);
 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 		if (memswlimit < val) {
 			ret = -EINVAL;
 			mutex_unlock(&set_limit_mutex);
 			break;
 		}
 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 		if (memlimit < val)
 			enlarge = 1;
 		ret = res_counter_set_limit(&memcg->res, val);
 		if (!ret) {
 			if (memswlimit == val)
 				memcg->memsw_is_minimum = true;
 			else
 				memcg->memsw_is_minimum = false;
 		}
 		mutex_unlock(&set_limit_mutex);
 		if (!ret)
 			break;
 		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
 						MEM_CGROUP_RECLAIM_SHRINK,
 						NULL);
 		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 		/* Usage is reduced ? */
   		if (curusage >= oldusage)
 			retry_count--;
 		else
 			oldusage = curusage;
 	}
 	if (!ret && enlarge)
 		memcg_oom_recover(memcg);
 	return ret;
 }
 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 					unsigned long long val)
 {
 	int retry_count;
 	u64 memlimit, memswlimit, oldusage, curusage;
 	int children = mem_cgroup_count_children(memcg);
 	int ret = -EBUSY;
 	int enlarge = 0;
 	/* see mem_cgroup_resize_res_limit */
  	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
 	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
 	while (retry_count) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		/*
 		 * Rather than hide all in some function, I do this in
 		 * open coded manner. You see what this really does.
 		 * We have to guarantee mem->res.limit < mem->memsw.limit.
 		 */
 		mutex_lock(&set_limit_mutex);
 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 		if (memlimit > val) {
 			ret = -EINVAL;
 			mutex_unlock(&set_limit_mutex);
 			break;
 		}
 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 		if (memswlimit < val)
 			enlarge = 1;
 		ret = res_counter_set_limit(&memcg->memsw, val);
 		if (!ret) {
 			if (memlimit == val)
 				memcg->memsw_is_minimum = true;
 			else
 				memcg->memsw_is_minimum = false;
 		}
 		mutex_unlock(&set_limit_mutex);
 		if (!ret)
 			break;
 		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
 						MEM_CGROUP_RECLAIM_NOSWAP |
 						MEM_CGROUP_RECLAIM_SHRINK,
 						NULL);
 		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
 		/* Usage is reduced ? */
 		if (curusage >= oldusage)
 			retry_count--;
 		else
 			oldusage = curusage;
 	}
 	if (!ret && enlarge)
 		memcg_oom_recover(memcg);
 	return ret;
 }
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 					    gfp_t gfp_mask,
 					    unsigned long *total_scanned)
 {
 	unsigned long nr_reclaimed = 0;
 	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
 	unsigned long reclaimed;
 	int loop = 0;
 	struct mem_cgroup_tree_per_zone *mctz;
 	unsigned long long excess;
 	unsigned long nr_scanned;
 	if (order > 0)
 		return 0;
 	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
 	/*
 	 * This loop can run a while, specially if mem_cgroup's continuously
 	 * keep exceeding their soft limit and putting the system under
 	 * pressure
 	 */
 	do {
 		if (next_mz)
 			mz = next_mz;
 		else
 			mz = mem_cgroup_largest_soft_limit_node(mctz);
 		if (!mz)
 			break;
 		nr_scanned = 0;
 		reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
 						gfp_mask,
 						MEM_CGROUP_RECLAIM_SOFT,
 						&nr_scanned);
 		nr_reclaimed += reclaimed;
 		*total_scanned += nr_scanned;
 		spin_lock(&mctz->lock);
 		/*
 		 * If we failed to reclaim anything from this memory cgroup
 		 * it is time to move on to the next cgroup
 		 */
 		next_mz = NULL;
 		if (!reclaimed) {
 			do {
 				/*
 				 * Loop until we find yet another one.
 				 *
 				 * By the time we get the soft_limit lock
 				 * again, someone might have aded the
 				 * group back on the RB tree. Iterate to
 				 * make sure we get a different mem.
 				 * mem_cgroup_largest_soft_limit_node returns
 				 * NULL if no other cgroup is present on
 				 * the tree
 				 */
 				next_mz =
 				__mem_cgroup_largest_soft_limit_node(mctz);
 				if (next_mz == mz)
 					css_put(&next_mz->mem->css);
 				else /* next_mz == NULL or other memcg */
 					break;
 			} while (1);
 		}
 		__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
 		excess = res_counter_soft_limit_excess(&mz->mem->res);
 		/*
 		 * One school of thought says that we should not add
 		 * back the node to the tree if reclaim returns 0.
 		 * But our reclaim could return 0, simply because due
 		 * to priority we are exposing a smaller subset of
 		 * memory to reclaim from. Consider this as a longer
 		 * term TODO.
 		 */
 		/* If excess == 0, no tree ops */
 		__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
 		spin_unlock(&mctz->lock);
 		css_put(&mz->mem->css);
 		loop++;
 		/*
 		 * Could not reclaim anything and there are no more
 		 * mem cgroups to try or we seem to be looping without
 		 * reclaiming anything.
 		 */
 		if (!nr_reclaimed &&
 			(next_mz == NULL ||
 			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
 			break;
 	} while (!nr_reclaimed);
 	if (next_mz)
 		css_put(&next_mz->mem->css);
 	return nr_reclaimed;
 }
 /*
  * This routine traverse page_cgroup in given list and drop them all.
  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
  */
 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 				int node, int zid, enum lru_list lru)
 {
 	struct zone *zone;
 	struct mem_cgroup_per_zone *mz;
 	struct page_cgroup *pc, *busy;
 	unsigned long flags, loop;
 	struct list_head *list;
 	int ret = 0;
 	zone = &NODE_DATA(node)->node_zones[zid];
 	mz = mem_cgroup_zoneinfo(mem, node, zid);
 	list = &mz->lists[lru];
 	loop = MEM_CGROUP_ZSTAT(mz, lru);
 	/* give some margin against EBUSY etc...*/
 	loop += 256;
 	busy = NULL;
 	while (loop--) {
 		struct page *page;
 		ret = 0;
 		spin_lock_irqsave(&zone->lru_lock, flags);
 		if (list_empty(list)) {
 			spin_unlock_irqrestore(&zone->lru_lock, flags);
 			break;
 		}
 		pc = list_entry(list->prev, struct page_cgroup, lru);
 		if (busy == pc) {
 			list_move(&pc->lru, list);
 			busy = NULL;
 			spin_unlock_irqrestore(&zone->lru_lock, flags);
 			continue;
 		}
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 		page = lookup_cgroup_page(pc);
 		ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
 		if (ret == -ENOMEM)
 			break;
 		if (ret == -EBUSY || ret == -EINVAL) {
 			/* found lock contention or "pc" is obsolete. */
 			busy = pc;
 			cond_resched();
 		} else
 			busy = NULL;
 	}
 	if (!ret && !list_empty(list))
 		return -EBUSY;
 	return ret;
 }
 /*
  * make mem_cgroup's charge to be 0 if there is no task.
  * This enables deleting this mem_cgroup.
  */
 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
 {
 	int ret;
 	int node, zid, shrink;
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct cgroup *cgrp = mem->css.cgroup;
 	css_get(&mem->css);
 	shrink = 0;
 	/* should free all ? */
 	if (free_all)
 		goto try_to_free;
 move_account:
 	do {
 		ret = -EBUSY;
 		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
 			goto out;
 		ret = -EINTR;
 		if (signal_pending(current))
 			goto out;
 		/* This is for making all *used* pages to be on LRU. */
 		lru_add_drain_all();
 		drain_all_stock_sync(mem);
 		ret = 0;
 		mem_cgroup_start_move(mem);
 		for_each_node_state(node, N_HIGH_MEMORY) {
 			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
 				enum lru_list l;
 				for_each_lru(l) {
 					ret = mem_cgroup_force_empty_list(mem,
 							node, zid, l);
 					if (ret)
 						break;
 				}
 			}
 			if (ret)
 				break;
 		}
 		mem_cgroup_end_move(mem);
 		memcg_oom_recover(mem);
 		/* it seems parent cgroup doesn't have enough mem */
 		if (ret == -ENOMEM)
 			goto try_to_free;
 		cond_resched();
 	/* "ret" should also be checked to ensure all lists are empty. */
 	} while (mem->res.usage > 0 || ret);
 out:
 	css_put(&mem->css);
 	return ret;
 try_to_free:
 	/* returns EBUSY if there is a task or if we come here twice. */
 	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
 		ret = -EBUSY;
 		goto out;
 	}
 	/* we call try-to-free pages for make this cgroup empty */
 	lru_add_drain_all();
 	/* try to free all pages in this cgroup */
 	shrink = 1;
 	while (nr_retries && mem->res.usage > 0) {
 		struct memcg_scanrecord rec;
 		int progress;
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			goto out;
 		}
 		rec.context = SCAN_BY_SHRINK;
 		rec.mem = mem;
 		rec.root = mem;
 		progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
 						false, &rec);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
 			congestion_wait(BLK_RW_ASYNC, HZ/10);
 		}
 	}
 	lru_add_drain();
 	/* try move_account...there may be some *locked* pages. */
 	goto move_account;
 }
 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
 {
 	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
 }
 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
 {
 	return mem_cgroup_from_cont(cont)->use_hierarchy;
 }
 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
 					u64 val)
 {
 	int retval = 0;
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 	struct cgroup *parent = cont->parent;
 	struct mem_cgroup *parent_mem = NULL;
 	if (parent)
 		parent_mem = mem_cgroup_from_cont(parent);
 	cgroup_lock();
 	/*
 	 * If parent's use_hierarchy is set, we can't make any modifications
 	 * in the child subtrees. If it is unset, then the change can
 	 * occur, provided the current cgroup has no children.
 	 *
 	 * For the root cgroup, parent_mem is NULL, we allow value to be
 	 * set if there are no children.
 	 */
 	if ((!parent_mem || !parent_mem->use_hierarchy) &&
 				(val == 1 || val == 0)) {
 		if (list_empty(&cont->children))
 			mem->use_hierarchy = val;
 		else
 			retval = -EBUSY;
 	} else
 		retval = -EINVAL;
 	cgroup_unlock();
 	return retval;
 }
 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
 					       enum mem_cgroup_stat_index idx)
 {
 	struct mem_cgroup *iter;
 	long val = 0;
 	/* Per-cpu values can be negative, use a signed accumulator */
 	for_each_mem_cgroup_tree(iter, mem)
 		val += mem_cgroup_read_stat(iter, idx);
 	if (val < 0) /* race ? */
 		val = 0;
 	return val;
 }
 static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
 {
 	u64 val;
 	if (!mem_cgroup_is_root(mem)) {
 		if (!swap)
 			return res_counter_read_u64(&mem->res, RES_USAGE);
 		else
 			return res_counter_read_u64(&mem->memsw, RES_USAGE);
 	}
 	val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
 	val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
 	if (swap)
 		val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
 	return val << PAGE_SHIFT;
 }
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 	u64 val;
 	int type, name;
 	type = MEMFILE_TYPE(cft->private);
 	name = MEMFILE_ATTR(cft->private);
 	switch (type) {
 	case _MEM:
 		if (name == RES_USAGE)
 			val = mem_cgroup_usage(mem, false);
 		else
 			val = res_counter_read_u64(&mem->res, name);
 		break;
 	case _MEMSWAP:
 		if (name == RES_USAGE)
 			val = mem_cgroup_usage(mem, true);
 		else
 			val = res_counter_read_u64(&mem->memsw, name);
 		break;
 	default:
 		BUG();
 		break;
 	}
 	return val;
 }
 /*
  * The user of this function is...
  * RES_LIMIT.
  */
 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 			    const char *buffer)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	int type, name;
 	unsigned long long val;
 	int ret;
 	type = MEMFILE_TYPE(cft->private);
 	name = MEMFILE_ATTR(cft->private);
 	switch (name) {
 	case RES_LIMIT:
 		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
 			ret = -EINVAL;
 			break;
 		}
 		/* This function does all necessary parse...reuse it */
 		ret = res_counter_memparse_write_strategy(buffer, &val);
 		if (ret)
 			break;
 		if (type == _MEM)
 			ret = mem_cgroup_resize_limit(memcg, val);
 		else
 			ret = mem_cgroup_resize_memsw_limit(memcg, val);
 		break;
 	case RES_SOFT_LIMIT:
 		ret = res_counter_memparse_write_strategy(buffer, &val);
 		if (ret)
 			break;
 		/*
 		 * For memsw, soft limits are hard to implement in terms
 		 * of semantics, for now, we support soft limits for
 		 * control without swap
 		 */
 		if (type == _MEM)
 			ret = res_counter_set_soft_limit(&memcg->res, val);
 		else
 			ret = -EINVAL;
 		break;
 	default:
 		ret = -EINVAL; /* should be BUG() ? */
 		break;
 	}
 	return ret;
 }
 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
 		unsigned long long *mem_limit, unsigned long long *memsw_limit)
 {
 	struct cgroup *cgroup;
 	unsigned long long min_limit, min_memsw_limit, tmp;
 	min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 	min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 	cgroup = memcg->css.cgroup;
 	if (!memcg->use_hierarchy)
 		goto out;
 	while (cgroup->parent) {
 		cgroup = cgroup->parent;
 		memcg = mem_cgroup_from_cont(cgroup);
 		if (!memcg->use_hierarchy)
 			break;
 		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
 		min_limit = min(min_limit, tmp);
 		tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 		min_memsw_limit = min(min_memsw_limit, tmp);
 	}
 out:
 	*mem_limit = min_limit;
 	*memsw_limit = min_memsw_limit;
 	return;
 }
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
 	struct mem_cgroup *mem;
 	int type, name;
 	mem = mem_cgroup_from_cont(cont);
 	type = MEMFILE_TYPE(event);
 	name = MEMFILE_ATTR(event);
 	switch (name) {
 	case RES_MAX_USAGE:
 		if (type == _MEM)
 			res_counter_reset_max(&mem->res);
 		else
 			res_counter_reset_max(&mem->memsw);
 		break;
 	case RES_FAILCNT:
 		if (type == _MEM)
 			res_counter_reset_failcnt(&mem->res);
 		else
 			res_counter_reset_failcnt(&mem->memsw);
 		break;
 	}
 	return 0;
 }
 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
 					struct cftype *cft)
 {
 	return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
 }
 #ifdef CONFIG_MMU
 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 					struct cftype *cft, u64 val)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
 	if (val >= (1 << NR_MOVE_TYPE))
 		return -EINVAL;
 	/*
 	 * We check this value several times in both in can_attach() and
 	 * attach(), so we need cgroup lock to prevent this value from being
 	 * inconsistent.
 	 */
 	cgroup_lock();
 	mem->move_charge_at_immigrate = val;
 	cgroup_unlock();
 	return 0;
 }
 #else
 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 					struct cftype *cft, u64 val)
 {
 	return -ENOSYS;
 }
 #endif
 /* For read statistics */
 enum {
 	MCS_CACHE,
 	MCS_RSS,
 	MCS_FILE_MAPPED,
 	MCS_PGPGIN,
 	MCS_PGPGOUT,
 	MCS_SWAP,
 	MCS_PGFAULT,
 	MCS_PGMAJFAULT,
 	MCS_INACTIVE_ANON,
 	MCS_ACTIVE_ANON,
 	MCS_INACTIVE_FILE,
 	MCS_ACTIVE_FILE,
 	MCS_UNEVICTABLE,
 	NR_MCS_STAT,
 };
 struct mcs_total_stat {
 	s64 stat[NR_MCS_STAT];
 };
 struct {
 	char *local_name;
 	char *total_name;
 } memcg_stat_strings[NR_MCS_STAT] = {
 	{"cache", "total_cache"},
 	{"rss", "total_rss"},
 	{"mapped_file", "total_mapped_file"},
 	{"pgpgin", "total_pgpgin"},
 	{"pgpgout", "total_pgpgout"},
 	{"swap", "total_swap"},
 	{"pgfault", "total_pgfault"},
 	{"pgmajfault", "total_pgmajfault"},
 	{"inactive_anon", "total_inactive_anon"},
 	{"active_anon", "total_active_anon"},
 	{"inactive_file", "total_inactive_file"},
 	{"active_file", "total_active_file"},
 	{"unevictable", "total_unevictable"}
 };
 static void
 mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
 {
 	s64 val;
 	/* per cpu stat */
 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
 	s->stat[MCS_CACHE] += val * PAGE_SIZE;
 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
 	s->stat[MCS_RSS] += val * PAGE_SIZE;
 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
 	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
 	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
 	s->stat[MCS_PGPGIN] += val;
 	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
 	s->stat[MCS_PGPGOUT] += val;
 	if (do_swap_account) {
 		val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
 		s->stat[MCS_SWAP] += val * PAGE_SIZE;
 	}
 	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
 	s->stat[MCS_PGFAULT] += val;
 	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
 	s->stat[MCS_PGMAJFAULT] += val;
 	/* per zone stat */
 	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));
 	s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
 	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));
 	s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
 	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));
 	s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
 	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));
 	s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
 	val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));
 	s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
 }
 static void
 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
 {
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, mem)
 		mem_cgroup_get_local_stat(iter, s);
 }
 #ifdef CONFIG_NUMA
 static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
 {
 	int nid;
 	unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
 	unsigned long node_nr;
 	struct cgroup *cont = m->private;
 	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
 	total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
 	seq_printf(m, "total=%lu", total_nr);
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
 		seq_printf(m, " N%d=%lu", nid, node_nr);
 	}
 	seq_putc(m, '\n');
 	file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
 	seq_printf(m, "file=%lu", file_nr);
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
 				LRU_ALL_FILE);
 		seq_printf(m, " N%d=%lu", nid, node_nr);
 	}
 	seq_putc(m, '\n');
 	anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
 	seq_printf(m, "anon=%lu", anon_nr);
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
 				LRU_ALL_ANON);
 		seq_printf(m, " N%d=%lu", nid, node_nr);
 	}
 	seq_putc(m, '\n');
 	unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
 	seq_printf(m, "unevictable=%lu", unevictable_nr);
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
 				BIT(LRU_UNEVICTABLE));
 		seq_printf(m, " N%d=%lu", nid, node_nr);
 	}
 	seq_putc(m, '\n');
 	return 0;
 }
 #endif /* CONFIG_NUMA */
 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 				 struct cgroup_map_cb *cb)
 {
 	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
 	struct mcs_total_stat mystat;
 	int i;
 	memset(&mystat, 0, sizeof(mystat));
 	mem_cgroup_get_local_stat(mem_cont, &mystat);
 	for (i = 0; i < NR_MCS_STAT; i++) {
 		if (i == MCS_SWAP && !do_swap_account)
 			continue;
 		cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
 	}
 	/* Hierarchical information */
 	{
 		unsigned long long limit, memsw_limit;
 		memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
 		cb->fill(cb, "hierarchical_memory_limit", limit);
 		if (do_swap_account)
 			cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
 	}
 	memset(&mystat, 0, sizeof(mystat));
 	mem_cgroup_get_total_stat(mem_cont, &mystat);
 	for (i = 0; i < NR_MCS_STAT; i++) {
 		if (i == MCS_SWAP && !do_swap_account)
 			continue;
 		cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
 	}
 #ifdef CONFIG_DEBUG_VM
 	cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
 	{
 		int nid, zid;
 		struct mem_cgroup_per_zone *mz;
 		unsigned long recent_rotated[2] = {0, 0};
 		unsigned long recent_scanned[2] = {0, 0};
 		for_each_online_node(nid)
 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 				mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
 				recent_rotated[0] +=
 					mz->reclaim_stat.recent_rotated[0];
 				recent_rotated[1] +=
 					mz->reclaim_stat.recent_rotated[1];
 				recent_scanned[0] +=
 					mz->reclaim_stat.recent_scanned[0];
 				recent_scanned[1] +=
 					mz->reclaim_stat.recent_scanned[1];
 			}
 		cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
 		cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
 		cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
 		cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
 	}
 #endif
 	return 0;
 }
 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	return mem_cgroup_swappiness(memcg);
 }
 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
 				       u64 val)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup *parent;
 	if (val > 100)
 		return -EINVAL;
 	if (cgrp->parent == NULL)
 		return -EINVAL;
 	parent = mem_cgroup_from_cont(cgrp->parent);
 	cgroup_lock();
 	/* If under hierarchy, only empty-root can set this value */
 	if ((parent->use_hierarchy) ||
 	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
 		cgroup_unlock();
 		return -EINVAL;
 	}
 	memcg->swappiness = val;
 	cgroup_unlock();
 	return 0;
 }
 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 {
 	struct mem_cgroup_threshold_ary *t;
 	u64 usage;
 	int i;
 	rcu_read_lock();
 	if (!swap)
 		t = rcu_dereference(memcg->thresholds.primary);
 	else
 		t = rcu_dereference(memcg->memsw_thresholds.primary);
 	if (!t)
 		goto unlock;
 	usage = mem_cgroup_usage(memcg, swap);
 	/*
 	 * current_threshold points to threshold just below usage.
 	 * If it's not true, a threshold was crossed after last
 	 * call of __mem_cgroup_threshold().
 	 */
 	i = t->current_threshold;
 	/*
 	 * Iterate backward over array of thresholds starting from
 	 * current_threshold and check if a threshold is crossed.
 	 * If none of thresholds below usage is crossed, we read
 	 * only one element of the array here.
 	 */
 	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
 		eventfd_signal(t->entries[i].eventfd, 1);
 	/* i = current_threshold + 1 */
 	i++;
 	/*
 	 * Iterate forward over array of thresholds starting from
 	 * current_threshold+1 and check if a threshold is crossed.
 	 * If none of thresholds above usage is crossed, we read
 	 * only one element of the array here.
 	 */
 	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
 		eventfd_signal(t->entries[i].eventfd, 1);
 	/* Update current_threshold */
 	t->current_threshold = i - 1;
 unlock:
 	rcu_read_unlock();
 }
 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
 {
 	while (memcg) {
 		__mem_cgroup_threshold(memcg, false);
 		if (do_swap_account)
 			__mem_cgroup_threshold(memcg, true);
 		memcg = parent_mem_cgroup(memcg);
 	}
 }
 static int compare_thresholds(const void *a, const void *b)
 {
 	const struct mem_cgroup_threshold *_a = a;
 	const struct mem_cgroup_threshold *_b = b;
 	return _a->threshold - _b->threshold;
 }
 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
 {
 	struct mem_cgroup_eventfd_list *ev;
 	list_for_each_entry(ev, &mem->oom_notify, list)
 		eventfd_signal(ev->eventfd, 1);
 	return 0;
 }
 static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
 {
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, mem)
 		mem_cgroup_oom_notify_cb(iter);
 }
 static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
 	int type = MEMFILE_TYPE(cft->private);
 	u64 threshold, usage;
 	int i, size, ret;
 	ret = res_counter_memparse_write_strategy(args, &threshold);
 	if (ret)
 		return ret;
 	mutex_lock(&memcg->thresholds_lock);
 	if (type == _MEM)
 		thresholds = &memcg->thresholds;
 	else if (type == _MEMSWAP)
 		thresholds = &memcg->memsw_thresholds;
 	else
 		BUG();
 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
 	/* Check if a threshold crossed before adding a new one */
 	if (thresholds->primary)
 		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
 	/* Allocate memory for new array of thresholds */
 	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
 			GFP_KERNEL);
 	if (!new) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
 	new->size = size;
 	/* Copy thresholds (if any) to new array */
 	if (thresholds->primary) {
 		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
 				sizeof(struct mem_cgroup_threshold));
 	}
 	/* Add new threshold */
 	new->entries[size - 1].eventfd = eventfd;
 	new->entries[size - 1].threshold = threshold;
 	/* Sort thresholds. Registering of new threshold isn't time-critical */
 	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
 			compare_thresholds, NULL);
 	/* Find current threshold */
 	new->current_threshold = -1;
 	for (i = 0; i < size; i++) {
 		if (new->entries[i].threshold < usage) {
 			/*
 			 * new->current_threshold will not be used until
 			 * rcu_assign_pointer(), so it's safe to increment
 			 * it here.
 			 */
 			++new->current_threshold;
 		}
 	}
 	/* Free old spare buffer and save old primary buffer as spare */
 	kfree(thresholds->spare);
 	thresholds->spare = thresholds->primary;
 	rcu_assign_pointer(thresholds->primary, new);
 	/* To be sure that nobody uses thresholds */
 	synchronize_rcu();
 unlock:
 	mutex_unlock(&memcg->thresholds_lock);
 	return ret;
 }
 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
 	int type = MEMFILE_TYPE(cft->private);
 	u64 usage;
 	int i, j, size;
 	mutex_lock(&memcg->thresholds_lock);
 	if (type == _MEM)
 		thresholds = &memcg->thresholds;
 	else if (type == _MEMSWAP)
 		thresholds = &memcg->memsw_thresholds;
 	else
 		BUG();
 	/*
 	 * Something went wrong if we trying to unregister a threshold
 	 * if we don't have thresholds
 	 */
 	BUG_ON(!thresholds);
 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
 	/* Check if a threshold crossed before removing */
 	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 	/* Calculate new number of threshold */
 	size = 0;
 	for (i = 0; i < thresholds->primary->size; i++) {
 		if (thresholds->primary->entries[i].eventfd != eventfd)
 			size++;
 	}
 	new = thresholds->spare;
 	/* Set thresholds array to NULL if we don't have thresholds */
 	if (!size) {
 		kfree(new);
 		new = NULL;
 		goto swap_buffers;
 	}
 	new->size = size;
 	/* Copy thresholds and find current threshold */
 	new->current_threshold = -1;
 	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
 		if (thresholds->primary->entries[i].eventfd == eventfd)
 			continue;
 		new->entries[j] = thresholds->primary->entries[i];
 		if (new->entries[j].threshold < usage) {
 			/*
 			 * new->current_threshold will not be used
 			 * until rcu_assign_pointer(), so it's safe to increment
 			 * it here.
 			 */
 			++new->current_threshold;
 		}
 		j++;
 	}
 swap_buffers:
 	/* Swap primary and spare array */
 	thresholds->spare = thresholds->primary;
 	rcu_assign_pointer(thresholds->primary, new);
 	/* To be sure that nobody uses thresholds */
 	synchronize_rcu();
 	mutex_unlock(&memcg->thresholds_lock);
 }
 static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_eventfd_list *event;
 	int type = MEMFILE_TYPE(cft->private);
 	BUG_ON(type != _OOM_TYPE);
 	event = kmalloc(sizeof(*event),	GFP_KERNEL);
 	if (!event)
 		return -ENOMEM;
 	spin_lock(&memcg_oom_lock);
 	event->eventfd = eventfd;
 	list_add(&event->list, &memcg->oom_notify);
 	/* already in OOM ? */
 	if (atomic_read(&memcg->under_oom))
 		eventfd_signal(eventfd, 1);
 	spin_unlock(&memcg_oom_lock);
 	return 0;
 }
 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_eventfd_list *ev, *tmp;
 	int type = MEMFILE_TYPE(cft->private);
 	BUG_ON(type != _OOM_TYPE);
 	spin_lock(&memcg_oom_lock);
 	list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
 		if (ev->eventfd == eventfd) {
 			list_del(&ev->list);
 			kfree(ev);
 		}
 	}
 	spin_unlock(&memcg_oom_lock);
 }
 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
 	struct cftype *cft,  struct cgroup_map_cb *cb)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
 	cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
 	if (atomic_read(&mem->under_oom))
 		cb->fill(cb, "under_oom", 1);
 	else
 		cb->fill(cb, "under_oom", 0);
 	return 0;
 }
 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 	struct cftype *cft, u64 val)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup *parent;
 	/* cannot set to root cgroup and only 0 and 1 are allowed */
 	if (!cgrp->parent || !((val == 0) || (val == 1)))
 		return -EINVAL;
 	parent = mem_cgroup_from_cont(cgrp->parent);
 	cgroup_lock();
 	/* oom-kill-disable is a flag for subhierarchy. */
 	if ((parent->use_hierarchy) ||
 	    (mem->use_hierarchy && !list_empty(&cgrp->children))) {
 		cgroup_unlock();
 		return -EINVAL;
 	}
 	mem->oom_kill_disable = val;
 	if (!val)
 		memcg_oom_recover(mem);
 	cgroup_unlock();
 	return 0;
 }
 #ifdef CONFIG_NUMA
 static const struct file_operations mem_control_numa_stat_file_operations = {
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.release = single_release,
 };
 static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
 {
 	struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
 	file->f_op = &mem_control_numa_stat_file_operations;
 	return single_open(file, mem_control_numa_stat_show, cont);
 }
 #endif /* CONFIG_NUMA */
 static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp,
 				struct cftype *cft,
 				struct cgroup_map_cb *cb)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
 	char string[64];
 	int i;
 	for (i = 0; i < NR_SCANSTATS; i++) {
 		strcpy(string, scanstat_string[i]);
 		strcat(string, SCANSTAT_WORD_LIMIT);
 		cb->fill(cb, string,  mem->scanstat.stats[SCAN_BY_LIMIT][i]);
 	}
 	for (i = 0; i < NR_SCANSTATS; i++) {
 		strcpy(string, scanstat_string[i]);
 		strcat(string, SCANSTAT_WORD_SYSTEM);
 		cb->fill(cb, string,  mem->scanstat.stats[SCAN_BY_SYSTEM][i]);
 	}
 	for (i = 0; i < NR_SCANSTATS; i++) {
 		strcpy(string, scanstat_string[i]);
 		strcat(string, SCANSTAT_WORD_LIMIT);
 		strcat(string, SCANSTAT_WORD_HIERARCHY);
 		cb->fill(cb, string,  mem->scanstat.rootstats[SCAN_BY_LIMIT][i]);
 	}
 	for (i = 0; i < NR_SCANSTATS; i++) {
 		strcpy(string, scanstat_string[i]);
 		strcat(string, SCANSTAT_WORD_SYSTEM);
 		strcat(string, SCANSTAT_WORD_HIERARCHY);
 		cb->fill(cb, string,  mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]);
 	}
 	return 0;
 }
 static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp,
 				unsigned int event)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
 	spin_lock(&mem->scanstat.lock);
 	memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats));
 	memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats));
 	spin_unlock(&mem->scanstat.lock);
 	return 0;
 }
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
 		.read_u64 = mem_cgroup_read,
 		.register_event = mem_cgroup_usage_register_event,
 		.unregister_event = mem_cgroup_usage_unregister_event,
 	},
 	{
 		.name = "max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "soft_limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "failcnt",
 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "stat",
 		.read_map = mem_control_stat_show,
 	},
 	{
 		.name = "force_empty",
 		.trigger = mem_cgroup_force_empty_write,
 	},
 	{
 		.name = "use_hierarchy",
 		.write_u64 = mem_cgroup_hierarchy_write,
 		.read_u64 = mem_cgroup_hierarchy_read,
 	},
 	{
 		.name = "swappiness",
 		.read_u64 = mem_cgroup_swappiness_read,
 		.write_u64 = mem_cgroup_swappiness_write,
 	},
 	{
 		.name = "move_charge_at_immigrate",
 		.read_u64 = mem_cgroup_move_charge_read,
 		.write_u64 = mem_cgroup_move_charge_write,
 	},
 	{
 		.name = "oom_control",
 		.read_map = mem_cgroup_oom_control_read,
 		.write_u64 = mem_cgroup_oom_control_write,
 		.register_event = mem_cgroup_oom_register_event,
 		.unregister_event = mem_cgroup_oom_unregister_event,
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 	},
 #ifdef CONFIG_NUMA
 	{
 		.name = "numa_stat",
 		.open = mem_control_numa_stat_open,
 		.mode = S_IRUGO,
 	},
 #endif
 	{
 		.name = "vmscan_stat",
 		.read_map = mem_cgroup_vmscan_stat_read,
 		.trigger = mem_cgroup_reset_vmscan_stat,
 	},
 };
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 static struct cftype memsw_cgroup_files[] = {
 	{
 		.name = "memsw.usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
 		.read_u64 = mem_cgroup_read,
 		.register_event = mem_cgroup_usage_register_event,
 		.unregister_event = mem_cgroup_usage_unregister_event,
 	},
 	{
 		.name = "memsw.max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "memsw.limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "memsw.failcnt",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read,
 	},
 };
 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
 {
 	if (!do_swap_account)
 		return 0;
 	return cgroup_add_files(cont, ss, memsw_cgroup_files,
 				ARRAY_SIZE(memsw_cgroup_files));
 };
 #else
 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
 {
 	return 0;
 }
 #endif
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
 	struct mem_cgroup_per_node *pn;
 	struct mem_cgroup_per_zone *mz;
 	enum lru_list l;
 	int zone, tmp = node;
 	/*
 	 * This routine is called against possible nodes.
 	 * But it's BUG to call kmalloc() against offline node.
 	 *
 	 * TODO: this routine can waste much memory for nodes which will
 	 *       never be onlined. It's better to use memory hotplug callback
 	 *       function.
 	 */
 	if (!node_state(node, N_NORMAL_MEMORY))
 		tmp = -1;
 	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
 	if (!pn)
 		return 1;
 	mem->info.nodeinfo[node] = pn;
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
 		for_each_lru(l)
 			INIT_LIST_HEAD(&mz->lists[l]);
 		mz->usage_in_excess = 0;
 		mz->on_tree = false;
 		mz->mem = mem;
 	}
 	return 0;
 }
 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
 	kfree(mem->info.nodeinfo[node]);
 }
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
 	struct mem_cgroup *mem;
 	int size = sizeof(struct mem_cgroup);
 	/* Can be very big if MAX_NUMNODES is very big */
 	if (size < PAGE_SIZE)
 		mem = kzalloc(size, GFP_KERNEL);
 	else
 		mem = vzalloc(size);
 	if (!mem)
 		return NULL;
 	mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
 	if (!mem->stat)
 		goto out_free;
 	spin_lock_init(&mem->pcp_counter_lock);
 	return mem;
 out_free:
 	if (size < PAGE_SIZE)
 		kfree(mem);
 	else
 		vfree(mem);
 	return NULL;
 }
 /*
  * At destroying mem_cgroup, references from swap_cgroup can remain.
  * (scanning all at force_empty is too costly...)
  *
  * Instead of clearing all references at force_empty, we remember
  * the number of reference from swap_cgroup and free mem_cgroup when
  * it goes down to 0.
  *
  * Removal of cgroup itself succeeds regardless of refs from swap.
  */
 static void __mem_cgroup_free(struct mem_cgroup *mem)
 {
 	int node;
 	mem_cgroup_remove_from_trees(mem);
 	free_css_id(&mem_cgroup_subsys, &mem->css);
 	for_each_node_state(node, N_POSSIBLE)
 		free_mem_cgroup_per_zone_info(mem, node);
 	free_percpu(mem->stat);
 	if (sizeof(struct mem_cgroup) < PAGE_SIZE)
 		kfree(mem);
 	else
 		vfree(mem);
 }
 static void mem_cgroup_get(struct mem_cgroup *mem)
 {
 	atomic_inc(&mem->refcnt);
 }
 static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
 {
 	if (atomic_sub_and_test(count, &mem->refcnt)) {
 		struct mem_cgroup *parent = parent_mem_cgroup(mem);
 		__mem_cgroup_free(mem);
 		if (parent)
 			mem_cgroup_put(parent);
 	}
 }
 static void mem_cgroup_put(struct mem_cgroup *mem)
 {
 	__mem_cgroup_put(mem, 1);
 }
 /*
  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
  */
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
 {
 	if (!mem->res.parent)
 		return NULL;
 	return mem_cgroup_from_res_counter(mem->res.parent, res);
 }
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 static void __init enable_swap_cgroup(void)
 {
 	if (!mem_cgroup_disabled() && really_do_swap_account)
 		do_swap_account = 1;
 }
 #else
 static void __init enable_swap_cgroup(void)
 {
 }
 #endif
 static int mem_cgroup_soft_limit_tree_init(void)
 {
 	struct mem_cgroup_tree_per_node *rtpn;
 	struct mem_cgroup_tree_per_zone *rtpz;
 	int tmp, node, zone;
 	for_each_node_state(node, N_POSSIBLE) {
 		tmp = node;
 		if (!node_state(node, N_NORMAL_MEMORY))
 			tmp = -1;
 		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
 		if (!rtpn)
 			return 1;
 		soft_limit_tree.rb_tree_per_node[node] = rtpn;
 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 			rtpz = &rtpn->rb_tree_per_zone[zone];
 			rtpz->rb_root = RB_ROOT;
 			spin_lock_init(&rtpz->lock);
 		}
 	}
 	return 0;
 }
 static struct cgroup_subsys_state * __ref
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	struct mem_cgroup *mem, *parent;
 	long error = -ENOMEM;
 	int node;
 	mem = mem_cgroup_alloc();
 	if (!mem)
 		return ERR_PTR(error);
 	for_each_node_state(node, N_POSSIBLE)
 		if (alloc_mem_cgroup_per_zone_info(mem, node))
 			goto free_out;
 	/* root ? */
 	if (cont->parent == NULL) {
 		int cpu;
 		enable_swap_cgroup();
 		parent = NULL;
 		root_mem_cgroup = mem;
 		if (mem_cgroup_soft_limit_tree_init())
 			goto free_out;
 		for_each_possible_cpu(cpu) {
 			struct memcg_stock_pcp *stock =
 						&per_cpu(memcg_stock, cpu);
 			INIT_WORK(&stock->work, drain_local_stock);
 		}
 		hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
 	} else {
 		parent = mem_cgroup_from_cont(cont->parent);
 		mem->use_hierarchy = parent->use_hierarchy;
 		mem->oom_kill_disable = parent->oom_kill_disable;
 	}
 	if (parent && parent->use_hierarchy) {
 		res_counter_init(&mem->res, &parent->res);
 		res_counter_init(&mem->memsw, &parent->memsw);
 		/*
 		 * We increment refcnt of the parent to ensure that we can
 		 * safely access it on res_counter_charge/uncharge.
 		 * This refcnt will be decremented when freeing this
 		 * mem_cgroup(see mem_cgroup_put).
 		 */
 		mem_cgroup_get(parent);
 	} else {
 		res_counter_init(&mem->res, NULL);
 		res_counter_init(&mem->memsw, NULL);
 	}
 	mem->last_scanned_child = 0;
 	mem->last_scanned_node = MAX_NUMNODES;
 	INIT_LIST_HEAD(&mem->oom_notify);
 	if (parent)
 		mem->swappiness = mem_cgroup_swappiness(parent);
 	atomic_set(&mem->refcnt, 1);
 	mem->move_charge_at_immigrate = 0;
 	mutex_init(&mem->thresholds_lock);
 	spin_lock_init(&mem->scanstat.lock);
 	return &mem->css;
 free_out:
 	__mem_cgroup_free(mem);
 	root_mem_cgroup = NULL;
 	return ERR_PTR(error);
 }
 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
 					struct cgroup *cont)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 	return mem_cgroup_force_empty(mem, false);
 }
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 				struct cgroup *cont)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 	mem_cgroup_put(mem);
 }
 static int mem_cgroup_populate(struct cgroup_subsys *ss,
 				struct cgroup *cont)
 {
 	int ret;
 	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
 				ARRAY_SIZE(mem_cgroup_files));
 	if (!ret)
 		ret = register_memsw_files(cont, ss);
 	return ret;
 }
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
 #define PRECHARGE_COUNT_AT_ONCE	256
 static int mem_cgroup_do_precharge(unsigned long count)
 {
 	int ret = 0;
 	int batch_count = PRECHARGE_COUNT_AT_ONCE;
 	struct mem_cgroup *mem = mc.to;
 	if (mem_cgroup_is_root(mem)) {
 		mc.precharge += count;
 		/* we don't need css_get for root */
 		return ret;
 	}
 	/* try to charge at once */
 	if (count > 1) {
 		struct res_counter *dummy;
 		/*
 		 * "mem" cannot be under rmdir() because we've already checked
 		 * by cgroup_lock_live_cgroup() that it is not removed and we
 		 * are still under the same cgroup_mutex. So we can postpone
 		 * css_get().
 		 */
 		if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
 			goto one_by_one;
 		if (do_swap_account && res_counter_charge(&mem->memsw,
 						PAGE_SIZE * count, &dummy)) {
 			res_counter_uncharge(&mem->res, PAGE_SIZE * count);
 			goto one_by_one;
 		}
 		mc.precharge += count;
 		return ret;
 	}
 one_by_one:
 	/* fall back to one by one charge */
 	while (count--) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		if (!batch_count--) {
 			batch_count = PRECHARGE_COUNT_AT_ONCE;
 			cond_resched();
 		}
 		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
 		if (ret || !mem)
 			/* mem_cgroup_clear_mc() will do uncharge later */
 			return -ENOMEM;
 		mc.precharge++;
 	}
 	return ret;
 }
 /**
  * is_target_pte_for_mc - check a pte whether it is valid for move charge
  * @vma: the vma the pte to be checked belongs
  * @addr: the address corresponding to the pte to be checked
  * @ptent: the pte to be checked
  * @target: the pointer the target page or swap ent will be stored(can be NULL)
  *
  * Returns
  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
  *     move charge. if @target is not NULL, the page is stored in target->page
  *     with extra refcnt got(Callers should handle it).
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  *     target for charge migration. if @target is not NULL, the entry is stored
  *     in target->ent.
  *
  * Called with pte lock held.
  */
 union mc_target {
 	struct page	*page;
 	swp_entry_t	ent;
 };
 enum mc_target_type {
 	MC_TARGET_NONE,	/* not used */
 	MC_TARGET_PAGE,
 	MC_TARGET_SWAP,
 };
 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
 						unsigned long addr, pte_t ptent)
 {
 	struct page *page = vm_normal_page(vma, addr, ptent);
 	if (!page || !page_mapped(page))
 		return NULL;
 	if (PageAnon(page)) {
 		/* we don't move shared anon */
 		if (!move_anon() || page_mapcount(page) > 2)
 			return NULL;
 	} else if (!move_file())
 		/* we ignore mapcount for file pages */
 		return NULL;
 	if (!get_page_unless_zero(page))
 		return NULL;
 	return page;
 }
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	int usage_count;
 	struct page *page = NULL;
 	swp_entry_t ent = pte_to_swp_entry(ptent);
 	if (!move_anon() || non_swap_entry(ent))
 		return NULL;
 	usage_count = mem_cgroup_count_swap_user(ent, &page);
 	if (usage_count > 1) { /* we don't move shared anon */
 		if (page)
 			put_page(page);
 		return NULL;
 	}
 	if (do_swap_account)
 		entry->val = ent.val;
 	return page;
 }
 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	struct page *page = NULL;
 	struct inode *inode;
 	struct address_space *mapping;
 	pgoff_t pgoff;
 	if (!vma->vm_file) /* anonymous vma */
 		return NULL;
 	if (!move_file())
 		return NULL;
 	inode = vma->vm_file->f_path.dentry->d_inode;
 	mapping = vma->vm_file->f_mapping;
 	if (pte_none(ptent))
 		pgoff = linear_page_index(vma, addr);
 	else /* pte_file(ptent) is true */
 		pgoff = pte_to_pgoff(ptent);
 	/* page is moved even if it's not RSS of this task(page-faulted). */
 	if (!mapping_cap_swap_backed(mapping)) { /* normal file */
 		page = find_get_page(mapping, pgoff);
 	} else { /* shmem/tmpfs file. we should take account of swap too. */
 		swp_entry_t ent;
 		mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
 		if (do_swap_account)
 			entry->val = ent.val;
 	}
 	return page;
 }
 static int is_target_pte_for_mc(struct vm_area_struct *vma,
 		unsigned long addr, pte_t ptent, union mc_target *target)
 {
 	struct page *page = NULL;
 	struct page_cgroup *pc;
 	int ret = 0;
 	swp_entry_t ent = { .val = 0 };
 	if (pte_present(ptent))
 		page = mc_handle_present_pte(vma, addr, ptent);
 	else if (is_swap_pte(ptent))
 		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
 	else if (pte_none(ptent) || pte_file(ptent))
 		page = mc_handle_file_pte(vma, addr, ptent, &ent);
 	if (!page && !ent.val)
 		return 0;
 	if (page) {
 		pc = lookup_page_cgroup(page);
 		/*
 		 * Do only loose check w/o page_cgroup lock.
 		 * mem_cgroup_move_account() checks the pc is valid or not under
 		 * the lock.
 		 */
 		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
 			ret = MC_TARGET_PAGE;
 			if (target)
 				target->page = page;
 		}
 		if (!ret || !target)
 			put_page(page);
 	}
 	/* There is a swap entry and a page doesn't exist or isn't charged */
 	if (ent.val && !ret &&
 			css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
 		ret = MC_TARGET_SWAP;
 		if (target)
 			target->ent = ent;
 	}
 	return ret;
 }
 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 					unsigned long addr, unsigned long end,
 					struct mm_walk *walk)
 {
 	struct vm_area_struct *vma = walk->private;
 	pte_t *pte;
 	spinlock_t *ptl;
 	split_huge_page_pmd(walk->mm, pmd);
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE)
 		if (is_target_pte_for_mc(vma, addr, *pte, NULL))
 			mc.precharge++;	/* increment precharge temporarily */
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	return 0;
 }
 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
 {
 	unsigned long precharge;
 	struct vm_area_struct *vma;
 	down_read(&mm->mmap_sem);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		struct mm_walk mem_cgroup_count_precharge_walk = {
 			.pmd_entry = mem_cgroup_count_precharge_pte_range,
 			.mm = mm,
 			.private = vma,
 		};
 		if (is_vm_hugetlb_page(vma))
 			continue;
 		walk_page_range(vma->vm_start, vma->vm_end,
 					&mem_cgroup_count_precharge_walk);
 	}
 	up_read(&mm->mmap_sem);
 	precharge = mc.precharge;
 	mc.precharge = 0;
 	return precharge;
 }
 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 {
 	unsigned long precharge = mem_cgroup_count_precharge(mm);
 	VM_BUG_ON(mc.moving_task);
 	mc.moving_task = current;
 	return mem_cgroup_do_precharge(precharge);
 }
 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
 static void __mem_cgroup_clear_mc(void)
 {
 	struct mem_cgroup *from = mc.from;
 	struct mem_cgroup *to = mc.to;
 	/* we must uncharge all the leftover precharges from mc.to */
 	if (mc.precharge) {
 		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
 		mc.precharge = 0;
 	}
 	/*
 	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
 	 * we must uncharge here.
 	 */
 	if (mc.moved_charge) {
 		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
 		mc.moved_charge = 0;
 	}
 	/* we must fixup refcnts and charges */
 	if (mc.moved_swap) {
 		/* uncharge swap account from the old cgroup */
 		if (!mem_cgroup_is_root(mc.from))
 			res_counter_uncharge(&mc.from->memsw,
 						PAGE_SIZE * mc.moved_swap);
 		__mem_cgroup_put(mc.from, mc.moved_swap);
 		if (!mem_cgroup_is_root(mc.to)) {
 			/*
 			 * we charged both to->res and to->memsw, so we should
 			 * uncharge to->res.
 			 */
 			res_counter_uncharge(&mc.to->res,
 						PAGE_SIZE * mc.moved_swap);
 		}
 		/* we've already done mem_cgroup_get(mc.to) */
 		mc.moved_swap = 0;
 	}
 	memcg_oom_recover(from);
 	memcg_oom_recover(to);
 	wake_up_all(&mc.waitq);
 }
 static void mem_cgroup_clear_mc(void)
 {
 	struct mem_cgroup *from = mc.from;
 	/*
 	 * we must clear moving_task before waking up waiters at the end of
 	 * task migration.
 	 */
 	mc.moving_task = NULL;
 	__mem_cgroup_clear_mc();
 	spin_lock(&mc.lock);
 	mc.from = NULL;
 	mc.to = NULL;
 	spin_unlock(&mc.lock);
 	mem_cgroup_end_move(from);
 }
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
 				struct cgroup *cgroup,
 				struct task_struct *p)
 {
 	int ret = 0;
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
 	if (mem->move_charge_at_immigrate) {
 		struct mm_struct *mm;
 		struct mem_cgroup *from = mem_cgroup_from_task(p);
 		VM_BUG_ON(from == mem);
 		mm = get_task_mm(p);
 		if (!mm)
 			return 0;
 		/* We move charges only when we move a owner of the mm */
 		if (mm->owner == p) {
 			VM_BUG_ON(mc.from);
 			VM_BUG_ON(mc.to);
 			VM_BUG_ON(mc.precharge);
 			VM_BUG_ON(mc.moved_charge);
 			VM_BUG_ON(mc.moved_swap);
 			mem_cgroup_start_move(from);
 			spin_lock(&mc.lock);
 			mc.from = from;
 			mc.to = mem;
 			spin_unlock(&mc.lock);
 			/* We set mc.moving_task later */
 			ret = mem_cgroup_precharge_mc(mm);
 			if (ret)
 				mem_cgroup_clear_mc();
 		}
 		mmput(mm);
 	}
 	return ret;
 }
 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
 				struct cgroup *cgroup,
 				struct task_struct *p)
 {
 	mem_cgroup_clear_mc();
 }
 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct mm_walk *walk)
 {
 	int ret = 0;
 	struct vm_area_struct *vma = walk->private;
 	pte_t *pte;
 	spinlock_t *ptl;
 	split_huge_page_pmd(walk->mm, pmd);
 retry:
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; addr += PAGE_SIZE) {
 		pte_t ptent = *(pte++);
 		union mc_target target;
 		int type;
 		struct page *page;
 		struct page_cgroup *pc;
 		swp_entry_t ent;
 		if (!mc.precharge)
 			break;
 		type = is_target_pte_for_mc(vma, addr, ptent, &target);
 		switch (type) {
 		case MC_TARGET_PAGE:
 			page = target.page;
 			if (isolate_lru_page(page))
 				goto put;
 			pc = lookup_page_cgroup(page);
 			if (!mem_cgroup_move_account(page, 1, pc,
 						     mc.from, mc.to, false)) {
 				mc.precharge--;
 				/* we uncharge from mc.from later. */
 				mc.moved_charge++;
 			}
 			putback_lru_page(page);
 put:			/* is_target_pte_for_mc() gets the page */
 			put_page(page);
 			break;
 		case MC_TARGET_SWAP:
 			ent = target.ent;
 			if (!mem_cgroup_move_swap_account(ent,
 						mc.from, mc.to, false)) {
 				mc.precharge--;
 				/* we fixup refcnts and charges later. */
 				mc.moved_swap++;
 			}
 			break;
 		default:
 			break;
 		}
 	}
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	if (addr != end) {
 		/*
 		 * We have consumed all precharges we got in can_attach().
 		 * We try charge one by one, but don't do any additional
 		 * charges to mc.to if we have failed in charge once in attach()
 		 * phase.
 		 */
 		ret = mem_cgroup_do_precharge(1);
 		if (!ret)
 			goto retry;
 	}
 	return ret;
 }
 static void mem_cgroup_move_charge(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
 	lru_add_drain_all();
 retry:
 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
 		/*
 		 * Someone who are holding the mmap_sem might be waiting in
 		 * waitq. So we cancel all extra charges, wake up all waiters,
 		 * and retry. Because we cancel precharges, we might not be able
 		 * to move enough charges, but moving charge is a best-effort
 		 * feature anyway, so it wouldn't be a big problem.
 		 */
 		__mem_cgroup_clear_mc();
 		cond_resched();
 		goto retry;
 	}
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		int ret;
 		struct mm_walk mem_cgroup_move_charge_walk = {
 			.pmd_entry = mem_cgroup_move_charge_pte_range,
 			.mm = mm,
 			.private = vma,
 		};
 		if (is_vm_hugetlb_page(vma))
 			continue;
 		ret = walk_page_range(vma->vm_start, vma->vm_end,
 						&mem_cgroup_move_charge_walk);
 		if (ret)
 			/*
 			 * means we have consumed all precharges and failed in
 			 * doing additional charge. Just abandon here.
 			 */
 			break;
 	}
 	up_read(&mm->mmap_sem);
 }
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 				struct cgroup *cont,
 				struct cgroup *old_cont,
 				struct task_struct *p)
 {
 	struct mm_struct *mm = get_task_mm(p);
 	if (mm) {
 		if (mc.to)
 			mem_cgroup_move_charge(mm);
 		put_swap_token(mm);
 		mmput(mm);
 	}
 	if (mc.to)
 		mem_cgroup_clear_mc();
 }
 #else	/* !CONFIG_MMU */
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
 				struct cgroup *cgroup,
 				struct task_struct *p)
 {
 	return 0;
 }
 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
 				struct cgroup *cgroup,
 				struct task_struct *p)
 {
 }
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 				struct cgroup *cont,
 				struct cgroup *old_cont,
 				struct task_struct *p)
 {
 }
 #endif
 struct cgroup_subsys mem_cgroup_subsys = {
 	.name = "memory",
 	.subsys_id = mem_cgroup_subsys_id,
 	.create = mem_cgroup_create,
 	.pre_destroy = mem_cgroup_pre_destroy,
 	.destroy = mem_cgroup_destroy,
 	.populate = mem_cgroup_populate,
 	.can_attach = mem_cgroup_can_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.attach = mem_cgroup_move_task,
 	.early_init = 0,
 	.use_id = 1,
 };
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 static int __init enable_swap_account(char *s)
 {
 	/* consider enabled if no parameter or 1 is given */
 	if (!strcmp(s, "1"))
 		really_do_swap_account = 1;
 	else if (!strcmp(s, "0"))
 		really_do_swap_account = 0;
 	return 1;
 }
 __setup("swapaccount=", enable_swap_account);
 #endif