Eric Lee / smarc-ti-linux-kernel

1

/* memcontrol.c - Memory Controller

1

/* memcontrol.c - Memory Controller

2

*

2

*

3

* Copyright IBM Corporation, 2007

3

* Copyright IBM Corporation, 2007

4

* Author Balbir Singh <balbir@linux.vnet.ibm.com>

4

* Author Balbir Singh <balbir@linux.vnet.ibm.com>

5

*

5

*

6

7

* Author: Pavel Emelianov <xemul@openvz.org>

7

* Author: Pavel Emelianov <xemul@openvz.org>

8

*

8

*

9

* Memory thresholds

9

* Memory thresholds

10

11

* Author: Kirill A. Shutemov

11

* Author: Kirill A. Shutemov

12

*

12

*

13

* This program is free software; you can redistribute it and/or modify

13

* This program is free software; you can redistribute it and/or modify

14

* it under the terms of the GNU General Public License as published by

14

* it under the terms of the GNU General Public License as published by

15

* the Free Software Foundation; either version 2 of the License, or

15

* the Free Software Foundation; either version 2 of the License, or

16

* (at your option) any later version.

16

* (at your option) any later version.

17

*

17

*

18

* This program is distributed in the hope that it will be useful,

18

* This program is distributed in the hope that it will be useful,

19

* but WITHOUT ANY WARRANTY; without even the implied warranty of

19

* but WITHOUT ANY WARRANTY; without even the implied warranty of

20

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

20

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

21

* GNU General Public License for more details.

21

* GNU General Public License for more details.

22

*/

22

*/

23

24

#include <linux/res_counter.h>

24

#include <linux/res_counter.h>

25

#include <linux/memcontrol.h>

25

#include <linux/memcontrol.h>

26

#include <linux/cgroup.h>

26

#include <linux/cgroup.h>

27

#include <linux/mm.h>

27

#include <linux/mm.h>

28

#include <linux/hugetlb.h>

28

#include <linux/hugetlb.h>

29

#include <linux/pagemap.h>

29

#include <linux/pagemap.h>

30

#include <linux/smp.h>

30

#include <linux/smp.h>

31

#include <linux/page-flags.h>

31

#include <linux/page-flags.h>

32

#include <linux/backing-dev.h>

32

#include <linux/backing-dev.h>

33

#include <linux/bit_spinlock.h>

33

#include <linux/bit_spinlock.h>

34

#include <linux/rcupdate.h>

34

#include <linux/rcupdate.h>

35

#include <linux/limits.h>

35

#include <linux/limits.h>

36

#include <linux/mutex.h>

36

#include <linux/mutex.h>

37

#include <linux/rbtree.h>

37

#include <linux/rbtree.h>

38

#include <linux/slab.h>

38

#include <linux/slab.h>

39

#include <linux/swap.h>

39

#include <linux/swap.h>

40

#include <linux/swapops.h>

40

#include <linux/swapops.h>

41

#include <linux/spinlock.h>

41

#include <linux/spinlock.h>

42

#include <linux/eventfd.h>

42

#include <linux/eventfd.h>

43

#include <linux/sort.h>

43

#include <linux/sort.h>

44

#include <linux/fs.h>

44

#include <linux/fs.h>

45

#include <linux/seq_file.h>

45

#include <linux/seq_file.h>

46

#include <linux/vmalloc.h>

46

#include <linux/vmalloc.h>

47

#include <linux/mm_inline.h>

47

#include <linux/mm_inline.h>

48

#include <linux/page_cgroup.h>

48

#include <linux/page_cgroup.h>

49

#include <linux/cpu.h>

49

#include <linux/cpu.h>

50

#include <linux/oom.h>

50

#include <linux/oom.h>

51

#include "internal.h"

51

#include "internal.h"

52

53

#include <asm/uaccess.h>

53

#include <asm/uaccess.h>

54

55

#include <trace/events/vmscan.h>

55

#include <trace/events/vmscan.h>

56

57

struct cgroup_subsys mem_cgroup_subsys __read_mostly;

57

struct cgroup_subsys mem_cgroup_subsys __read_mostly;

58

#define MEM_CGROUP_RECLAIM_RETRIES 5

58

#define MEM_CGROUP_RECLAIM_RETRIES 5

59

struct mem_cgroup *root_mem_cgroup __read_mostly;

59

struct mem_cgroup *root_mem_cgroup __read_mostly;

60

61

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

61

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

62

/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */

62

/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */

63

int do_swap_account __read_mostly;

63

int do_swap_account __read_mostly;

64

65

/* for remember boot option*/

65

/* for remember boot option*/

66

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED

66

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED

67

static int really_do_swap_account __initdata = 1;

67

static int really_do_swap_account __initdata = 1;

68

#else

68

#else

69

static int really_do_swap_account __initdata = 0;

69

static int really_do_swap_account __initdata = 0;

70

#endif

70

#endif

71

72

#else

72

#else

73

#define do_swap_account (0)

73

#define do_swap_account (0)

74

#endif

74

#endif

75

76

77

/*

77

/*

78

* Statistics for memory cgroup.

78

* Statistics for memory cgroup.

79

*/

79

*/

80

enum mem_cgroup_stat_index {

80

enum mem_cgroup_stat_index {

81

/*

81

/*

82

* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.

82

* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.

83

*/

83

*/

84

MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */

84

MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */

85

MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */

85

MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */

86

MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */

86

MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */

87

MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */

87

MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */

88

MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */

88

MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */

89

MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */

89

MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */

90

MEM_CGROUP_STAT_NSTATS,

90

MEM_CGROUP_STAT_NSTATS,

91

};

91

};

92

93

enum mem_cgroup_events_index {

93

enum mem_cgroup_events_index {

94

MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */

94

MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */

95

MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */

95

MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */

96

MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */

96

MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */

97

MEM_CGROUP_EVENTS_NSTATS,

97

MEM_CGROUP_EVENTS_NSTATS,

98

};

98

};

99

/*

99

/*

100

* Per memcg event counter is incremented at every pagein/pageout. With THP,

100

* Per memcg event counter is incremented at every pagein/pageout. With THP,

101

* it will be incremated by the number of pages. This counter is used for

101

* it will be incremated by the number of pages. This counter is used for

102

* for trigger some periodic events. This is straightforward and better

102

* for trigger some periodic events. This is straightforward and better

103

* than using jiffies etc. to handle periodic memcg event.

103

* than using jiffies etc. to handle periodic memcg event.

104

*/

104

*/

105

enum mem_cgroup_events_target {

105

enum mem_cgroup_events_target {

106

MEM_CGROUP_TARGET_THRESH,

106

MEM_CGROUP_TARGET_THRESH,

107

MEM_CGROUP_TARGET_SOFTLIMIT,

107

MEM_CGROUP_TARGET_SOFTLIMIT,

108

MEM_CGROUP_NTARGETS,

108

MEM_CGROUP_NTARGETS,

109

};

109

};

110

#define THRESHOLDS_EVENTS_TARGET (128)

110

#define THRESHOLDS_EVENTS_TARGET (128)

111

#define SOFTLIMIT_EVENTS_TARGET (1024)

111

#define SOFTLIMIT_EVENTS_TARGET (1024)

112

113

struct mem_cgroup_stat_cpu {

113

struct mem_cgroup_stat_cpu {

114

long count[MEM_CGROUP_STAT_NSTATS];

114

long count[MEM_CGROUP_STAT_NSTATS];

115

unsigned long events[MEM_CGROUP_EVENTS_NSTATS];

115

unsigned long events[MEM_CGROUP_EVENTS_NSTATS];

116

unsigned long targets[MEM_CGROUP_NTARGETS];

116

unsigned long targets[MEM_CGROUP_NTARGETS];

117

};

117

};

118

119

/*

119

/*

120

* per-zone information in memory controller.

120

* per-zone information in memory controller.

121

*/

121

*/

122

struct mem_cgroup_per_zone {

122

struct mem_cgroup_per_zone {

123

/*

123

/*

124

* spin_lock to protect the per cgroup LRU

124

* spin_lock to protect the per cgroup LRU

125

*/

125

*/

126

struct list_head lists[NR_LRU_LISTS];

126

struct list_head lists[NR_LRU_LISTS];

127

unsigned long count[NR_LRU_LISTS];

127

unsigned long count[NR_LRU_LISTS];

128

129

struct zone_reclaim_stat reclaim_stat;

129

struct zone_reclaim_stat reclaim_stat;

130

struct rb_node tree_node; /* RB tree node */

130

struct rb_node tree_node; /* RB tree node */

131

unsigned long long usage_in_excess;/* Set to the value by which */

131

unsigned long long usage_in_excess;/* Set to the value by which */

132

/* the soft limit is exceeded*/

132

/* the soft limit is exceeded*/

133

bool on_tree;

133

bool on_tree;

134

struct mem_cgroup *mem; /* Back pointer, we cannot */

134

struct mem_cgroup *mem; /* Back pointer, we cannot */

135

/* use container_of */

135

/* use container_of */

136

};

136

};

137

/* Macro for accessing counter */

137

/* Macro for accessing counter */

138

#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])

138

#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])

139

140

struct mem_cgroup_per_node {

140

struct mem_cgroup_per_node {

141

struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];

141

struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];

142

};

142

};

143

144

struct mem_cgroup_lru_info {

144

struct mem_cgroup_lru_info {

145

struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];

145

struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];

146

};

146

};

147

148

/*

148

/*

149

* Cgroups above their limits are maintained in a RB-Tree, independent of

149

* Cgroups above their limits are maintained in a RB-Tree, independent of

150

* their hierarchy representation

150

* their hierarchy representation

151

*/

151

*/

152

153

struct mem_cgroup_tree_per_zone {

153

struct mem_cgroup_tree_per_zone {

154

struct rb_root rb_root;

154

struct rb_root rb_root;

155

spinlock_t lock;

155

spinlock_t lock;

156

};

156

};

157

158

struct mem_cgroup_tree_per_node {

158

struct mem_cgroup_tree_per_node {

159

struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];

159

struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];

160

};

160

};

161

162

struct mem_cgroup_tree {

162

struct mem_cgroup_tree {

163

struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];

163

struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];

164

};

164

};

165

166

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

166

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

167

168

struct mem_cgroup_threshold {

168

struct mem_cgroup_threshold {

169

struct eventfd_ctx *eventfd;

169

struct eventfd_ctx *eventfd;

170

u64 threshold;

170

u64 threshold;

171

};

171

};

172

173

/* For threshold */

173

/* For threshold */

174

struct mem_cgroup_threshold_ary {

174

struct mem_cgroup_threshold_ary {

175

/* An array index points to threshold just below usage. */

175

/* An array index points to threshold just below usage. */

176

int current_threshold;

176

int current_threshold;

177

/* Size of entries[] */

177

/* Size of entries[] */

178

unsigned int size;

178

unsigned int size;

179

/* Array of thresholds */

179

/* Array of thresholds */

180

struct mem_cgroup_threshold entries[0];

180

struct mem_cgroup_threshold entries[0];

181

};

181

};

182

183

struct mem_cgroup_thresholds {

183

struct mem_cgroup_thresholds {

184

/* Primary thresholds array */

184

/* Primary thresholds array */

185

struct mem_cgroup_threshold_ary *primary;

185

struct mem_cgroup_threshold_ary *primary;

186

/*

186

/*

187

* Spare threshold array.

187

* Spare threshold array.

188

* This is needed to make mem_cgroup_unregister_event() "never fail".

188

* This is needed to make mem_cgroup_unregister_event() "never fail".

189

* It must be able to store at least primary->size - 1 entries.

189

* It must be able to store at least primary->size - 1 entries.

190

*/

190

*/

191

struct mem_cgroup_threshold_ary *spare;

191

struct mem_cgroup_threshold_ary *spare;

192

};

192

};

193

194

/* for OOM */

194

/* for OOM */

195

struct mem_cgroup_eventfd_list {

195

struct mem_cgroup_eventfd_list {

196

struct list_head list;

196

struct list_head list;

197

struct eventfd_ctx *eventfd;

197

struct eventfd_ctx *eventfd;

198

};

198

};

199

200

static void mem_cgroup_threshold(struct mem_cgroup *mem);

200

static void mem_cgroup_threshold(struct mem_cgroup *mem);

201

static void mem_cgroup_oom_notify(struct mem_cgroup *mem);

201

static void mem_cgroup_oom_notify(struct mem_cgroup *mem);

202

203

/*

203

/*

204

* The memory controller data structure. The memory controller controls both

204

* The memory controller data structure. The memory controller controls both

205

* page cache and RSS per cgroup. We would eventually like to provide

205

* page cache and RSS per cgroup. We would eventually like to provide

206

* statistics based on the statistics developed by Rik Van Riel for clock-pro,

206

* statistics based on the statistics developed by Rik Van Riel for clock-pro,

207

* to help the administrator determine what knobs to tune.

207

* to help the administrator determine what knobs to tune.

208

*

208

*

209

* TODO: Add a water mark for the memory controller. Reclaim will begin when

209

* TODO: Add a water mark for the memory controller. Reclaim will begin when

210

* we hit the water mark. May be even add a low water mark, such that

210

* we hit the water mark. May be even add a low water mark, such that

211

* no reclaim occurs from a cgroup at it's low water mark, this is

211

* no reclaim occurs from a cgroup at it's low water mark, this is

212

* a feature that will be implemented much later in the future.

212

* a feature that will be implemented much later in the future.

213

*/

213

*/

214

struct mem_cgroup {

214

struct mem_cgroup {

215

struct cgroup_subsys_state css;

215

struct cgroup_subsys_state css;

216

/*

216

/*

217

* the counter to account for memory usage

217

* the counter to account for memory usage

218

*/

218

*/

219

struct res_counter res;

219

struct res_counter res;

220

/*

220

/*

221

* the counter to account for mem+swap usage.

221

* the counter to account for mem+swap usage.

222

*/

222

*/

223

struct res_counter memsw;

223

struct res_counter memsw;

224

/*

224

/*

225

* Per cgroup active and inactive list, similar to the

225

* Per cgroup active and inactive list, similar to the

226

* per zone LRU lists.

226

* per zone LRU lists.

227

*/

227

*/

228

struct mem_cgroup_lru_info info;

228

struct mem_cgroup_lru_info info;

229

/*

229

/*

230

* While reclaiming in a hierarchy, we cache the last child we

230

* While reclaiming in a hierarchy, we cache the last child we

231

* reclaimed from.

231

* reclaimed from.

232

*/

232

*/

233

int last_scanned_child;

233

int last_scanned_child;

234

/*

234

/*

235

* Should the accounting and control be hierarchical, per subtree?

235

* Should the accounting and control be hierarchical, per subtree?

236

*/

236

*/

237

bool use_hierarchy;

237

bool use_hierarchy;

238

atomic_t oom_lock;

238

atomic_t oom_lock;

239

atomic_t refcnt;

239

atomic_t refcnt;

240

241

unsigned int swappiness;

241

unsigned int swappiness;

242

/* OOM-Killer disable */

242

/* OOM-Killer disable */

243

int oom_kill_disable;

243

int oom_kill_disable;

244

245

/* set when res.limit == memsw.limit */

245

/* set when res.limit == memsw.limit */

246

bool memsw_is_minimum;

246

bool memsw_is_minimum;

247

248

/* protect arrays of thresholds */

248

/* protect arrays of thresholds */

249

struct mutex thresholds_lock;

249

struct mutex thresholds_lock;

250

251

/* thresholds for memory usage. RCU-protected */

251

/* thresholds for memory usage. RCU-protected */

252

struct mem_cgroup_thresholds thresholds;

252

struct mem_cgroup_thresholds thresholds;

253

254

/* thresholds for mem+swap usage. RCU-protected */

254

/* thresholds for mem+swap usage. RCU-protected */

255

struct mem_cgroup_thresholds memsw_thresholds;

255

struct mem_cgroup_thresholds memsw_thresholds;

256

257

/* For oom notifier event fd */

257

/* For oom notifier event fd */

258

struct list_head oom_notify;

258

struct list_head oom_notify;

259

260

/*

260

/*

261

* Should we move charges of a task when a task is moved into this

261

* Should we move charges of a task when a task is moved into this

262

* mem_cgroup ? And what type of charges should we move ?

262

* mem_cgroup ? And what type of charges should we move ?

263

*/

263

*/

264

unsigned long move_charge_at_immigrate;

264

unsigned long move_charge_at_immigrate;

265

/*

265

/*

266

* percpu counter.

266

* percpu counter.

267

*/

267

*/

268

struct mem_cgroup_stat_cpu *stat;

268

struct mem_cgroup_stat_cpu *stat;

269

/*

269

/*

270

* used when a cpu is offlined or other synchronizations

270

* used when a cpu is offlined or other synchronizations

271

* See mem_cgroup_read_stat().

271

* See mem_cgroup_read_stat().

272

*/

272

*/

273

struct mem_cgroup_stat_cpu nocpu_base;

273

struct mem_cgroup_stat_cpu nocpu_base;

274

spinlock_t pcp_counter_lock;

274

spinlock_t pcp_counter_lock;

275

};

275

};

276

277

/* Stuffs for move charges at task migration. */

277

/* Stuffs for move charges at task migration. */

278

/*

278

/*

279

* Types of charges to be moved. "move_charge_at_immitgrate" is treated as a

279

* Types of charges to be moved. "move_charge_at_immitgrate" is treated as a

280

* left-shifted bitmap of these types.

280

* left-shifted bitmap of these types.

281

*/

281

*/

282

enum move_type {

282

enum move_type {

283

MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */

283

MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */

284

MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */

284

MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */

285

NR_MOVE_TYPE,

285

NR_MOVE_TYPE,

286

};

286

};

287

288

/* "mc" and its members are protected by cgroup_mutex */

288

/* "mc" and its members are protected by cgroup_mutex */

289

static struct move_charge_struct {

289

static struct move_charge_struct {

290

spinlock_t lock; /* for from, to */

290

spinlock_t lock; /* for from, to */

291

struct mem_cgroup *from;

291

struct mem_cgroup *from;

292

struct mem_cgroup *to;

292

struct mem_cgroup *to;

293

unsigned long precharge;

293

unsigned long precharge;

294

unsigned long moved_charge;

294

unsigned long moved_charge;

295

unsigned long moved_swap;

295

unsigned long moved_swap;

296

struct task_struct *moving_task; /* a task moving charges */

296

struct task_struct *moving_task; /* a task moving charges */

297

wait_queue_head_t waitq; /* a waitq for other context */

297

wait_queue_head_t waitq; /* a waitq for other context */

298

} mc = {

298

} mc = {

299

.lock = __SPIN_LOCK_UNLOCKED(mc.lock),

299

.lock = __SPIN_LOCK_UNLOCKED(mc.lock),

300

.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),

300

.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),

301

};

301

};

302

303

static bool move_anon(void)

303

static bool move_anon(void)

304

{

304

{

305

return test_bit(MOVE_CHARGE_TYPE_ANON,

305

return test_bit(MOVE_CHARGE_TYPE_ANON,

306

&mc.to->move_charge_at_immigrate);

306

&mc.to->move_charge_at_immigrate);

307

}

307

}

308

309

static bool move_file(void)

309

static bool move_file(void)

310

{

310

{

311

return test_bit(MOVE_CHARGE_TYPE_FILE,

311

return test_bit(MOVE_CHARGE_TYPE_FILE,

312

&mc.to->move_charge_at_immigrate);

312

&mc.to->move_charge_at_immigrate);

313

}

313

}

314

315

/*

315

/*

316

* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft

316

* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft

317

* limit reclaim to prevent infinite loops, if they ever occur.

317

* limit reclaim to prevent infinite loops, if they ever occur.

318

*/

318

*/

319

#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)

319

#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)

320

#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)

320

#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)

321

322

enum charge_type {

322

enum charge_type {

323

MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

323

MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

324

MEM_CGROUP_CHARGE_TYPE_MAPPED,

324

MEM_CGROUP_CHARGE_TYPE_MAPPED,

325

MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */

325

MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */

326

MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */

326

MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */

327

MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */

327

MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */

328

MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */

328

MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */

329

NR_CHARGE_TYPE,

329

NR_CHARGE_TYPE,

330

};

330

};

331

332

/* for encoding cft->private value on file */

332

/* for encoding cft->private value on file */

333

#define _MEM (0)

333

#define _MEM (0)

334

#define _MEMSWAP (1)

334

#define _MEMSWAP (1)

335

#define _OOM_TYPE (2)

335

#define _OOM_TYPE (2)

336

#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))

336

#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))

337

#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)

337

#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)

338

#define MEMFILE_ATTR(val) ((val) & 0xffff)

338

#define MEMFILE_ATTR(val) ((val) & 0xffff)

339

/* Used for OOM nofiier */

339

/* Used for OOM nofiier */

340

#define OOM_CONTROL (0)

340

#define OOM_CONTROL (0)

341

342

/*

342

/*

343

* Reclaim flags for mem_cgroup_hierarchical_reclaim

343

* Reclaim flags for mem_cgroup_hierarchical_reclaim

344

*/

344

*/

345

#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0

345

#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0

346

#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)

346

#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)

347

#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1

347

#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1

348

#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)

348

#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)

349

#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2

349

#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2

350

#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)

350

#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)

351

352

static void mem_cgroup_get(struct mem_cgroup *mem);

352

static void mem_cgroup_get(struct mem_cgroup *mem);

353

static void mem_cgroup_put(struct mem_cgroup *mem);

353

static void mem_cgroup_put(struct mem_cgroup *mem);

354

static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);

354

static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);

355

static void drain_all_stock_async(void);

355

static void drain_all_stock_async(void);

356

357

static struct mem_cgroup_per_zone *

357

static struct mem_cgroup_per_zone *

358

mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)

358

mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)

359

{

359

{

360

return &mem->info.nodeinfo[nid]->zoneinfo[zid];

360

return &mem->info.nodeinfo[nid]->zoneinfo[zid];

361

}

361

}

362

363

struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)

363

struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)

364

{

364

{

365

return &mem->css;

365

return &mem->css;

366

}

366

}

367

368

static struct mem_cgroup_per_zone *

368

static struct mem_cgroup_per_zone *

369

page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)

369

page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)

370

{

370

{

371

int nid = page_to_nid(page);

371

int nid = page_to_nid(page);

372

int zid = page_zonenum(page);

372

int zid = page_zonenum(page);

373

374

return mem_cgroup_zoneinfo(mem, nid, zid);

374

return mem_cgroup_zoneinfo(mem, nid, zid);

375

}

375

}

376

377

static struct mem_cgroup_tree_per_zone *

377

static struct mem_cgroup_tree_per_zone *

378

soft_limit_tree_node_zone(int nid, int zid)

378

soft_limit_tree_node_zone(int nid, int zid)

379

{

379

{

380

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

380

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

381

}

381

}

382

383

static struct mem_cgroup_tree_per_zone *

383

static struct mem_cgroup_tree_per_zone *

384

soft_limit_tree_from_page(struct page *page)

384

soft_limit_tree_from_page(struct page *page)

385

{

385

{

386

int nid = page_to_nid(page);

386

int nid = page_to_nid(page);

387

int zid = page_zonenum(page);

387

int zid = page_zonenum(page);

388

389

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

389

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

390

}

390

}

391

392

static void

392

static void

393

__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,

393

__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,

394

struct mem_cgroup_per_zone *mz,

394

struct mem_cgroup_per_zone *mz,

395

struct mem_cgroup_tree_per_zone *mctz,

395

struct mem_cgroup_tree_per_zone *mctz,

396

unsigned long long new_usage_in_excess)

396

unsigned long long new_usage_in_excess)

397

{

397

{

398

struct rb_node **p = &mctz->rb_root.rb_node;

398

struct rb_node **p = &mctz->rb_root.rb_node;

399

struct rb_node *parent = NULL;

399

struct rb_node *parent = NULL;

400

struct mem_cgroup_per_zone *mz_node;

400

struct mem_cgroup_per_zone *mz_node;

401

402

if (mz->on_tree)

402

if (mz->on_tree)

403

return;

403

return;

404

405

mz->usage_in_excess = new_usage_in_excess;

405

mz->usage_in_excess = new_usage_in_excess;

406

if (!mz->usage_in_excess)

406

if (!mz->usage_in_excess)

407

return;

407

return;

408

while (*p) {

408

while (*p) {

409

parent = *p;

409

parent = *p;

410

mz_node = rb_entry(parent, struct mem_cgroup_per_zone,

410

mz_node = rb_entry(parent, struct mem_cgroup_per_zone,

411

tree_node);

411

tree_node);

412

if (mz->usage_in_excess < mz_node->usage_in_excess)

412

if (mz->usage_in_excess < mz_node->usage_in_excess)

413

p = &(*p)->rb_left;

413

p = &(*p)->rb_left;

414

/*

414

/*

415

* We can't avoid mem cgroups that are over their soft

415

* We can't avoid mem cgroups that are over their soft

416

* limit by the same amount

416

* limit by the same amount

417

*/

417

*/

418

else if (mz->usage_in_excess >= mz_node->usage_in_excess)

418

else if (mz->usage_in_excess >= mz_node->usage_in_excess)

419

p = &(*p)->rb_right;

419

p = &(*p)->rb_right;

420

}

420

}

421

rb_link_node(&mz->tree_node, parent, p);

421

rb_link_node(&mz->tree_node, parent, p);

422

rb_insert_color(&mz->tree_node, &mctz->rb_root);

422

rb_insert_color(&mz->tree_node, &mctz->rb_root);

423

mz->on_tree = true;

423

mz->on_tree = true;

424

}

424

}

425

426

static void

426

static void

427

__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,

427

__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,

428

struct mem_cgroup_per_zone *mz,

428

struct mem_cgroup_per_zone *mz,

429

struct mem_cgroup_tree_per_zone *mctz)

429

struct mem_cgroup_tree_per_zone *mctz)

430

{

430

{

431

if (!mz->on_tree)

431

if (!mz->on_tree)

432

return;

432

return;

433

rb_erase(&mz->tree_node, &mctz->rb_root);

433

rb_erase(&mz->tree_node, &mctz->rb_root);

434

mz->on_tree = false;

434

mz->on_tree = false;

435

}

435

}

436

437

static void

437

static void

438

mem_cgroup_remove_exceeded(struct mem_cgroup *mem,

438

mem_cgroup_remove_exceeded(struct mem_cgroup *mem,

439

struct mem_cgroup_per_zone *mz,

439

struct mem_cgroup_per_zone *mz,

440

struct mem_cgroup_tree_per_zone *mctz)

440

struct mem_cgroup_tree_per_zone *mctz)

441

{

441

{

442

spin_lock(&mctz->lock);

442

spin_lock(&mctz->lock);

443

__mem_cgroup_remove_exceeded(mem, mz, mctz);

443

__mem_cgroup_remove_exceeded(mem, mz, mctz);

444

spin_unlock(&mctz->lock);

444

spin_unlock(&mctz->lock);

445

}

445

}

446

447

448

static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)

448

static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)

449

{

449

{

450

unsigned long long excess;

450

unsigned long long excess;

451

struct mem_cgroup_per_zone *mz;

451

struct mem_cgroup_per_zone *mz;

452

struct mem_cgroup_tree_per_zone *mctz;

452

struct mem_cgroup_tree_per_zone *mctz;

453

int nid = page_to_nid(page);

453

int nid = page_to_nid(page);

454

int zid = page_zonenum(page);

454

int zid = page_zonenum(page);

455

mctz = soft_limit_tree_from_page(page);

455

mctz = soft_limit_tree_from_page(page);

456

457

/*

457

/*

458

* Necessary to update all ancestors when hierarchy is used.

458

* Necessary to update all ancestors when hierarchy is used.

459

* because their event counter is not touched.

459

* because their event counter is not touched.

460

*/

460

*/

461

for (; mem; mem = parent_mem_cgroup(mem)) {

461

for (; mem; mem = parent_mem_cgroup(mem)) {

462

mz = mem_cgroup_zoneinfo(mem, nid, zid);

462

mz = mem_cgroup_zoneinfo(mem, nid, zid);

463

excess = res_counter_soft_limit_excess(&mem->res);

463

excess = res_counter_soft_limit_excess(&mem->res);

464

/*

464

/*

465

* We have to update the tree if mz is on RB-tree or

465

* We have to update the tree if mz is on RB-tree or

466

* mem is over its softlimit.

466

* mem is over its softlimit.

467

*/

467

*/

468

if (excess || mz->on_tree) {

468

if (excess || mz->on_tree) {

469

spin_lock(&mctz->lock);

469

spin_lock(&mctz->lock);

470

/* if on-tree, remove it */

470

/* if on-tree, remove it */

471

if (mz->on_tree)

471

if (mz->on_tree)

472

__mem_cgroup_remove_exceeded(mem, mz, mctz);

472

__mem_cgroup_remove_exceeded(mem, mz, mctz);

473

/*

473

/*

474

* Insert again. mz->usage_in_excess will be updated.

474

* Insert again. mz->usage_in_excess will be updated.

475

* If excess is 0, no tree ops.

475

* If excess is 0, no tree ops.

476

*/

476

*/

477

__mem_cgroup_insert_exceeded(mem, mz, mctz, excess);

477

__mem_cgroup_insert_exceeded(mem, mz, mctz, excess);

478

spin_unlock(&mctz->lock);

478

spin_unlock(&mctz->lock);

479

}

479

}

480

}

480

}

481

}

481

}

482

483

static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)

483

static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)

484

{

484

{

485

int node, zone;

485

int node, zone;

486

struct mem_cgroup_per_zone *mz;

486

struct mem_cgroup_per_zone *mz;

487

struct mem_cgroup_tree_per_zone *mctz;

487

struct mem_cgroup_tree_per_zone *mctz;

488

489

for_each_node_state(node, N_POSSIBLE) {

489

for_each_node_state(node, N_POSSIBLE) {

490

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

490

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

491

mz = mem_cgroup_zoneinfo(mem, node, zone);

491

mz = mem_cgroup_zoneinfo(mem, node, zone);

492

mctz = soft_limit_tree_node_zone(node, zone);

492

mctz = soft_limit_tree_node_zone(node, zone);

493

mem_cgroup_remove_exceeded(mem, mz, mctz);

493

mem_cgroup_remove_exceeded(mem, mz, mctz);

494

}

494

}

495

}

495

}

496

}

496

}

497

498

static struct mem_cgroup_per_zone *

498

static struct mem_cgroup_per_zone *

499

__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

499

__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

500

{

500

{

501

struct rb_node *rightmost = NULL;

501

struct rb_node *rightmost = NULL;

502

struct mem_cgroup_per_zone *mz;

502

struct mem_cgroup_per_zone *mz;

503

504

retry:

504

retry:

505

mz = NULL;

505

mz = NULL;

506

rightmost = rb_last(&mctz->rb_root);

506

rightmost = rb_last(&mctz->rb_root);

507

if (!rightmost)

507

if (!rightmost)

508

goto done; /* Nothing to reclaim from */

508

goto done; /* Nothing to reclaim from */

509

510

mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);

510

mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);

511

/*

511

/*

512

* Remove the node now but someone else can add it back,

512

* Remove the node now but someone else can add it back,

513

* we will to add it back at the end of reclaim to its correct

513

* we will to add it back at the end of reclaim to its correct

514

* position in the tree.

514

* position in the tree.

515

*/

515

*/

516

__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);

516

__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);

517

if (!res_counter_soft_limit_excess(&mz->mem->res) ||

517

if (!res_counter_soft_limit_excess(&mz->mem->res) ||

518

!css_tryget(&mz->mem->css))

518

!css_tryget(&mz->mem->css))

519

goto retry;

519

goto retry;

520

done:

520

done:

521

return mz;

521

return mz;

522

}

522

}

523

524

static struct mem_cgroup_per_zone *

524

static struct mem_cgroup_per_zone *

525

mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

525

mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

526

{

526

{

527

struct mem_cgroup_per_zone *mz;

527

struct mem_cgroup_per_zone *mz;

528

529

spin_lock(&mctz->lock);

529

spin_lock(&mctz->lock);

530

mz = __mem_cgroup_largest_soft_limit_node(mctz);

530

mz = __mem_cgroup_largest_soft_limit_node(mctz);

531

spin_unlock(&mctz->lock);

531

spin_unlock(&mctz->lock);

532

return mz;

532

return mz;

533

}

533

}

534

535

/*

535

/*

536

* Implementation Note: reading percpu statistics for memcg.

536

* Implementation Note: reading percpu statistics for memcg.

537

*

537

*

538

* Both of vmstat[] and percpu_counter has threshold and do periodic

538

* Both of vmstat[] and percpu_counter has threshold and do periodic

539

* synchronization to implement "quick" read. There are trade-off between

539

* synchronization to implement "quick" read. There are trade-off between

540

* reading cost and precision of value. Then, we may have a chance to implement

540

* reading cost and precision of value. Then, we may have a chance to implement

541

* a periodic synchronizion of counter in memcg's counter.

541

* a periodic synchronizion of counter in memcg's counter.

542

*

542

*

543

* But this _read() function is used for user interface now. The user accounts

543

* But this _read() function is used for user interface now. The user accounts

544

* memory usage by memory cgroup and he _always_ requires exact value because

544

* memory usage by memory cgroup and he _always_ requires exact value because

545

* he accounts memory. Even if we provide quick-and-fuzzy read, we always

545

* he accounts memory. Even if we provide quick-and-fuzzy read, we always

546

* have to visit all online cpus and make sum. So, for now, unnecessary

546

* have to visit all online cpus and make sum. So, for now, unnecessary

547

* synchronization is not implemented. (just implemented for cpu hotplug)

547

* synchronization is not implemented. (just implemented for cpu hotplug)

548

*

548

*

549

* If there are kernel internal actions which can make use of some not-exact

549

* If there are kernel internal actions which can make use of some not-exact

550

* value, and reading all cpu value can be performance bottleneck in some

550

* value, and reading all cpu value can be performance bottleneck in some

551

* common workload, threashold and synchonization as vmstat[] should be

551

* common workload, threashold and synchonization as vmstat[] should be

552

* implemented.

552

* implemented.

553

*/

553

*/

554

static long mem_cgroup_read_stat(struct mem_cgroup *mem,

554

static long mem_cgroup_read_stat(struct mem_cgroup *mem,

555

enum mem_cgroup_stat_index idx)

555

enum mem_cgroup_stat_index idx)

556

{

556

{

557

long val = 0;

557

long val = 0;

558

int cpu;

558

int cpu;

559

560

get_online_cpus();

560

get_online_cpus();

561

for_each_online_cpu(cpu)

561

for_each_online_cpu(cpu)

562

val += per_cpu(mem->stat->count[idx], cpu);

562

val += per_cpu(mem->stat->count[idx], cpu);

563

#ifdef CONFIG_HOTPLUG_CPU

563

#ifdef CONFIG_HOTPLUG_CPU

564

spin_lock(&mem->pcp_counter_lock);

564

spin_lock(&mem->pcp_counter_lock);

565

val += mem->nocpu_base.count[idx];

565

val += mem->nocpu_base.count[idx];

566

spin_unlock(&mem->pcp_counter_lock);

566

spin_unlock(&mem->pcp_counter_lock);

567

#endif

567

#endif

568

put_online_cpus();

568

put_online_cpus();

569

return val;

569

return val;

570

}

570

}

571

572

static long mem_cgroup_local_usage(struct mem_cgroup *mem)

572

static long mem_cgroup_local_usage(struct mem_cgroup *mem)

573

{

573

{

574

long ret;

574

long ret;

575

576

ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);

576

ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);

577

ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);

577

ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);

578

return ret;

578

return ret;

579

}

579

}

580

581

static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,

581

static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,

582

bool charge)

582

bool charge)

583

{

583

{

584

int val = (charge) ? 1 : -1;

584

int val = (charge) ? 1 : -1;

585

this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);

585

this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);

586

}

586

}

587

588

static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,

588

static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,

589

enum mem_cgroup_events_index idx)

589

enum mem_cgroup_events_index idx)

590

{

590

{

591

unsigned long val = 0;

591

unsigned long val = 0;

592

int cpu;

592

int cpu;

593

594

for_each_online_cpu(cpu)

594

for_each_online_cpu(cpu)

595

val += per_cpu(mem->stat->events[idx], cpu);

595

val += per_cpu(mem->stat->events[idx], cpu);

596

#ifdef CONFIG_HOTPLUG_CPU

596

#ifdef CONFIG_HOTPLUG_CPU

597

spin_lock(&mem->pcp_counter_lock);

597

spin_lock(&mem->pcp_counter_lock);

598

val += mem->nocpu_base.events[idx];

598

val += mem->nocpu_base.events[idx];

599

spin_unlock(&mem->pcp_counter_lock);

599

spin_unlock(&mem->pcp_counter_lock);

600

#endif

600

#endif

601

return val;

601

return val;

602

}

602

}

603

604

static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,

604

static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,

605

bool file, int nr_pages)

605

bool file, int nr_pages)

606

{

606

{

607

preempt_disable();

607

preempt_disable();

608

609

if (file)

609

if (file)

610

__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);

610

__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);

611

else

611

else

612

__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);

612

__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);

613

614

/* pagein of a big page is an event. So, ignore page size */

614

/* pagein of a big page is an event. So, ignore page size */

615

if (nr_pages > 0)

615

if (nr_pages > 0)

616

__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);

616

__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);

617

else {

617

else {

618

__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);

618

__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);

619

nr_pages = -nr_pages; /* for event */

619

nr_pages = -nr_pages; /* for event */

620

}

620

}

621

622

__this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);

622

__this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);

623

624

preempt_enable();

624

preempt_enable();

625

}

625

}

626

627

static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,

627

static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,

628

enum lru_list idx)

628

enum lru_list idx)

629

{

629

{

630

int nid, zid;

630

int nid, zid;

631

struct mem_cgroup_per_zone *mz;

631

struct mem_cgroup_per_zone *mz;

632

u64 total = 0;

632

u64 total = 0;

633

634

for_each_online_node(nid)

634

for_each_online_node(nid)

635

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

635

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

636

mz = mem_cgroup_zoneinfo(mem, nid, zid);

636

mz = mem_cgroup_zoneinfo(mem, nid, zid);

637

total += MEM_CGROUP_ZSTAT(mz, idx);

637

total += MEM_CGROUP_ZSTAT(mz, idx);

638

}

638

}

639

return total;

639

return total;

640

}

640

}

641

642

static bool __memcg_event_check(struct mem_cgroup *mem, int target)

642

static bool __memcg_event_check(struct mem_cgroup *mem, int target)

643

{

643

{

644

unsigned long val, next;

644

unsigned long val, next;

645

646

val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);

646

val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);

647

next = this_cpu_read(mem->stat->targets[target]);

647

next = this_cpu_read(mem->stat->targets[target]);

648

/* from time_after() in jiffies.h */

648

/* from time_after() in jiffies.h */

649

return ((long)next - (long)val < 0);

649

return ((long)next - (long)val < 0);

650

}

650

}

651

652

static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)

652

static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)

653

{

653

{

654

unsigned long val, next;

654

unsigned long val, next;

655

656

val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);

656

val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);

657

658

switch (target) {

658

switch (target) {

659

case MEM_CGROUP_TARGET_THRESH:

659

case MEM_CGROUP_TARGET_THRESH:

660

next = val + THRESHOLDS_EVENTS_TARGET;

660

next = val + THRESHOLDS_EVENTS_TARGET;

661

break;

661

break;

662

case MEM_CGROUP_TARGET_SOFTLIMIT:

662

case MEM_CGROUP_TARGET_SOFTLIMIT:

663

next = val + SOFTLIMIT_EVENTS_TARGET;

663

next = val + SOFTLIMIT_EVENTS_TARGET;

664

break;

664

break;

665

default:

665

default:

666

return;

666

return;

667

}

667

}

668

669

this_cpu_write(mem->stat->targets[target], next);

669

this_cpu_write(mem->stat->targets[target], next);

670

}

670

}

671

672

/*

672

/*

673

* Check events in order.

673

* Check events in order.

674

*

674

*

675

*/

675

*/

676

static void memcg_check_events(struct mem_cgroup *mem, struct page *page)

676

static void memcg_check_events(struct mem_cgroup *mem, struct page *page)

677

{

677

{

678

/* threshold event is triggered in finer grain than soft limit */

678

/* threshold event is triggered in finer grain than soft limit */

679

if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {

679

if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {

680

mem_cgroup_threshold(mem);

680

mem_cgroup_threshold(mem);

681

__mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);

681

__mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);

682

if (unlikely(__memcg_event_check(mem,

682

if (unlikely(__memcg_event_check(mem,

683

MEM_CGROUP_TARGET_SOFTLIMIT))){

683

MEM_CGROUP_TARGET_SOFTLIMIT))){

684

mem_cgroup_update_tree(mem, page);

684

mem_cgroup_update_tree(mem, page);

685

__mem_cgroup_target_update(mem,

685

__mem_cgroup_target_update(mem,

686

MEM_CGROUP_TARGET_SOFTLIMIT);

686

MEM_CGROUP_TARGET_SOFTLIMIT);

687

}

687

}

688

}

688

}

689

}

689

}

690

691

static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)

691

static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)

692

{

692

{

693

return container_of(cgroup_subsys_state(cont,

693

return container_of(cgroup_subsys_state(cont,

694

mem_cgroup_subsys_id), struct mem_cgroup,

694

mem_cgroup_subsys_id), struct mem_cgroup,

695

css);

695

css);

696

}

696

}

697

698

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)

698

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)

699

{

699

{

700

/*

700

/*

701

* mm_update_next_owner() may clear mm->owner to NULL

701

* mm_update_next_owner() may clear mm->owner to NULL

702

* if it races with swapoff, page migration, etc.

702

* if it races with swapoff, page migration, etc.

703

* So this can be called with p == NULL.

703

* So this can be called with p == NULL.

704

*/

704

*/

705

if (unlikely(!p))

705

if (unlikely(!p))

706

return NULL;

706

return NULL;

707

708

return container_of(task_subsys_state(p, mem_cgroup_subsys_id),

708

return container_of(task_subsys_state(p, mem_cgroup_subsys_id),

709

struct mem_cgroup, css);

709

struct mem_cgroup, css);

710

}

710

}

711

712

static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)

712

static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)

713

{

713

{

714

struct mem_cgroup *mem = NULL;

714

struct mem_cgroup *mem = NULL;

715

716

if (!mm)

716

if (!mm)

717

return NULL;

717

return NULL;

718

/*

718

/*

719

* Because we have no locks, mm->owner's may be being moved to other

719

* Because we have no locks, mm->owner's may be being moved to other

720

* cgroup. We use css_tryget() here even if this looks

720

* cgroup. We use css_tryget() here even if this looks

721

* pessimistic (rather than adding locks here).

721

* pessimistic (rather than adding locks here).

722

*/

722

*/

723

rcu_read_lock();

723

rcu_read_lock();

724

do {

724

do {

725

mem = mem_cgroup_from_task(rcu_dereference(mm->owner));

725

mem = mem_cgroup_from_task(rcu_dereference(mm->owner));

726

if (unlikely(!mem))

726

if (unlikely(!mem))

727

break;

727

break;

728

} while (!css_tryget(&mem->css));

728

} while (!css_tryget(&mem->css));

729

rcu_read_unlock();

729

rcu_read_unlock();

730

return mem;

730

return mem;

731

}

731

}

732

733

/* The caller has to guarantee "mem" exists before calling this */

733

/* The caller has to guarantee "mem" exists before calling this */

734

static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)

734

static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)

735

{

735

{

736

struct cgroup_subsys_state *css;

736

struct cgroup_subsys_state *css;

737

int found;

737

int found;

738

739

if (!mem) /* ROOT cgroup has the smallest ID */

739

if (!mem) /* ROOT cgroup has the smallest ID */

740

return root_mem_cgroup; /*css_put/get against root is ignored*/

740

return root_mem_cgroup; /*css_put/get against root is ignored*/

741

if (!mem->use_hierarchy) {

741

if (!mem->use_hierarchy) {

742

if (css_tryget(&mem->css))

742

if (css_tryget(&mem->css))

743

return mem;

743

return mem;

744

return NULL;

744

return NULL;

745

}

745

}

746

rcu_read_lock();

746

rcu_read_lock();

747

/*

747

/*

748

* searching a memory cgroup which has the smallest ID under given

748

* searching a memory cgroup which has the smallest ID under given

749

* ROOT cgroup. (ID >= 1)

749

* ROOT cgroup. (ID >= 1)

750

*/

750

*/

751

css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);

751

css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);

752

if (css && css_tryget(css))

752

if (css && css_tryget(css))

753

mem = container_of(css, struct mem_cgroup, css);

753

mem = container_of(css, struct mem_cgroup, css);

754

else

754

else

755

mem = NULL;

755

mem = NULL;

756

rcu_read_unlock();

756

rcu_read_unlock();

757

return mem;

757

return mem;

758

}

758

}

759

760

static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,

760

static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,

761

struct mem_cgroup *root,

761

struct mem_cgroup *root,

762

bool cond)

762

bool cond)

763

{

763

{

764

int nextid = css_id(&iter->css) + 1;

764

int nextid = css_id(&iter->css) + 1;

765

int found;

765

int found;

766

int hierarchy_used;

766

int hierarchy_used;

767

struct cgroup_subsys_state *css;

767

struct cgroup_subsys_state *css;

768

769

hierarchy_used = iter->use_hierarchy;

769

hierarchy_used = iter->use_hierarchy;

770

771

css_put(&iter->css);

771

css_put(&iter->css);

772

/* If no ROOT, walk all, ignore hierarchy */

772

/* If no ROOT, walk all, ignore hierarchy */

773

if (!cond || (root && !hierarchy_used))

773

if (!cond || (root && !hierarchy_used))

774

return NULL;

774

return NULL;

775

776

if (!root)

776

if (!root)

777

root = root_mem_cgroup;

777

root = root_mem_cgroup;

778

779

do {

779

do {

780

iter = NULL;

780

iter = NULL;

781

rcu_read_lock();

781

rcu_read_lock();

782

783

css = css_get_next(&mem_cgroup_subsys, nextid,

783

css = css_get_next(&mem_cgroup_subsys, nextid,

784

&root->css, &found);

784

&root->css, &found);

785

if (css && css_tryget(css))

785

if (css && css_tryget(css))

786

iter = container_of(css, struct mem_cgroup, css);

786

iter = container_of(css, struct mem_cgroup, css);

787

rcu_read_unlock();

787

rcu_read_unlock();

788

/* If css is NULL, no more cgroups will be found */

788

/* If css is NULL, no more cgroups will be found */

789

nextid = found + 1;

789

nextid = found + 1;

790

} while (css && !iter);

790

} while (css && !iter);

791

792

return iter;

792

return iter;

793

}

793

}

794

/*

794

/*

795

* for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please

795

* for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please

796

* be careful that "break" loop is not allowed. We have reference count.

796

* be careful that "break" loop is not allowed. We have reference count.

797

* Instead of that modify "cond" to be false and "continue" to exit the loop.

797

* Instead of that modify "cond" to be false and "continue" to exit the loop.

798

*/

798

*/

799

#define for_each_mem_cgroup_tree_cond(iter, root, cond) \

799

#define for_each_mem_cgroup_tree_cond(iter, root, cond) \

800

for (iter = mem_cgroup_start_loop(root);\

800

for (iter = mem_cgroup_start_loop(root);\

801

iter != NULL;\

801

iter != NULL;\

802

iter = mem_cgroup_get_next(iter, root, cond))

802

iter = mem_cgroup_get_next(iter, root, cond))

803

804

#define for_each_mem_cgroup_tree(iter, root) \

804

#define for_each_mem_cgroup_tree(iter, root) \

805

for_each_mem_cgroup_tree_cond(iter, root, true)

805

for_each_mem_cgroup_tree_cond(iter, root, true)

806

807

#define for_each_mem_cgroup_all(iter) \

807

#define for_each_mem_cgroup_all(iter) \

808

for_each_mem_cgroup_tree_cond(iter, NULL, true)

808

for_each_mem_cgroup_tree_cond(iter, NULL, true)

809

810

811

static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)

811

static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)

812

{

812

{

813

return (mem == root_mem_cgroup);

813

return (mem == root_mem_cgroup);

814

}

814

}

815

816

/*

816

/*

817

* Following LRU functions are allowed to be used without PCG_LOCK.

817

* Following LRU functions are allowed to be used without PCG_LOCK.

818

* Operations are called by routine of global LRU independently from memcg.

818

* Operations are called by routine of global LRU independently from memcg.

819

* What we have to take care of here is validness of pc->mem_cgroup.

819

* What we have to take care of here is validness of pc->mem_cgroup.

820

*

820

*

821

* Changes to pc->mem_cgroup happens when

821

* Changes to pc->mem_cgroup happens when

822

* 1. charge

822

* 1. charge

823

* 2. moving account

823

* 2. moving account

824

* In typical case, "charge" is done before add-to-lru. Exception is SwapCache.

824

* In typical case, "charge" is done before add-to-lru. Exception is SwapCache.

825

* It is added to LRU before charge.

825

* It is added to LRU before charge.

826

* If PCG_USED bit is not set, page_cgroup is not added to this private LRU.

826

* If PCG_USED bit is not set, page_cgroup is not added to this private LRU.

827

* When moving account, the page is not on LRU. It's isolated.

827

* When moving account, the page is not on LRU. It's isolated.

828

*/

828

*/

829

830

void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)

830

void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)

831

{

831

{

832

struct page_cgroup *pc;

832

struct page_cgroup *pc;

833

struct mem_cgroup_per_zone *mz;

833

struct mem_cgroup_per_zone *mz;

834

835

if (mem_cgroup_disabled())

835

if (mem_cgroup_disabled())

836

return;

836

return;

837

pc = lookup_page_cgroup(page);

837

pc = lookup_page_cgroup(page);

838

/* can happen while we handle swapcache. */

838

/* can happen while we handle swapcache. */

839

if (!TestClearPageCgroupAcctLRU(pc))

839

if (!TestClearPageCgroupAcctLRU(pc))

840

return;

840

return;

841

VM_BUG_ON(!pc->mem_cgroup);

841

VM_BUG_ON(!pc->mem_cgroup);

842

/*

842

/*

843

* We don't check PCG_USED bit. It's cleared when the "page" is finally

843

* We don't check PCG_USED bit. It's cleared when the "page" is finally

844

* removed from global LRU.

844

* removed from global LRU.

845

*/

845

*/

846

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

846

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

847

/* huge page split is done under lru_lock. so, we have no races. */

847

/* huge page split is done under lru_lock. so, we have no races. */

848

MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);

848

MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);

849

if (mem_cgroup_is_root(pc->mem_cgroup))

849

if (mem_cgroup_is_root(pc->mem_cgroup))

850

return;

850

return;

851

VM_BUG_ON(list_empty(&pc->lru));

851

VM_BUG_ON(list_empty(&pc->lru));

852

list_del_init(&pc->lru);

852

list_del_init(&pc->lru);

853

}

853

}

854

855

void mem_cgroup_del_lru(struct page *page)

855

void mem_cgroup_del_lru(struct page *page)

856

{

856

{

857

mem_cgroup_del_lru_list(page, page_lru(page));

857

mem_cgroup_del_lru_list(page, page_lru(page));

858

}

858

}

859

860

/*

860

/*

861

* Writeback is about to end against a page which has been marked for immediate

861

* Writeback is about to end against a page which has been marked for immediate

862

* reclaim. If it still appears to be reclaimable, move it to the tail of the

862

* reclaim. If it still appears to be reclaimable, move it to the tail of the

863

* inactive list.

863

* inactive list.

864

*/

864

*/

865

void mem_cgroup_rotate_reclaimable_page(struct page *page)

865

void mem_cgroup_rotate_reclaimable_page(struct page *page)

866

{

866

{

867

struct mem_cgroup_per_zone *mz;

867

struct mem_cgroup_per_zone *mz;

868

struct page_cgroup *pc;

868

struct page_cgroup *pc;

869

enum lru_list lru = page_lru(page);

869

enum lru_list lru = page_lru(page);

870

871

if (mem_cgroup_disabled())

871

if (mem_cgroup_disabled())

872

return;

872

return;

873

874

pc = lookup_page_cgroup(page);

874

pc = lookup_page_cgroup(page);

875

/* unused or root page is not rotated. */

875

/* unused or root page is not rotated. */

876

if (!PageCgroupUsed(pc))

876

if (!PageCgroupUsed(pc))

877

return;

877

return;

878

/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */

878

/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */

879

smp_rmb();

879

smp_rmb();

880

if (mem_cgroup_is_root(pc->mem_cgroup))

880

if (mem_cgroup_is_root(pc->mem_cgroup))

881

return;

881

return;

882

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

882

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

883

list_move_tail(&pc->lru, &mz->lists[lru]);

883

list_move_tail(&pc->lru, &mz->lists[lru]);

884

}

884

}

885

886

void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)

886

void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)

887

{

887

{

888

struct mem_cgroup_per_zone *mz;

888

struct mem_cgroup_per_zone *mz;

889

struct page_cgroup *pc;

889

struct page_cgroup *pc;

890

891

if (mem_cgroup_disabled())

891

if (mem_cgroup_disabled())

892

return;

892

return;

893

894

pc = lookup_page_cgroup(page);

894

pc = lookup_page_cgroup(page);

895

/* unused or root page is not rotated. */

895

/* unused or root page is not rotated. */

896

if (!PageCgroupUsed(pc))

896

if (!PageCgroupUsed(pc))

897

return;

897

return;

898

/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */

898

/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */

899

smp_rmb();

899

smp_rmb();

900

if (mem_cgroup_is_root(pc->mem_cgroup))

900

if (mem_cgroup_is_root(pc->mem_cgroup))

901

return;

901

return;

902

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

902

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

903

list_move(&pc->lru, &mz->lists[lru]);

903

list_move(&pc->lru, &mz->lists[lru]);

904

}

904

}

905

906

void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)

906

void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)

907

{

907

{

908

struct page_cgroup *pc;

908

struct page_cgroup *pc;

909

struct mem_cgroup_per_zone *mz;

909

struct mem_cgroup_per_zone *mz;

910

911

if (mem_cgroup_disabled())

911

if (mem_cgroup_disabled())

912

return;

912

return;

913

pc = lookup_page_cgroup(page);

913

pc = lookup_page_cgroup(page);

914

VM_BUG_ON(PageCgroupAcctLRU(pc));

914

VM_BUG_ON(PageCgroupAcctLRU(pc));

915

if (!PageCgroupUsed(pc))

915

if (!PageCgroupUsed(pc))

916

return;

916

return;

917

/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */

917

/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */

918

smp_rmb();

918

smp_rmb();

919

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

919

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

920

/* huge page split is done under lru_lock. so, we have no races. */

920

/* huge page split is done under lru_lock. so, we have no races. */

921

MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);

921

MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);

922

SetPageCgroupAcctLRU(pc);

922

SetPageCgroupAcctLRU(pc);

923

if (mem_cgroup_is_root(pc->mem_cgroup))

923

if (mem_cgroup_is_root(pc->mem_cgroup))

924

return;

924

return;

925

list_add(&pc->lru, &mz->lists[lru]);

925

list_add(&pc->lru, &mz->lists[lru]);

926

}

926

}

927

928

/*

928

/*

929

* At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed

929

* At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed

930

* while it's linked to lru because the page may be reused after it's fully

930

* while it's linked to lru because the page may be reused after it's fully

931

* uncharged. To handle that, unlink page_cgroup from LRU when charge it again.

931

* uncharged. To handle that, unlink page_cgroup from LRU when charge it again.

932

* It's done under lock_page and expected that zone->lru_lock isnever held.

932

* It's done under lock_page and expected that zone->lru_lock isnever held.

933

*/

933

*/

934

static void mem_cgroup_lru_del_before_commit(struct page *page)

934

static void mem_cgroup_lru_del_before_commit(struct page *page)

935

{

935

{

936

unsigned long flags;

936

unsigned long flags;

937

struct zone *zone = page_zone(page);

937

struct zone *zone = page_zone(page);

938

struct page_cgroup *pc = lookup_page_cgroup(page);

938

struct page_cgroup *pc = lookup_page_cgroup(page);

939

940

/*

940

/*

941

* Doing this check without taking ->lru_lock seems wrong but this

941

* Doing this check without taking ->lru_lock seems wrong but this

942

* is safe. Because if page_cgroup's USED bit is unset, the page

942

* is safe. Because if page_cgroup's USED bit is unset, the page

943

* will not be added to any memcg's LRU. If page_cgroup's USED bit is

943

* will not be added to any memcg's LRU. If page_cgroup's USED bit is

944

* set, the commit after this will fail, anyway.

944

* set, the commit after this will fail, anyway.

945

* This all charge/uncharge is done under some mutual execustion.

945

* This all charge/uncharge is done under some mutual execustion.

946

* So, we don't need to taking care of changes in USED bit.

946

* So, we don't need to taking care of changes in USED bit.

947

*/

947

*/

948

if (likely(!PageLRU(page)))

948

if (likely(!PageLRU(page)))

949

return;

949

return;

950

951

spin_lock_irqsave(&zone->lru_lock, flags);

951

spin_lock_irqsave(&zone->lru_lock, flags);

952

/*

952

/*

953

* Forget old LRU when this page_cgroup is *not* used. This Used bit

953

* Forget old LRU when this page_cgroup is *not* used. This Used bit

954

* is guarded by lock_page() because the page is SwapCache.

954

* is guarded by lock_page() because the page is SwapCache.

955

*/

955

*/

956

if (!PageCgroupUsed(pc))

956

if (!PageCgroupUsed(pc))

957

mem_cgroup_del_lru_list(page, page_lru(page));

957

mem_cgroup_del_lru_list(page, page_lru(page));

958

spin_unlock_irqrestore(&zone->lru_lock, flags);

958

spin_unlock_irqrestore(&zone->lru_lock, flags);

959

}

959

}

960

961

static void mem_cgroup_lru_add_after_commit(struct page *page)

961

static void mem_cgroup_lru_add_after_commit(struct page *page)

962

{

962

{

963

unsigned long flags;

963

unsigned long flags;

964

struct zone *zone = page_zone(page);

964

struct zone *zone = page_zone(page);

965

struct page_cgroup *pc = lookup_page_cgroup(page);

965

struct page_cgroup *pc = lookup_page_cgroup(page);

966

967

/* taking care of that the page is added to LRU while we commit it */

967

/* taking care of that the page is added to LRU while we commit it */

968

if (likely(!PageLRU(page)))

968

if (likely(!PageLRU(page)))

969

return;

969

return;

970

spin_lock_irqsave(&zone->lru_lock, flags);

970

spin_lock_irqsave(&zone->lru_lock, flags);

971

/* link when the page is linked to LRU but page_cgroup isn't */

971

/* link when the page is linked to LRU but page_cgroup isn't */

972

if (PageLRU(page) && !PageCgroupAcctLRU(pc))

972

if (PageLRU(page) && !PageCgroupAcctLRU(pc))

973

mem_cgroup_add_lru_list(page, page_lru(page));

973

mem_cgroup_add_lru_list(page, page_lru(page));

974

spin_unlock_irqrestore(&zone->lru_lock, flags);

974

spin_unlock_irqrestore(&zone->lru_lock, flags);

975

}

975

}

976

977

978

void mem_cgroup_move_lists(struct page *page,

978

void mem_cgroup_move_lists(struct page *page,

979

enum lru_list from, enum lru_list to)

979

enum lru_list from, enum lru_list to)

980

{

980

{

981

if (mem_cgroup_disabled())

981

if (mem_cgroup_disabled())

982

return;

982

return;

983

mem_cgroup_del_lru_list(page, from);

983

mem_cgroup_del_lru_list(page, from);

984

mem_cgroup_add_lru_list(page, to);

984

mem_cgroup_add_lru_list(page, to);

985

}

985

}

986

987

int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)

987

int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)

988

{

988

{

989

int ret;

989

int ret;

990

struct mem_cgroup *curr = NULL;

990

struct mem_cgroup *curr = NULL;

991

struct task_struct *p;

991

struct task_struct *p;

992

993

p = find_lock_task_mm(task);

993

p = find_lock_task_mm(task);

994

if (!p)

994

if (!p)

995

return 0;

995

return 0;

996

curr = try_get_mem_cgroup_from_mm(p->mm);

996

curr = try_get_mem_cgroup_from_mm(p->mm);

997

task_unlock(p);

997

task_unlock(p);

998

if (!curr)

998

if (!curr)

999

return 0;

999

return 0;

1000

/*

1000

/*

1001

* We should check use_hierarchy of "mem" not "curr". Because checking

1001

* We should check use_hierarchy of "mem" not "curr". Because checking

1002

* use_hierarchy of "curr" here make this function true if hierarchy is

1002

* use_hierarchy of "curr" here make this function true if hierarchy is

1003

* enabled in "curr" and "curr" is a child of "mem" in *cgroup*

1003

* enabled in "curr" and "curr" is a child of "mem" in *cgroup*

1004

* hierarchy(even if use_hierarchy is disabled in "mem").

1004

* hierarchy(even if use_hierarchy is disabled in "mem").

1005

*/

1005

*/

1006

if (mem->use_hierarchy)

1006

if (mem->use_hierarchy)

1007

ret = css_is_ancestor(&curr->css, &mem->css);

1007

ret = css_is_ancestor(&curr->css, &mem->css);

1008

else

1008

else

1009

ret = (curr == mem);

1009

ret = (curr == mem);

1010

css_put(&curr->css);

1010

css_put(&curr->css);

1011

return ret;

1011

return ret;

1012

}

1012

}

1013

1014

static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)

1014

static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)

1015

{

1015

{

1016

unsigned long active;

1016

unsigned long active;

1017

unsigned long inactive;

1017

unsigned long inactive;

1018

unsigned long gb;

1018

unsigned long gb;

1019

unsigned long inactive_ratio;

1019

unsigned long inactive_ratio;

1020

1021

inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);

1021

inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);

1022

active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);

1022

active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);

1023

1024

gb = (inactive + active) >> (30 - PAGE_SHIFT);

1024

gb = (inactive + active) >> (30 - PAGE_SHIFT);

1025

if (gb)

1025

if (gb)

1026

inactive_ratio = int_sqrt(10 * gb);

1026

inactive_ratio = int_sqrt(10 * gb);

1027

else

1027

else

1028

inactive_ratio = 1;

1028

inactive_ratio = 1;

1029

1030

if (present_pages) {

1030

if (present_pages) {

1031

present_pages[0] = inactive;

1031

present_pages[0] = inactive;

1032

present_pages[1] = active;

1032

present_pages[1] = active;

1033

}

1033

}

1034

1035

return inactive_ratio;

1035

return inactive_ratio;

1036

}

1036

}

1037

1038

int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)

1038

int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)

1039

{

1039

{

1040

unsigned long active;

1040

unsigned long active;

1041

unsigned long inactive;

1041

unsigned long inactive;

1042

unsigned long present_pages[2];

1042

unsigned long present_pages[2];

1043

unsigned long inactive_ratio;

1043

unsigned long inactive_ratio;

1044

1045

inactive_ratio = calc_inactive_ratio(memcg, present_pages);

1045

inactive_ratio = calc_inactive_ratio(memcg, present_pages);

1046

1047

inactive = present_pages[0];

1047

inactive = present_pages[0];

1048

active = present_pages[1];

1048

active = present_pages[1];

1049

1050

if (inactive * inactive_ratio < active)

1050

if (inactive * inactive_ratio < active)

1051

return 1;

1051

return 1;

1052

1053

return 0;

1053

return 0;

1054

}

1054

}

1055

1056

int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)

1056

int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)

1057

{

1057

{

1058

unsigned long active;

1058

unsigned long active;

1059

unsigned long inactive;

1059

unsigned long inactive;

1060

1061

inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);

1061

inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);

1062

active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);

1062

active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);

1063

1064

return (active > inactive);

1064

return (active > inactive);

1065

}

1065

}

1066

1067

unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,

1067

unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,

1068

struct zone *zone,

1068

struct zone *zone,

1069

enum lru_list lru)

1069

enum lru_list lru)

1070

{

1070

{

1071

int nid = zone_to_nid(zone);

1071

int nid = zone_to_nid(zone);

1072

int zid = zone_idx(zone);

1072

int zid = zone_idx(zone);

1073

struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);

1073

struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);

1074

1075

return MEM_CGROUP_ZSTAT(mz, lru);

1075

return MEM_CGROUP_ZSTAT(mz, lru);

1076

}

1076

}

1077

1078

struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,

1078

struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,

1079

struct zone *zone)

1079

struct zone *zone)

1080

{

1080

{

1081

int nid = zone_to_nid(zone);

1081

int nid = zone_to_nid(zone);

1082

int zid = zone_idx(zone);

1082

int zid = zone_idx(zone);

1083

struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);

1083

struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);

1084

1085

return &mz->reclaim_stat;

1085

return &mz->reclaim_stat;

1086

}

1086

}

1087

1088

struct zone_reclaim_stat *

1088

struct zone_reclaim_stat *

1089

mem_cgroup_get_reclaim_stat_from_page(struct page *page)

1089

mem_cgroup_get_reclaim_stat_from_page(struct page *page)

1090

{

1090

{

1091

struct page_cgroup *pc;

1091

struct page_cgroup *pc;

1092

struct mem_cgroup_per_zone *mz;

1092

struct mem_cgroup_per_zone *mz;

1093

1094

if (mem_cgroup_disabled())

1094

if (mem_cgroup_disabled())

1095

return NULL;

1095

return NULL;

1096

1097

pc = lookup_page_cgroup(page);

1097

pc = lookup_page_cgroup(page);

1098

if (!PageCgroupUsed(pc))

1098

if (!PageCgroupUsed(pc))

1099

return NULL;

1099

return NULL;

1100

/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */

1100

/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */

1101

smp_rmb();

1101

smp_rmb();

1102

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

1102

mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);

1103

return &mz->reclaim_stat;

1103

return &mz->reclaim_stat;

1104

}

1104

}

1105

1106

unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,

1106

unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,

1107

struct list_head *dst,

1107

struct list_head *dst,

1108

unsigned long *scanned, int order,

1108

unsigned long *scanned, int order,

1109

int mode, struct zone *z,

1109

int mode, struct zone *z,

1110

struct mem_cgroup *mem_cont,

1110

struct mem_cgroup *mem_cont,

1111

int active, int file)

1111

int active, int file)

1112

{

1112

{

1113

unsigned long nr_taken = 0;

1113

unsigned long nr_taken = 0;

1114

struct page *page;

1114

struct page *page;

1115

unsigned long scan;

1115

unsigned long scan;

1116

LIST_HEAD(pc_list);

1116

LIST_HEAD(pc_list);

1117

struct list_head *src;

1117

struct list_head *src;

1118

struct page_cgroup *pc, *tmp;

1118

struct page_cgroup *pc, *tmp;

1119

int nid = zone_to_nid(z);

1119

int nid = zone_to_nid(z);

1120

int zid = zone_idx(z);

1120

int zid = zone_idx(z);

1121

struct mem_cgroup_per_zone *mz;

1121

struct mem_cgroup_per_zone *mz;

1122

int lru = LRU_FILE * file + active;

1122

int lru = LRU_FILE * file + active;

1123

int ret;

1123

int ret;

1124

1125

BUG_ON(!mem_cont);

1125

BUG_ON(!mem_cont);

1126

mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);

1126

mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);

1127

src = &mz->lists[lru];

1127

src = &mz->lists[lru];

1128

1129

scan = 0;

1129

scan = 0;

1130

list_for_each_entry_safe_reverse(pc, tmp, src, lru) {

1130

list_for_each_entry_safe_reverse(pc, tmp, src, lru) {

1131

if (scan >= nr_to_scan)

1131

if (scan >= nr_to_scan)

1132

break;

1132

break;

1133

1134

if (unlikely(!PageCgroupUsed(pc)))

1134

if (unlikely(!PageCgroupUsed(pc)))

1135

continue;

1135

continue;

1136

1137

page = lookup_cgroup_page(pc);

1137

page = lookup_cgroup_page(pc);

1138

1139

if (unlikely(!PageLRU(page)))

1139

if (unlikely(!PageLRU(page)))

1140

continue;

1140

continue;

1141

1142

scan++;

1142

scan++;

1143

ret = __isolate_lru_page(page, mode, file);

1143

ret = __isolate_lru_page(page, mode, file);

1144

switch (ret) {

1144

switch (ret) {

1145

case 0:

1145

case 0:

1146

list_move(&page->lru, dst);

1146

list_move(&page->lru, dst);

1147

mem_cgroup_del_lru(page);

1147

mem_cgroup_del_lru(page);

1148

nr_taken += hpage_nr_pages(page);

1148

nr_taken += hpage_nr_pages(page);

1149

break;

1149

break;

1150

case -EBUSY:

1150

case -EBUSY:

1151

/* we don't affect global LRU but rotate in our LRU */

1151

/* we don't affect global LRU but rotate in our LRU */

1152

mem_cgroup_rotate_lru_list(page, page_lru(page));

1152

mem_cgroup_rotate_lru_list(page, page_lru(page));

1153

break;

1153

break;

1154

default:

1154

default:

1155

break;

1155

break;

1156

}

1156

}

1157

}

1157

}

1158

1159

*scanned = scan;

1159

*scanned = scan;

1160

1161

trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,

1161

trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,

1162

0, 0, 0, mode);

1162

0, 0, 0, mode);

1163

1164

return nr_taken;

1164

return nr_taken;

1165

}

1165

}

1166

1167

#define mem_cgroup_from_res_counter(counter, member) \

1167

#define mem_cgroup_from_res_counter(counter, member) \

1168

container_of(counter, struct mem_cgroup, member)

1168

container_of(counter, struct mem_cgroup, member)

1169

1170

/**

1170

/**

1171

* mem_cgroup_margin - calculate chargeable space of a memory cgroup

1171

* mem_cgroup_margin - calculate chargeable space of a memory cgroup

1172

* @mem: the memory cgroup

1172

* @mem: the memory cgroup

1173

*

1173

*

1174

* Returns the maximum amount of memory @mem can be charged with, in

1174

* Returns the maximum amount of memory @mem can be charged with, in

1175

* pages.

1175

* pages.

1176

*/

1176

*/

1177

static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)

1177

static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)

1178

{

1178

{

1179

unsigned long long margin;

1179

unsigned long long margin;

1180

1181

margin = res_counter_margin(&mem->res);

1181

margin = res_counter_margin(&mem->res);

1182

if (do_swap_account)

1182

if (do_swap_account)

1183

margin = min(margin, res_counter_margin(&mem->memsw));

1183

margin = min(margin, res_counter_margin(&mem->memsw));

1184

return margin >> PAGE_SHIFT;

1184

return margin >> PAGE_SHIFT;

1185

}

1185

}

1186

1187

static unsigned int get_swappiness(struct mem_cgroup *memcg)

1187

static unsigned int get_swappiness(struct mem_cgroup *memcg)

1188

{

1188

{

1189

struct cgroup *cgrp = memcg->css.cgroup;

1189

struct cgroup *cgrp = memcg->css.cgroup;

1190

1191

/* root ? */

1191

/* root ? */

1192

if (cgrp->parent == NULL)

1192

if (cgrp->parent == NULL)

1193

return vm_swappiness;

1193

return vm_swappiness;

1194

1195

return memcg->swappiness;

1195

return memcg->swappiness;

1196

}

1196

}

1197

1198

static void mem_cgroup_start_move(struct mem_cgroup *mem)

1198

static void mem_cgroup_start_move(struct mem_cgroup *mem)

1199

{

1199

{

1200

int cpu;

1200

int cpu;

1201

1202

get_online_cpus();

1202

get_online_cpus();

1203

spin_lock(&mem->pcp_counter_lock);

1203

spin_lock(&mem->pcp_counter_lock);

1204

for_each_online_cpu(cpu)

1204

for_each_online_cpu(cpu)

1205

per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;

1205

per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;

1206

mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;

1206

mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;

1207

spin_unlock(&mem->pcp_counter_lock);

1207

spin_unlock(&mem->pcp_counter_lock);

1208

put_online_cpus();

1208

put_online_cpus();

1209

1210

synchronize_rcu();

1210

synchronize_rcu();

1211

}

1211

}

1212

1213

static void mem_cgroup_end_move(struct mem_cgroup *mem)

1213

static void mem_cgroup_end_move(struct mem_cgroup *mem)

1214

{

1214

{

1215

int cpu;

1215

int cpu;

1216

1217

if (!mem)

1217

if (!mem)

1218

return;

1218

return;

1219

get_online_cpus();

1219

get_online_cpus();

1220

spin_lock(&mem->pcp_counter_lock);

1220

spin_lock(&mem->pcp_counter_lock);

1221

for_each_online_cpu(cpu)

1221

for_each_online_cpu(cpu)

1222

per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;

1222

per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;

1223

mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;

1223

mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;

1224

spin_unlock(&mem->pcp_counter_lock);

1224

spin_unlock(&mem->pcp_counter_lock);

1225

put_online_cpus();

1225

put_online_cpus();

1226

}

1226

}

1227

/*

1227

/*

1228

* 2 routines for checking "mem" is under move_account() or not.

1228

* 2 routines for checking "mem" is under move_account() or not.

1229

*

1229

*

1230

* mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used

1230

* mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used

1231

* for avoiding race in accounting. If true,

1231

* for avoiding race in accounting. If true,

1232

* pc->mem_cgroup may be overwritten.

1232

* pc->mem_cgroup may be overwritten.

1233

*

1233

*

1234

* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or

1234

* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or

1235

* under hierarchy of moving cgroups. This is for

1235

* under hierarchy of moving cgroups. This is for

1236

* waiting at hith-memory prressure caused by "move".

1236

* waiting at hith-memory prressure caused by "move".

1237

*/

1237

*/

1238

1239

static bool mem_cgroup_stealed(struct mem_cgroup *mem)

1239

static bool mem_cgroup_stealed(struct mem_cgroup *mem)

1240

{

1240

{

1241

VM_BUG_ON(!rcu_read_lock_held());

1241

VM_BUG_ON(!rcu_read_lock_held());

1242

return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;

1242

return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;

1243

}

1243

}

1244

1245

static bool mem_cgroup_under_move(struct mem_cgroup *mem)

1245

static bool mem_cgroup_under_move(struct mem_cgroup *mem)

1246

{

1246

{

1247

struct mem_cgroup *from;

1247

struct mem_cgroup *from;

1248

struct mem_cgroup *to;

1248

struct mem_cgroup *to;

1249

bool ret = false;

1249

bool ret = false;

1250

/*

1250

/*

1251

* Unlike task_move routines, we access mc.to, mc.from not under

1251

* Unlike task_move routines, we access mc.to, mc.from not under

1252

* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.

1252

* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.

1253

*/

1253

*/

1254

spin_lock(&mc.lock);

1254

spin_lock(&mc.lock);

1255

from = mc.from;

1255

from = mc.from;

1256

to = mc.to;

1256

to = mc.to;

1257

if (!from)

1257

if (!from)

1258

goto unlock;

1258

goto unlock;

1259

if (from == mem || to == mem

1259

if (from == mem || to == mem

1260

|| (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))

1260

|| (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))

1261

|| (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))

1261

|| (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))

1262

ret = true;

1262

ret = true;

1263

unlock:

1263

unlock:

1264

spin_unlock(&mc.lock);

1264

spin_unlock(&mc.lock);

1265

return ret;

1265

return ret;

1266

}

1266

}

1267

1268

static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)

1268

static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)

1269

{

1269

{

1270

if (mc.moving_task && current != mc.moving_task) {

1270

if (mc.moving_task && current != mc.moving_task) {

1271

if (mem_cgroup_under_move(mem)) {

1271

if (mem_cgroup_under_move(mem)) {

1272

DEFINE_WAIT(wait);

1272

DEFINE_WAIT(wait);

1273

prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);

1273

prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);

1274

/* moving charge context might have finished. */

1274

/* moving charge context might have finished. */

1275

if (mc.moving_task)

1275

if (mc.moving_task)

1276

schedule();

1276

schedule();

1277

finish_wait(&mc.waitq, &wait);

1277

finish_wait(&mc.waitq, &wait);

1278

return true;

1278

return true;

1279

}

1279

}

1280

}

1280

}

1281

return false;

1281

return false;

1282

}

1282

}

1283

1284

/**

1284

/**

1285

* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.

1285

* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.

1286

* @memcg: The memory cgroup that went over limit

1286

* @memcg: The memory cgroup that went over limit

1287

* @p: Task that is going to be killed

1287

* @p: Task that is going to be killed

1288

*

1288

*

1289

* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is

1289

* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is

1290

* enabled

1290

* enabled

1291

*/

1291

*/

1292

void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)

1292

void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)

1293

{

1293

{

1294

struct cgroup *task_cgrp;

1294

struct cgroup *task_cgrp;

1295

struct cgroup *mem_cgrp;

1295

struct cgroup *mem_cgrp;

1296

/*

1296

/*

1297

* Need a buffer in BSS, can't rely on allocations. The code relies

1297

* Need a buffer in BSS, can't rely on allocations. The code relies

1298

* on the assumption that OOM is serialized for memory controller.

1298

* on the assumption that OOM is serialized for memory controller.

1299

* If this assumption is broken, revisit this code.

1299

* If this assumption is broken, revisit this code.

1300

*/

1300

*/

1301

static char memcg_name[PATH_MAX];

1301

static char memcg_name[PATH_MAX];

1302

int ret;

1302

int ret;

1303

1304

if (!memcg || !p)

1304

if (!memcg || !p)

1305

return;

1305

return;

1306

1307

1308

rcu_read_lock();

1308

rcu_read_lock();

1309

1310

mem_cgrp = memcg->css.cgroup;

1310

mem_cgrp = memcg->css.cgroup;

1311

task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);

1311

task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);

1312

1313

ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);

1313

ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);

1314

if (ret < 0) {

1314

if (ret < 0) {

1315

/*

1315

/*

1316

* Unfortunately, we are unable to convert to a useful name

1316

* Unfortunately, we are unable to convert to a useful name

1317

* But we'll still print out the usage information

1317

* But we'll still print out the usage information

1318

*/

1318

*/

1319

rcu_read_unlock();

1319

rcu_read_unlock();

1320

goto done;

1320

goto done;

1321

}

1321

}

1322

rcu_read_unlock();

1322

rcu_read_unlock();

1323

1324

printk(KERN_INFO "Task in %s killed", memcg_name);

1324

printk(KERN_INFO "Task in %s killed", memcg_name);

1325

1326

rcu_read_lock();

1326

rcu_read_lock();

1327

ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);

1327

ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);

1328

if (ret < 0) {

1328

if (ret < 0) {

1329

rcu_read_unlock();

1329

rcu_read_unlock();

1330

goto done;

1330

goto done;

1331

}

1331

}

1332

rcu_read_unlock();

1332

rcu_read_unlock();

1333

1334

/*

1334

/*

1335

* Continues from above, so we don't need an KERN_ level

1335

* Continues from above, so we don't need an KERN_ level

1336

*/

1336

*/

1337

printk(KERN_CONT " as a result of limit of %s\n", memcg_name);

1337

printk(KERN_CONT " as a result of limit of %s\n", memcg_name);

1338

done:

1338

done:

1339

1340

printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",

1340

printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",

1341

res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,

1341

res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,

1342

res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,

1342

res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,

1343

res_counter_read_u64(&memcg->res, RES_FAILCNT));

1343

res_counter_read_u64(&memcg->res, RES_FAILCNT));

1344

printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "

1344

printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "

1345

"failcnt %llu\n",

1345

"failcnt %llu\n",

1346

res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,

1346

res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,

1347

res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,

1347

res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,

1348

res_counter_read_u64(&memcg->memsw, RES_FAILCNT));

1348

res_counter_read_u64(&memcg->memsw, RES_FAILCNT));

1349

}

1349

}

1350

1351

/*

1351

/*

1352

* This function returns the number of memcg under hierarchy tree. Returns

1352

* This function returns the number of memcg under hierarchy tree. Returns

1353

* 1(self count) if no children.

1353

* 1(self count) if no children.

1354

*/

1354

*/

1355

static int mem_cgroup_count_children(struct mem_cgroup *mem)

1355

static int mem_cgroup_count_children(struct mem_cgroup *mem)

1356

{

1356

{

1357

int num = 0;

1357

int num = 0;

1358

struct mem_cgroup *iter;

1358

struct mem_cgroup *iter;

1359

1360

for_each_mem_cgroup_tree(iter, mem)

1360

for_each_mem_cgroup_tree(iter, mem)

1361

num++;

1361

num++;

1362

return num;

1362

return num;

1363

}

1363

}

1364

1365

/*

1365

/*

1366

* Return the memory (and swap, if configured) limit for a memcg.

1366

* Return the memory (and swap, if configured) limit for a memcg.

1367

*/

1367

*/

1368

u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)

1368

u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)

1369

{

1369

{

1370

u64 limit;

1370

u64 limit;

1371

u64 memsw;

1371

u64 memsw;

1372

1373

limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

1373

limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

1374

limit += total_swap_pages << PAGE_SHIFT;

1374

limit += total_swap_pages << PAGE_SHIFT;

1375

1376

memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

1376

memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

1377

/*

1377

/*

1378

* If memsw is finite and limits the amount of swap space available

1378

* If memsw is finite and limits the amount of swap space available

1379

* to this memcg, return that limit.

1379

* to this memcg, return that limit.

1380

*/

1380

*/

1381

return min(limit, memsw);

1381

return min(limit, memsw);

1382

}

1382

}

1383

1384

/*

1384

/*

1385

* Visit the first child (need not be the first child as per the ordering

1385

* Visit the first child (need not be the first child as per the ordering

1386

* of the cgroup list, since we track last_scanned_child) of @mem and use

1386

* of the cgroup list, since we track last_scanned_child) of @mem and use

1387

* that to reclaim free pages from.

1387

* that to reclaim free pages from.

1388

*/

1388

*/

1389

static struct mem_cgroup *

1389

static struct mem_cgroup *

1390

mem_cgroup_select_victim(struct mem_cgroup *root_mem)

1390

mem_cgroup_select_victim(struct mem_cgroup *root_mem)

1391

{

1391

{

1392

struct mem_cgroup *ret = NULL;

1392

struct mem_cgroup *ret = NULL;

1393

struct cgroup_subsys_state *css;

1393

struct cgroup_subsys_state *css;

1394

int nextid, found;

1394

int nextid, found;

1395

1396

if (!root_mem->use_hierarchy) {

1396

if (!root_mem->use_hierarchy) {

1397

css_get(&root_mem->css);

1397

css_get(&root_mem->css);

1398

ret = root_mem;

1398

ret = root_mem;

1399

}

1399

}

1400

1401

while (!ret) {

1401

while (!ret) {

1402

rcu_read_lock();

1402

rcu_read_lock();

1403

nextid = root_mem->last_scanned_child + 1;

1403

nextid = root_mem->last_scanned_child + 1;

1404

css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,

1404

css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,

1405

&found);

1405

&found);

1406

if (css && css_tryget(css))

1406

if (css && css_tryget(css))

1407

ret = container_of(css, struct mem_cgroup, css);

1407

ret = container_of(css, struct mem_cgroup, css);

1408

1409

rcu_read_unlock();

1409

rcu_read_unlock();

1410

/* Updates scanning parameter */

1410

/* Updates scanning parameter */

1411

if (!css) {

1411

if (!css) {

1412

/* this means start scan from ID:1 */

1412

/* this means start scan from ID:1 */

1413

root_mem->last_scanned_child = 0;

1413

root_mem->last_scanned_child = 0;

1414

} else

1414

} else

1415

root_mem->last_scanned_child = found;

1415

root_mem->last_scanned_child = found;

1416

}

1416

}

1417

1418

return ret;

1418

return ret;

1419

}

1419

}

1420

1421

/*

1421

/*

1422

* Scan the hierarchy if needed to reclaim memory. We remember the last child

1422

* Scan the hierarchy if needed to reclaim memory. We remember the last child

1423

* we reclaimed from, so that we don't end up penalizing one child extensively

1423

* we reclaimed from, so that we don't end up penalizing one child extensively

1424

* based on its position in the children list.

1424

* based on its position in the children list.

1425

*

1425

*

1426

* root_mem is the original ancestor that we've been reclaim from.

1426

* root_mem is the original ancestor that we've been reclaim from.

1427

*

1427

*

1428

* We give up and return to the caller when we visit root_mem twice.

1428

* We give up and return to the caller when we visit root_mem twice.

1429

* (other groups can be removed while we're walking....)

1429

* (other groups can be removed while we're walking....)

1430

*

1430

*

1431

* If shrink==true, for avoiding to free too much, this returns immedieately.

1431

* If shrink==true, for avoiding to free too much, this returns immedieately.

1432

*/

1432

*/

1433

static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,

1433

static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,

1434

struct zone *zone,

1434

struct zone *zone,

1435

gfp_t gfp_mask,

1435

gfp_t gfp_mask,

1436

unsigned long reclaim_options,

1436

unsigned long reclaim_options,

1437

unsigned long *total_scanned)

1437

unsigned long *total_scanned)

1438

{

1438

{

1439

struct mem_cgroup *victim;

1439

struct mem_cgroup *victim;

1440

int ret, total = 0;

1440

int ret, total = 0;

1441

int loop = 0;

1441

int loop = 0;

1442

bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;

1442

bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;

1443

bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;

1443

bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;

1444

bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;

1444

bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;

1445

unsigned long excess;

1445

unsigned long excess;

1446

unsigned long nr_scanned;

1446

unsigned long nr_scanned;

1447

1448

excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;

1448

excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;

1449

1450

/* If memsw_is_minimum==1, swap-out is of-no-use. */

1450

/* If memsw_is_minimum==1, swap-out is of-no-use. */

1451

if (root_mem->memsw_is_minimum)

1451

if (root_mem->memsw_is_minimum)

1452

noswap = true;

1452

noswap = true;

1453

1454

while (1) {

1454

while (1) {

1455

victim = mem_cgroup_select_victim(root_mem);

1455

victim = mem_cgroup_select_victim(root_mem);

1456

if (victim == root_mem) {

1456

if (victim == root_mem) {

1457

loop++;

1457

loop++;

1458

if (loop >= 1)

1458

if (loop >= 1)

1459

drain_all_stock_async();

1459

drain_all_stock_async();

1460

if (loop >= 2) {

1460

if (loop >= 2) {

1461

/*

1461

/*

1462

* If we have not been able to reclaim

1462

* If we have not been able to reclaim

1463

* anything, it might because there are

1463

* anything, it might because there are

1464

* no reclaimable pages under this hierarchy

1464

* no reclaimable pages under this hierarchy

1465

*/

1465

*/

1466

if (!check_soft || !total) {

1466

if (!check_soft || !total) {

1467

css_put(&victim->css);

1467

css_put(&victim->css);

1468

break;

1468

break;

1469

}

1469

}

1470

/*

1470

/*

1471

* We want to do more targeted reclaim.

1471

* We want to do more targeted reclaim.

1472

* excess >> 2 is not to excessive so as to

1472

* excess >> 2 is not to excessive so as to

1473

* reclaim too much, nor too less that we keep

1473

* reclaim too much, nor too less that we keep

1474

* coming back to reclaim from this cgroup

1474

* coming back to reclaim from this cgroup

1475

*/

1475

*/

1476

if (total >= (excess >> 2) ||

1476

if (total >= (excess >> 2) ||

1477

(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {

1477

(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {

1478

css_put(&victim->css);

1478

css_put(&victim->css);

1479

break;

1479

break;

1480

}

1480

}

1481

}

1481

}

1482

}

1482

}

1483

if (!mem_cgroup_local_usage(victim)) {

1483

if (!mem_cgroup_local_usage(victim)) {

1484

/* this cgroup's local usage == 0 */

1484

/* this cgroup's local usage == 0 */

1485

css_put(&victim->css);

1485

css_put(&victim->css);

1486

continue;

1486

continue;

1487

}

1487

}

1488

/* we use swappiness of local cgroup */

1488

/* we use swappiness of local cgroup */

1489

if (check_soft) {

1489

if (check_soft) {

1490

ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,

1490

ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,

1491

noswap, get_swappiness(victim), zone,

1491

noswap, get_swappiness(victim), zone,

1492

&nr_scanned);

1492

&nr_scanned);

1493

*total_scanned += nr_scanned;

1493

*total_scanned += nr_scanned;

1494

} else

1494

} else

1495

ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,

1495

ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,

1496

noswap, get_swappiness(victim));

1496

noswap, get_swappiness(victim));

1497

css_put(&victim->css);

1497

css_put(&victim->css);

1498

/*

1498

/*

1499

* At shrinking usage, we can't check we should stop here or

1499

* At shrinking usage, we can't check we should stop here or

1500

* reclaim more. It's depends on callers. last_scanned_child

1500

* reclaim more. It's depends on callers. last_scanned_child

1501

* will work enough for keeping fairness under tree.

1501

* will work enough for keeping fairness under tree.

1502

*/

1502

*/

1503

if (shrink)

1503

if (shrink)

1504

return ret;

1504

return ret;

1505

total += ret;

1505

total += ret;

1506

if (check_soft) {

1506

if (check_soft) {

1507

if (!res_counter_soft_limit_excess(&root_mem->res))

1507

if (!res_counter_soft_limit_excess(&root_mem->res))

1508

return total;

1508

return total;

1509

} else if (mem_cgroup_margin(root_mem))

1509

} else if (mem_cgroup_margin(root_mem))

1510

return 1 + total;

1510

return 1 + total;

1511

}

1511

}

1512

return total;

1512

return total;

1513

}

1513

}

1514

1515

/*

1515

/*

1516

* Check OOM-Killer is already running under our hierarchy.

1516

* Check OOM-Killer is already running under our hierarchy.

1517

* If someone is running, return false.

1517

* If someone is running, return false.

1518

*/

1518

*/

1519

static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)

1519

static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)

1520

{

1520

{

1521

int x, lock_count = 0;

1521

int x, lock_count = 0;

1522

struct mem_cgroup *iter;

1522

struct mem_cgroup *iter;

1523

1524

for_each_mem_cgroup_tree(iter, mem) {

1524

for_each_mem_cgroup_tree(iter, mem) {

1525

x = atomic_inc_return(&iter->oom_lock);

1525

x = atomic_inc_return(&iter->oom_lock);

1526

lock_count = max(x, lock_count);

1526

lock_count = max(x, lock_count);

1527

}

1527

}

1528

1529

if (lock_count == 1)

1529

if (lock_count == 1)

1530

return true;

1530

return true;

1531

return false;

1531

return false;

1532

}

1532

}

1533

1534

static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)

1534

static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)

1535

{

1535

{

1536

struct mem_cgroup *iter;

1536

struct mem_cgroup *iter;

1537

1538

/*

1538

/*

1539

* When a new child is created while the hierarchy is under oom,

1539

* When a new child is created while the hierarchy is under oom,

1540

* mem_cgroup_oom_lock() may not be called. We have to use

1540

* mem_cgroup_oom_lock() may not be called. We have to use

1541

* atomic_add_unless() here.

1541

* atomic_add_unless() here.

1542

*/

1542

*/

1543

for_each_mem_cgroup_tree(iter, mem)

1543

for_each_mem_cgroup_tree(iter, mem)

1544

atomic_add_unless(&iter->oom_lock, -1, 0);

1544

atomic_add_unless(&iter->oom_lock, -1, 0);

1545

return 0;

1545

return 0;

1546

}

1546

}

1547

1548

1549

static DEFINE_MUTEX(memcg_oom_mutex);

1549

static DEFINE_MUTEX(memcg_oom_mutex);

1550

static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);

1550

static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);

1551

1552

struct oom_wait_info {

1552

struct oom_wait_info {

1553

struct mem_cgroup *mem;

1553

struct mem_cgroup *mem;

1554

wait_queue_t wait;

1554

wait_queue_t wait;

1555

};

1555

};

1556

1557

static int memcg_oom_wake_function(wait_queue_t *wait,

1557

static int memcg_oom_wake_function(wait_queue_t *wait,

1558

unsigned mode, int sync, void *arg)

1558

unsigned mode, int sync, void *arg)

1559

{

1559

{

1560

struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;

1560

struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;

1561

struct oom_wait_info *oom_wait_info;

1561

struct oom_wait_info *oom_wait_info;

1562

1563

oom_wait_info = container_of(wait, struct oom_wait_info, wait);

1563

oom_wait_info = container_of(wait, struct oom_wait_info, wait);

1564

1565

if (oom_wait_info->mem == wake_mem)

1565

if (oom_wait_info->mem == wake_mem)

1566

goto wakeup;

1566

goto wakeup;

1567

/* if no hierarchy, no match */

1567

/* if no hierarchy, no match */

1568

if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)

1568

if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)

1569

return 0;

1569

return 0;

1570

/*

1570

/*

1571

* Both of oom_wait_info->mem and wake_mem are stable under us.

1571

* Both of oom_wait_info->mem and wake_mem are stable under us.

1572

* Then we can use css_is_ancestor without taking care of RCU.

1572

* Then we can use css_is_ancestor without taking care of RCU.

1573

*/

1573

*/

1574

if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&

1574

if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&

1575

!css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))

1575

!css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))

1576

return 0;

1576

return 0;

1577

1578

wakeup:

1578

wakeup:

1579

return autoremove_wake_function(wait, mode, sync, arg);

1579

return autoremove_wake_function(wait, mode, sync, arg);

1580

}

1580

}

1581

1582

static void memcg_wakeup_oom(struct mem_cgroup *mem)

1582

static void memcg_wakeup_oom(struct mem_cgroup *mem)

1583

{

1583

{

1584

/* for filtering, pass "mem" as argument. */

1584

/* for filtering, pass "mem" as argument. */

1585

__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);

1585

__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);

1586

}

1586

}

1587

1588

static void memcg_oom_recover(struct mem_cgroup *mem)

1588

static void memcg_oom_recover(struct mem_cgroup *mem)

1589

{

1589

{

1590

if (mem && atomic_read(&mem->oom_lock))

1590

if (mem && atomic_read(&mem->oom_lock))

1591

memcg_wakeup_oom(mem);

1591

memcg_wakeup_oom(mem);

1592

}

1592

}

1593

1594

/*

1594

/*

1595

* try to call OOM killer. returns false if we should exit memory-reclaim loop.

1595

* try to call OOM killer. returns false if we should exit memory-reclaim loop.

1596

*/

1596

*/

1597

bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)

1597

bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)

1598

{

1598

{

1599

struct oom_wait_info owait;

1599

struct oom_wait_info owait;

1600

bool locked, need_to_kill;

1600

bool locked, need_to_kill;

1601

1602

owait.mem = mem;

1602

owait.mem = mem;

1603

owait.wait.flags = 0;

1603

owait.wait.flags = 0;

1604

owait.wait.func = memcg_oom_wake_function;

1604

owait.wait.func = memcg_oom_wake_function;

1605

owait.wait.private = current;

1605

owait.wait.private = current;

1606

INIT_LIST_HEAD(&owait.wait.task_list);

1606

INIT_LIST_HEAD(&owait.wait.task_list);

1607

need_to_kill = true;

1607

need_to_kill = true;

1608

/* At first, try to OOM lock hierarchy under mem.*/

1608

/* At first, try to OOM lock hierarchy under mem.*/

1609

mutex_lock(&memcg_oom_mutex);

1609

mutex_lock(&memcg_oom_mutex);

1610

locked = mem_cgroup_oom_lock(mem);

1610

locked = mem_cgroup_oom_lock(mem);

1611

/*

1611

/*

1612

* Even if signal_pending(), we can't quit charge() loop without

1612

* Even if signal_pending(), we can't quit charge() loop without

1613

* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL

1613

* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL

1614

* under OOM is always welcomed, use TASK_KILLABLE here.

1614

* under OOM is always welcomed, use TASK_KILLABLE here.

1615

*/

1615

*/

1616

prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);

1616

prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);

1617

if (!locked || mem->oom_kill_disable)

1617

if (!locked || mem->oom_kill_disable)

1618

need_to_kill = false;

1618

need_to_kill = false;

1619

if (locked)

1619

if (locked)

1620

mem_cgroup_oom_notify(mem);

1620

mem_cgroup_oom_notify(mem);

1621

mutex_unlock(&memcg_oom_mutex);

1621

mutex_unlock(&memcg_oom_mutex);

1622

1623

if (need_to_kill) {

1623

if (need_to_kill) {

1624

finish_wait(&memcg_oom_waitq, &owait.wait);

1624

finish_wait(&memcg_oom_waitq, &owait.wait);

1625

mem_cgroup_out_of_memory(mem, mask);

1625

mem_cgroup_out_of_memory(mem, mask);

1626

} else {

1626

} else {

1627

schedule();

1627

schedule();

1628

finish_wait(&memcg_oom_waitq, &owait.wait);

1628

finish_wait(&memcg_oom_waitq, &owait.wait);

1629

}

1629

}

1630

mutex_lock(&memcg_oom_mutex);

1630

mutex_lock(&memcg_oom_mutex);

1631

mem_cgroup_oom_unlock(mem);

1631

mem_cgroup_oom_unlock(mem);

1632

memcg_wakeup_oom(mem);

1632

memcg_wakeup_oom(mem);

1633

mutex_unlock(&memcg_oom_mutex);

1633

mutex_unlock(&memcg_oom_mutex);

1634

1635

if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))

1635

if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))

1636

return false;

1636

return false;

1637

/* Give chance to dying process */

1637

/* Give chance to dying process */

1638

schedule_timeout(1);

1638

schedule_timeout(1);

1639

return true;

1639

return true;

1640

}

1640

}

1641

1642

/*

1642

/*

1643

* Currently used to update mapped file statistics, but the routine can be

1643

* Currently used to update mapped file statistics, but the routine can be

1644

* generalized to update other statistics as well.

1644

* generalized to update other statistics as well.

1645

*

1645

*

1646

* Notes: Race condition

1646

* Notes: Race condition

1647

*

1647

*

1648

* We usually use page_cgroup_lock() for accessing page_cgroup member but

1648

* We usually use page_cgroup_lock() for accessing page_cgroup member but

1649

* it tends to be costly. But considering some conditions, we doesn't need

1649

* it tends to be costly. But considering some conditions, we doesn't need

1650

* to do so _always_.

1650

* to do so _always_.

1651

*

1651

*

1652

* Considering "charge", lock_page_cgroup() is not required because all

1652

* Considering "charge", lock_page_cgroup() is not required because all

1653

* file-stat operations happen after a page is attached to radix-tree. There

1653

* file-stat operations happen after a page is attached to radix-tree. There

1654

* are no race with "charge".

1654

* are no race with "charge".

1655

*

1655

*

1656

* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup

1656

* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup

1657

* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even

1657

* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even

1658

* if there are race with "uncharge". Statistics itself is properly handled

1658

* if there are race with "uncharge". Statistics itself is properly handled

1659

* by flags.

1659

* by flags.

1660

*

1660

*

1661

* Considering "move", this is an only case we see a race. To make the race

1661

* Considering "move", this is an only case we see a race. To make the race

1662

* small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are

1662

* small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are

1663

* possibility of race condition. If there is, we take a lock.

1663

* possibility of race condition. If there is, we take a lock.

1664

*/

1664

*/

1665

1666

void mem_cgroup_update_page_stat(struct page *page,

1666

void mem_cgroup_update_page_stat(struct page *page,

1667

enum mem_cgroup_page_stat_item idx, int val)

1667

enum mem_cgroup_page_stat_item idx, int val)

1668

{

1668

{

1669

struct mem_cgroup *mem;

1669

struct mem_cgroup *mem;

1670

struct page_cgroup *pc = lookup_page_cgroup(page);

1670

struct page_cgroup *pc = lookup_page_cgroup(page);

1671

bool need_unlock = false;

1671

bool need_unlock = false;

1672

unsigned long uninitialized_var(flags);

1672

unsigned long uninitialized_var(flags);

1673

1674

if (unlikely(!pc))

1674

if (unlikely(!pc))

1675

return;

1675

return;

1676

1677

rcu_read_lock();

1677

rcu_read_lock();

1678

mem = pc->mem_cgroup;

1678

mem = pc->mem_cgroup;

1679

if (unlikely(!mem || !PageCgroupUsed(pc)))

1679

if (unlikely(!mem || !PageCgroupUsed(pc)))

1680

goto out;

1680

goto out;

1681

/* pc->mem_cgroup is unstable ? */

1681

/* pc->mem_cgroup is unstable ? */

1682

if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {

1682

if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {

1683

/* take a lock against to access pc->mem_cgroup */

1683

/* take a lock against to access pc->mem_cgroup */

1684

move_lock_page_cgroup(pc, &flags);

1684

move_lock_page_cgroup(pc, &flags);

1685

need_unlock = true;

1685

need_unlock = true;

1686

mem = pc->mem_cgroup;

1686

mem = pc->mem_cgroup;

1687

if (!mem || !PageCgroupUsed(pc))

1687

if (!mem || !PageCgroupUsed(pc))

1688

goto out;

1688

goto out;

1689

}

1689

}

1690

1691

switch (idx) {

1691

switch (idx) {

1692

case MEMCG_NR_FILE_MAPPED:

1692

case MEMCG_NR_FILE_MAPPED:

1693

if (val > 0)

1693

if (val > 0)

1694

SetPageCgroupFileMapped(pc);

1694

SetPageCgroupFileMapped(pc);

1695

else if (!page_mapped(page))

1695

else if (!page_mapped(page))

1696

ClearPageCgroupFileMapped(pc);

1696

ClearPageCgroupFileMapped(pc);

1697

idx = MEM_CGROUP_STAT_FILE_MAPPED;

1697

idx = MEM_CGROUP_STAT_FILE_MAPPED;

1698

break;

1698

break;

1699

default:

1699

default:

1700

BUG();

1700

BUG();

1701

}

1701

}

1702

1703

this_cpu_add(mem->stat->count[idx], val);

1703

this_cpu_add(mem->stat->count[idx], val);

1704

1705

out:

1705

out:

1706

if (unlikely(need_unlock))

1706

if (unlikely(need_unlock))

1707

move_unlock_page_cgroup(pc, &flags);

1707

move_unlock_page_cgroup(pc, &flags);

1708

rcu_read_unlock();

1708

rcu_read_unlock();

1709

return;

1709

return;

1710

}

1710

}

1711

EXPORT_SYMBOL(mem_cgroup_update_page_stat);

1711

EXPORT_SYMBOL(mem_cgroup_update_page_stat);

1712

1713

/*

1713

/*

1714

* size of first charge trial. "32" comes from vmscan.c's magic value.

1714

* size of first charge trial. "32" comes from vmscan.c's magic value.

1715

* TODO: maybe necessary to use big numbers in big irons.

1715

* TODO: maybe necessary to use big numbers in big irons.

1716

*/

1716

*/

1717

#define CHARGE_BATCH 32U

1717

#define CHARGE_BATCH 32U

1718

struct memcg_stock_pcp {

1718

struct memcg_stock_pcp {

1719

struct mem_cgroup *cached; /* this never be root cgroup */

1719

struct mem_cgroup *cached; /* this never be root cgroup */

1720

unsigned int nr_pages;

1720

unsigned int nr_pages;

1721

struct work_struct work;

1721

struct work_struct work;

1722

};

1722

};

1723

static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);

1723

static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);

1724

static atomic_t memcg_drain_count;

1724

static atomic_t memcg_drain_count;

1725

1726

/*

1726

/*

1727

* Try to consume stocked charge on this cpu. If success, one page is consumed

1727

* Try to consume stocked charge on this cpu. If success, one page is consumed

1728

* from local stock and true is returned. If the stock is 0 or charges from a

1728

* from local stock and true is returned. If the stock is 0 or charges from a

1729

* cgroup which is not current target, returns false. This stock will be

1729

* cgroup which is not current target, returns false. This stock will be

1730

* refilled.

1730

* refilled.

1731

*/

1731

*/

1732

static bool consume_stock(struct mem_cgroup *mem)

1732

static bool consume_stock(struct mem_cgroup *mem)

1733

{

1733

{

1734

struct memcg_stock_pcp *stock;

1734

struct memcg_stock_pcp *stock;

1735

bool ret = true;

1735

bool ret = true;

1736

1737

stock = &get_cpu_var(memcg_stock);

1737

stock = &get_cpu_var(memcg_stock);

1738

if (mem == stock->cached && stock->nr_pages)

1738

if (mem == stock->cached && stock->nr_pages)

1739

stock->nr_pages--;

1739

stock->nr_pages--;

1740

else /* need to call res_counter_charge */

1740

else /* need to call res_counter_charge */

1741

ret = false;

1741

ret = false;

1742

put_cpu_var(memcg_stock);

1742

put_cpu_var(memcg_stock);

1743

return ret;

1743

return ret;

1744

}

1744

}

1745

1746

/*

1746

/*

1747

* Returns stocks cached in percpu to res_counter and reset cached information.

1747

* Returns stocks cached in percpu to res_counter and reset cached information.

1748

*/

1748

*/

1749

static void drain_stock(struct memcg_stock_pcp *stock)

1749

static void drain_stock(struct memcg_stock_pcp *stock)

1750

{

1750

{

1751

struct mem_cgroup *old = stock->cached;

1751

struct mem_cgroup *old = stock->cached;

1752

1753

if (stock->nr_pages) {

1753

if (stock->nr_pages) {

1754

unsigned long bytes = stock->nr_pages * PAGE_SIZE;

1754

unsigned long bytes = stock->nr_pages * PAGE_SIZE;

1755

1756

res_counter_uncharge(&old->res, bytes);

1756

res_counter_uncharge(&old->res, bytes);

1757

if (do_swap_account)

1757

if (do_swap_account)

1758

res_counter_uncharge(&old->memsw, bytes);

1758

res_counter_uncharge(&old->memsw, bytes);

1759

stock->nr_pages = 0;

1759

stock->nr_pages = 0;

1760

}

1760

}

1761

stock->cached = NULL;

1761

stock->cached = NULL;

1762

}

1762

}

1763

1764

/*

1764

/*

1765

* This must be called under preempt disabled or must be called by

1765

* This must be called under preempt disabled or must be called by

1766

* a thread which is pinned to local cpu.

1766

* a thread which is pinned to local cpu.

1767

*/

1767

*/

1768

static void drain_local_stock(struct work_struct *dummy)

1768

static void drain_local_stock(struct work_struct *dummy)

1769

{

1769

{

1770

struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);

1770

struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);

1771

drain_stock(stock);

1771

drain_stock(stock);

1772

}

1772

}

1773

1774

/*

1774

/*

1775

* Cache charges(val) which is from res_counter, to local per_cpu area.

1775

* Cache charges(val) which is from res_counter, to local per_cpu area.

1776

* This will be consumed by consume_stock() function, later.

1776

* This will be consumed by consume_stock() function, later.

1777

*/

1777

*/

1778

static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)

1778

static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)

1779

{

1779

{

1780

struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);

1780

struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);

1781

1782

if (stock->cached != mem) { /* reset if necessary */

1782

if (stock->cached != mem) { /* reset if necessary */

1783

drain_stock(stock);

1783

drain_stock(stock);

1784

stock->cached = mem;

1784

stock->cached = mem;

1785

}

1785

}

1786

stock->nr_pages += nr_pages;

1786

stock->nr_pages += nr_pages;

1787

put_cpu_var(memcg_stock);

1787

put_cpu_var(memcg_stock);

1788

}

1788

}

1789

1790

/*

1790

/*

1791

* Tries to drain stocked charges in other cpus. This function is asynchronous

1791

* Tries to drain stocked charges in other cpus. This function is asynchronous

1792

* and just put a work per cpu for draining localy on each cpu. Caller can

1792

* and just put a work per cpu for draining localy on each cpu. Caller can

1793

* expects some charges will be back to res_counter later but cannot wait for

1793

* expects some charges will be back to res_counter later but cannot wait for

1794

* it.

1794

* it.

1795

*/

1795

*/

1796

static void drain_all_stock_async(void)

1796

static void drain_all_stock_async(void)

1797

{

1797

{

1798

int cpu;

1798

int cpu;

1799

/* This function is for scheduling "drain" in asynchronous way.

1799

/* This function is for scheduling "drain" in asynchronous way.

1800

* The result of "drain" is not directly handled by callers. Then,

1800

* The result of "drain" is not directly handled by callers. Then,

1801

* if someone is calling drain, we don't have to call drain more.

1801

* if someone is calling drain, we don't have to call drain more.

1802

* Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if

1802

* Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if

1803

* there is a race. We just do loose check here.

1803

* there is a race. We just do loose check here.

1804

*/

1804

*/

1805

if (atomic_read(&memcg_drain_count))

1805

if (atomic_read(&memcg_drain_count))

1806

return;

1806

return;

1807

/* Notify other cpus that system-wide "drain" is running */

1807

/* Notify other cpus that system-wide "drain" is running */

1808

atomic_inc(&memcg_drain_count);

1808

atomic_inc(&memcg_drain_count);

1809

get_online_cpus();

1809

get_online_cpus();

1810

for_each_online_cpu(cpu) {

1810

for_each_online_cpu(cpu) {

1811

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

1811

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

1812

schedule_work_on(cpu, &stock->work);

1812

schedule_work_on(cpu, &stock->work);

1813

}

1813

}

1814

put_online_cpus();

1814

put_online_cpus();

1815

atomic_dec(&memcg_drain_count);

1815

atomic_dec(&memcg_drain_count);

1816

/* We don't wait for flush_work */

1816

/* We don't wait for flush_work */

1817

}

1817

}

1818

1819

/* This is a synchronous drain interface. */

1819

/* This is a synchronous drain interface. */

1820

static void drain_all_stock_sync(void)

1820

static void drain_all_stock_sync(void)

1821

{

1821

{

1822

/* called when force_empty is called */

1822

/* called when force_empty is called */

1823

atomic_inc(&memcg_drain_count);

1823

atomic_inc(&memcg_drain_count);

1824

schedule_on_each_cpu(drain_local_stock);

1824

schedule_on_each_cpu(drain_local_stock);

1825

atomic_dec(&memcg_drain_count);

1825

atomic_dec(&memcg_drain_count);

1826

}

1826

}

1827

1828

/*

1828

/*

1829

* This function drains percpu counter value from DEAD cpu and

1829

* This function drains percpu counter value from DEAD cpu and

1830

* move it to local cpu. Note that this function can be preempted.

1830

* move it to local cpu. Note that this function can be preempted.

1831

*/

1831

*/

1832

static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)

1832

static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)

1833

{

1833

{

1834

int i;

1834

int i;

1835

1836

spin_lock(&mem->pcp_counter_lock);

1836

spin_lock(&mem->pcp_counter_lock);

1837

for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {

1837

for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {

1838

long x = per_cpu(mem->stat->count[i], cpu);

1838

long x = per_cpu(mem->stat->count[i], cpu);

1839

1840

per_cpu(mem->stat->count[i], cpu) = 0;

1840

per_cpu(mem->stat->count[i], cpu) = 0;

1841

mem->nocpu_base.count[i] += x;

1841

mem->nocpu_base.count[i] += x;

1842

}

1842

}

1843

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

1843

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

1844

unsigned long x = per_cpu(mem->stat->events[i], cpu);

1844

unsigned long x = per_cpu(mem->stat->events[i], cpu);

1845

1846

per_cpu(mem->stat->events[i], cpu) = 0;

1846

per_cpu(mem->stat->events[i], cpu) = 0;

1847

mem->nocpu_base.events[i] += x;

1847

mem->nocpu_base.events[i] += x;

1848

}

1848

}

1849

/* need to clear ON_MOVE value, works as a kind of lock. */

1849

/* need to clear ON_MOVE value, works as a kind of lock. */

1850

per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;

1850

per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;

1851

spin_unlock(&mem->pcp_counter_lock);

1851

spin_unlock(&mem->pcp_counter_lock);

1852

}

1852

}

1853

1854

static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)

1854

static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)

1855

{

1855

{

1856

int idx = MEM_CGROUP_ON_MOVE;

1856

int idx = MEM_CGROUP_ON_MOVE;

1857

1858

spin_lock(&mem->pcp_counter_lock);

1858

spin_lock(&mem->pcp_counter_lock);

1859

per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];

1859

per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];

1860

spin_unlock(&mem->pcp_counter_lock);

1860

spin_unlock(&mem->pcp_counter_lock);

1861

}

1861

}

1862

1863

static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,

1863

static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,

1864

unsigned long action,

1864

unsigned long action,

1865

void *hcpu)

1865

void *hcpu)

1866

{

1866

{

1867

int cpu = (unsigned long)hcpu;

1867

int cpu = (unsigned long)hcpu;

1868

struct memcg_stock_pcp *stock;

1868

struct memcg_stock_pcp *stock;

1869

struct mem_cgroup *iter;

1869

struct mem_cgroup *iter;

1870

1871

if ((action == CPU_ONLINE)) {

1871

if ((action == CPU_ONLINE)) {

1872

for_each_mem_cgroup_all(iter)

1872

for_each_mem_cgroup_all(iter)

1873

synchronize_mem_cgroup_on_move(iter, cpu);

1873

synchronize_mem_cgroup_on_move(iter, cpu);

1874

return NOTIFY_OK;

1874

return NOTIFY_OK;

1875

}

1875

}

1876

1877

if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)

1877

if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)

1878

return NOTIFY_OK;

1878

return NOTIFY_OK;

1879

1880

for_each_mem_cgroup_all(iter)

1880

for_each_mem_cgroup_all(iter)

1881

mem_cgroup_drain_pcp_counter(iter, cpu);

1881

mem_cgroup_drain_pcp_counter(iter, cpu);

1882

1883

stock = &per_cpu(memcg_stock, cpu);

1883

stock = &per_cpu(memcg_stock, cpu);

1884

drain_stock(stock);

1884

drain_stock(stock);

1885

return NOTIFY_OK;

1885

return NOTIFY_OK;

1886

}

1886

}

1887

1888

1889

/* See __mem_cgroup_try_charge() for details */

1889

/* See __mem_cgroup_try_charge() for details */

1890

enum {

1890

enum {

1891

CHARGE_OK, /* success */

1891

CHARGE_OK, /* success */

1892

CHARGE_RETRY, /* need to retry but retry is not bad */

1892

CHARGE_RETRY, /* need to retry but retry is not bad */

1893

CHARGE_NOMEM, /* we can't do more. return -ENOMEM */

1893

CHARGE_NOMEM, /* we can't do more. return -ENOMEM */

1894

CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */

1894

CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */

1895

CHARGE_OOM_DIE, /* the current is killed because of OOM */

1895

CHARGE_OOM_DIE, /* the current is killed because of OOM */

1896

};

1896

};

1897

1898

static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,

1898

static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,

1899

unsigned int nr_pages, bool oom_check)

1899

unsigned int nr_pages, bool oom_check)

1900

{

1900

{

1901

unsigned long csize = nr_pages * PAGE_SIZE;

1901

unsigned long csize = nr_pages * PAGE_SIZE;

1902

struct mem_cgroup *mem_over_limit;

1902

struct mem_cgroup *mem_over_limit;

1903

struct res_counter *fail_res;

1903

struct res_counter *fail_res;

1904

unsigned long flags = 0;

1904

unsigned long flags = 0;

1905

int ret;

1905

int ret;

1906

1907

ret = res_counter_charge(&mem->res, csize, &fail_res);

1907

ret = res_counter_charge(&mem->res, csize, &fail_res);

1908

1909

if (likely(!ret)) {

1909

if (likely(!ret)) {

1910

if (!do_swap_account)

1910

if (!do_swap_account)

1911

return CHARGE_OK;

1911

return CHARGE_OK;

1912

ret = res_counter_charge(&mem->memsw, csize, &fail_res);

1912

ret = res_counter_charge(&mem->memsw, csize, &fail_res);

1913

if (likely(!ret))

1913

if (likely(!ret))

1914

return CHARGE_OK;

1914

return CHARGE_OK;

1915

1916

res_counter_uncharge(&mem->res, csize);

1916

res_counter_uncharge(&mem->res, csize);

1917

mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);

1917

mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);

1918

flags |= MEM_CGROUP_RECLAIM_NOSWAP;

1918

flags |= MEM_CGROUP_RECLAIM_NOSWAP;

1919

} else

1919

} else

1920

mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);

1920

mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);

1921

/*

1921

/*

1922

* nr_pages can be either a huge page (HPAGE_PMD_NR), a batch

1922

* nr_pages can be either a huge page (HPAGE_PMD_NR), a batch

1923

* of regular pages (CHARGE_BATCH), or a single regular page (1).

1923

* of regular pages (CHARGE_BATCH), or a single regular page (1).

1924

*

1924

*

1925

* Never reclaim on behalf of optional batching, retry with a

1925

* Never reclaim on behalf of optional batching, retry with a

1926

* single page instead.

1926

* single page instead.

1927

*/

1927

*/

1928

if (nr_pages == CHARGE_BATCH)

1928

if (nr_pages == CHARGE_BATCH)

1929

return CHARGE_RETRY;

1929

return CHARGE_RETRY;

1930

1931

if (!(gfp_mask & __GFP_WAIT))

1931

if (!(gfp_mask & __GFP_WAIT))

1932

return CHARGE_WOULDBLOCK;

1932

return CHARGE_WOULDBLOCK;

1933

1934

ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,

1934

ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,

1935

gfp_mask, flags, NULL);

1935

gfp_mask, flags, NULL);

1936

if (mem_cgroup_margin(mem_over_limit) >= nr_pages)

1936

if (mem_cgroup_margin(mem_over_limit) >= nr_pages)

1937

return CHARGE_RETRY;

1937

return CHARGE_RETRY;

1938

/*

1938

/*

1939

* Even though the limit is exceeded at this point, reclaim

1939

* Even though the limit is exceeded at this point, reclaim

1940

* may have been able to free some pages. Retry the charge

1940

* may have been able to free some pages. Retry the charge

1941

* before killing the task.

1941

* before killing the task.

1942

*

1942

*

1943

* Only for regular pages, though: huge pages are rather

1943

* Only for regular pages, though: huge pages are rather

1944

* unlikely to succeed so close to the limit, and we fall back

1944

* unlikely to succeed so close to the limit, and we fall back

1945

* to regular pages anyway in case of failure.

1945

* to regular pages anyway in case of failure.

1946

*/

1946

*/

1947

if (nr_pages == 1 && ret)

1947

if (nr_pages == 1 && ret)

1948

return CHARGE_RETRY;

1948

return CHARGE_RETRY;

1949

1950

/*

1950

/*

1951

* At task move, charge accounts can be doubly counted. So, it's

1951

* At task move, charge accounts can be doubly counted. So, it's

1952

* better to wait until the end of task_move if something is going on.

1952

* better to wait until the end of task_move if something is going on.

1953

*/

1953

*/

1954

if (mem_cgroup_wait_acct_move(mem_over_limit))

1954

if (mem_cgroup_wait_acct_move(mem_over_limit))

1955

return CHARGE_RETRY;

1955

return CHARGE_RETRY;

1956

1957

/* If we don't need to call oom-killer at el, return immediately */

1957

/* If we don't need to call oom-killer at el, return immediately */

1958

if (!oom_check)

1958

if (!oom_check)

1959

return CHARGE_NOMEM;

1959

return CHARGE_NOMEM;

1960

/* check OOM */

1960

/* check OOM */

1961

if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))

1961

if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))

1962

return CHARGE_OOM_DIE;

1962

return CHARGE_OOM_DIE;

1963

1964

return CHARGE_RETRY;

1964

return CHARGE_RETRY;

1965

}

1965

}

1966

1967

/*

1967

/*

1968

* Unlike exported interface, "oom" parameter is added. if oom==true,

1968

* Unlike exported interface, "oom" parameter is added. if oom==true,

1969

* oom-killer can be invoked.

1969

* oom-killer can be invoked.

1970

*/

1970

*/

1971

static int __mem_cgroup_try_charge(struct mm_struct *mm,

1971

static int __mem_cgroup_try_charge(struct mm_struct *mm,

1972

gfp_t gfp_mask,

1972

gfp_t gfp_mask,

1973

unsigned int nr_pages,

1973

unsigned int nr_pages,

1974

struct mem_cgroup **memcg,

1974

struct mem_cgroup **memcg,

1975

bool oom)

1975

bool oom)

1976

{

1976

{

1977

unsigned int batch = max(CHARGE_BATCH, nr_pages);

1977

unsigned int batch = max(CHARGE_BATCH, nr_pages);

1978

int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

1978

int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

1979

struct mem_cgroup *mem = NULL;

1979

struct mem_cgroup *mem = NULL;

1980

int ret;

1980

int ret;

1981

1982

/*

1982

/*

1983

* Unlike gloval-vm's OOM-kill, we're not in memory shortage

1983

* Unlike gloval-vm's OOM-kill, we're not in memory shortage

1984

* in system level. So, allow to go ahead dying process in addition to

1984

* in system level. So, allow to go ahead dying process in addition to

1985

* MEMDIE process.

1985

* MEMDIE process.

1986

*/

1986

*/

1987

if (unlikely(test_thread_flag(TIF_MEMDIE)

1987

if (unlikely(test_thread_flag(TIF_MEMDIE)

1988

|| fatal_signal_pending(current)))

1988

|| fatal_signal_pending(current)))

1989

goto bypass;

1989

goto bypass;

1990

1991

/*

1991

/*

1992

* We always charge the cgroup the mm_struct belongs to.

1992

* We always charge the cgroup the mm_struct belongs to.

1993

* The mm_struct's mem_cgroup changes on task migration if the

1993

* The mm_struct's mem_cgroup changes on task migration if the

1994

* thread group leader migrates. It's possible that mm is not

1994

* thread group leader migrates. It's possible that mm is not

1995

* set, if so charge the init_mm (happens for pagecache usage).

1995

* set, if so charge the init_mm (happens for pagecache usage).

1996

*/

1996

*/

1997

if (!*memcg && !mm)

1997

if (!*memcg && !mm)

1998

goto bypass;

1998

goto bypass;

1999

again:

1999

again:

2000

if (*memcg) { /* css should be a valid one */

2000

if (*memcg) { /* css should be a valid one */

2001

mem = *memcg;

2001

mem = *memcg;

2002

VM_BUG_ON(css_is_removed(&mem->css));

2002

VM_BUG_ON(css_is_removed(&mem->css));

2003

if (mem_cgroup_is_root(mem))

2003

if (mem_cgroup_is_root(mem))

2004

goto done;

2004

goto done;

2005

if (nr_pages == 1 && consume_stock(mem))

2005

if (nr_pages == 1 && consume_stock(mem))

2006

goto done;

2006

goto done;

2007

css_get(&mem->css);

2007

css_get(&mem->css);

2008

} else {

2008

} else {

2009

struct task_struct *p;

2009

struct task_struct *p;

2010

2011

rcu_read_lock();

2011

rcu_read_lock();

2012

p = rcu_dereference(mm->owner);

2012

p = rcu_dereference(mm->owner);

2013

/*

2013

/*

2014

* Because we don't have task_lock(), "p" can exit.

2014

* Because we don't have task_lock(), "p" can exit.

2015

* In that case, "mem" can point to root or p can be NULL with

2015

* In that case, "mem" can point to root or p can be NULL with

2016

* race with swapoff. Then, we have small risk of mis-accouning.

2016

* race with swapoff. Then, we have small risk of mis-accouning.

2017

* But such kind of mis-account by race always happens because

2017

* But such kind of mis-account by race always happens because

2018

* we don't have cgroup_mutex(). It's overkill and we allo that

2018

* we don't have cgroup_mutex(). It's overkill and we allo that

2019

* small race, here.

2019

* small race, here.

2020

* (*) swapoff at el will charge against mm-struct not against

2020

* (*) swapoff at el will charge against mm-struct not against

2021

* task-struct. So, mm->owner can be NULL.

2021

* task-struct. So, mm->owner can be NULL.

2022

*/

2022

*/

2023

mem = mem_cgroup_from_task(p);

2023

mem = mem_cgroup_from_task(p);

2024

if (!mem || mem_cgroup_is_root(mem)) {

2024

if (!mem || mem_cgroup_is_root(mem)) {

2025

rcu_read_unlock();

2025

rcu_read_unlock();

2026

goto done;

2026

goto done;

2027

}

2027

}

2028

if (nr_pages == 1 && consume_stock(mem)) {

2028

if (nr_pages == 1 && consume_stock(mem)) {

2029

/*

2029

/*

2030

* It seems dagerous to access memcg without css_get().

2030

* It seems dagerous to access memcg without css_get().

2031

* But considering how consume_stok works, it's not

2031

* But considering how consume_stok works, it's not

2032

* necessary. If consume_stock success, some charges

2032

* necessary. If consume_stock success, some charges

2033

* from this memcg are cached on this cpu. So, we

2033

* from this memcg are cached on this cpu. So, we

2034

* don't need to call css_get()/css_tryget() before

2034

* don't need to call css_get()/css_tryget() before

2035

* calling consume_stock().

2035

* calling consume_stock().

2036

*/

2036

*/

2037

rcu_read_unlock();

2037

rcu_read_unlock();

2038

goto done;

2038

goto done;

2039

}

2039

}

2040

/* after here, we may be blocked. we need to get refcnt */

2040

/* after here, we may be blocked. we need to get refcnt */

2041

if (!css_tryget(&mem->css)) {

2041

if (!css_tryget(&mem->css)) {

2042

rcu_read_unlock();

2042

rcu_read_unlock();

2043

goto again;

2043

goto again;

2044

}

2044

}

2045

rcu_read_unlock();

2045

rcu_read_unlock();

2046

}

2046

}

2047

2048

do {

2048

do {

2049

bool oom_check;

2049

bool oom_check;

2050

2051

/* If killed, bypass charge */

2051

/* If killed, bypass charge */

2052

if (fatal_signal_pending(current)) {

2052

if (fatal_signal_pending(current)) {

2053

css_put(&mem->css);

2053

css_put(&mem->css);

2054

goto bypass;

2054

goto bypass;

2055

}

2055

}

2056

2057

oom_check = false;

2057

oom_check = false;

2058

if (oom && !nr_oom_retries) {

2058

if (oom && !nr_oom_retries) {

2059

oom_check = true;

2059

oom_check = true;

2060

nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

2060

nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

2061

}

2061

}

2062

2063

ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);

2063

ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);

2064

switch (ret) {

2064

switch (ret) {

2065

case CHARGE_OK:

2065

case CHARGE_OK:

2066

break;

2066

break;

2067

case CHARGE_RETRY: /* not in OOM situation but retry */

2067

case CHARGE_RETRY: /* not in OOM situation but retry */

2068

batch = nr_pages;

2068

batch = nr_pages;

2069

css_put(&mem->css);

2069

css_put(&mem->css);

2070

mem = NULL;

2070

mem = NULL;

2071

goto again;

2071

goto again;

2072

case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */

2072

case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */

2073

css_put(&mem->css);

2073

css_put(&mem->css);

2074

goto nomem;

2074

goto nomem;

2075

case CHARGE_NOMEM: /* OOM routine works */

2075

case CHARGE_NOMEM: /* OOM routine works */

2076

if (!oom) {

2076

if (!oom) {

2077

css_put(&mem->css);

2077

css_put(&mem->css);

2078

goto nomem;

2078

goto nomem;

2079

}

2079

}

2080

/* If oom, we never return -ENOMEM */

2080

/* If oom, we never return -ENOMEM */

2081

nr_oom_retries--;

2081

nr_oom_retries--;

2082

break;

2082

break;

2083

case CHARGE_OOM_DIE: /* Killed by OOM Killer */

2083

case CHARGE_OOM_DIE: /* Killed by OOM Killer */

2084

css_put(&mem->css);

2084

css_put(&mem->css);

2085

goto bypass;

2085

goto bypass;

2086

}

2086

}

2087

} while (ret != CHARGE_OK);

2087

} while (ret != CHARGE_OK);

2088

2089

if (batch > nr_pages)

2089

if (batch > nr_pages)

2090

refill_stock(mem, batch - nr_pages);

2090

refill_stock(mem, batch - nr_pages);

2091

css_put(&mem->css);

2091

css_put(&mem->css);

2092

done:

2092

done:

2093

*memcg = mem;

2093

*memcg = mem;

2094

return 0;

2094

return 0;

2095

nomem:

2095

nomem:

2096

*memcg = NULL;

2096

*memcg = NULL;

2097

return -ENOMEM;

2097

return -ENOMEM;

2098

bypass:

2098

bypass:

2099

*memcg = NULL;

2099

*memcg = NULL;

2100

return 0;

2100

return 0;

2101

}

2101

}

2102

2103

/*

2103

/*

2104

* Somemtimes we have to undo a charge we got by try_charge().

2104

* Somemtimes we have to undo a charge we got by try_charge().

2105

* This function is for that and do uncharge, put css's refcnt.

2105

* This function is for that and do uncharge, put css's refcnt.

2106

* gotten by try_charge().

2106

* gotten by try_charge().

2107

*/

2107

*/

2108

static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,

2108

static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,

2109

unsigned int nr_pages)

2109

unsigned int nr_pages)

2110

{

2110

{

2111

if (!mem_cgroup_is_root(mem)) {

2111

if (!mem_cgroup_is_root(mem)) {

2112

unsigned long bytes = nr_pages * PAGE_SIZE;

2112

unsigned long bytes = nr_pages * PAGE_SIZE;

2113

2114

res_counter_uncharge(&mem->res, bytes);

2114

res_counter_uncharge(&mem->res, bytes);

2115

if (do_swap_account)

2115

if (do_swap_account)

2116

res_counter_uncharge(&mem->memsw, bytes);

2116

res_counter_uncharge(&mem->memsw, bytes);

2117

}

2117

}

2118

}

2118

}

2119

2120

/*

2120

/*

2121

* A helper function to get mem_cgroup from ID. must be called under

2121

* A helper function to get mem_cgroup from ID. must be called under

2122

* rcu_read_lock(). The caller must check css_is_removed() or some if

2122

* rcu_read_lock(). The caller must check css_is_removed() or some if

2123

* it's concern. (dropping refcnt from swap can be called against removed

2123

* it's concern. (dropping refcnt from swap can be called against removed

2124

* memcg.)

2124

* memcg.)

2125

*/

2125

*/

2126

static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)

2126

static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)

2127

{

2127

{

2128

struct cgroup_subsys_state *css;

2128

struct cgroup_subsys_state *css;

2129

2130

/* ID 0 is unused ID */

2130

/* ID 0 is unused ID */

2131

if (!id)

2131

if (!id)

2132

return NULL;

2132

return NULL;

2133

css = css_lookup(&mem_cgroup_subsys, id);

2133

css = css_lookup(&mem_cgroup_subsys, id);

2134

if (!css)

2134

if (!css)

2135

return NULL;

2135

return NULL;

2136

return container_of(css, struct mem_cgroup, css);

2136

return container_of(css, struct mem_cgroup, css);

2137

}

2137

}

2138

2139

struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)

2139

struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)

2140

{

2140

{

2141

struct mem_cgroup *mem = NULL;

2141

struct mem_cgroup *mem = NULL;

2142

struct page_cgroup *pc;

2142

struct page_cgroup *pc;

2143

unsigned short id;

2143

unsigned short id;

2144

swp_entry_t ent;

2144

swp_entry_t ent;

2145

2146

VM_BUG_ON(!PageLocked(page));

2146

VM_BUG_ON(!PageLocked(page));

2147

2148

pc = lookup_page_cgroup(page);

2148

pc = lookup_page_cgroup(page);

2149

lock_page_cgroup(pc);

2149

lock_page_cgroup(pc);

2150

if (PageCgroupUsed(pc)) {

2150

if (PageCgroupUsed(pc)) {

2151

mem = pc->mem_cgroup;

2151

mem = pc->mem_cgroup;

2152

if (mem && !css_tryget(&mem->css))

2152

if (mem && !css_tryget(&mem->css))

2153

mem = NULL;

2153

mem = NULL;

2154

} else if (PageSwapCache(page)) {

2154

} else if (PageSwapCache(page)) {

2155

ent.val = page_private(page);

2155

ent.val = page_private(page);

2156

id = lookup_swap_cgroup(ent);

2156

id = lookup_swap_cgroup(ent);

2157

rcu_read_lock();

2157

rcu_read_lock();

2158

mem = mem_cgroup_lookup(id);

2158

mem = mem_cgroup_lookup(id);

2159

if (mem && !css_tryget(&mem->css))

2159

if (mem && !css_tryget(&mem->css))

2160

mem = NULL;

2160

mem = NULL;

2161

rcu_read_unlock();

2161

rcu_read_unlock();

2162

}

2162

}

2163

unlock_page_cgroup(pc);

2163

unlock_page_cgroup(pc);

2164

return mem;

2164

return mem;

2165

}

2165

}

2166

2167

static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,

2167

static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,

2168

struct page *page,

2168

struct page *page,

2169

unsigned int nr_pages,

2169

unsigned int nr_pages,

2170

struct page_cgroup *pc,

2170

struct page_cgroup *pc,

2171

enum charge_type ctype)

2171

enum charge_type ctype)

2172

{

2172

{

2173

lock_page_cgroup(pc);

2173

lock_page_cgroup(pc);

2174

if (unlikely(PageCgroupUsed(pc))) {

2174

if (unlikely(PageCgroupUsed(pc))) {

2175

unlock_page_cgroup(pc);

2175

unlock_page_cgroup(pc);

2176

__mem_cgroup_cancel_charge(mem, nr_pages);

2176

__mem_cgroup_cancel_charge(mem, nr_pages);

2177

return;

2177

return;

2178

}

2178

}

2179

/*

2179

/*

2180

* we don't need page_cgroup_lock about tail pages, becase they are not

2180

* we don't need page_cgroup_lock about tail pages, becase they are not

2181

* accessed by any other context at this point.

2181

* accessed by any other context at this point.

2182

*/

2182

*/

2183

pc->mem_cgroup = mem;

2183

pc->mem_cgroup = mem;

2184

/*

2184

/*

2185

* We access a page_cgroup asynchronously without lock_page_cgroup().

2185

* We access a page_cgroup asynchronously without lock_page_cgroup().

2186

* Especially when a page_cgroup is taken from a page, pc->mem_cgroup

2186

* Especially when a page_cgroup is taken from a page, pc->mem_cgroup

2187

* is accessed after testing USED bit. To make pc->mem_cgroup visible

2187

* is accessed after testing USED bit. To make pc->mem_cgroup visible

2188

* before USED bit, we need memory barrier here.

2188

* before USED bit, we need memory barrier here.

2189

* See mem_cgroup_add_lru_list(), etc.

2189

* See mem_cgroup_add_lru_list(), etc.

2190

*/

2190

*/

2191

smp_wmb();

2191

smp_wmb();

2192

switch (ctype) {

2192

switch (ctype) {

2193

case MEM_CGROUP_CHARGE_TYPE_CACHE:

2193

case MEM_CGROUP_CHARGE_TYPE_CACHE:

2194

case MEM_CGROUP_CHARGE_TYPE_SHMEM:

2194

case MEM_CGROUP_CHARGE_TYPE_SHMEM:

2195

SetPageCgroupCache(pc);

2195

SetPageCgroupCache(pc);

2196

SetPageCgroupUsed(pc);

2196

SetPageCgroupUsed(pc);

2197

break;

2197

break;

2198

case MEM_CGROUP_CHARGE_TYPE_MAPPED:

2198

case MEM_CGROUP_CHARGE_TYPE_MAPPED:

2199

ClearPageCgroupCache(pc);

2199

ClearPageCgroupCache(pc);

2200

SetPageCgroupUsed(pc);

2200

SetPageCgroupUsed(pc);

2201

break;

2201

break;

2202

default:

2202

default:

2203

break;

2203

break;

2204

}

2204

}

2205

2206

mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);

2206

mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);

2207

unlock_page_cgroup(pc);

2207

unlock_page_cgroup(pc);

2208

/*

2208

/*

2209

* "charge_statistics" updated event counter. Then, check it.

2209

* "charge_statistics" updated event counter. Then, check it.

2210

* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.

2210

* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.

2211

* if they exceeds softlimit.

2211

* if they exceeds softlimit.

2212

*/

2212

*/

2213

memcg_check_events(mem, page);

2213

memcg_check_events(mem, page);

2214

}

2214

}

2215

2216

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

2216

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

2217

2218

#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\

2218

#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\

2219

(1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))

2219

(1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))

2220

/*

2220

/*

2221

* Because tail pages are not marked as "used", set it. We're under

2221

* Because tail pages are not marked as "used", set it. We're under

2222

* zone->lru_lock, 'splitting on pmd' and compund_lock.

2222

* zone->lru_lock, 'splitting on pmd' and compund_lock.

2223

*/

2223

*/

2224

void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)

2224

void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)

2225

{

2225

{

2226

struct page_cgroup *head_pc = lookup_page_cgroup(head);

2226

struct page_cgroup *head_pc = lookup_page_cgroup(head);

2227

struct page_cgroup *tail_pc = lookup_page_cgroup(tail);

2227

struct page_cgroup *tail_pc = lookup_page_cgroup(tail);

2228

unsigned long flags;

2228

unsigned long flags;

2229

2230

if (mem_cgroup_disabled())

2230

if (mem_cgroup_disabled())

2231

return;

2231

return;

2232

/*

2232

/*

2233

* We have no races with charge/uncharge but will have races with

2233

* We have no races with charge/uncharge but will have races with

2234

* page state accounting.

2234

* page state accounting.

2235

*/

2235

*/

2236

move_lock_page_cgroup(head_pc, &flags);

2236

move_lock_page_cgroup(head_pc, &flags);

2237

2238

tail_pc->mem_cgroup = head_pc->mem_cgroup;

2238

tail_pc->mem_cgroup = head_pc->mem_cgroup;

2239

smp_wmb(); /* see __commit_charge() */

2239

smp_wmb(); /* see __commit_charge() */

2240

if (PageCgroupAcctLRU(head_pc)) {

2240

if (PageCgroupAcctLRU(head_pc)) {

2241

enum lru_list lru;

2241

enum lru_list lru;

2242

struct mem_cgroup_per_zone *mz;

2242

struct mem_cgroup_per_zone *mz;

2243

2244

/*

2244

/*

2245

* LRU flags cannot be copied because we need to add tail

2245

* LRU flags cannot be copied because we need to add tail

2246

*.page to LRU by generic call and our hook will be called.

2246

*.page to LRU by generic call and our hook will be called.

2247

* We hold lru_lock, then, reduce counter directly.

2247

* We hold lru_lock, then, reduce counter directly.

2248

*/

2248

*/

2249

lru = page_lru(head);

2249

lru = page_lru(head);

2250

mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);

2250

mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);

2251

MEM_CGROUP_ZSTAT(mz, lru) -= 1;

2251

MEM_CGROUP_ZSTAT(mz, lru) -= 1;

2252

}

2252

}

2253

tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;

2253

tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;

2254

move_unlock_page_cgroup(head_pc, &flags);

2254

move_unlock_page_cgroup(head_pc, &flags);

2255

}

2255

}

2256

#endif

2256

#endif

2257

2258

/**

2258

/**

2259

* mem_cgroup_move_account - move account of the page

2259

* mem_cgroup_move_account - move account of the page

2260

* @page: the page

2260

* @page: the page

2261

* @nr_pages: number of regular pages (>1 for huge pages)

2261

* @nr_pages: number of regular pages (>1 for huge pages)

2262

* @pc: page_cgroup of the page.

2262

* @pc: page_cgroup of the page.

2263

* @from: mem_cgroup which the page is moved from.

2263

* @from: mem_cgroup which the page is moved from.

2264

* @to: mem_cgroup which the page is moved to. @from != @to.

2264

* @to: mem_cgroup which the page is moved to. @from != @to.

2265

* @uncharge: whether we should call uncharge and css_put against @from.

2265

* @uncharge: whether we should call uncharge and css_put against @from.

2266

*

2266

*

2267

* The caller must confirm following.

2267

* The caller must confirm following.

2268

* - page is not on LRU (isolate_page() is useful.)

2268

* - page is not on LRU (isolate_page() is useful.)

2269

* - compound_lock is held when nr_pages > 1

2269

* - compound_lock is held when nr_pages > 1

2270

*

2270

*

2271

* This function doesn't do "charge" nor css_get to new cgroup. It should be

2271

* This function doesn't do "charge" nor css_get to new cgroup. It should be

2272

* done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is

2272

* done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is

2273

* true, this function does "uncharge" from old cgroup, but it doesn't if

2273

* true, this function does "uncharge" from old cgroup, but it doesn't if

2274

* @uncharge is false, so a caller should do "uncharge".

2274

* @uncharge is false, so a caller should do "uncharge".

2275

*/

2275

*/

2276

static int mem_cgroup_move_account(struct page *page,

2276

static int mem_cgroup_move_account(struct page *page,

2277

unsigned int nr_pages,

2277

unsigned int nr_pages,

2278

struct page_cgroup *pc,

2278

struct page_cgroup *pc,

2279

struct mem_cgroup *from,

2279

struct mem_cgroup *from,

2280

struct mem_cgroup *to,

2280

struct mem_cgroup *to,

2281

bool uncharge)

2281

bool uncharge)

2282

{

2282

{

2283

unsigned long flags;

2283

unsigned long flags;

2284

int ret;

2284

int ret;

2285

2286

VM_BUG_ON(from == to);

2286

VM_BUG_ON(from == to);

2287

VM_BUG_ON(PageLRU(page));

2287

VM_BUG_ON(PageLRU(page));

2288

/*

2288

/*

2289

* The page is isolated from LRU. So, collapse function

2289

* The page is isolated from LRU. So, collapse function

2290

* will not handle this page. But page splitting can happen.

2290

* will not handle this page. But page splitting can happen.

2291

* Do this check under compound_page_lock(). The caller should

2291

* Do this check under compound_page_lock(). The caller should

2292

* hold it.

2292

* hold it.

2293

*/

2293

*/

2294

ret = -EBUSY;

2294

ret = -EBUSY;

2295

if (nr_pages > 1 && !PageTransHuge(page))

2295

if (nr_pages > 1 && !PageTransHuge(page))

2296

goto out;

2296

goto out;

2297

2298

lock_page_cgroup(pc);

2298

lock_page_cgroup(pc);

2299

2300

ret = -EINVAL;

2300

ret = -EINVAL;

2301

if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)

2301

if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)

2302

goto unlock;

2302

goto unlock;

2303

2304

move_lock_page_cgroup(pc, &flags);

2304

move_lock_page_cgroup(pc, &flags);

2305

2306

if (PageCgroupFileMapped(pc)) {

2306

if (PageCgroupFileMapped(pc)) {

2307

/* Update mapped_file data for mem_cgroup */

2307

/* Update mapped_file data for mem_cgroup */

2308

preempt_disable();

2308

preempt_disable();

2309

__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2309

__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2310

__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2310

__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2311

preempt_enable();

2311

preempt_enable();

2312

}

2312

}

2313

mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);

2313

mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);

2314

if (uncharge)

2314

if (uncharge)

2315

/* This is not "cancel", but cancel_charge does all we need. */

2315

/* This is not "cancel", but cancel_charge does all we need. */

2316

__mem_cgroup_cancel_charge(from, nr_pages);

2316

__mem_cgroup_cancel_charge(from, nr_pages);

2317

2318

/* caller should have done css_get */

2318

/* caller should have done css_get */

2319

pc->mem_cgroup = to;

2319

pc->mem_cgroup = to;

2320

mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);

2320

mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);

2321

/*

2321

/*

2322

* We charges against "to" which may not have any tasks. Then, "to"

2322

* We charges against "to" which may not have any tasks. Then, "to"

2323

* can be under rmdir(). But in current implementation, caller of

2323

* can be under rmdir(). But in current implementation, caller of

2324

* this function is just force_empty() and move charge, so it's

2324

* this function is just force_empty() and move charge, so it's

2325

* guaranteed that "to" is never removed. So, we don't check rmdir

2325

* guaranteed that "to" is never removed. So, we don't check rmdir

2326

* status here.

2326

* status here.

2327

*/

2327

*/

2328

move_unlock_page_cgroup(pc, &flags);

2328

move_unlock_page_cgroup(pc, &flags);

2329

ret = 0;

2329

ret = 0;

2330

unlock:

2330

unlock:

2331

unlock_page_cgroup(pc);

2331

unlock_page_cgroup(pc);

2332

/*

2332

/*

2333

* check events

2333

* check events

2334

*/

2334

*/

2335

memcg_check_events(to, page);

2335

memcg_check_events(to, page);

2336

memcg_check_events(from, page);

2336

memcg_check_events(from, page);

2337

out:

2337

out:

2338

return ret;

2338

return ret;

2339

}

2339

}

2340

2341

/*

2341

/*

2342

* move charges to its parent.

2342

* move charges to its parent.

2343

*/

2343

*/

2344

2345

static int mem_cgroup_move_parent(struct page *page,

2345

static int mem_cgroup_move_parent(struct page *page,

2346

struct page_cgroup *pc,

2346

struct page_cgroup *pc,

2347

struct mem_cgroup *child,

2347

struct mem_cgroup *child,

2348

gfp_t gfp_mask)

2348

gfp_t gfp_mask)

2349

{

2349

{

2350

struct cgroup *cg = child->css.cgroup;

2350

struct cgroup *cg = child->css.cgroup;

2351

struct cgroup *pcg = cg->parent;

2351

struct cgroup *pcg = cg->parent;

2352

struct mem_cgroup *parent;

2352

struct mem_cgroup *parent;

2353

unsigned int nr_pages;

2353

unsigned int nr_pages;

2354

unsigned long uninitialized_var(flags);

2354

unsigned long uninitialized_var(flags);

2355

int ret;

2355

int ret;

2356

2357

/* Is ROOT ? */

2357

/* Is ROOT ? */

2358

if (!pcg)

2358

if (!pcg)

2359

return -EINVAL;

2359

return -EINVAL;

2360

2361

ret = -EBUSY;

2361

ret = -EBUSY;

2362

if (!get_page_unless_zero(page))

2362

if (!get_page_unless_zero(page))

2363

goto out;

2363

goto out;

2364

if (isolate_lru_page(page))

2364

if (isolate_lru_page(page))

2365

goto put;

2365

goto put;

2366

2367

nr_pages = hpage_nr_pages(page);

2367

nr_pages = hpage_nr_pages(page);

2368

2369

parent = mem_cgroup_from_cont(pcg);

2369

parent = mem_cgroup_from_cont(pcg);

2370

ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);

2370

ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);

2371

if (ret || !parent)

2371

if (ret || !parent)

2372

goto put_back;

2372

goto put_back;

2373

2374

if (nr_pages > 1)

2374

if (nr_pages > 1)

2375

flags = compound_lock_irqsave(page);

2375

flags = compound_lock_irqsave(page);

2376

2377

ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);

2377

ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);

2378

if (ret)

2378

if (ret)

2379

__mem_cgroup_cancel_charge(parent, nr_pages);

2379

__mem_cgroup_cancel_charge(parent, nr_pages);

2380

2381

if (nr_pages > 1)

2381

if (nr_pages > 1)

2382

compound_unlock_irqrestore(page, flags);

2382

compound_unlock_irqrestore(page, flags);

2383

put_back:

2383

put_back:

2384

putback_lru_page(page);

2384

putback_lru_page(page);

2385

put:

2385

put:

2386

put_page(page);

2386

put_page(page);

2387

out:

2387

out:

2388

return ret;

2388

return ret;

2389

}

2389

}

2390

2391

/*

2391

/*

2392

* Charge the memory controller for page usage.

2392

* Charge the memory controller for page usage.

2393

* Return

2393

* Return

2394

* 0 if the charge was successful

2394

* 0 if the charge was successful

2395

* < 0 if the cgroup is over its limit

2395

* < 0 if the cgroup is over its limit

2396

*/

2396

*/

2397

static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,

2397

static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,

2398

gfp_t gfp_mask, enum charge_type ctype)

2398

gfp_t gfp_mask, enum charge_type ctype)

2399

{

2399

{

2400

struct mem_cgroup *mem = NULL;

2400

struct mem_cgroup *mem = NULL;

2401

unsigned int nr_pages = 1;

2401

unsigned int nr_pages = 1;

2402

struct page_cgroup *pc;

2402

struct page_cgroup *pc;

2403

bool oom = true;

2403

bool oom = true;

2404

int ret;

2404

int ret;

2405

2406

if (PageTransHuge(page)) {

2406

if (PageTransHuge(page)) {

2407

nr_pages <<= compound_order(page);

2407

nr_pages <<= compound_order(page);

2408

VM_BUG_ON(!PageTransHuge(page));

2408

VM_BUG_ON(!PageTransHuge(page));

2409

/*

2409

/*

2410

* Never OOM-kill a process for a huge page. The

2410

* Never OOM-kill a process for a huge page. The

2411

* fault handler will fall back to regular pages.

2411

* fault handler will fall back to regular pages.

2412

*/

2412

*/

2413

oom = false;

2413

oom = false;

2414

}

2414

}

2415

2416

pc = lookup_page_cgroup(page);

2416

pc = lookup_page_cgroup(page);

2417

BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */

2417

BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */

2418

2419

ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);

2419

ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);

2420

if (ret || !mem)

2420

if (ret || !mem)

2421

return ret;

2421

return ret;

2422

2423

__mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);

2423

__mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);

2424

return 0;

2424

return 0;

2425

}

2425

}

2426

2427

int mem_cgroup_newpage_charge(struct page *page,

2427

int mem_cgroup_newpage_charge(struct page *page,

2428

struct mm_struct *mm, gfp_t gfp_mask)

2428

struct mm_struct *mm, gfp_t gfp_mask)

2429

{

2429

{

2430

if (mem_cgroup_disabled())

2430

if (mem_cgroup_disabled())

2431

return 0;

2431

return 0;

2432

/*

2432

/*

2433

* If already mapped, we don't have to account.

2433

* If already mapped, we don't have to account.

2434

* If page cache, page->mapping has address_space.

2434

* If page cache, page->mapping has address_space.

2435

* But page->mapping may have out-of-use anon_vma pointer,

2435

* But page->mapping may have out-of-use anon_vma pointer,

2436

* detecit it by PageAnon() check. newly-mapped-anon's page->mapping

2436

* detecit it by PageAnon() check. newly-mapped-anon's page->mapping

2437

* is NULL.

2437

* is NULL.

2438

*/

2438

*/

2439

if (page_mapped(page) || (page->mapping && !PageAnon(page)))

2439

if (page_mapped(page) || (page->mapping && !PageAnon(page)))

2440

return 0;

2440

return 0;

2441

if (unlikely(!mm))

2441

if (unlikely(!mm))

2442

mm = &init_mm;

2442

mm = &init_mm;

2443

return mem_cgroup_charge_common(page, mm, gfp_mask,

2443

return mem_cgroup_charge_common(page, mm, gfp_mask,

2444

MEM_CGROUP_CHARGE_TYPE_MAPPED);

2444

MEM_CGROUP_CHARGE_TYPE_MAPPED);

2445

}

2445

}

2446

2447

static void

2447

static void

2448

__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,

2448

__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,

2449

enum charge_type ctype);

2449

enum charge_type ctype);

2450

2451

static void

2451

static void

2452

__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,

2452

__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,

2453

enum charge_type ctype)

2453

enum charge_type ctype)

2454

{

2454

{

2455

struct page_cgroup *pc = lookup_page_cgroup(page);

2455

struct page_cgroup *pc = lookup_page_cgroup(page);

2456

/*

2456

/*

2457

* In some case, SwapCache, FUSE(splice_buf->radixtree), the page

2457

* In some case, SwapCache, FUSE(splice_buf->radixtree), the page

2458

* is already on LRU. It means the page may on some other page_cgroup's

2458

* is already on LRU. It means the page may on some other page_cgroup's

2459

* LRU. Take care of it.

2459

* LRU. Take care of it.

2460

*/

2460

*/

2461

mem_cgroup_lru_del_before_commit(page);

2461

mem_cgroup_lru_del_before_commit(page);

2462

__mem_cgroup_commit_charge(mem, page, 1, pc, ctype);

2462

__mem_cgroup_commit_charge(mem, page, 1, pc, ctype);

2463

mem_cgroup_lru_add_after_commit(page);

2463

mem_cgroup_lru_add_after_commit(page);

2464

return;

2464

return;

2465

}

2465

}

2466

2467

int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,

2467

int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,

2468

gfp_t gfp_mask)

2468

gfp_t gfp_mask)

2469

{

2469

{

2470

struct mem_cgroup *mem = NULL;

2470

struct mem_cgroup *mem = NULL;

2471

int ret;

2471

int ret;

2472

2473

if (mem_cgroup_disabled())

2473

if (mem_cgroup_disabled())

2474

return 0;

2474

return 0;

2475

if (PageCompound(page))

2475

if (PageCompound(page))

2476

return 0;

2476

return 0;

2477

/*

2477

/*

2478

* Corner case handling. This is called from add_to_page_cache()

2478

* Corner case handling. This is called from add_to_page_cache()

2479

* in usual. But some FS (shmem) precharges this page before calling it

2479

* in usual. But some FS (shmem) precharges this page before calling it

2480

* and call add_to_page_cache() with GFP_NOWAIT.

2480

* and call add_to_page_cache() with GFP_NOWAIT.

2481

*

2481

*

2482

* For GFP_NOWAIT case, the page may be pre-charged before calling

2482

* For GFP_NOWAIT case, the page may be pre-charged before calling

2483

* add_to_page_cache(). (See shmem.c) check it here and avoid to call

2483

* add_to_page_cache(). (See shmem.c) check it here and avoid to call

2484

* charge twice. (It works but has to pay a bit larger cost.)

2484

* charge twice. (It works but has to pay a bit larger cost.)

2485

* And when the page is SwapCache, it should take swap information

2485

* And when the page is SwapCache, it should take swap information

2486

* into account. This is under lock_page() now.

2486

* into account. This is under lock_page() now.

2487

*/

2487

*/

2488

if (!(gfp_mask & __GFP_WAIT)) {

2488

if (!(gfp_mask & __GFP_WAIT)) {

2489

struct page_cgroup *pc;

2489

struct page_cgroup *pc;

2490

2491

pc = lookup_page_cgroup(page);

2491

pc = lookup_page_cgroup(page);

2492

if (!pc)

2492

if (!pc)

2493

return 0;

2493

return 0;

2494

lock_page_cgroup(pc);

2494

lock_page_cgroup(pc);

2495

if (PageCgroupUsed(pc)) {

2495

if (PageCgroupUsed(pc)) {

2496

unlock_page_cgroup(pc);

2496

unlock_page_cgroup(pc);

2497

return 0;

2497

return 0;

2498

}

2498

}

2499

unlock_page_cgroup(pc);

2499

unlock_page_cgroup(pc);

2500

}

2500

}

2501

2502

if (unlikely(!mm))

2502

if (unlikely(!mm))

2503

mm = &init_mm;

2503

mm = &init_mm;

2504

2505

if (page_is_file_cache(page)) {

2505

if (page_is_file_cache(page)) {

2506

ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);

2506

ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);

2507

if (ret || !mem)

2507

if (ret || !mem)

2508

return ret;

2508

return ret;

2509

2510

/*

2510

/*

2511

* FUSE reuses pages without going through the final

2511

* FUSE reuses pages without going through the final

2512

* put that would remove them from the LRU list, make

2512

* put that would remove them from the LRU list, make

2513

* sure that they get relinked properly.

2513

* sure that they get relinked properly.

2514

*/

2514

*/

2515

__mem_cgroup_commit_charge_lrucare(page, mem,

2515

__mem_cgroup_commit_charge_lrucare(page, mem,

2516

MEM_CGROUP_CHARGE_TYPE_CACHE);

2516

MEM_CGROUP_CHARGE_TYPE_CACHE);

2517

return ret;

2517

return ret;

2518

}

2518

}

2519

/* shmem */

2519

/* shmem */

2520

if (PageSwapCache(page)) {

2520

if (PageSwapCache(page)) {

2521

ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);

2521

ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);

2522

if (!ret)

2522

if (!ret)

2523

__mem_cgroup_commit_charge_swapin(page, mem,

2523

__mem_cgroup_commit_charge_swapin(page, mem,

2524

MEM_CGROUP_CHARGE_TYPE_SHMEM);

2524

MEM_CGROUP_CHARGE_TYPE_SHMEM);

2525

} else

2525

} else

2526

ret = mem_cgroup_charge_common(page, mm, gfp_mask,

2526

ret = mem_cgroup_charge_common(page, mm, gfp_mask,

2527

MEM_CGROUP_CHARGE_TYPE_SHMEM);

2527

MEM_CGROUP_CHARGE_TYPE_SHMEM);

2528

2529

return ret;

2529

return ret;

2530

}

2530

}

2531

2532

/*

2532

/*

2533

* While swap-in, try_charge -> commit or cancel, the page is locked.

2533

* While swap-in, try_charge -> commit or cancel, the page is locked.

2534

* And when try_charge() successfully returns, one refcnt to memcg without

2534

* And when try_charge() successfully returns, one refcnt to memcg without

2535

* struct page_cgroup is acquired. This refcnt will be consumed by

2535

* struct page_cgroup is acquired. This refcnt will be consumed by

2536

* "commit()" or removed by "cancel()"

2536

* "commit()" or removed by "cancel()"

2537

*/

2537

*/

2538

int mem_cgroup_try_charge_swapin(struct mm_struct *mm,

2538

int mem_cgroup_try_charge_swapin(struct mm_struct *mm,

2539

struct page *page,

2539

struct page *page,

2540

gfp_t mask, struct mem_cgroup **ptr)

2540

gfp_t mask, struct mem_cgroup **ptr)

2541

{

2541

{

2542

struct mem_cgroup *mem;

2542

struct mem_cgroup *mem;

2543

int ret;

2543

int ret;

2544

2545

*ptr = NULL;

2545

*ptr = NULL;

2546

2547

if (mem_cgroup_disabled())

2547

if (mem_cgroup_disabled())

2548

return 0;

2548

return 0;

2549

2550

if (!do_swap_account)

2550

if (!do_swap_account)

2551

goto charge_cur_mm;

2551

goto charge_cur_mm;

2552

/*

2552

/*

2553

* A racing thread's fault, or swapoff, may have already updated

2553

* A racing thread's fault, or swapoff, may have already updated

2554

* the pte, and even removed page from swap cache: in those cases

2554

* the pte, and even removed page from swap cache: in those cases

2555

* do_swap_page()'s pte_same() test will fail; but there's also a

2555

* do_swap_page()'s pte_same() test will fail; but there's also a

2556

* KSM case which does need to charge the page.

2556

* KSM case which does need to charge the page.

2557

*/

2557

*/

2558

if (!PageSwapCache(page))

2558

if (!PageSwapCache(page))

2559

goto charge_cur_mm;

2559

goto charge_cur_mm;

2560

mem = try_get_mem_cgroup_from_page(page);

2560

mem = try_get_mem_cgroup_from_page(page);

2561

if (!mem)

2561

if (!mem)

2562

goto charge_cur_mm;

2562

goto charge_cur_mm;

2563

*ptr = mem;

2563

*ptr = mem;

2564

ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);

2564

ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);

2565

css_put(&mem->css);

2565

css_put(&mem->css);

2566

return ret;

2566

return ret;

2567

charge_cur_mm:

2567

charge_cur_mm:

2568

if (unlikely(!mm))

2568

if (unlikely(!mm))

2569

mm = &init_mm;

2569

mm = &init_mm;

2570

return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);

2570

return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);

2571

}

2571

}

2572

2573

static void

2573

static void

2574

__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,

2574

__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,

2575

enum charge_type ctype)

2575

enum charge_type ctype)

2576

{

2576

{

2577

if (mem_cgroup_disabled())

2577

if (mem_cgroup_disabled())

2578

return;

2578

return;

2579

if (!ptr)

2579

if (!ptr)

2580

return;

2580

return;

2581

cgroup_exclude_rmdir(&ptr->css);

2581

cgroup_exclude_rmdir(&ptr->css);

2582

2583

__mem_cgroup_commit_charge_lrucare(page, ptr, ctype);

2583

__mem_cgroup_commit_charge_lrucare(page, ptr, ctype);

2584

/*

2584

/*

2585

* Now swap is on-memory. This means this page may be

2585

* Now swap is on-memory. This means this page may be

2586

* counted both as mem and swap....double count.

2586

* counted both as mem and swap....double count.

2587

* Fix it by uncharging from memsw. Basically, this SwapCache is stable

2587

* Fix it by uncharging from memsw. Basically, this SwapCache is stable

2588

* under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()

2588

* under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()

2589

* may call delete_from_swap_cache() before reach here.

2589

* may call delete_from_swap_cache() before reach here.

2590

*/

2590

*/

2591

if (do_swap_account && PageSwapCache(page)) {

2591

if (do_swap_account && PageSwapCache(page)) {

2592

swp_entry_t ent = {.val = page_private(page)};

2592

swp_entry_t ent = {.val = page_private(page)};

2593

unsigned short id;

2593

unsigned short id;

2594

struct mem_cgroup *memcg;

2594

struct mem_cgroup *memcg;

2595

2596

id = swap_cgroup_record(ent, 0);

2596

id = swap_cgroup_record(ent, 0);

2597

rcu_read_lock();

2597

rcu_read_lock();

2598

memcg = mem_cgroup_lookup(id);

2598

memcg = mem_cgroup_lookup(id);

2599

if (memcg) {

2599

if (memcg) {

2600

/*

2600

/*

2601

* This recorded memcg can be obsolete one. So, avoid

2601

* This recorded memcg can be obsolete one. So, avoid

2602

* calling css_tryget

2602

* calling css_tryget

2603

*/

2603

*/

2604

if (!mem_cgroup_is_root(memcg))

2604

if (!mem_cgroup_is_root(memcg))

2605

res_counter_uncharge(&memcg->memsw, PAGE_SIZE);

2605

res_counter_uncharge(&memcg->memsw, PAGE_SIZE);

2606

mem_cgroup_swap_statistics(memcg, false);

2606

mem_cgroup_swap_statistics(memcg, false);

2607

mem_cgroup_put(memcg);

2607

mem_cgroup_put(memcg);

2608

}

2608

}

2609

rcu_read_unlock();

2609

rcu_read_unlock();

2610

}

2610

}

2611

/*

2611

/*

2612

* At swapin, we may charge account against cgroup which has no tasks.

2612

* At swapin, we may charge account against cgroup which has no tasks.

2613

* So, rmdir()->pre_destroy() can be called while we do this charge.

2613

* So, rmdir()->pre_destroy() can be called while we do this charge.

2614

* In that case, we need to call pre_destroy() again. check it here.

2614

* In that case, we need to call pre_destroy() again. check it here.

2615

*/

2615

*/

2616

cgroup_release_and_wakeup_rmdir(&ptr->css);

2616

cgroup_release_and_wakeup_rmdir(&ptr->css);

2617

}

2617

}

2618

2619

void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)

2619

void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)

2620

{

2620

{

2621

__mem_cgroup_commit_charge_swapin(page, ptr,

2621

__mem_cgroup_commit_charge_swapin(page, ptr,

2622

MEM_CGROUP_CHARGE_TYPE_MAPPED);

2622

MEM_CGROUP_CHARGE_TYPE_MAPPED);

2623

}

2623

}

2624

2625

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)

2625

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)

2626

{

2626

{

2627

if (mem_cgroup_disabled())

2627

if (mem_cgroup_disabled())

2628

return;

2628

return;

2629

if (!mem)

2629

if (!mem)

2630

return;

2630

return;

2631

__mem_cgroup_cancel_charge(mem, 1);

2631

__mem_cgroup_cancel_charge(mem, 1);

2632

}

2632

}

2633

2634

static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,

2634

static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,

2635

unsigned int nr_pages,

2635

unsigned int nr_pages,

2636

const enum charge_type ctype)

2636

const enum charge_type ctype)

2637

{

2637

{

2638

struct memcg_batch_info *batch = NULL;

2638

struct memcg_batch_info *batch = NULL;

2639

bool uncharge_memsw = true;

2639

bool uncharge_memsw = true;

2640

2641

/* If swapout, usage of swap doesn't decrease */

2641

/* If swapout, usage of swap doesn't decrease */

2642

if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)

2642

if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)

2643

uncharge_memsw = false;

2643

uncharge_memsw = false;

2644

2645

batch = &current->memcg_batch;

2645

batch = &current->memcg_batch;

2646

/*

2646

/*

2647

* In usual, we do css_get() when we remember memcg pointer.

2647

* In usual, we do css_get() when we remember memcg pointer.

2648

* But in this case, we keep res->usage until end of a series of

2648

* But in this case, we keep res->usage until end of a series of

2649

* uncharges. Then, it's ok to ignore memcg's refcnt.

2649

* uncharges. Then, it's ok to ignore memcg's refcnt.

2650

*/

2650

*/

2651

if (!batch->memcg)

2651

if (!batch->memcg)

2652

batch->memcg = mem;

2652

batch->memcg = mem;

2653

/*

2653

/*

2654

* do_batch > 0 when unmapping pages or inode invalidate/truncate.

2654

* do_batch > 0 when unmapping pages or inode invalidate/truncate.

2655

* In those cases, all pages freed continuously can be expected to be in

2655

* In those cases, all pages freed continuously can be expected to be in

2656

* the same cgroup and we have chance to coalesce uncharges.

2656

* the same cgroup and we have chance to coalesce uncharges.

2657

* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)

2657

* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)

2658

* because we want to do uncharge as soon as possible.

2658

* because we want to do uncharge as soon as possible.

2659

*/

2659

*/

2660

2661

if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))

2661

if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))

2662

goto direct_uncharge;

2662

goto direct_uncharge;

2663

2664

if (nr_pages > 1)

2664

if (nr_pages > 1)

2665

goto direct_uncharge;

2665

goto direct_uncharge;

2666

2667

/*

2667

/*

2668

* In typical case, batch->memcg == mem. This means we can

2668

* In typical case, batch->memcg == mem. This means we can

2669

* merge a series of uncharges to an uncharge of res_counter.

2669

* merge a series of uncharges to an uncharge of res_counter.

2670

* If not, we uncharge res_counter ony by one.

2670

* If not, we uncharge res_counter ony by one.

2671

*/

2671

*/

2672

if (batch->memcg != mem)

2672

if (batch->memcg != mem)

2673

goto direct_uncharge;

2673

goto direct_uncharge;

2674

/* remember freed charge and uncharge it later */

2674

/* remember freed charge and uncharge it later */

2675

batch->nr_pages++;

2675

batch->nr_pages++;

2676

if (uncharge_memsw)

2676

if (uncharge_memsw)

2677

batch->memsw_nr_pages++;

2677

batch->memsw_nr_pages++;

2678

return;

2678

return;

2679

direct_uncharge:

2679

direct_uncharge:

2680

res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);

2680

res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);

2681

if (uncharge_memsw)

2681

if (uncharge_memsw)

2682

res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);

2682

res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);

2683

if (unlikely(batch->memcg != mem))

2683

if (unlikely(batch->memcg != mem))

2684

memcg_oom_recover(mem);

2684

memcg_oom_recover(mem);

2685

return;

2685

return;

2686

}

2686

}

2687

2688

/*

2688

/*

2689

* uncharge if !page_mapped(page)

2689

* uncharge if !page_mapped(page)

2690

*/

2690

*/

2691

static struct mem_cgroup *

2691

static struct mem_cgroup *

2692

__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)

2692

__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)

2693

{

2693

{

2694

struct mem_cgroup *mem = NULL;

2694

struct mem_cgroup *mem = NULL;

2695

unsigned int nr_pages = 1;

2695

unsigned int nr_pages = 1;

2696

struct page_cgroup *pc;

2696

struct page_cgroup *pc;

2697

2698

if (mem_cgroup_disabled())

2698

if (mem_cgroup_disabled())

2699

return NULL;

2699

return NULL;

2700

2701

if (PageSwapCache(page))

2701

if (PageSwapCache(page))

2702

return NULL;

2702

return NULL;

2703

2704

if (PageTransHuge(page)) {

2704

if (PageTransHuge(page)) {

2705

nr_pages <<= compound_order(page);

2705

nr_pages <<= compound_order(page);

2706

VM_BUG_ON(!PageTransHuge(page));

2706

VM_BUG_ON(!PageTransHuge(page));

2707

}

2707

}

2708

/*

2708

/*

2709

* Check if our page_cgroup is valid

2709

* Check if our page_cgroup is valid

2710

*/

2710

*/

2711

pc = lookup_page_cgroup(page);

2711

pc = lookup_page_cgroup(page);

2712

if (unlikely(!pc || !PageCgroupUsed(pc)))

2712

if (unlikely(!pc || !PageCgroupUsed(pc)))

2713

return NULL;

2713

return NULL;

2714

2715

lock_page_cgroup(pc);

2715

lock_page_cgroup(pc);

2716

2717

mem = pc->mem_cgroup;

2717

mem = pc->mem_cgroup;

2718

2719

if (!PageCgroupUsed(pc))

2719

if (!PageCgroupUsed(pc))

2720

goto unlock_out;

2720

goto unlock_out;

2721

2722

switch (ctype) {

2722

switch (ctype) {

2723

case MEM_CGROUP_CHARGE_TYPE_MAPPED:

2723

case MEM_CGROUP_CHARGE_TYPE_MAPPED:

2724

case MEM_CGROUP_CHARGE_TYPE_DROP:

2724

case MEM_CGROUP_CHARGE_TYPE_DROP:

2725

/* See mem_cgroup_prepare_migration() */

2725

/* See mem_cgroup_prepare_migration() */

2726

if (page_mapped(page) || PageCgroupMigration(pc))

2726

if (page_mapped(page) || PageCgroupMigration(pc))

2727

goto unlock_out;

2727

goto unlock_out;

2728

break;

2728

break;

2729

case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:

2729

case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:

2730

if (!PageAnon(page)) { /* Shared memory */

2730

if (!PageAnon(page)) { /* Shared memory */

2731

if (page->mapping && !page_is_file_cache(page))

2731

if (page->mapping && !page_is_file_cache(page))

2732

goto unlock_out;

2732

goto unlock_out;

2733

} else if (page_mapped(page)) /* Anon */

2733

} else if (page_mapped(page)) /* Anon */

2734

goto unlock_out;

2734

goto unlock_out;

2735

break;

2735

break;

2736

default:

2736

default:

2737

break;

2737

break;

2738

}

2738

}

2739

2740

mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);

2740

mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);

2741

2742

ClearPageCgroupUsed(pc);

2742

ClearPageCgroupUsed(pc);

2743

/*

2743

/*

2744

* pc->mem_cgroup is not cleared here. It will be accessed when it's

2744

* pc->mem_cgroup is not cleared here. It will be accessed when it's

2745

* freed from LRU. This is safe because uncharged page is expected not

2745

* freed from LRU. This is safe because uncharged page is expected not

2746

* to be reused (freed soon). Exception is SwapCache, it's handled by

2746

* to be reused (freed soon). Exception is SwapCache, it's handled by

2747

* special functions.

2747

* special functions.

2748

*/

2748

*/

2749

2750

unlock_page_cgroup(pc);

2750

unlock_page_cgroup(pc);

2751

/*

2751

/*

2752

* even after unlock, we have mem->res.usage here and this memcg

2752

* even after unlock, we have mem->res.usage here and this memcg

2753

* will never be freed.

2753

* will never be freed.

2754

*/

2754

*/

2755

memcg_check_events(mem, page);

2755

memcg_check_events(mem, page);

2756

if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {

2756

if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {

2757

mem_cgroup_swap_statistics(mem, true);

2757

mem_cgroup_swap_statistics(mem, true);

2758

mem_cgroup_get(mem);

2758

mem_cgroup_get(mem);

2759

}

2759

}

2760

if (!mem_cgroup_is_root(mem))

2760

if (!mem_cgroup_is_root(mem))

2761

mem_cgroup_do_uncharge(mem, nr_pages, ctype);

2761

mem_cgroup_do_uncharge(mem, nr_pages, ctype);

2762

2763

return mem;

2763

return mem;

2764

2765

unlock_out:

2765

unlock_out:

2766

unlock_page_cgroup(pc);

2766

unlock_page_cgroup(pc);

2767

return NULL;

2767

return NULL;

2768

}

2768

}

2769

2770

void mem_cgroup_uncharge_page(struct page *page)

2770

void mem_cgroup_uncharge_page(struct page *page)

2771

{

2771

{

2772

/* early check. */

2772

/* early check. */

2773

if (page_mapped(page))

2773

if (page_mapped(page))

2774

return;

2774

return;

2775

if (page->mapping && !PageAnon(page))

2775

if (page->mapping && !PageAnon(page))

2776

return;

2776

return;

2777

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);

2777

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);

2778

}

2778

}

2779

2780

void mem_cgroup_uncharge_cache_page(struct page *page)

2780

void mem_cgroup_uncharge_cache_page(struct page *page)

2781

{

2781

{

2782

VM_BUG_ON(page_mapped(page));

2782

VM_BUG_ON(page_mapped(page));

2783

VM_BUG_ON(page->mapping);

2783

VM_BUG_ON(page->mapping);

2784

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);

2784

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);

2785

}

2785

}

2786

2787

/*

2787

/*

2788

* Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.

2788

* Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.

2789

* In that cases, pages are freed continuously and we can expect pages

2789

* In that cases, pages are freed continuously and we can expect pages

2790

* are in the same memcg. All these calls itself limits the number of

2790

* are in the same memcg. All these calls itself limits the number of

2791

* pages freed at once, then uncharge_start/end() is called properly.

2791

* pages freed at once, then uncharge_start/end() is called properly.

2792

* This may be called prural(2) times in a context,

2792

* This may be called prural(2) times in a context,

2793

*/

2793

*/

2794

2795

void mem_cgroup_uncharge_start(void)

2795

void mem_cgroup_uncharge_start(void)

2796

{

2796

{

2797

current->memcg_batch.do_batch++;

2797

current->memcg_batch.do_batch++;

2798

/* We can do nest. */

2798

/* We can do nest. */

2799

if (current->memcg_batch.do_batch == 1) {

2799

if (current->memcg_batch.do_batch == 1) {

2800

current->memcg_batch.memcg = NULL;

2800

current->memcg_batch.memcg = NULL;

2801

current->memcg_batch.nr_pages = 0;

2801

current->memcg_batch.nr_pages = 0;

2802

current->memcg_batch.memsw_nr_pages = 0;

2802

current->memcg_batch.memsw_nr_pages = 0;

2803

}

2803

}

2804

}

2804

}

2805

2806

void mem_cgroup_uncharge_end(void)

2806

void mem_cgroup_uncharge_end(void)

2807

{

2807

{

2808

struct memcg_batch_info *batch = &current->memcg_batch;

2808

struct memcg_batch_info *batch = &current->memcg_batch;

2809

2810

if (!batch->do_batch)

2810

if (!batch->do_batch)

2811

return;

2811

return;

2812

2813

batch->do_batch--;

2813

batch->do_batch--;

2814

if (batch->do_batch) /* If stacked, do nothing. */

2814

if (batch->do_batch) /* If stacked, do nothing. */

2815

return;

2815

return;

2816

2817

if (!batch->memcg)

2817

if (!batch->memcg)

2818

return;

2818

return;

2819

/*

2819

/*

2820

* This "batch->memcg" is valid without any css_get/put etc...

2820

* This "batch->memcg" is valid without any css_get/put etc...

2821

* bacause we hide charges behind us.

2821

* bacause we hide charges behind us.

2822

*/

2822

*/

2823

if (batch->nr_pages)

2823

if (batch->nr_pages)

2824

res_counter_uncharge(&batch->memcg->res,

2824

res_counter_uncharge(&batch->memcg->res,

2825

batch->nr_pages * PAGE_SIZE);

2825

batch->nr_pages * PAGE_SIZE);

2826

if (batch->memsw_nr_pages)

2826

if (batch->memsw_nr_pages)

2827

res_counter_uncharge(&batch->memcg->memsw,

2827

res_counter_uncharge(&batch->memcg->memsw,

2828

batch->memsw_nr_pages * PAGE_SIZE);

2828

batch->memsw_nr_pages * PAGE_SIZE);

2829

memcg_oom_recover(batch->memcg);

2829

memcg_oom_recover(batch->memcg);

2830

/* forget this pointer (for sanity check) */

2830

/* forget this pointer (for sanity check) */

2831

batch->memcg = NULL;

2831

batch->memcg = NULL;

2832

}

2832

}

2833

2834

#ifdef CONFIG_SWAP

2834

#ifdef CONFIG_SWAP

2835

/*

2835

/*

2836

* called after __delete_from_swap_cache() and drop "page" account.

2836

* called after __delete_from_swap_cache() and drop "page" account.

2837

* memcg information is recorded to swap_cgroup of "ent"

2837

* memcg information is recorded to swap_cgroup of "ent"

2838

*/

2838

*/

2839

void

2839

void

2840

mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)

2840

mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)

2841

{

2841

{

2842

struct mem_cgroup *memcg;

2842

struct mem_cgroup *memcg;

2843

int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;

2843

int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;

2844

2845

if (!swapout) /* this was a swap cache but the swap is unused ! */

2845

if (!swapout) /* this was a swap cache but the swap is unused ! */

2846

ctype = MEM_CGROUP_CHARGE_TYPE_DROP;

2846

ctype = MEM_CGROUP_CHARGE_TYPE_DROP;

2847

2848

memcg = __mem_cgroup_uncharge_common(page, ctype);

2848

memcg = __mem_cgroup_uncharge_common(page, ctype);

2849

2850

/*

2850

/*

2851

* record memcg information, if swapout && memcg != NULL,

2851

* record memcg information, if swapout && memcg != NULL,

2852

* mem_cgroup_get() was called in uncharge().

2852

* mem_cgroup_get() was called in uncharge().

2853

*/

2853

*/

2854

if (do_swap_account && swapout && memcg)

2854

if (do_swap_account && swapout && memcg)

2855

swap_cgroup_record(ent, css_id(&memcg->css));

2855

swap_cgroup_record(ent, css_id(&memcg->css));

2856

}

2856

}

2857

#endif

2857

#endif

2858

2859

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

2859

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

2860

/*

2860

/*

2861

* called from swap_entry_free(). remove record in swap_cgroup and

2861

* called from swap_entry_free(). remove record in swap_cgroup and

2862

* uncharge "memsw" account.

2862

* uncharge "memsw" account.

2863

*/

2863

*/

2864

void mem_cgroup_uncharge_swap(swp_entry_t ent)

2864

void mem_cgroup_uncharge_swap(swp_entry_t ent)

2865

{

2865

{

2866

struct mem_cgroup *memcg;

2866

struct mem_cgroup *memcg;

2867

unsigned short id;

2867

unsigned short id;

2868

2869

if (!do_swap_account)

2869

if (!do_swap_account)

2870

return;

2870

return;

2871

2872

id = swap_cgroup_record(ent, 0);

2872

id = swap_cgroup_record(ent, 0);

2873

rcu_read_lock();

2873

rcu_read_lock();

2874

memcg = mem_cgroup_lookup(id);

2874

memcg = mem_cgroup_lookup(id);

2875

if (memcg) {

2875

if (memcg) {

2876

/*

2876

/*

2877

* We uncharge this because swap is freed.

2877

* We uncharge this because swap is freed.

2878

* This memcg can be obsolete one. We avoid calling css_tryget

2878

* This memcg can be obsolete one. We avoid calling css_tryget

2879

*/

2879

*/

2880

if (!mem_cgroup_is_root(memcg))

2880

if (!mem_cgroup_is_root(memcg))

2881

res_counter_uncharge(&memcg->memsw, PAGE_SIZE);

2881

res_counter_uncharge(&memcg->memsw, PAGE_SIZE);

2882

mem_cgroup_swap_statistics(memcg, false);

2882

mem_cgroup_swap_statistics(memcg, false);

2883

mem_cgroup_put(memcg);

2883

mem_cgroup_put(memcg);

2884

}

2884

}

2885

rcu_read_unlock();

2885

rcu_read_unlock();

2886

}

2886

}

2887

2888

/**

2888

/**

2889

* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.

2889

* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.

2890

* @entry: swap entry to be moved

2890

* @entry: swap entry to be moved

2891

* @from: mem_cgroup which the entry is moved from

2891

* @from: mem_cgroup which the entry is moved from

2892

* @to: mem_cgroup which the entry is moved to

2892

* @to: mem_cgroup which the entry is moved to

2893

* @need_fixup: whether we should fixup res_counters and refcounts.

2893

* @need_fixup: whether we should fixup res_counters and refcounts.

2894

*

2894

*

2895

* It succeeds only when the swap_cgroup's record for this entry is the same

2895

* It succeeds only when the swap_cgroup's record for this entry is the same

2896

* as the mem_cgroup's id of @from.

2896

* as the mem_cgroup's id of @from.

2897

*

2897

*

2898

* Returns 0 on success, -EINVAL on failure.

2898

* Returns 0 on success, -EINVAL on failure.

2899

*

2899

*

2900

* The caller must have charged to @to, IOW, called res_counter_charge() about

2900

* The caller must have charged to @to, IOW, called res_counter_charge() about

2901

* both res and memsw, and called css_get().

2901

* both res and memsw, and called css_get().

2902

*/

2902

*/

2903

static int mem_cgroup_move_swap_account(swp_entry_t entry,

2903

static int mem_cgroup_move_swap_account(swp_entry_t entry,

2904

struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)

2904

struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)

2905

{

2905

{

2906

unsigned short old_id, new_id;

2906

unsigned short old_id, new_id;

2907

2908

old_id = css_id(&from->css);

2908

old_id = css_id(&from->css);

2909

new_id = css_id(&to->css);

2909

new_id = css_id(&to->css);

2910

2911

if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {

2911

if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {

2912

mem_cgroup_swap_statistics(from, false);

2912

mem_cgroup_swap_statistics(from, false);

2913

mem_cgroup_swap_statistics(to, true);

2913

mem_cgroup_swap_statistics(to, true);

2914

/*

2914

/*

2915

* This function is only called from task migration context now.

2915

* This function is only called from task migration context now.

2916

* It postpones res_counter and refcount handling till the end

2916

* It postpones res_counter and refcount handling till the end

2917

* of task migration(mem_cgroup_clear_mc()) for performance

2917

* of task migration(mem_cgroup_clear_mc()) for performance

2918

* improvement. But we cannot postpone mem_cgroup_get(to)

2918

* improvement. But we cannot postpone mem_cgroup_get(to)

2919

* because if the process that has been moved to @to does

2919

* because if the process that has been moved to @to does

2920

* swap-in, the refcount of @to might be decreased to 0.

2920

* swap-in, the refcount of @to might be decreased to 0.

2921

*/

2921

*/

2922

mem_cgroup_get(to);

2922

mem_cgroup_get(to);

2923

if (need_fixup) {

2923

if (need_fixup) {

2924

if (!mem_cgroup_is_root(from))

2924

if (!mem_cgroup_is_root(from))

2925

res_counter_uncharge(&from->memsw, PAGE_SIZE);

2925

res_counter_uncharge(&from->memsw, PAGE_SIZE);

2926

mem_cgroup_put(from);

2926

mem_cgroup_put(from);

2927

/*

2927

/*

2928

* we charged both to->res and to->memsw, so we should

2928

* we charged both to->res and to->memsw, so we should

2929

* uncharge to->res.

2929

* uncharge to->res.

2930

*/

2930

*/

2931

if (!mem_cgroup_is_root(to))

2931

if (!mem_cgroup_is_root(to))

2932

res_counter_uncharge(&to->res, PAGE_SIZE);

2932

res_counter_uncharge(&to->res, PAGE_SIZE);

2933

}

2933

}

2934

return 0;

2934

return 0;

2935

}

2935

}

2936

return -EINVAL;

2936

return -EINVAL;

2937

}

2937

}

2938

#else

2938

#else

2939

static inline int mem_cgroup_move_swap_account(swp_entry_t entry,

2939

static inline int mem_cgroup_move_swap_account(swp_entry_t entry,

2940

struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)

2940

struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)

2941

{

2941

{

2942

return -EINVAL;

2942

return -EINVAL;

2943

}

2943

}

2944

#endif

2944

#endif

2945

2946

/*

2946

/*

2947

* Before starting migration, account PAGE_SIZE to mem_cgroup that the old

2947

* Before starting migration, account PAGE_SIZE to mem_cgroup that the old

2948

* page belongs to.

2948

* page belongs to.

2949

*/

2949

*/

2950

int mem_cgroup_prepare_migration(struct page *page,

2950

int mem_cgroup_prepare_migration(struct page *page,

2951

struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)

2951

struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)

2952

{

2952

{

2953

struct mem_cgroup *mem = NULL;

2953

struct mem_cgroup *mem = NULL;

2954

struct page_cgroup *pc;

2954

struct page_cgroup *pc;

2955

enum charge_type ctype;

2955

enum charge_type ctype;

2956

int ret = 0;

2956

int ret = 0;

2957

2958

*ptr = NULL;

2958

*ptr = NULL;

2959

2960

VM_BUG_ON(PageTransHuge(page));

2960

VM_BUG_ON(PageTransHuge(page));

2961

if (mem_cgroup_disabled())

2961

if (mem_cgroup_disabled())

2962

return 0;

2962

return 0;

2963

2964

pc = lookup_page_cgroup(page);

2964

pc = lookup_page_cgroup(page);

2965

lock_page_cgroup(pc);

2965

lock_page_cgroup(pc);

2966

if (PageCgroupUsed(pc)) {

2966

if (PageCgroupUsed(pc)) {

2967

mem = pc->mem_cgroup;

2967

mem = pc->mem_cgroup;

2968

css_get(&mem->css);

2968

css_get(&mem->css);

2969

/*

2969

/*

2970

* At migrating an anonymous page, its mapcount goes down

2970

* At migrating an anonymous page, its mapcount goes down

2971

* to 0 and uncharge() will be called. But, even if it's fully

2971

* to 0 and uncharge() will be called. But, even if it's fully

2972

* unmapped, migration may fail and this page has to be

2972

* unmapped, migration may fail and this page has to be

2973

* charged again. We set MIGRATION flag here and delay uncharge

2973

* charged again. We set MIGRATION flag here and delay uncharge

2974

* until end_migration() is called

2974

* until end_migration() is called

2975

*

2975

*

2976

* Corner Case Thinking

2976

* Corner Case Thinking

2977

* A)

2977

* A)

2978

* When the old page was mapped as Anon and it's unmap-and-freed

2978

* When the old page was mapped as Anon and it's unmap-and-freed

2979

* while migration was ongoing.

2979

* while migration was ongoing.

2980

* If unmap finds the old page, uncharge() of it will be delayed

2980

* If unmap finds the old page, uncharge() of it will be delayed

2981

* until end_migration(). If unmap finds a new page, it's

2981

* until end_migration(). If unmap finds a new page, it's

2982

* uncharged when it make mapcount to be 1->0. If unmap code

2982

* uncharged when it make mapcount to be 1->0. If unmap code

2983

* finds swap_migration_entry, the new page will not be mapped

2983

* finds swap_migration_entry, the new page will not be mapped

2984

* and end_migration() will find it(mapcount==0).

2984

* and end_migration() will find it(mapcount==0).

2985

*

2985

*

2986

* B)

2986

* B)

2987

* When the old page was mapped but migraion fails, the kernel

2987

* When the old page was mapped but migraion fails, the kernel

2988

* remaps it. A charge for it is kept by MIGRATION flag even

2988

* remaps it. A charge for it is kept by MIGRATION flag even

2989

* if mapcount goes down to 0. We can do remap successfully

2989

* if mapcount goes down to 0. We can do remap successfully

2990

* without charging it again.

2990

* without charging it again.

2991

*

2991

*

2992

* C)

2992

* C)

2993

* The "old" page is under lock_page() until the end of

2993

* The "old" page is under lock_page() until the end of

2994

* migration, so, the old page itself will not be swapped-out.

2994

* migration, so, the old page itself will not be swapped-out.

2995

* If the new page is swapped out before end_migraton, our

2995

* If the new page is swapped out before end_migraton, our

2996

* hook to usual swap-out path will catch the event.

2996

* hook to usual swap-out path will catch the event.

2997

*/

2997

*/

2998

if (PageAnon(page))

2998

if (PageAnon(page))

2999

SetPageCgroupMigration(pc);

2999

SetPageCgroupMigration(pc);

3000

}

3000

}

3001

unlock_page_cgroup(pc);

3001

unlock_page_cgroup(pc);

3002

/*

3002

/*

3003

* If the page is not charged at this point,

3003

* If the page is not charged at this point,

3004

* we return here.

3004

* we return here.

3005

*/

3005

*/

3006

if (!mem)

3006

if (!mem)

3007

return 0;

3007

return 0;

3008

3009

*ptr = mem;

3009

*ptr = mem;

3010

ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);

3010

ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);

3011

css_put(&mem->css);/* drop extra refcnt */

3011

css_put(&mem->css);/* drop extra refcnt */

3012

if (ret || *ptr == NULL) {

3012

if (ret || *ptr == NULL) {

3013

if (PageAnon(page)) {

3013

if (PageAnon(page)) {

3014

lock_page_cgroup(pc);

3014

lock_page_cgroup(pc);

3015

ClearPageCgroupMigration(pc);

3015

ClearPageCgroupMigration(pc);

3016

unlock_page_cgroup(pc);

3016

unlock_page_cgroup(pc);

3017

/*

3017

/*

3018

* The old page may be fully unmapped while we kept it.

3018

* The old page may be fully unmapped while we kept it.

3019

*/

3019

*/

3020

mem_cgroup_uncharge_page(page);

3020

mem_cgroup_uncharge_page(page);

3021

}

3021

}

3022

return -ENOMEM;

3022

return -ENOMEM;

3023

}

3023

}

3024

/*

3024

/*

3025

* We charge new page before it's used/mapped. So, even if unlock_page()

3025

* We charge new page before it's used/mapped. So, even if unlock_page()

3026

* is called before end_migration, we can catch all events on this new

3026

* is called before end_migration, we can catch all events on this new

3027

* page. In the case new page is migrated but not remapped, new page's

3027

* page. In the case new page is migrated but not remapped, new page's

3028

* mapcount will be finally 0 and we call uncharge in end_migration().

3028

* mapcount will be finally 0 and we call uncharge in end_migration().

3029

*/

3029

*/

3030

pc = lookup_page_cgroup(newpage);

3030

pc = lookup_page_cgroup(newpage);

3031

if (PageAnon(page))

3031

if (PageAnon(page))

3032

ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;

3032

ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;

3033

else if (page_is_file_cache(page))

3033

else if (page_is_file_cache(page))

3034

ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;

3034

ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;

3035

else

3035

else

3036

ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;

3036

ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;

3037

__mem_cgroup_commit_charge(mem, page, 1, pc, ctype);

3037

__mem_cgroup_commit_charge(mem, page, 1, pc, ctype);

3038

return ret;

3038

return ret;

3039

}

3039

}

3040

3041

/* remove redundant charge if migration failed*/

3041

/* remove redundant charge if migration failed*/

3042

void mem_cgroup_end_migration(struct mem_cgroup *mem,

3042

void mem_cgroup_end_migration(struct mem_cgroup *mem,

3043

struct page *oldpage, struct page *newpage, bool migration_ok)

3043

struct page *oldpage, struct page *newpage, bool migration_ok)

3044

{

3044

{

3045

struct page *used, *unused;

3045

struct page *used, *unused;

3046

struct page_cgroup *pc;

3046

struct page_cgroup *pc;

3047

3048

if (!mem)

3048

if (!mem)

3049

return;

3049

return;

3050

/* blocks rmdir() */

3050

/* blocks rmdir() */

3051

cgroup_exclude_rmdir(&mem->css);

3051

cgroup_exclude_rmdir(&mem->css);

3052

if (!migration_ok) {

3052

if (!migration_ok) {

3053

used = oldpage;

3053

used = oldpage;

3054

unused = newpage;

3054

unused = newpage;

3055

} else {

3055

} else {

3056

used = newpage;

3056

used = newpage;

3057

unused = oldpage;

3057

unused = oldpage;

3058

}

3058

}

3059

/*

3059

/*

3060

* We disallowed uncharge of pages under migration because mapcount

3060

* We disallowed uncharge of pages under migration because mapcount

3061

* of the page goes down to zero, temporarly.

3061

* of the page goes down to zero, temporarly.

3062

* Clear the flag and check the page should be charged.

3062

* Clear the flag and check the page should be charged.

3063

*/

3063

*/

3064

pc = lookup_page_cgroup(oldpage);

3064

pc = lookup_page_cgroup(oldpage);

3065

lock_page_cgroup(pc);

3065

lock_page_cgroup(pc);

3066

ClearPageCgroupMigration(pc);

3066

ClearPageCgroupMigration(pc);

3067

unlock_page_cgroup(pc);

3067

unlock_page_cgroup(pc);

3068

3069

__mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);

3069

__mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);

3070

3071

/*

3071

/*

3072

* If a page is a file cache, radix-tree replacement is very atomic

3072

* If a page is a file cache, radix-tree replacement is very atomic

3073

* and we can skip this check. When it was an Anon page, its mapcount

3073

* and we can skip this check. When it was an Anon page, its mapcount

3074

* goes down to 0. But because we added MIGRATION flage, it's not

3074

* goes down to 0. But because we added MIGRATION flage, it's not

3075

* uncharged yet. There are several case but page->mapcount check

3075

* uncharged yet. There are several case but page->mapcount check

3076

* and USED bit check in mem_cgroup_uncharge_page() will do enough

3076

* and USED bit check in mem_cgroup_uncharge_page() will do enough

3077

* check. (see prepare_charge() also)

3077

* check. (see prepare_charge() also)

3078

*/

3078

*/

3079

if (PageAnon(used))

3079

if (PageAnon(used))

3080

mem_cgroup_uncharge_page(used);

3080

mem_cgroup_uncharge_page(used);

3081

/*

3081

/*

3082

* At migration, we may charge account against cgroup which has no

3082

* At migration, we may charge account against cgroup which has no

3083

* tasks.

3083

* tasks.

3084

* So, rmdir()->pre_destroy() can be called while we do this charge.

3084

* So, rmdir()->pre_destroy() can be called while we do this charge.

3085

* In that case, we need to call pre_destroy() again. check it here.

3085

* In that case, we need to call pre_destroy() again. check it here.

3086

*/

3086

*/

3087

cgroup_release_and_wakeup_rmdir(&mem->css);

3087

cgroup_release_and_wakeup_rmdir(&mem->css);

3088

}

3088

}

3089

3090

/*

3090

/*

3091

* A call to try to shrink memory usage on charge failure at shmem's swapin.

3091

* A call to try to shrink memory usage on charge failure at shmem's swapin.

3092

* Calling hierarchical_reclaim is not enough because we should update

3092

* Calling hierarchical_reclaim is not enough because we should update

3093

* last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.

3093

* last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.

3094

* Moreover considering hierarchy, we should reclaim from the mem_over_limit,

3094

* Moreover considering hierarchy, we should reclaim from the mem_over_limit,

3095

* not from the memcg which this page would be charged to.

3095

* not from the memcg which this page would be charged to.

3096

* try_charge_swapin does all of these works properly.

3096

* try_charge_swapin does all of these works properly.

3097

*/

3097

*/

3098

int mem_cgroup_shmem_charge_fallback(struct page *page,

3098

int mem_cgroup_shmem_charge_fallback(struct page *page,

3099

struct mm_struct *mm,

3099

struct mm_struct *mm,

3100

gfp_t gfp_mask)

3100

gfp_t gfp_mask)

3101

{

3101

{

3102

struct mem_cgroup *mem;

3102

struct mem_cgroup *mem;

3103

int ret;

3103

int ret;

3104

3105

if (mem_cgroup_disabled())

3105

if (mem_cgroup_disabled())

3106

return 0;

3106

return 0;

3107

3108

ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);

3108

ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);

3109

if (!ret)

3109

if (!ret)

3110

mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */

3110

mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */

3111

3112

return ret;

3112

return ret;

3113

}

3113

}

3114

3115

#ifdef CONFIG_DEBUG_VM

3115

#ifdef CONFIG_DEBUG_VM

3116

static struct page_cgroup *lookup_page_cgroup_used(struct page *page)

3116

static struct page_cgroup *lookup_page_cgroup_used(struct page *page)

3117

{

3117

{

3118

struct page_cgroup *pc;

3118

struct page_cgroup *pc;

3119

3120

pc = lookup_page_cgroup(page);

3120

pc = lookup_page_cgroup(page);

3121

if (likely(pc) && PageCgroupUsed(pc))

3121

if (likely(pc) && PageCgroupUsed(pc))

3122

return pc;

3122

return pc;

3123

return NULL;

3123

return NULL;

3124

}

3124

}

3125

3126

bool mem_cgroup_bad_page_check(struct page *page)

3126

bool mem_cgroup_bad_page_check(struct page *page)

3127

{

3127

{

3128

if (mem_cgroup_disabled())

3128

if (mem_cgroup_disabled())

3129

return false;

3129

return false;

3130

3131

return lookup_page_cgroup_used(page) != NULL;

3131

return lookup_page_cgroup_used(page) != NULL;

3132

}

3132

}

3133

3134

void mem_cgroup_print_bad_page(struct page *page)

3134

void mem_cgroup_print_bad_page(struct page *page)

3135

{

3135

{

3136

struct page_cgroup *pc;

3136

struct page_cgroup *pc;

3137

3138

pc = lookup_page_cgroup_used(page);

3138

pc = lookup_page_cgroup_used(page);

3139

if (pc) {

3139

if (pc) {

3140

int ret = -1;

3140

int ret = -1;

3141

char *path;

3141

char *path;

3142

3143

printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",

3143

printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",

3144

pc, pc->flags, pc->mem_cgroup);

3144

pc, pc->flags, pc->mem_cgroup);

3145

3146

path = kmalloc(PATH_MAX, GFP_KERNEL);

3146

path = kmalloc(PATH_MAX, GFP_KERNEL);

3147

if (path) {

3147

if (path) {

3148

rcu_read_lock();

3148

rcu_read_lock();

3149

ret = cgroup_path(pc->mem_cgroup->css.cgroup,

3149

ret = cgroup_path(pc->mem_cgroup->css.cgroup,

3150

path, PATH_MAX);

3150

path, PATH_MAX);

3151

rcu_read_unlock();

3151

rcu_read_unlock();

3152

}

3152

}

3153

3154

printk(KERN_CONT "(%s)\n",

3154

printk(KERN_CONT "(%s)\n",

3155

(ret < 0) ? "cannot get the path" : path);

3155

(ret < 0) ? "cannot get the path" : path);

3156

kfree(path);

3156

kfree(path);

3157

}

3157

}

3158

}

3158

}

3159

#endif

3159

#endif

3160

3161

static DEFINE_MUTEX(set_limit_mutex);

3161

static DEFINE_MUTEX(set_limit_mutex);

3162

3163

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,

3163

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,

3164

unsigned long long val)

3164

unsigned long long val)

3165

{

3165

{

3166

int retry_count;

3166

int retry_count;

3167

u64 memswlimit, memlimit;

3167

u64 memswlimit, memlimit;

3168

int ret = 0;

3168

int ret = 0;

3169

int children = mem_cgroup_count_children(memcg);

3169

int children = mem_cgroup_count_children(memcg);

3170

u64 curusage, oldusage;

3170

u64 curusage, oldusage;

3171

int enlarge;

3171

int enlarge;

3172

3173

/*

3173

/*

3174

* For keeping hierarchical_reclaim simple, how long we should retry

3174

* For keeping hierarchical_reclaim simple, how long we should retry

3175

* is depends on callers. We set our retry-count to be function

3175

* is depends on callers. We set our retry-count to be function

3176

* of # of children which we should visit in this loop.

3176

* of # of children which we should visit in this loop.

3177

*/

3177

*/

3178

retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;

3178

retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;

3179

3180

oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);

3180

oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);

3181

3182

enlarge = 0;

3182

enlarge = 0;

3183

while (retry_count) {

3183

while (retry_count) {

3184

if (signal_pending(current)) {

3184

if (signal_pending(current)) {

3185

ret = -EINTR;

3185

ret = -EINTR;

3186

break;

3186

break;

3187

}

3187

}

3188

/*

3188

/*

3189

* Rather than hide all in some function, I do this in

3189

* Rather than hide all in some function, I do this in

3190

* open coded manner. You see what this really does.

3190

* open coded manner. You see what this really does.

3191

* We have to guarantee mem->res.limit < mem->memsw.limit.

3191

* We have to guarantee mem->res.limit < mem->memsw.limit.

3192

*/

3192

*/

3193

mutex_lock(&set_limit_mutex);

3193

mutex_lock(&set_limit_mutex);

3194

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3194

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3195

if (memswlimit < val) {

3195

if (memswlimit < val) {

3196

ret = -EINVAL;

3196

ret = -EINVAL;

3197

mutex_unlock(&set_limit_mutex);

3197

mutex_unlock(&set_limit_mutex);

3198

break;

3198

break;

3199

}

3199

}

3200

3201

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3201

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3202

if (memlimit < val)

3202

if (memlimit < val)

3203

enlarge = 1;

3203

enlarge = 1;

3204

3205

ret = res_counter_set_limit(&memcg->res, val);

3205

ret = res_counter_set_limit(&memcg->res, val);

3206

if (!ret) {

3206

if (!ret) {

3207

if (memswlimit == val)

3207

if (memswlimit == val)

3208

memcg->memsw_is_minimum = true;

3208

memcg->memsw_is_minimum = true;

3209

else

3209

else

3210

memcg->memsw_is_minimum = false;

3210

memcg->memsw_is_minimum = false;

3211

}

3211

}

3212

mutex_unlock(&set_limit_mutex);

3212

mutex_unlock(&set_limit_mutex);

3213

3214

if (!ret)

3214

if (!ret)

3215

break;

3215

break;

3216

3217

mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,

3217

mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,

3218

MEM_CGROUP_RECLAIM_SHRINK,

3218

MEM_CGROUP_RECLAIM_SHRINK,

3219

NULL);

3219

NULL);

3220

curusage = res_counter_read_u64(&memcg->res, RES_USAGE);

3220

curusage = res_counter_read_u64(&memcg->res, RES_USAGE);

3221

/* Usage is reduced ? */

3221

/* Usage is reduced ? */

3222

if (curusage >= oldusage)

3222

if (curusage >= oldusage)

3223

retry_count--;

3223

retry_count--;

3224

else

3224

else

3225

oldusage = curusage;

3225

oldusage = curusage;

3226

}

3226

}

3227

if (!ret && enlarge)

3227

if (!ret && enlarge)

3228

memcg_oom_recover(memcg);

3228

memcg_oom_recover(memcg);

3229

3230

return ret;

3230

return ret;

3231

}

3231

}

3232

3233

static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,

3233

static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,

3234

unsigned long long val)

3234

unsigned long long val)

3235

{

3235

{

3236

int retry_count;

3236

int retry_count;

3237

u64 memlimit, memswlimit, oldusage, curusage;

3237

u64 memlimit, memswlimit, oldusage, curusage;

3238

int children = mem_cgroup_count_children(memcg);

3238

int children = mem_cgroup_count_children(memcg);

3239

int ret = -EBUSY;

3239

int ret = -EBUSY;

3240

int enlarge = 0;

3240

int enlarge = 0;

3241

3242

/* see mem_cgroup_resize_res_limit */

3242

/* see mem_cgroup_resize_res_limit */

3243

retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;

3243

retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;

3244

oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

3244

oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

3245

while (retry_count) {

3245

while (retry_count) {

3246

if (signal_pending(current)) {

3246

if (signal_pending(current)) {

3247

ret = -EINTR;

3247

ret = -EINTR;

3248

break;

3248

break;

3249

}

3249

}

3250

/*

3250

/*

3251

* Rather than hide all in some function, I do this in

3251

* Rather than hide all in some function, I do this in

3252

* open coded manner. You see what this really does.

3252

* open coded manner. You see what this really does.

3253

* We have to guarantee mem->res.limit < mem->memsw.limit.

3253

* We have to guarantee mem->res.limit < mem->memsw.limit.

3254

*/

3254

*/

3255

mutex_lock(&set_limit_mutex);

3255

mutex_lock(&set_limit_mutex);

3256

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3256

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3257

if (memlimit > val) {

3257

if (memlimit > val) {

3258

ret = -EINVAL;

3258

ret = -EINVAL;

3259

mutex_unlock(&set_limit_mutex);

3259

mutex_unlock(&set_limit_mutex);

3260

break;

3260

break;

3261

}

3261

}

3262

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3262

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3263

if (memswlimit < val)

3263

if (memswlimit < val)

3264

enlarge = 1;

3264

enlarge = 1;

3265

ret = res_counter_set_limit(&memcg->memsw, val);

3265

ret = res_counter_set_limit(&memcg->memsw, val);

3266

if (!ret) {

3266

if (!ret) {

3267

if (memlimit == val)

3267

if (memlimit == val)

3268

memcg->memsw_is_minimum = true;

3268

memcg->memsw_is_minimum = true;

3269

else

3269

else

3270

memcg->memsw_is_minimum = false;

3270

memcg->memsw_is_minimum = false;

3271

}

3271

}

3272

mutex_unlock(&set_limit_mutex);

3272

mutex_unlock(&set_limit_mutex);

3273

3274

if (!ret)

3274

if (!ret)

3275

break;

3275

break;

3276

3277

mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,

3277

mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,

3278

MEM_CGROUP_RECLAIM_NOSWAP |

3278

MEM_CGROUP_RECLAIM_NOSWAP |

3279

MEM_CGROUP_RECLAIM_SHRINK,

3279

MEM_CGROUP_RECLAIM_SHRINK,

3280

NULL);

3280

NULL);

3281

curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

3281

curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

3282

/* Usage is reduced ? */

3282

/* Usage is reduced ? */

3283

if (curusage >= oldusage)

3283

if (curusage >= oldusage)

3284

retry_count--;

3284

retry_count--;

3285

else

3285

else

3286

oldusage = curusage;

3286

oldusage = curusage;

3287

}

3287

}

3288

if (!ret && enlarge)

3288

if (!ret && enlarge)

3289

memcg_oom_recover(memcg);

3289

memcg_oom_recover(memcg);

3290

return ret;

3290

return ret;

3291

}

3291

}

3292

3293

unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,

3293

unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,

3294

gfp_t gfp_mask,

3294

gfp_t gfp_mask,

3295

unsigned long *total_scanned)

3295

unsigned long *total_scanned)

3296

{

3296

{

3297

unsigned long nr_reclaimed = 0;

3297

unsigned long nr_reclaimed = 0;

3298

struct mem_cgroup_per_zone *mz, *next_mz = NULL;

3298

struct mem_cgroup_per_zone *mz, *next_mz = NULL;

3299

unsigned long reclaimed;

3299

unsigned long reclaimed;

3300

int loop = 0;

3300

int loop = 0;

3301

struct mem_cgroup_tree_per_zone *mctz;

3301

struct mem_cgroup_tree_per_zone *mctz;

3302

unsigned long long excess;

3302

unsigned long long excess;

3303

unsigned long nr_scanned;

3303

unsigned long nr_scanned;

3304

3305

if (order > 0)

3305

if (order > 0)

3306

return 0;

3306

return 0;

3307

3308

mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));

3308

mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));

3309

/*

3309

/*

3310

* This loop can run a while, specially if mem_cgroup's continuously

3310

* This loop can run a while, specially if mem_cgroup's continuously

3311

* keep exceeding their soft limit and putting the system under

3311

* keep exceeding their soft limit and putting the system under

3312

* pressure

3312

* pressure

3313

*/

3313

*/

3314

do {

3314

do {

3315

if (next_mz)

3315

if (next_mz)

3316

mz = next_mz;

3316

mz = next_mz;

3317

else

3317

else

3318

mz = mem_cgroup_largest_soft_limit_node(mctz);

3318

mz = mem_cgroup_largest_soft_limit_node(mctz);

3319

if (!mz)

3319

if (!mz)

3320

break;

3320

break;

3321

3322

nr_scanned = 0;

3322

nr_scanned = 0;

3323

reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,

3323

reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,

3324

gfp_mask,

3324

gfp_mask,

3325

MEM_CGROUP_RECLAIM_SOFT,

3325

MEM_CGROUP_RECLAIM_SOFT,

3326

&nr_scanned);

3326

&nr_scanned);

3327

nr_reclaimed += reclaimed;

3327

nr_reclaimed += reclaimed;

3328

*total_scanned += nr_scanned;

3328

*total_scanned += nr_scanned;

3329

spin_lock(&mctz->lock);

3329

spin_lock(&mctz->lock);

3330

3331

/*

3331

/*

3332

* If we failed to reclaim anything from this memory cgroup

3332

* If we failed to reclaim anything from this memory cgroup

3333

* it is time to move on to the next cgroup

3333

* it is time to move on to the next cgroup

3334

*/

3334

*/

3335

next_mz = NULL;

3335

next_mz = NULL;

3336

if (!reclaimed) {

3336

if (!reclaimed) {

3337

do {

3337

do {

3338

/*

3338

/*

3339

* Loop until we find yet another one.

3339

* Loop until we find yet another one.

3340

*

3340

*

3341

* By the time we get the soft_limit lock

3341

* By the time we get the soft_limit lock

3342

* again, someone might have aded the

3342

* again, someone might have aded the

3343

* group back on the RB tree. Iterate to

3343

* group back on the RB tree. Iterate to

3344

* make sure we get a different mem.

3344

* make sure we get a different mem.

3345

* mem_cgroup_largest_soft_limit_node returns

3345

* mem_cgroup_largest_soft_limit_node returns

3346

* NULL if no other cgroup is present on

3346

* NULL if no other cgroup is present on

3347

* the tree

3347

* the tree

3348

*/

3348

*/

3349

next_mz =

3349

next_mz =

3350

__mem_cgroup_largest_soft_limit_node(mctz);

3350

__mem_cgroup_largest_soft_limit_node(mctz);

3351

if (next_mz == mz) {

3351

if (next_mz == mz)

3352

css_put(&next_mz->mem->css);

3352

css_put(&next_mz->mem->css);

3353

next_mz = NULL;

3353

else /* next_mz == NULL or other memcg */

3354

} else /* next_mz == NULL or other memcg */

3355

break;

3354

break;

3356

} while (1);

3355

} while (1);

3357

}

3356

}

3358

__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);

3357

__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);

3359

excess = res_counter_soft_limit_excess(&mz->mem->res);

3358

excess = res_counter_soft_limit_excess(&mz->mem->res);

3360

/*

3359

/*

3361

* One school of thought says that we should not add

3360

* One school of thought says that we should not add

3362

* back the node to the tree if reclaim returns 0.

3361

* back the node to the tree if reclaim returns 0.

3363

* But our reclaim could return 0, simply because due

3362

* But our reclaim could return 0, simply because due

3364

* to priority we are exposing a smaller subset of

3363

* to priority we are exposing a smaller subset of

3365

* memory to reclaim from. Consider this as a longer

3364

* memory to reclaim from. Consider this as a longer

3366

* term TODO.

3365

* term TODO.

3367

*/

3366

*/

3368

/* If excess == 0, no tree ops */

3367

/* If excess == 0, no tree ops */

3369

__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);

3368

__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);

3370

spin_unlock(&mctz->lock);

3369

spin_unlock(&mctz->lock);

3371

css_put(&mz->mem->css);

3370

css_put(&mz->mem->css);

3372

loop++;

3371

loop++;

3373

/*

3372

/*

3374

* Could not reclaim anything and there are no more

3373

* Could not reclaim anything and there are no more

3375

* mem cgroups to try or we seem to be looping without

3374

* mem cgroups to try or we seem to be looping without

3376

* reclaiming anything.

3375

* reclaiming anything.

3377

*/

3376

*/

3378

if (!nr_reclaimed &&

3377

if (!nr_reclaimed &&

3379

(next_mz == NULL ||

3378

(next_mz == NULL ||

3380

loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))

3379

loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))

3381

break;

3380

break;

3382

} while (!nr_reclaimed);

3381

} while (!nr_reclaimed);

3383

if (next_mz)

3382

if (next_mz)

3384

css_put(&next_mz->mem->css);

3383

css_put(&next_mz->mem->css);

3385

return nr_reclaimed;

3384

return nr_reclaimed;

3386

}

3385

}

3387

3386

3388

/*

3387

/*

3389

* This routine traverse page_cgroup in given list and drop them all.

3388

* This routine traverse page_cgroup in given list and drop them all.

3390

* *And* this routine doesn't reclaim page itself, just removes page_cgroup.

3389

* *And* this routine doesn't reclaim page itself, just removes page_cgroup.

3391

*/

3390

*/

3392

static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,

3391

static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,

3393

int node, int zid, enum lru_list lru)

3392

int node, int zid, enum lru_list lru)

3394

{

3393

{

3395

struct zone *zone;

3394

struct zone *zone;

3396

struct mem_cgroup_per_zone *mz;

3395

struct mem_cgroup_per_zone *mz;

3397

struct page_cgroup *pc, *busy;

3396

struct page_cgroup *pc, *busy;

3398

unsigned long flags, loop;

3397

unsigned long flags, loop;

3399

struct list_head *list;

3398

struct list_head *list;

3400

int ret = 0;

3399

int ret = 0;

3401

3400

3402

zone = &NODE_DATA(node)->node_zones[zid];

3401

zone = &NODE_DATA(node)->node_zones[zid];

3403

mz = mem_cgroup_zoneinfo(mem, node, zid);

3402

mz = mem_cgroup_zoneinfo(mem, node, zid);

3404

list = &mz->lists[lru];

3403

list = &mz->lists[lru];

3405

3404

3406

loop = MEM_CGROUP_ZSTAT(mz, lru);

3405

loop = MEM_CGROUP_ZSTAT(mz, lru);

3407

/* give some margin against EBUSY etc...*/

3406

/* give some margin against EBUSY etc...*/

3408

loop += 256;

3407

loop += 256;

3409

busy = NULL;

3408

busy = NULL;

3410

while (loop--) {

3409

while (loop--) {

3411

struct page *page;

3410

struct page *page;

3412

3411

3413

ret = 0;

3412

ret = 0;

3414

spin_lock_irqsave(&zone->lru_lock, flags);

3413

spin_lock_irqsave(&zone->lru_lock, flags);

3415

if (list_empty(list)) {

3414

if (list_empty(list)) {

3416

spin_unlock_irqrestore(&zone->lru_lock, flags);

3415

spin_unlock_irqrestore(&zone->lru_lock, flags);

3417

break;

3416

break;

3418

}

3417

}

3419

pc = list_entry(list->prev, struct page_cgroup, lru);

3418

pc = list_entry(list->prev, struct page_cgroup, lru);

3420

if (busy == pc) {

3419

if (busy == pc) {

3421

list_move(&pc->lru, list);

3420

list_move(&pc->lru, list);

3422

busy = NULL;

3421

busy = NULL;

3423

spin_unlock_irqrestore(&zone->lru_lock, flags);

3422

spin_unlock_irqrestore(&zone->lru_lock, flags);

3424

continue;

3423

continue;

3425

}

3424

}

3426

spin_unlock_irqrestore(&zone->lru_lock, flags);

3425

spin_unlock_irqrestore(&zone->lru_lock, flags);

3427

3426

3428

page = lookup_cgroup_page(pc);

3427

page = lookup_cgroup_page(pc);

3429

3428

3430

ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);

3429

ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);

3431

if (ret == -ENOMEM)

3430

if (ret == -ENOMEM)

3432

break;

3431

break;

3433

3432

3434

if (ret == -EBUSY || ret == -EINVAL) {

3433

if (ret == -EBUSY || ret == -EINVAL) {

3435

/* found lock contention or "pc" is obsolete. */

3434

/* found lock contention or "pc" is obsolete. */

3436

busy = pc;

3435

busy = pc;

3437

cond_resched();

3436

cond_resched();

3438

} else

3437

} else

3439

busy = NULL;

3438

busy = NULL;

3440

}

3439

}

3441

3440

3442

if (!ret && !list_empty(list))

3441

if (!ret && !list_empty(list))

3443

return -EBUSY;

3442

return -EBUSY;

3444

return ret;

3443

return ret;

3445

}

3444

}

3446

3445

3447

/*

3446

/*

3448

* make mem_cgroup's charge to be 0 if there is no task.

3447

* make mem_cgroup's charge to be 0 if there is no task.

3449

* This enables deleting this mem_cgroup.

3448

* This enables deleting this mem_cgroup.

3450

*/

3449

*/

3451

static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)

3450

static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)

3452

{

3451

{

3453

int ret;

3452

int ret;

3454

int node, zid, shrink;

3453

int node, zid, shrink;

3455

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

3454

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

3456

struct cgroup *cgrp = mem->css.cgroup;

3455

struct cgroup *cgrp = mem->css.cgroup;

3457

3456

3458

css_get(&mem->css);

3457

css_get(&mem->css);

3459

3458

3460

shrink = 0;

3459

shrink = 0;

3461

/* should free all ? */

3460

/* should free all ? */

3462

if (free_all)

3461

if (free_all)

3463

goto try_to_free;

3462

goto try_to_free;

3464

move_account:

3463

move_account:

3465

do {

3464

do {

3466

ret = -EBUSY;

3465

ret = -EBUSY;

3467

if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))

3466

if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))

3468

goto out;

3467

goto out;

3469

ret = -EINTR;

3468

ret = -EINTR;

3470

if (signal_pending(current))

3469

if (signal_pending(current))

3471

goto out;

3470

goto out;

3472

/* This is for making all *used* pages to be on LRU. */

3471

/* This is for making all *used* pages to be on LRU. */

3473

lru_add_drain_all();

3472

lru_add_drain_all();

3474

drain_all_stock_sync();

3473

drain_all_stock_sync();

3475

ret = 0;

3474

ret = 0;

3476

mem_cgroup_start_move(mem);

3475

mem_cgroup_start_move(mem);

3477

for_each_node_state(node, N_HIGH_MEMORY) {

3476

for_each_node_state(node, N_HIGH_MEMORY) {

3478

for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {

3477

for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {

3479

enum lru_list l;

3478

enum lru_list l;

3480

for_each_lru(l) {

3479

for_each_lru(l) {

3481

ret = mem_cgroup_force_empty_list(mem,

3480

ret = mem_cgroup_force_empty_list(mem,

3482

node, zid, l);

3481

node, zid, l);

3483

if (ret)

3482

if (ret)

3484

break;

3483

break;

3485

}

3484

}

3486

}

3485

}

3487

if (ret)

3486

if (ret)

3488

break;

3487

break;

3489

}

3488

}

3490

mem_cgroup_end_move(mem);

3489

mem_cgroup_end_move(mem);

3491

memcg_oom_recover(mem);

3490

memcg_oom_recover(mem);

3492

/* it seems parent cgroup doesn't have enough mem */

3491

/* it seems parent cgroup doesn't have enough mem */

3493

if (ret == -ENOMEM)

3492

if (ret == -ENOMEM)

3494

goto try_to_free;

3493

goto try_to_free;

3495

cond_resched();

3494

cond_resched();

3496

/* "ret" should also be checked to ensure all lists are empty. */

3495

/* "ret" should also be checked to ensure all lists are empty. */

3497

} while (mem->res.usage > 0 || ret);

3496

} while (mem->res.usage > 0 || ret);

3498

out:

3497

out:

3499

css_put(&mem->css);

3498

css_put(&mem->css);

3500

return ret;

3499

return ret;

3501

3500

3502

try_to_free:

3501

try_to_free:

3503

/* returns EBUSY if there is a task or if we come here twice. */

3502

/* returns EBUSY if there is a task or if we come here twice. */

3504

if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {

3503

if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {

3505

ret = -EBUSY;

3504

ret = -EBUSY;

3506

goto out;

3505

goto out;

3507

}

3506

}

3508

/* we call try-to-free pages for make this cgroup empty */

3507

/* we call try-to-free pages for make this cgroup empty */

3509

lru_add_drain_all();

3508

lru_add_drain_all();

3510

/* try to free all pages in this cgroup */

3509

/* try to free all pages in this cgroup */

3511

shrink = 1;

3510

shrink = 1;

3512

while (nr_retries && mem->res.usage > 0) {

3511

while (nr_retries && mem->res.usage > 0) {

3513

int progress;

3512

int progress;

3514

3513

3515

if (signal_pending(current)) {

3514

if (signal_pending(current)) {

3516

ret = -EINTR;

3515

ret = -EINTR;

3517

goto out;

3516

goto out;

3518

}

3517

}

3519

progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,

3518

progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,

3520

false, get_swappiness(mem));

3519

false, get_swappiness(mem));

3521

if (!progress) {

3520

if (!progress) {

3522

nr_retries--;

3521

nr_retries--;

3523

/* maybe some writeback is necessary */

3522

/* maybe some writeback is necessary */

3524

congestion_wait(BLK_RW_ASYNC, HZ/10);

3523

congestion_wait(BLK_RW_ASYNC, HZ/10);

3525

}

3524

}

3526

3525

3527

}

3526

}

3528

lru_add_drain();

3527

lru_add_drain();

3529

/* try move_account...there may be some *locked* pages. */

3528

/* try move_account...there may be some *locked* pages. */

3530

goto move_account;

3529

goto move_account;

3531

}

3530

}

3532

3531

3533

int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)

3532

int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)

3534

{

3533

{

3535

return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);

3534

return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);

3536

}

3535

}

3537

3536

3538

3537

3539

static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)

3538

static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)

3540

{

3539

{

3541

return mem_cgroup_from_cont(cont)->use_hierarchy;

3540

return mem_cgroup_from_cont(cont)->use_hierarchy;

3542

}

3541

}

3543

3542

3544

static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,

3543

static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,

3545

u64 val)

3544

u64 val)

3546

{

3545

{

3547

int retval = 0;

3546

int retval = 0;

3548

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

3547

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

3549

struct cgroup *parent = cont->parent;

3548

struct cgroup *parent = cont->parent;

3550

struct mem_cgroup *parent_mem = NULL;

3549

struct mem_cgroup *parent_mem = NULL;

3551

3550

3552

if (parent)

3551

if (parent)

3553

parent_mem = mem_cgroup_from_cont(parent);

3552

parent_mem = mem_cgroup_from_cont(parent);

3554

3553

3555

cgroup_lock();

3554

cgroup_lock();

3556

/*

3555

/*

3557

* If parent's use_hierarchy is set, we can't make any modifications

3556

* If parent's use_hierarchy is set, we can't make any modifications

3558

* in the child subtrees. If it is unset, then the change can

3557

* in the child subtrees. If it is unset, then the change can

3559

* occur, provided the current cgroup has no children.

3558

* occur, provided the current cgroup has no children.

3560

*

3559

*

3561

* For the root cgroup, parent_mem is NULL, we allow value to be

3560

* For the root cgroup, parent_mem is NULL, we allow value to be

3562

* set if there are no children.

3561

* set if there are no children.

3563

*/

3562

*/

3564

if ((!parent_mem || !parent_mem->use_hierarchy) &&

3563

if ((!parent_mem || !parent_mem->use_hierarchy) &&

3565

(val == 1 || val == 0)) {

3564

(val == 1 || val == 0)) {

3566

if (list_empty(&cont->children))

3565

if (list_empty(&cont->children))

3567

mem->use_hierarchy = val;

3566

mem->use_hierarchy = val;

3568

else

3567

else

3569

retval = -EBUSY;

3568

retval = -EBUSY;

3570

} else

3569

} else

3571

retval = -EINVAL;

3570

retval = -EINVAL;

3572

cgroup_unlock();

3571

cgroup_unlock();

3573

3572

3574

return retval;

3573

return retval;

3575

}

3574

}

3576

3575

3577

3576

3578

static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,

3577

static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,

3579

enum mem_cgroup_stat_index idx)

3578

enum mem_cgroup_stat_index idx)

3580

{

3579

{

3581

struct mem_cgroup *iter;

3580

struct mem_cgroup *iter;

3582

long val = 0;

3581

long val = 0;

3583

3582

3584

/* Per-cpu values can be negative, use a signed accumulator */

3583

/* Per-cpu values can be negative, use a signed accumulator */

3585

for_each_mem_cgroup_tree(iter, mem)

3584

for_each_mem_cgroup_tree(iter, mem)

3586

val += mem_cgroup_read_stat(iter, idx);

3585

val += mem_cgroup_read_stat(iter, idx);

3587

3586

3588

if (val < 0) /* race ? */

3587

if (val < 0) /* race ? */

3589

val = 0;

3588

val = 0;

3590

return val;

3589

return val;

3591

}

3590

}

3592

3591

3593

static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)

3592

static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)

3594

{

3593

{

3595

u64 val;

3594

u64 val;

3596

3595

3597

if (!mem_cgroup_is_root(mem)) {

3596

if (!mem_cgroup_is_root(mem)) {

3598

if (!swap)

3597

if (!swap)

3599

return res_counter_read_u64(&mem->res, RES_USAGE);

3598

return res_counter_read_u64(&mem->res, RES_USAGE);

3600

else

3599

else

3601

return res_counter_read_u64(&mem->memsw, RES_USAGE);

3600

return res_counter_read_u64(&mem->memsw, RES_USAGE);

3602

}

3601

}

3603

3602

3604

val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);

3603

val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);

3605

val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);

3604

val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);

3606

3605

3607

if (swap)

3606

if (swap)

3608

val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);

3607

val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);

3609

3608

3610

return val << PAGE_SHIFT;

3609

return val << PAGE_SHIFT;

3611

}

3610

}

3612

3611

3613

static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)

3612

static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)

3614

{

3613

{

3615

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

3614

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

3616

u64 val;

3615

u64 val;

3617

int type, name;

3616

int type, name;

3618

3617

3619

type = MEMFILE_TYPE(cft->private);

3618

type = MEMFILE_TYPE(cft->private);

3620

name = MEMFILE_ATTR(cft->private);

3619

name = MEMFILE_ATTR(cft->private);

3621

switch (type) {

3620

switch (type) {

3622

case _MEM:

3621

case _MEM:

3623

if (name == RES_USAGE)

3622

if (name == RES_USAGE)

3624

val = mem_cgroup_usage(mem, false);

3623

val = mem_cgroup_usage(mem, false);

3625

else

3624

else

3626

val = res_counter_read_u64(&mem->res, name);

3625

val = res_counter_read_u64(&mem->res, name);

3627

break;

3626

break;

3628

case _MEMSWAP:

3627

case _MEMSWAP:

3629

if (name == RES_USAGE)

3628

if (name == RES_USAGE)

3630

val = mem_cgroup_usage(mem, true);

3629

val = mem_cgroup_usage(mem, true);

3631

else

3630

else

3632

val = res_counter_read_u64(&mem->memsw, name);

3631

val = res_counter_read_u64(&mem->memsw, name);

3633

break;

3632

break;

3634

default:

3633

default:

3635

BUG();

3634

BUG();

3636

break;

3635

break;

3637

}

3636

}

3638

return val;

3637

return val;

3639

}

3638

}

3640

/*

3639

/*

3641

* The user of this function is...

3640

* The user of this function is...

3642

* RES_LIMIT.

3641

* RES_LIMIT.

3643

*/

3642

*/

3644

static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,

3643

static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,

3645

const char *buffer)

3644

const char *buffer)

3646

{

3645

{

3647

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

3646

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

3648

int type, name;

3647

int type, name;

3649

unsigned long long val;

3648

unsigned long long val;

3650

int ret;

3649

int ret;

3651

3650

3652

type = MEMFILE_TYPE(cft->private);

3651

type = MEMFILE_TYPE(cft->private);

3653

name = MEMFILE_ATTR(cft->private);

3652

name = MEMFILE_ATTR(cft->private);

3654

switch (name) {

3653

switch (name) {

3655

case RES_LIMIT:

3654

case RES_LIMIT:

3656

if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */

3655

if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */

3657

ret = -EINVAL;

3656

ret = -EINVAL;

3658

break;

3657

break;

3659

}

3658

}

3660

/* This function does all necessary parse...reuse it */

3659

/* This function does all necessary parse...reuse it */

3661

ret = res_counter_memparse_write_strategy(buffer, &val);

3660

ret = res_counter_memparse_write_strategy(buffer, &val);

3662

if (ret)

3661

if (ret)

3663

break;

3662

break;

3664

if (type == _MEM)

3663

if (type == _MEM)

3665

ret = mem_cgroup_resize_limit(memcg, val);

3664

ret = mem_cgroup_resize_limit(memcg, val);

3666

else

3665

else

3667

ret = mem_cgroup_resize_memsw_limit(memcg, val);

3666

ret = mem_cgroup_resize_memsw_limit(memcg, val);

3668

break;

3667

break;

3669

case RES_SOFT_LIMIT:

3668

case RES_SOFT_LIMIT:

3670

ret = res_counter_memparse_write_strategy(buffer, &val);

3669

ret = res_counter_memparse_write_strategy(buffer, &val);

3671

if (ret)

3670

if (ret)

3672

break;

3671

break;

3673

/*

3672

/*

3674

* For memsw, soft limits are hard to implement in terms

3673

* For memsw, soft limits are hard to implement in terms

3675

* of semantics, for now, we support soft limits for

3674

* of semantics, for now, we support soft limits for

3676

* control without swap

3675

* control without swap

3677

*/

3676

*/

3678

if (type == _MEM)

3677

if (type == _MEM)

3679

ret = res_counter_set_soft_limit(&memcg->res, val);

3678

ret = res_counter_set_soft_limit(&memcg->res, val);

3680

else

3679

else

3681

ret = -EINVAL;

3680

ret = -EINVAL;

3682

break;

3681

break;

3683

default:

3682

default:

3684

ret = -EINVAL; /* should be BUG() ? */

3683

ret = -EINVAL; /* should be BUG() ? */

3685

break;

3684

break;

3686

}

3685

}

3687

return ret;

3686

return ret;

3688

}

3687

}

3689

3688

3690

static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,

3689

static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,

3691

unsigned long long *mem_limit, unsigned long long *memsw_limit)

3690

unsigned long long *mem_limit, unsigned long long *memsw_limit)

3692

{

3691

{

3693

struct cgroup *cgroup;

3692

struct cgroup *cgroup;

3694

unsigned long long min_limit, min_memsw_limit, tmp;

3693

unsigned long long min_limit, min_memsw_limit, tmp;

3695

3694

3696

min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3695

min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3697

min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3696

min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3698

cgroup = memcg->css.cgroup;

3697

cgroup = memcg->css.cgroup;

3699

if (!memcg->use_hierarchy)

3698

if (!memcg->use_hierarchy)

3700

goto out;

3699

goto out;

3701

3700

3702

while (cgroup->parent) {

3701

while (cgroup->parent) {

3703

cgroup = cgroup->parent;

3702

cgroup = cgroup->parent;

3704

memcg = mem_cgroup_from_cont(cgroup);

3703

memcg = mem_cgroup_from_cont(cgroup);

3705

if (!memcg->use_hierarchy)

3704

if (!memcg->use_hierarchy)

3706

break;

3705

break;

3707

tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);

3706

tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);

3708

min_limit = min(min_limit, tmp);

3707

min_limit = min(min_limit, tmp);

3709

tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3708

tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3710

min_memsw_limit = min(min_memsw_limit, tmp);

3709

min_memsw_limit = min(min_memsw_limit, tmp);

3711

}

3710

}

3712

out:

3711

out:

3713

*mem_limit = min_limit;

3712

*mem_limit = min_limit;

3714

*memsw_limit = min_memsw_limit;

3713

*memsw_limit = min_memsw_limit;

3715

return;

3714

return;

3716

}

3715

}

3717

3716

3718

static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)

3717

static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)

3719

{

3718

{

3720

struct mem_cgroup *mem;

3719

struct mem_cgroup *mem;

3721

int type, name;

3720

int type, name;

3722

3721

3723

mem = mem_cgroup_from_cont(cont);

3722

mem = mem_cgroup_from_cont(cont);

3724

type = MEMFILE_TYPE(event);

3723

type = MEMFILE_TYPE(event);

3725

name = MEMFILE_ATTR(event);

3724

name = MEMFILE_ATTR(event);

3726

switch (name) {

3725

switch (name) {

3727

case RES_MAX_USAGE:

3726

case RES_MAX_USAGE:

3728

if (type == _MEM)

3727

if (type == _MEM)

3729

res_counter_reset_max(&mem->res);

3728

res_counter_reset_max(&mem->res);

3730

else

3729

else

3731

res_counter_reset_max(&mem->memsw);

3730

res_counter_reset_max(&mem->memsw);

3732

break;

3731

break;

3733

case RES_FAILCNT:

3732

case RES_FAILCNT:

3734

if (type == _MEM)

3733

if (type == _MEM)

3735

res_counter_reset_failcnt(&mem->res);

3734

res_counter_reset_failcnt(&mem->res);

3736

else

3735

else

3737

res_counter_reset_failcnt(&mem->memsw);

3736

res_counter_reset_failcnt(&mem->memsw);

3738

break;

3737

break;

3739

}

3738

}

3740

3739

3741

return 0;

3740

return 0;

3742

}

3741

}

3743

3742

3744

static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,

3743

static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,

3745

struct cftype *cft)

3744

struct cftype *cft)

3746

{

3745

{

3747

return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;

3746

return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;

3748

}

3747

}

3749

3748

3750

#ifdef CONFIG_MMU

3749

#ifdef CONFIG_MMU

3751

static int mem_cgroup_move_charge_write(struct cgroup *cgrp,

3750

static int mem_cgroup_move_charge_write(struct cgroup *cgrp,

3752

struct cftype *cft, u64 val)

3751

struct cftype *cft, u64 val)

3753

{

3752

{

3754

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

3753

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

3755

3754

3756

if (val >= (1 << NR_MOVE_TYPE))

3755

if (val >= (1 << NR_MOVE_TYPE))

3757

return -EINVAL;

3756

return -EINVAL;

3758

/*

3757

/*

3759

* We check this value several times in both in can_attach() and

3758

* We check this value several times in both in can_attach() and

3760

* attach(), so we need cgroup lock to prevent this value from being

3759

* attach(), so we need cgroup lock to prevent this value from being

3761

* inconsistent.

3760

* inconsistent.

3762

*/

3761

*/

3763

cgroup_lock();

3762

cgroup_lock();

3764

mem->move_charge_at_immigrate = val;

3763

mem->move_charge_at_immigrate = val;

3765

cgroup_unlock();

3764

cgroup_unlock();

3766

3765

3767

return 0;

3766

return 0;

3768

}

3767

}

3769

#else

3768

#else

3770

static int mem_cgroup_move_charge_write(struct cgroup *cgrp,

3769

static int mem_cgroup_move_charge_write(struct cgroup *cgrp,

3771

struct cftype *cft, u64 val)

3770

struct cftype *cft, u64 val)

3772

{

3771

{

3773

return -ENOSYS;

3772

return -ENOSYS;

3774

}

3773

}

3775

#endif

3774

#endif

3776

3775

3777

3776

3778

/* For read statistics */

3777

/* For read statistics */

3779

enum {

3778

enum {

3780

MCS_CACHE,

3779

MCS_CACHE,

3781

MCS_RSS,

3780

MCS_RSS,

3782

MCS_FILE_MAPPED,

3781

MCS_FILE_MAPPED,

3783

MCS_PGPGIN,

3782

MCS_PGPGIN,

3784

MCS_PGPGOUT,

3783

MCS_PGPGOUT,

3785

MCS_SWAP,

3784

MCS_SWAP,

3786

MCS_INACTIVE_ANON,

3785

MCS_INACTIVE_ANON,

3787

MCS_ACTIVE_ANON,

3786

MCS_ACTIVE_ANON,

3788

MCS_INACTIVE_FILE,

3787

MCS_INACTIVE_FILE,

3789

MCS_ACTIVE_FILE,

3788

MCS_ACTIVE_FILE,

3790

MCS_UNEVICTABLE,

3789

MCS_UNEVICTABLE,

3791

NR_MCS_STAT,

3790

NR_MCS_STAT,

3792

};

3791

};

3793

3792

3794

struct mcs_total_stat {

3793

struct mcs_total_stat {

3795

s64 stat[NR_MCS_STAT];

3794

s64 stat[NR_MCS_STAT];

3796

};

3795

};

3797

3796

3798

struct {

3797

struct {

3799

char *local_name;

3798

char *local_name;

3800

char *total_name;

3799

char *total_name;

3801

} memcg_stat_strings[NR_MCS_STAT] = {

3800

} memcg_stat_strings[NR_MCS_STAT] = {

3802

{"cache", "total_cache"},

3801

{"cache", "total_cache"},

3803

{"rss", "total_rss"},

3802

{"rss", "total_rss"},

3804

{"mapped_file", "total_mapped_file"},

3803

{"mapped_file", "total_mapped_file"},

3805

{"pgpgin", "total_pgpgin"},

3804

{"pgpgin", "total_pgpgin"},

3806

{"pgpgout", "total_pgpgout"},

3805

{"pgpgout", "total_pgpgout"},

3807

{"swap", "total_swap"},

3806

{"swap", "total_swap"},

3808

{"inactive_anon", "total_inactive_anon"},

3807

{"inactive_anon", "total_inactive_anon"},

3809

{"active_anon", "total_active_anon"},

3808

{"active_anon", "total_active_anon"},

3810

{"inactive_file", "total_inactive_file"},

3809

{"inactive_file", "total_inactive_file"},

3811

{"active_file", "total_active_file"},

3810

{"active_file", "total_active_file"},

3812

{"unevictable", "total_unevictable"}

3811

{"unevictable", "total_unevictable"}

3813

};

3812

};

3814

3813

3815

3814

3816

static void

3815

static void

3817

mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)

3816

mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)

3818

{

3817

{

3819

s64 val;

3818

s64 val;

3820

3819

3821

/* per cpu stat */

3820

/* per cpu stat */

3822

val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);

3821

val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);

3823

s->stat[MCS_CACHE] += val * PAGE_SIZE;

3822

s->stat[MCS_CACHE] += val * PAGE_SIZE;

3824

val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);

3823

val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);

3825

s->stat[MCS_RSS] += val * PAGE_SIZE;

3824

s->stat[MCS_RSS] += val * PAGE_SIZE;

3826

val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);

3825

val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);

3827

s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;

3826

s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;

3828

val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);

3827

val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);

3829

s->stat[MCS_PGPGIN] += val;

3828

s->stat[MCS_PGPGIN] += val;

3830

val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);

3829

val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);

3831

s->stat[MCS_PGPGOUT] += val;

3830

s->stat[MCS_PGPGOUT] += val;

3832

if (do_swap_account) {

3831

if (do_swap_account) {

3833

val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);

3832

val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);

3834

s->stat[MCS_SWAP] += val * PAGE_SIZE;

3833

s->stat[MCS_SWAP] += val * PAGE_SIZE;

3835

}

3834

}

3836

3835

3837

/* per zone stat */

3836

/* per zone stat */

3838

val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);

3837

val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);

3839

s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;

3838

s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;

3840

val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);

3839

val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);

3841

s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;

3840

s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;

3842

val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);

3841

val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);

3843

s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;

3842

s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;

3844

val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);

3843

val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);

3845

s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;

3844

s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;

3846

val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);

3845

val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);

3847

s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;

3846

s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;

3848

}

3847

}

3849

3848

3850

static void

3849

static void

3851

mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)

3850

mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)

3852

{

3851

{

3853

struct mem_cgroup *iter;

3852

struct mem_cgroup *iter;

3854

3853

3855

for_each_mem_cgroup_tree(iter, mem)

3854

for_each_mem_cgroup_tree(iter, mem)

3856

mem_cgroup_get_local_stat(iter, s);

3855

mem_cgroup_get_local_stat(iter, s);

3857

}

3856

}

3858

3857

3859

static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,

3858

static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,

3860

struct cgroup_map_cb *cb)

3859

struct cgroup_map_cb *cb)

3861

{

3860

{

3862

struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);

3861

struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);

3863

struct mcs_total_stat mystat;

3862

struct mcs_total_stat mystat;

3864

int i;

3863

int i;

3865

3864

3866

memset(&mystat, 0, sizeof(mystat));

3865

memset(&mystat, 0, sizeof(mystat));

3867

mem_cgroup_get_local_stat(mem_cont, &mystat);

3866

mem_cgroup_get_local_stat(mem_cont, &mystat);

3868

3867

3869

for (i = 0; i < NR_MCS_STAT; i++) {

3868

for (i = 0; i < NR_MCS_STAT; i++) {

3870

if (i == MCS_SWAP && !do_swap_account)

3869

if (i == MCS_SWAP && !do_swap_account)

3871

continue;

3870

continue;

3872

cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);

3871

cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);

3873

}

3872

}

3874

3873

3875

/* Hierarchical information */

3874

/* Hierarchical information */

3876

{

3875

{

3877

unsigned long long limit, memsw_limit;

3876

unsigned long long limit, memsw_limit;

3878

memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);

3877

memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);

3879

cb->fill(cb, "hierarchical_memory_limit", limit);

3878

cb->fill(cb, "hierarchical_memory_limit", limit);

3880

if (do_swap_account)

3879

if (do_swap_account)

3881

cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);

3880

cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);

3882

}

3881

}

3883

3882

3884

memset(&mystat, 0, sizeof(mystat));

3883

memset(&mystat, 0, sizeof(mystat));

3885

mem_cgroup_get_total_stat(mem_cont, &mystat);

3884

mem_cgroup_get_total_stat(mem_cont, &mystat);

3886

for (i = 0; i < NR_MCS_STAT; i++) {

3885

for (i = 0; i < NR_MCS_STAT; i++) {

3887

if (i == MCS_SWAP && !do_swap_account)

3886

if (i == MCS_SWAP && !do_swap_account)

3888

continue;

3887

continue;

3889

cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);

3888

cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);

3890

}

3889

}

3891

3890

3892

#ifdef CONFIG_DEBUG_VM

3891

#ifdef CONFIG_DEBUG_VM

3893

cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));

3892

cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));

3894

3893

3895

{

3894

{

3896

int nid, zid;

3895

int nid, zid;

3897

struct mem_cgroup_per_zone *mz;

3896

struct mem_cgroup_per_zone *mz;

3898

unsigned long recent_rotated[2] = {0, 0};

3897

unsigned long recent_rotated[2] = {0, 0};

3899

unsigned long recent_scanned[2] = {0, 0};

3898

unsigned long recent_scanned[2] = {0, 0};

3900

3899

3901

for_each_online_node(nid)

3900

for_each_online_node(nid)

3902

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

3901

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

3903

mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);

3902

mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);

3904

3903

3905

recent_rotated[0] +=

3904

recent_rotated[0] +=

3906

mz->reclaim_stat.recent_rotated[0];

3905

mz->reclaim_stat.recent_rotated[0];

3907

recent_rotated[1] +=

3906

recent_rotated[1] +=

3908

mz->reclaim_stat.recent_rotated[1];

3907

mz->reclaim_stat.recent_rotated[1];

3909

recent_scanned[0] +=

3908

recent_scanned[0] +=

3910

mz->reclaim_stat.recent_scanned[0];

3909

mz->reclaim_stat.recent_scanned[0];

3911

recent_scanned[1] +=

3910

recent_scanned[1] +=

3912

mz->reclaim_stat.recent_scanned[1];

3911

mz->reclaim_stat.recent_scanned[1];

3913

}

3912

}

3914

cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);

3913

cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);

3915

cb->fill(cb, "recent_rotated_file", recent_rotated[1]);

3914

cb->fill(cb, "recent_rotated_file", recent_rotated[1]);

3916

cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);

3915

cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);

3917

cb->fill(cb, "recent_scanned_file", recent_scanned[1]);

3916

cb->fill(cb, "recent_scanned_file", recent_scanned[1]);

3918

}

3917

}

3919

#endif

3918

#endif

3920

3919

3921

return 0;

3920

return 0;

3922

}

3921

}

3923

3922

3924

static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)

3923

static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)

3925

{

3924

{

3926

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

3925

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

3927

3926

3928

return get_swappiness(memcg);

3927

return get_swappiness(memcg);

3929

}

3928

}

3930

3929

3931

static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,

3930

static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,

3932

u64 val)

3931

u64 val)

3933

{

3932

{

3934

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

3933

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

3935

struct mem_cgroup *parent;

3934

struct mem_cgroup *parent;

3936

3935

3937

if (val > 100)

3936

if (val > 100)

3938

return -EINVAL;

3937

return -EINVAL;

3939

3938

3940

if (cgrp->parent == NULL)

3939

if (cgrp->parent == NULL)

3941

return -EINVAL;

3940

return -EINVAL;

3942

3941

3943

parent = mem_cgroup_from_cont(cgrp->parent);

3942

parent = mem_cgroup_from_cont(cgrp->parent);

3944

3943

3945

cgroup_lock();

3944

cgroup_lock();

3946

3945

3947

/* If under hierarchy, only empty-root can set this value */

3946

/* If under hierarchy, only empty-root can set this value */

3948

if ((parent->use_hierarchy) ||

3947

if ((parent->use_hierarchy) ||

3949

(memcg->use_hierarchy && !list_empty(&cgrp->children))) {

3948

(memcg->use_hierarchy && !list_empty(&cgrp->children))) {

3950

cgroup_unlock();

3949

cgroup_unlock();

3951

return -EINVAL;

3950

return -EINVAL;

3952

}

3951

}

3953

3952

3954

memcg->swappiness = val;

3953

memcg->swappiness = val;

3955

3954

3956

cgroup_unlock();

3955

cgroup_unlock();

3957

3956

3958

return 0;

3957

return 0;

3959

}

3958

}

3960

3959

3961

static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)

3960

static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)

3962

{

3961

{

3963

struct mem_cgroup_threshold_ary *t;

3962

struct mem_cgroup_threshold_ary *t;

3964

u64 usage;

3963

u64 usage;

3965

int i;

3964

int i;

3966

3965

3967

rcu_read_lock();

3966

rcu_read_lock();

3968

if (!swap)

3967

if (!swap)

3969

t = rcu_dereference(memcg->thresholds.primary);

3968

t = rcu_dereference(memcg->thresholds.primary);

3970

else

3969

else

3971

t = rcu_dereference(memcg->memsw_thresholds.primary);

3970

t = rcu_dereference(memcg->memsw_thresholds.primary);

3972

3971

3973

if (!t)

3972

if (!t)

3974

goto unlock;

3973

goto unlock;

3975

3974

3976

usage = mem_cgroup_usage(memcg, swap);

3975

usage = mem_cgroup_usage(memcg, swap);

3977

3976

3978

/*

3977

/*

3979

* current_threshold points to threshold just below usage.

3978

* current_threshold points to threshold just below usage.

3980

* If it's not true, a threshold was crossed after last

3979

* If it's not true, a threshold was crossed after last

3981

* call of __mem_cgroup_threshold().

3980

* call of __mem_cgroup_threshold().

3982

*/

3981

*/

3983

i = t->current_threshold;

3982

i = t->current_threshold;

3984

3983

3985

/*

3984

/*

3986

* Iterate backward over array of thresholds starting from

3985

* Iterate backward over array of thresholds starting from

3987

* current_threshold and check if a threshold is crossed.

3986

* current_threshold and check if a threshold is crossed.

3988

* If none of thresholds below usage is crossed, we read

3987

* If none of thresholds below usage is crossed, we read

3989

* only one element of the array here.

3988

* only one element of the array here.

3990

*/

3989

*/

3991

for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)

3990

for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)

3992

eventfd_signal(t->entries[i].eventfd, 1);

3991

eventfd_signal(t->entries[i].eventfd, 1);

3993

3992

3994

/* i = current_threshold + 1 */

3993

/* i = current_threshold + 1 */

3995

i++;

3994

i++;

3996

3995

3997

/*

3996

/*

3998

* Iterate forward over array of thresholds starting from

3997

* Iterate forward over array of thresholds starting from

3999

* current_threshold+1 and check if a threshold is crossed.

3998

* current_threshold+1 and check if a threshold is crossed.

4000

* If none of thresholds above usage is crossed, we read

3999

* If none of thresholds above usage is crossed, we read

4001

* only one element of the array here.

4000

* only one element of the array here.

4002

*/

4001

*/

4003

for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)

4002

for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)

4004

eventfd_signal(t->entries[i].eventfd, 1);

4003

eventfd_signal(t->entries[i].eventfd, 1);

4005

4004

4006

/* Update current_threshold */

4005

/* Update current_threshold */

4007

t->current_threshold = i - 1;

4006

t->current_threshold = i - 1;

4008

unlock:

4007

unlock:

4009

rcu_read_unlock();

4008

rcu_read_unlock();

4010

}

4009

}

4011

4010

4012

static void mem_cgroup_threshold(struct mem_cgroup *memcg)

4011

static void mem_cgroup_threshold(struct mem_cgroup *memcg)

4013

{

4012

{

4014

while (memcg) {

4013

while (memcg) {

4015

__mem_cgroup_threshold(memcg, false);

4014

__mem_cgroup_threshold(memcg, false);

4016

if (do_swap_account)

4015

if (do_swap_account)

4017

__mem_cgroup_threshold(memcg, true);

4016

__mem_cgroup_threshold(memcg, true);

4018

4017

4019

memcg = parent_mem_cgroup(memcg);

4018

memcg = parent_mem_cgroup(memcg);

4020

}

4019

}

4021

}

4020

}

4022

4021

4023

static int compare_thresholds(const void *a, const void *b)

4022

static int compare_thresholds(const void *a, const void *b)

4024

{

4023

{

4025

const struct mem_cgroup_threshold *_a = a;

4024

const struct mem_cgroup_threshold *_a = a;

4026

const struct mem_cgroup_threshold *_b = b;

4025

const struct mem_cgroup_threshold *_b = b;

4027

4026

4028

return _a->threshold - _b->threshold;

4027

return _a->threshold - _b->threshold;

4029

}

4028

}

4030

4029

4031

static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)

4030

static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)

4032

{

4031

{

4033

struct mem_cgroup_eventfd_list *ev;

4032

struct mem_cgroup_eventfd_list *ev;

4034

4033

4035

list_for_each_entry(ev, &mem->oom_notify, list)

4034

list_for_each_entry(ev, &mem->oom_notify, list)

4036

eventfd_signal(ev->eventfd, 1);

4035

eventfd_signal(ev->eventfd, 1);

4037

return 0;

4036

return 0;

4038

}

4037

}

4039

4038

4040

static void mem_cgroup_oom_notify(struct mem_cgroup *mem)

4039

static void mem_cgroup_oom_notify(struct mem_cgroup *mem)

4041

{

4040

{

4042

struct mem_cgroup *iter;

4041

struct mem_cgroup *iter;

4043

4042

4044

for_each_mem_cgroup_tree(iter, mem)

4043

for_each_mem_cgroup_tree(iter, mem)

4045

mem_cgroup_oom_notify_cb(iter);

4044

mem_cgroup_oom_notify_cb(iter);

4046

}

4045

}

4047

4046

4048

static int mem_cgroup_usage_register_event(struct cgroup *cgrp,

4047

static int mem_cgroup_usage_register_event(struct cgroup *cgrp,

4049

struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)

4048

struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)

4050

{

4049

{

4051

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4050

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4052

struct mem_cgroup_thresholds *thresholds;

4051

struct mem_cgroup_thresholds *thresholds;

4053

struct mem_cgroup_threshold_ary *new;

4052

struct mem_cgroup_threshold_ary *new;

4054

int type = MEMFILE_TYPE(cft->private);

4053

int type = MEMFILE_TYPE(cft->private);

4055

u64 threshold, usage;

4054

u64 threshold, usage;

4056

int i, size, ret;

4055

int i, size, ret;

4057

4056

4058

ret = res_counter_memparse_write_strategy(args, &threshold);

4057

ret = res_counter_memparse_write_strategy(args, &threshold);

4059

if (ret)

4058

if (ret)

4060

return ret;

4059

return ret;

4061

4060

4062

mutex_lock(&memcg->thresholds_lock);

4061

mutex_lock(&memcg->thresholds_lock);

4063

4062

4064

if (type == _MEM)

4063

if (type == _MEM)

4065

thresholds = &memcg->thresholds;

4064

thresholds = &memcg->thresholds;

4066

else if (type == _MEMSWAP)

4065

else if (type == _MEMSWAP)

4067

thresholds = &memcg->memsw_thresholds;

4066

thresholds = &memcg->memsw_thresholds;

4068

else

4067

else

4069

BUG();

4068

BUG();

4070

4069

4071

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

4070

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

4072

4071

4073

/* Check if a threshold crossed before adding a new one */

4072

/* Check if a threshold crossed before adding a new one */

4074

if (thresholds->primary)

4073

if (thresholds->primary)

4075

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4074

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4076

4075

4077

size = thresholds->primary ? thresholds->primary->size + 1 : 1;

4076

size = thresholds->primary ? thresholds->primary->size + 1 : 1;

4078

4077

4079

/* Allocate memory for new array of thresholds */

4078

/* Allocate memory for new array of thresholds */

4080

new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),

4079

new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),

4081

GFP_KERNEL);

4080

GFP_KERNEL);

4082

if (!new) {

4081

if (!new) {

4083

ret = -ENOMEM;

4082

ret = -ENOMEM;

4084

goto unlock;

4083

goto unlock;

4085

}

4084

}

4086

new->size = size;

4085

new->size = size;

4087

4086

4088

/* Copy thresholds (if any) to new array */

4087

/* Copy thresholds (if any) to new array */

4089

if (thresholds->primary) {

4088

if (thresholds->primary) {

4090

memcpy(new->entries, thresholds->primary->entries, (size - 1) *

4089

memcpy(new->entries, thresholds->primary->entries, (size - 1) *

4091

sizeof(struct mem_cgroup_threshold));

4090

sizeof(struct mem_cgroup_threshold));

4092

}

4091

}

4093

4092

4094

/* Add new threshold */

4093

/* Add new threshold */

4095

new->entries[size - 1].eventfd = eventfd;

4094

new->entries[size - 1].eventfd = eventfd;

4096

new->entries[size - 1].threshold = threshold;

4095

new->entries[size - 1].threshold = threshold;

4097

4096

4098

/* Sort thresholds. Registering of new threshold isn't time-critical */

4097

/* Sort thresholds. Registering of new threshold isn't time-critical */

4099

sort(new->entries, size, sizeof(struct mem_cgroup_threshold),

4098

sort(new->entries, size, sizeof(struct mem_cgroup_threshold),

4100

compare_thresholds, NULL);

4099

compare_thresholds, NULL);

4101

4100

4102

/* Find current threshold */

4101

/* Find current threshold */

4103

new->current_threshold = -1;

4102

new->current_threshold = -1;

4104

for (i = 0; i < size; i++) {

4103

for (i = 0; i < size; i++) {

4105

if (new->entries[i].threshold < usage) {

4104

if (new->entries[i].threshold < usage) {

4106

/*

4105

/*

4107

* new->current_threshold will not be used until

4106

* new->current_threshold will not be used until

4108

* rcu_assign_pointer(), so it's safe to increment

4107

* rcu_assign_pointer(), so it's safe to increment

4109

* it here.

4108

* it here.

4110

*/

4109

*/

4111

++new->current_threshold;

4110

++new->current_threshold;

4112

}

4111

}

4113

}

4112

}

4114

4113

4115

/* Free old spare buffer and save old primary buffer as spare */

4114

/* Free old spare buffer and save old primary buffer as spare */

4116

kfree(thresholds->spare);

4115

kfree(thresholds->spare);

4117

thresholds->spare = thresholds->primary;

4116

thresholds->spare = thresholds->primary;

4118

4117

4119

rcu_assign_pointer(thresholds->primary, new);

4118

rcu_assign_pointer(thresholds->primary, new);

4120

4119

4121

/* To be sure that nobody uses thresholds */

4120

/* To be sure that nobody uses thresholds */

4122

synchronize_rcu();

4121

synchronize_rcu();

4123

4122

4124

unlock:

4123

unlock:

4125

mutex_unlock(&memcg->thresholds_lock);

4124

mutex_unlock(&memcg->thresholds_lock);

4126

4125

4127

return ret;

4126

return ret;

4128

}

4127

}

4129

4128

4130

static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,

4129

static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,

4131

struct cftype *cft, struct eventfd_ctx *eventfd)

4130

struct cftype *cft, struct eventfd_ctx *eventfd)

4132

{

4131

{

4133

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4132

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4134

struct mem_cgroup_thresholds *thresholds;

4133

struct mem_cgroup_thresholds *thresholds;

4135

struct mem_cgroup_threshold_ary *new;

4134

struct mem_cgroup_threshold_ary *new;

4136

int type = MEMFILE_TYPE(cft->private);

4135

int type = MEMFILE_TYPE(cft->private);

4137

u64 usage;

4136

u64 usage;

4138

int i, j, size;

4137

int i, j, size;

4139

4138

4140

mutex_lock(&memcg->thresholds_lock);

4139

mutex_lock(&memcg->thresholds_lock);

4141

if (type == _MEM)

4140

if (type == _MEM)

4142

thresholds = &memcg->thresholds;

4141

thresholds = &memcg->thresholds;

4143

else if (type == _MEMSWAP)

4142

else if (type == _MEMSWAP)

4144

thresholds = &memcg->memsw_thresholds;

4143

thresholds = &memcg->memsw_thresholds;

4145

else

4144

else

4146

BUG();

4145

BUG();

4147

4146

4148

/*

4147

/*

4149

* Something went wrong if we trying to unregister a threshold

4148

* Something went wrong if we trying to unregister a threshold

4150

* if we don't have thresholds

4149

* if we don't have thresholds

4151

*/

4150

*/

4152

BUG_ON(!thresholds);

4151

BUG_ON(!thresholds);

4153

4152

4154

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

4153

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

4155

4154

4156

/* Check if a threshold crossed before removing */

4155

/* Check if a threshold crossed before removing */

4157

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4156

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4158

4157

4159

/* Calculate new number of threshold */

4158

/* Calculate new number of threshold */

4160

size = 0;

4159

size = 0;

4161

for (i = 0; i < thresholds->primary->size; i++) {

4160

for (i = 0; i < thresholds->primary->size; i++) {

4162

if (thresholds->primary->entries[i].eventfd != eventfd)

4161

if (thresholds->primary->entries[i].eventfd != eventfd)

4163

size++;

4162

size++;

4164

}

4163

}

4165

4164

4166

new = thresholds->spare;

4165

new = thresholds->spare;

4167

4166

4168

/* Set thresholds array to NULL if we don't have thresholds */

4167

/* Set thresholds array to NULL if we don't have thresholds */

4169

if (!size) {

4168

if (!size) {

4170

kfree(new);

4169

kfree(new);

4171

new = NULL;

4170

new = NULL;

4172

goto swap_buffers;

4171

goto swap_buffers;

4173

}

4172

}

4174

4173

4175

new->size = size;

4174

new->size = size;

4176

4175

4177

/* Copy thresholds and find current threshold */

4176

/* Copy thresholds and find current threshold */

4178

new->current_threshold = -1;

4177

new->current_threshold = -1;

4179

for (i = 0, j = 0; i < thresholds->primary->size; i++) {

4178

for (i = 0, j = 0; i < thresholds->primary->size; i++) {

4180

if (thresholds->primary->entries[i].eventfd == eventfd)

4179

if (thresholds->primary->entries[i].eventfd == eventfd)

4181

continue;

4180

continue;

4182

4181

4183

new->entries[j] = thresholds->primary->entries[i];

4182

new->entries[j] = thresholds->primary->entries[i];

4184

if (new->entries[j].threshold < usage) {

4183

if (new->entries[j].threshold < usage) {

4185

/*

4184

/*

4186

* new->current_threshold will not be used

4185

* new->current_threshold will not be used

4187

* until rcu_assign_pointer(), so it's safe to increment

4186

* until rcu_assign_pointer(), so it's safe to increment

4188

* it here.

4187

* it here.

4189

*/

4188

*/

4190

++new->current_threshold;

4189

++new->current_threshold;

4191

}

4190

}

4192

j++;

4191

j++;

4193

}

4192

}

4194

4193

4195

swap_buffers:

4194

swap_buffers:

4196

/* Swap primary and spare array */

4195

/* Swap primary and spare array */

4197

thresholds->spare = thresholds->primary;

4196

thresholds->spare = thresholds->primary;

4198

rcu_assign_pointer(thresholds->primary, new);

4197

rcu_assign_pointer(thresholds->primary, new);

4199

4198

4200

/* To be sure that nobody uses thresholds */

4199

/* To be sure that nobody uses thresholds */

4201

synchronize_rcu();

4200

synchronize_rcu();

4202

4201

4203

mutex_unlock(&memcg->thresholds_lock);

4202

mutex_unlock(&memcg->thresholds_lock);

4204

}

4203

}

4205

4204

4206

static int mem_cgroup_oom_register_event(struct cgroup *cgrp,

4205

static int mem_cgroup_oom_register_event(struct cgroup *cgrp,

4207

struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)

4206

struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)

4208

{

4207

{

4209

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4208

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4210

struct mem_cgroup_eventfd_list *event;

4209

struct mem_cgroup_eventfd_list *event;

4211

int type = MEMFILE_TYPE(cft->private);

4210

int type = MEMFILE_TYPE(cft->private);

4212

4211

4213

BUG_ON(type != _OOM_TYPE);

4212

BUG_ON(type != _OOM_TYPE);

4214

event = kmalloc(sizeof(*event), GFP_KERNEL);

4213

event = kmalloc(sizeof(*event), GFP_KERNEL);

4215

if (!event)

4214

if (!event)

4216

return -ENOMEM;

4215

return -ENOMEM;

4217

4216

4218

mutex_lock(&memcg_oom_mutex);

4217

mutex_lock(&memcg_oom_mutex);

4219

4218

4220

event->eventfd = eventfd;

4219

event->eventfd = eventfd;

4221

list_add(&event->list, &memcg->oom_notify);

4220

list_add(&event->list, &memcg->oom_notify);

4222

4221

4223

/* already in OOM ? */

4222

/* already in OOM ? */

4224

if (atomic_read(&memcg->oom_lock))

4223

if (atomic_read(&memcg->oom_lock))

4225

eventfd_signal(eventfd, 1);

4224

eventfd_signal(eventfd, 1);

4226

mutex_unlock(&memcg_oom_mutex);

4225

mutex_unlock(&memcg_oom_mutex);

4227

4226

4228

return 0;

4227

return 0;

4229

}

4228

}

4230

4229

4231

static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,

4230

static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,

4232

struct cftype *cft, struct eventfd_ctx *eventfd)

4231

struct cftype *cft, struct eventfd_ctx *eventfd)

4233

{

4232

{

4234

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4233

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4235

struct mem_cgroup_eventfd_list *ev, *tmp;

4234

struct mem_cgroup_eventfd_list *ev, *tmp;

4236

int type = MEMFILE_TYPE(cft->private);

4235

int type = MEMFILE_TYPE(cft->private);

4237

4236

4238

BUG_ON(type != _OOM_TYPE);

4237

BUG_ON(type != _OOM_TYPE);

4239

4238

4240

mutex_lock(&memcg_oom_mutex);

4239

mutex_lock(&memcg_oom_mutex);

4241

4240

4242

list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {

4241

list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {

4243

if (ev->eventfd == eventfd) {

4242

if (ev->eventfd == eventfd) {

4244

list_del(&ev->list);

4243

list_del(&ev->list);

4245

kfree(ev);

4244

kfree(ev);

4246

}

4245

}

4247

}

4246

}

4248

4247

4249

mutex_unlock(&memcg_oom_mutex);

4248

mutex_unlock(&memcg_oom_mutex);

4250

}

4249

}

4251

4250

4252

static int mem_cgroup_oom_control_read(struct cgroup *cgrp,

4251

static int mem_cgroup_oom_control_read(struct cgroup *cgrp,

4253

struct cftype *cft, struct cgroup_map_cb *cb)

4252

struct cftype *cft, struct cgroup_map_cb *cb)

4254

{

4253

{

4255

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4254

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4256

4255

4257

cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);

4256

cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);

4258

4257

4259

if (atomic_read(&mem->oom_lock))

4258

if (atomic_read(&mem->oom_lock))

4260

cb->fill(cb, "under_oom", 1);

4259

cb->fill(cb, "under_oom", 1);

4261

else

4260

else

4262

cb->fill(cb, "under_oom", 0);

4261

cb->fill(cb, "under_oom", 0);

4263

return 0;

4262

return 0;

4264

}

4263

}

4265

4264

4266

static int mem_cgroup_oom_control_write(struct cgroup *cgrp,

4265

static int mem_cgroup_oom_control_write(struct cgroup *cgrp,

4267

struct cftype *cft, u64 val)

4266

struct cftype *cft, u64 val)

4268

{

4267

{

4269

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4268

struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);

4270

struct mem_cgroup *parent;

4269

struct mem_cgroup *parent;

4271

4270

4272

/* cannot set to root cgroup and only 0 and 1 are allowed */

4271

/* cannot set to root cgroup and only 0 and 1 are allowed */

4273

if (!cgrp->parent || !((val == 0) || (val == 1)))

4272

if (!cgrp->parent || !((val == 0) || (val == 1)))

4274

return -EINVAL;

4273

return -EINVAL;

4275

4274

4276

parent = mem_cgroup_from_cont(cgrp->parent);

4275

parent = mem_cgroup_from_cont(cgrp->parent);

4277

4276

4278

cgroup_lock();

4277

cgroup_lock();

4279

/* oom-kill-disable is a flag for subhierarchy. */

4278

/* oom-kill-disable is a flag for subhierarchy. */

4280

if ((parent->use_hierarchy) ||

4279

if ((parent->use_hierarchy) ||

4281

(mem->use_hierarchy && !list_empty(&cgrp->children))) {

4280

(mem->use_hierarchy && !list_empty(&cgrp->children))) {

4282

cgroup_unlock();

4281

cgroup_unlock();

4283

return -EINVAL;

4282

return -EINVAL;

4284

}

4283

}

4285

mem->oom_kill_disable = val;

4284

mem->oom_kill_disable = val;

4286

if (!val)

4285

if (!val)

4287

memcg_oom_recover(mem);

4286

memcg_oom_recover(mem);

4288

cgroup_unlock();

4287

cgroup_unlock();

4289

return 0;

4288

return 0;

4290

}

4289

}

4291

4290

4292

static struct cftype mem_cgroup_files[] = {

4291

static struct cftype mem_cgroup_files[] = {

4293

{

4292

{

4294

.name = "usage_in_bytes",

4293

.name = "usage_in_bytes",

4295

.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),

4294

.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),

4296

.read_u64 = mem_cgroup_read,

4295

.read_u64 = mem_cgroup_read,

4297

.register_event = mem_cgroup_usage_register_event,

4296

.register_event = mem_cgroup_usage_register_event,

4298

.unregister_event = mem_cgroup_usage_unregister_event,

4297

.unregister_event = mem_cgroup_usage_unregister_event,

4299

},

4298

},

4300

{

4299

{

4301

.name = "max_usage_in_bytes",

4300

.name = "max_usage_in_bytes",

4302

.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),

4301

.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),

4303

.trigger = mem_cgroup_reset,

4302

.trigger = mem_cgroup_reset,

4304

.read_u64 = mem_cgroup_read,

4303

.read_u64 = mem_cgroup_read,

4305

},

4304

},

4306

{

4305

{

4307

.name = "limit_in_bytes",

4306

.name = "limit_in_bytes",

4308

.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),

4307

.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),

4309

.write_string = mem_cgroup_write,

4308

.write_string = mem_cgroup_write,

4310

.read_u64 = mem_cgroup_read,

4309

.read_u64 = mem_cgroup_read,

4311

},

4310

},

4312

{

4311

{

4313

.name = "soft_limit_in_bytes",

4312

.name = "soft_limit_in_bytes",

4314

.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),

4313

.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),

4315

.write_string = mem_cgroup_write,

4314

.write_string = mem_cgroup_write,

4316

.read_u64 = mem_cgroup_read,

4315

.read_u64 = mem_cgroup_read,

4317

},

4316

},

4318

{

4317

{

4319

.name = "failcnt",

4318

.name = "failcnt",

4320

.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),

4319

.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),

4321

.trigger = mem_cgroup_reset,

4320

.trigger = mem_cgroup_reset,

4322

.read_u64 = mem_cgroup_read,

4321

.read_u64 = mem_cgroup_read,

4323

},

4322

},

4324

{

4323

{

4325

.name = "stat",

4324

.name = "stat",

4326

.read_map = mem_control_stat_show,

4325

.read_map = mem_control_stat_show,

4327

},

4326

},

4328

{

4327

{

4329

.name = "force_empty",

4328

.name = "force_empty",

4330

.trigger = mem_cgroup_force_empty_write,

4329

.trigger = mem_cgroup_force_empty_write,

4331

},

4330

},

4332

{

4331

{

4333

.name = "use_hierarchy",

4332

.name = "use_hierarchy",

4334

.write_u64 = mem_cgroup_hierarchy_write,

4333

.write_u64 = mem_cgroup_hierarchy_write,

4335

.read_u64 = mem_cgroup_hierarchy_read,

4334

.read_u64 = mem_cgroup_hierarchy_read,

4336

},

4335

},

4337

{

4336

{

4338

.name = "swappiness",

4337

.name = "swappiness",

4339

.read_u64 = mem_cgroup_swappiness_read,

4338

.read_u64 = mem_cgroup_swappiness_read,

4340

.write_u64 = mem_cgroup_swappiness_write,

4339

.write_u64 = mem_cgroup_swappiness_write,

4341

},

4340

},

4342

{

4341

{

4343

.name = "move_charge_at_immigrate",

4342

.name = "move_charge_at_immigrate",

4344

.read_u64 = mem_cgroup_move_charge_read,

4343

.read_u64 = mem_cgroup_move_charge_read,

4345

.write_u64 = mem_cgroup_move_charge_write,

4344

.write_u64 = mem_cgroup_move_charge_write,

4346

},

4345

},

4347

{

4346

{

4348

.name = "oom_control",

4347

.name = "oom_control",

4349

.read_map = mem_cgroup_oom_control_read,

4348

.read_map = mem_cgroup_oom_control_read,

4350

.write_u64 = mem_cgroup_oom_control_write,

4349

.write_u64 = mem_cgroup_oom_control_write,

4351

.register_event = mem_cgroup_oom_register_event,

4350

.register_event = mem_cgroup_oom_register_event,

4352

.unregister_event = mem_cgroup_oom_unregister_event,

4351

.unregister_event = mem_cgroup_oom_unregister_event,

4353

.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),

4352

.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),

4354

},

4353

},

4355

};

4354

};

4356

4355

4357

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

4356

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

4358

static struct cftype memsw_cgroup_files[] = {

4357

static struct cftype memsw_cgroup_files[] = {

4359

{

4358

{

4360

.name = "memsw.usage_in_bytes",

4359

.name = "memsw.usage_in_bytes",

4361

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),

4360

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),

4362

.read_u64 = mem_cgroup_read,

4361

.read_u64 = mem_cgroup_read,

4363

.register_event = mem_cgroup_usage_register_event,

4362

.register_event = mem_cgroup_usage_register_event,

4364

.unregister_event = mem_cgroup_usage_unregister_event,

4363

.unregister_event = mem_cgroup_usage_unregister_event,

4365

},

4364

},

4366

{

4365

{

4367

.name = "memsw.max_usage_in_bytes",

4366

.name = "memsw.max_usage_in_bytes",

4368

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),

4367

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),

4369

.trigger = mem_cgroup_reset,

4368

.trigger = mem_cgroup_reset,

4370

.read_u64 = mem_cgroup_read,

4369

.read_u64 = mem_cgroup_read,

4371

},

4370

},

4372

{

4371

{

4373

.name = "memsw.limit_in_bytes",

4372

.name = "memsw.limit_in_bytes",

4374

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),

4373

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),

4375

.write_string = mem_cgroup_write,

4374

.write_string = mem_cgroup_write,

4376

.read_u64 = mem_cgroup_read,

4375

.read_u64 = mem_cgroup_read,

4377

},

4376

},

4378

{

4377

{

4379

.name = "memsw.failcnt",

4378

.name = "memsw.failcnt",

4380

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),

4379

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),

4381

.trigger = mem_cgroup_reset,

4380

.trigger = mem_cgroup_reset,

4382

.read_u64 = mem_cgroup_read,

4381

.read_u64 = mem_cgroup_read,

4383

},

4382

},

4384

};

4383

};

4385

4384

4386

static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)

4385

static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)

4387

{

4386

{

4388

if (!do_swap_account)

4387

if (!do_swap_account)

4389

return 0;

4388

return 0;

4390

return cgroup_add_files(cont, ss, memsw_cgroup_files,

4389

return cgroup_add_files(cont, ss, memsw_cgroup_files,

4391

ARRAY_SIZE(memsw_cgroup_files));

4390

ARRAY_SIZE(memsw_cgroup_files));

4392

};

4391

};

4393

#else

4392

#else

4394

static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)

4393

static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)

4395

{

4394

{

4396

return 0;

4395

return 0;

4397

}

4396

}

4398

#endif

4397

#endif

4399

4398

4400

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)

4399

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)

4401

{

4400

{

4402

struct mem_cgroup_per_node *pn;

4401

struct mem_cgroup_per_node *pn;

4403

struct mem_cgroup_per_zone *mz;

4402

struct mem_cgroup_per_zone *mz;

4404

enum lru_list l;

4403

enum lru_list l;

4405

int zone, tmp = node;

4404

int zone, tmp = node;

4406

/*

4405

/*

4407

* This routine is called against possible nodes.

4406

* This routine is called against possible nodes.

4408

* But it's BUG to call kmalloc() against offline node.

4407

* But it's BUG to call kmalloc() against offline node.

4409

*

4408

*

4410

* TODO: this routine can waste much memory for nodes which will

4409

* TODO: this routine can waste much memory for nodes which will

4411

* never be onlined. It's better to use memory hotplug callback

4410

* never be onlined. It's better to use memory hotplug callback

4412

* function.

4411

* function.

4413

*/

4412

*/

4414

if (!node_state(node, N_NORMAL_MEMORY))

4413

if (!node_state(node, N_NORMAL_MEMORY))

4415

tmp = -1;

4414

tmp = -1;

4416

pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

4415

pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

4417

if (!pn)

4416

if (!pn)

4418

return 1;

4417

return 1;

4419

4418

4420

mem->info.nodeinfo[node] = pn;

4419

mem->info.nodeinfo[node] = pn;

4421

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

4420

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

4422

mz = &pn->zoneinfo[zone];

4421

mz = &pn->zoneinfo[zone];

4423

for_each_lru(l)

4422

for_each_lru(l)

4424

INIT_LIST_HEAD(&mz->lists[l]);

4423

INIT_LIST_HEAD(&mz->lists[l]);

4425

mz->usage_in_excess = 0;

4424

mz->usage_in_excess = 0;

4426

mz->on_tree = false;

4425

mz->on_tree = false;

4427

mz->mem = mem;

4426

mz->mem = mem;

4428

}

4427

}

4429

return 0;

4428

return 0;

4430

}

4429

}

4431

4430

4432

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)

4431

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)

4433

{

4432

{

4434

kfree(mem->info.nodeinfo[node]);

4433

kfree(mem->info.nodeinfo[node]);

4435

}

4434

}

4436

4435

4437

static struct mem_cgroup *mem_cgroup_alloc(void)

4436

static struct mem_cgroup *mem_cgroup_alloc(void)

4438

{

4437

{

4439

struct mem_cgroup *mem;

4438

struct mem_cgroup *mem;

4440

int size = sizeof(struct mem_cgroup);

4439

int size = sizeof(struct mem_cgroup);

4441

4440

4442

/* Can be very big if MAX_NUMNODES is very big */

4441

/* Can be very big if MAX_NUMNODES is very big */

4443

if (size < PAGE_SIZE)

4442

if (size < PAGE_SIZE)

4444

mem = kzalloc(size, GFP_KERNEL);

4443

mem = kzalloc(size, GFP_KERNEL);

4445

else

4444

else

4446

mem = vzalloc(size);

4445

mem = vzalloc(size);

4447

4446

4448

if (!mem)

4447

if (!mem)

4449

return NULL;

4448

return NULL;

4450

4449

4451

mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);

4450

mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);

4452

if (!mem->stat)

4451

if (!mem->stat)

4453

goto out_free;

4452

goto out_free;

4454

spin_lock_init(&mem->pcp_counter_lock);

4453

spin_lock_init(&mem->pcp_counter_lock);

4455

return mem;

4454

return mem;

4456

4455

4457

out_free:

4456

out_free:

4458

if (size < PAGE_SIZE)

4457

if (size < PAGE_SIZE)

4459

kfree(mem);

4458

kfree(mem);

4460

else

4459

else

4461

vfree(mem);

4460

vfree(mem);

4462

return NULL;

4461

return NULL;

4463

}

4462

}

4464

4463

4465

/*

4464

/*

4466

* At destroying mem_cgroup, references from swap_cgroup can remain.

4465

* At destroying mem_cgroup, references from swap_cgroup can remain.

4467

* (scanning all at force_empty is too costly...)

4466

* (scanning all at force_empty is too costly...)

4468

*

4467

*

4469

* Instead of clearing all references at force_empty, we remember

4468

* Instead of clearing all references at force_empty, we remember

4470

* the number of reference from swap_cgroup and free mem_cgroup when

4469

* the number of reference from swap_cgroup and free mem_cgroup when

4471

* it goes down to 0.

4470

* it goes down to 0.

4472

*

4471

*

4473

* Removal of cgroup itself succeeds regardless of refs from swap.

4472

* Removal of cgroup itself succeeds regardless of refs from swap.

4474

*/

4473

*/

4475

4474

4476

static void __mem_cgroup_free(struct mem_cgroup *mem)

4475

static void __mem_cgroup_free(struct mem_cgroup *mem)

4477

{

4476

{

4478

int node;

4477

int node;

4479

4478

4480

mem_cgroup_remove_from_trees(mem);

4479

mem_cgroup_remove_from_trees(mem);

4481

free_css_id(&mem_cgroup_subsys, &mem->css);

4480

free_css_id(&mem_cgroup_subsys, &mem->css);

4482

4481

4483

for_each_node_state(node, N_POSSIBLE)

4482

for_each_node_state(node, N_POSSIBLE)

4484

free_mem_cgroup_per_zone_info(mem, node);

4483

free_mem_cgroup_per_zone_info(mem, node);

4485

4484

4486

free_percpu(mem->stat);

4485

free_percpu(mem->stat);

4487

if (sizeof(struct mem_cgroup) < PAGE_SIZE)

4486

if (sizeof(struct mem_cgroup) < PAGE_SIZE)

4488

kfree(mem);

4487

kfree(mem);

4489

else

4488

else

4490

vfree(mem);

4489

vfree(mem);

4491

}

4490

}

4492

4491

4493

static void mem_cgroup_get(struct mem_cgroup *mem)

4492

static void mem_cgroup_get(struct mem_cgroup *mem)

4494

{

4493

{

4495

atomic_inc(&mem->refcnt);

4494

atomic_inc(&mem->refcnt);

4496

}

4495

}

4497

4496

4498

static void __mem_cgroup_put(struct mem_cgroup *mem, int count)

4497

static void __mem_cgroup_put(struct mem_cgroup *mem, int count)

4499

{

4498

{

4500

if (atomic_sub_and_test(count, &mem->refcnt)) {

4499

if (atomic_sub_and_test(count, &mem->refcnt)) {

4501

struct mem_cgroup *parent = parent_mem_cgroup(mem);

4500

struct mem_cgroup *parent = parent_mem_cgroup(mem);

4502

__mem_cgroup_free(mem);

4501

__mem_cgroup_free(mem);

4503

if (parent)

4502

if (parent)

4504

mem_cgroup_put(parent);

4503

mem_cgroup_put(parent);

4505

}

4504

}

4506

}

4505

}

4507

4506

4508

static void mem_cgroup_put(struct mem_cgroup *mem)

4507

static void mem_cgroup_put(struct mem_cgroup *mem)

4509

{

4508

{

4510

__mem_cgroup_put(mem, 1);

4509

__mem_cgroup_put(mem, 1);

4511

}

4510

}

4512

4511

4513

/*

4512

/*

4514

* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.

4513

* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.

4515

*/

4514

*/

4516

static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)

4515

static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)

4517

{

4516

{

4518

if (!mem->res.parent)

4517

if (!mem->res.parent)

4519

return NULL;

4518

return NULL;

4520

return mem_cgroup_from_res_counter(mem->res.parent, res);

4519

return mem_cgroup_from_res_counter(mem->res.parent, res);

4521

}

4520

}

4522

4521

4523

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

4522

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

4524

static void __init enable_swap_cgroup(void)

4523

static void __init enable_swap_cgroup(void)

4525

{

4524

{

4526

if (!mem_cgroup_disabled() && really_do_swap_account)

4525

if (!mem_cgroup_disabled() && really_do_swap_account)

4527

do_swap_account = 1;

4526

do_swap_account = 1;

4528

}

4527

}

4529

#else

4528

#else

4530

static void __init enable_swap_cgroup(void)

4529

static void __init enable_swap_cgroup(void)

4531

{

4530

{

4532

}

4531

}

4533

#endif

4532

#endif

4534

4533

4535

static int mem_cgroup_soft_limit_tree_init(void)

4534

static int mem_cgroup_soft_limit_tree_init(void)

4536

{

4535

{

4537

struct mem_cgroup_tree_per_node *rtpn;

4536

struct mem_cgroup_tree_per_node *rtpn;

4538

struct mem_cgroup_tree_per_zone *rtpz;

4537

struct mem_cgroup_tree_per_zone *rtpz;

4539

int tmp, node, zone;

4538

int tmp, node, zone;

4540

4539

4541

for_each_node_state(node, N_POSSIBLE) {

4540

for_each_node_state(node, N_POSSIBLE) {

4542

tmp = node;

4541

tmp = node;

4543

if (!node_state(node, N_NORMAL_MEMORY))

4542

if (!node_state(node, N_NORMAL_MEMORY))

4544

tmp = -1;

4543

tmp = -1;

4545

rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);

4544

rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);

4546

if (!rtpn)

4545

if (!rtpn)

4547

return 1;

4546

return 1;

4548

4547

4549

soft_limit_tree.rb_tree_per_node[node] = rtpn;

4548

soft_limit_tree.rb_tree_per_node[node] = rtpn;

4550

4549

4551

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

4550

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

4552

rtpz = &rtpn->rb_tree_per_zone[zone];

4551

rtpz = &rtpn->rb_tree_per_zone[zone];

4553

rtpz->rb_root = RB_ROOT;

4552

rtpz->rb_root = RB_ROOT;

4554

spin_lock_init(&rtpz->lock);

4553

spin_lock_init(&rtpz->lock);

4555

}

4554

}

4556

}

4555

}

4557

return 0;

4556

return 0;

4558

}

4557

}

4559

4558

4560

static struct cgroup_subsys_state * __ref

4559

static struct cgroup_subsys_state * __ref

4561

mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)

4560

mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)

4562

{

4561

{

4563

struct mem_cgroup *mem, *parent;

4562

struct mem_cgroup *mem, *parent;

4564

long error = -ENOMEM;

4563

long error = -ENOMEM;

4565

int node;

4564

int node;

4566

4565

4567

mem = mem_cgroup_alloc();

4566

mem = mem_cgroup_alloc();

4568

if (!mem)

4567

if (!mem)

4569

return ERR_PTR(error);

4568

return ERR_PTR(error);

4570

4569

4571

for_each_node_state(node, N_POSSIBLE)

4570

for_each_node_state(node, N_POSSIBLE)

4572

if (alloc_mem_cgroup_per_zone_info(mem, node))

4571

if (alloc_mem_cgroup_per_zone_info(mem, node))

4573

goto free_out;

4572

goto free_out;

4574

4573

4575

/* root ? */

4574

/* root ? */

4576

if (cont->parent == NULL) {

4575

if (cont->parent == NULL) {

4577

int cpu;

4576

int cpu;

4578

enable_swap_cgroup();

4577

enable_swap_cgroup();

4579

parent = NULL;

4578

parent = NULL;

4580

root_mem_cgroup = mem;

4579

root_mem_cgroup = mem;

4581

if (mem_cgroup_soft_limit_tree_init())

4580

if (mem_cgroup_soft_limit_tree_init())

4582

goto free_out;

4581

goto free_out;

4583

for_each_possible_cpu(cpu) {

4582

for_each_possible_cpu(cpu) {

4584

struct memcg_stock_pcp *stock =

4583

struct memcg_stock_pcp *stock =

4585

&per_cpu(memcg_stock, cpu);

4584

&per_cpu(memcg_stock, cpu);

4586

INIT_WORK(&stock->work, drain_local_stock);

4585

INIT_WORK(&stock->work, drain_local_stock);

4587

}

4586

}

4588

hotcpu_notifier(memcg_cpu_hotplug_callback, 0);

4587

hotcpu_notifier(memcg_cpu_hotplug_callback, 0);

4589

} else {

4588

} else {

4590

parent = mem_cgroup_from_cont(cont->parent);

4589

parent = mem_cgroup_from_cont(cont->parent);

4591

mem->use_hierarchy = parent->use_hierarchy;

4590

mem->use_hierarchy = parent->use_hierarchy;

4592

mem->oom_kill_disable = parent->oom_kill_disable;

4591

mem->oom_kill_disable = parent->oom_kill_disable;

4593

}

4592

}

4594

4593

4595

if (parent && parent->use_hierarchy) {

4594

if (parent && parent->use_hierarchy) {

4596

res_counter_init(&mem->res, &parent->res);

4595

res_counter_init(&mem->res, &parent->res);

4597

res_counter_init(&mem->memsw, &parent->memsw);

4596

res_counter_init(&mem->memsw, &parent->memsw);

4598

/*

4597

/*

4599

* We increment refcnt of the parent to ensure that we can

4598

* We increment refcnt of the parent to ensure that we can

4600

* safely access it on res_counter_charge/uncharge.

4599

* safely access it on res_counter_charge/uncharge.

4601

* This refcnt will be decremented when freeing this

4600

* This refcnt will be decremented when freeing this

4602

* mem_cgroup(see mem_cgroup_put).

4601

* mem_cgroup(see mem_cgroup_put).

4603

*/

4602

*/

4604

mem_cgroup_get(parent);

4603

mem_cgroup_get(parent);

4605

} else {

4604

} else {

4606

res_counter_init(&mem->res, NULL);

4605

res_counter_init(&mem->res, NULL);

4607

res_counter_init(&mem->memsw, NULL);

4606

res_counter_init(&mem->memsw, NULL);

4608

}

4607

}

4609

mem->last_scanned_child = 0;

4608

mem->last_scanned_child = 0;

4610

INIT_LIST_HEAD(&mem->oom_notify);

4609

INIT_LIST_HEAD(&mem->oom_notify);

4611

4610

4612

if (parent)

4611

if (parent)

4613

mem->swappiness = get_swappiness(parent);

4612

mem->swappiness = get_swappiness(parent);

4614

atomic_set(&mem->refcnt, 1);

4613

atomic_set(&mem->refcnt, 1);

4615

mem->move_charge_at_immigrate = 0;

4614

mem->move_charge_at_immigrate = 0;

4616

mutex_init(&mem->thresholds_lock);

4615

mutex_init(&mem->thresholds_lock);

4617

return &mem->css;

4616

return &mem->css;

4618

free_out:

4617

free_out:

4619

__mem_cgroup_free(mem);

4618

__mem_cgroup_free(mem);

4620

root_mem_cgroup = NULL;

4619

root_mem_cgroup = NULL;

4621

return ERR_PTR(error);

4620

return ERR_PTR(error);

4622

}

4621

}

4623

4622

4624

static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,

4623

static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,

4625

struct cgroup *cont)

4624

struct cgroup *cont)

4626

{

4625

{

4627

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

4626

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

4628

4627

4629

return mem_cgroup_force_empty(mem, false);

4628

return mem_cgroup_force_empty(mem, false);

4630

}

4629

}

4631

4630

4632

static void mem_cgroup_destroy(struct cgroup_subsys *ss,

4631

static void mem_cgroup_destroy(struct cgroup_subsys *ss,

4633

struct cgroup *cont)

4632

struct cgroup *cont)

4634

{

4633

{

4635

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

4634

struct mem_cgroup *mem = mem_cgroup_from_cont(cont);

4636

4635

4637

mem_cgroup_put(mem);

4636

mem_cgroup_put(mem);

4638

}

4637

}

4639

4638

4640

static int mem_cgroup_populate(struct cgroup_subsys *ss,

4639

static int mem_cgroup_populate(struct cgroup_subsys *ss,

4641

struct cgroup *cont)

4640

struct cgroup *cont)

4642

{

4641

{

4643

int ret;

4642

int ret;

4644

4643

4645

ret = cgroup_add_files(cont, ss, mem_cgroup_files,

4644

ret = cgroup_add_files(cont, ss, mem_cgroup_files,

4646

ARRAY_SIZE(mem_cgroup_files));

4645

ARRAY_SIZE(mem_cgroup_files));

4647

4646

4648

if (!ret)

4647

if (!ret)

4649

ret = register_memsw_files(cont, ss);

4648

ret = register_memsw_files(cont, ss);

4650

return ret;

4649

return ret;

4651

}

4650

}

4652

4651

4653

#ifdef CONFIG_MMU

4652

#ifdef CONFIG_MMU

4654

/* Handlers for move charge at task migration. */

4653

/* Handlers for move charge at task migration. */

4655

#define PRECHARGE_COUNT_AT_ONCE 256

4654

#define PRECHARGE_COUNT_AT_ONCE 256

4656

static int mem_cgroup_do_precharge(unsigned long count)

4655

static int mem_cgroup_do_precharge(unsigned long count)

4657

{

4656

{

4658

int ret = 0;

4657

int ret = 0;

4659

int batch_count = PRECHARGE_COUNT_AT_ONCE;

4658

int batch_count = PRECHARGE_COUNT_AT_ONCE;

4660

struct mem_cgroup *mem = mc.to;

4659

struct mem_cgroup *mem = mc.to;

4661

4660

4662

if (mem_cgroup_is_root(mem)) {

4661

if (mem_cgroup_is_root(mem)) {

4663

mc.precharge += count;

4662

mc.precharge += count;

4664

/* we don't need css_get for root */

4663

/* we don't need css_get for root */

4665

return ret;

4664

return ret;

4666

}

4665

}

4667

/* try to charge at once */

4666

/* try to charge at once */

4668

if (count > 1) {

4667

if (count > 1) {

4669

struct res_counter *dummy;

4668

struct res_counter *dummy;

4670

/*

4669

/*

4671

* "mem" cannot be under rmdir() because we've already checked

4670

* "mem" cannot be under rmdir() because we've already checked

4672

* by cgroup_lock_live_cgroup() that it is not removed and we

4671

* by cgroup_lock_live_cgroup() that it is not removed and we

4673

* are still under the same cgroup_mutex. So we can postpone

4672

* are still under the same cgroup_mutex. So we can postpone

4674

* css_get().

4673

* css_get().

4675

*/

4674

*/

4676

if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))

4675

if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))

4677

goto one_by_one;

4676

goto one_by_one;

4678

if (do_swap_account && res_counter_charge(&mem->memsw,

4677

if (do_swap_account && res_counter_charge(&mem->memsw,

4679

PAGE_SIZE * count, &dummy)) {

4678

PAGE_SIZE * count, &dummy)) {

4680

res_counter_uncharge(&mem->res, PAGE_SIZE * count);

4679

res_counter_uncharge(&mem->res, PAGE_SIZE * count);

4681

goto one_by_one;

4680

goto one_by_one;

4682

}

4681

}

4683

mc.precharge += count;

4682

mc.precharge += count;

4684

return ret;

4683

return ret;

4685

}

4684

}

4686

one_by_one:

4685

one_by_one:

4687

/* fall back to one by one charge */

4686

/* fall back to one by one charge */

4688

while (count--) {

4687

while (count--) {

4689

if (signal_pending(current)) {

4688

if (signal_pending(current)) {

4690

ret = -EINTR;

4689

ret = -EINTR;

4691

break;

4690

break;

4692

}

4691

}

4693

if (!batch_count--) {

4692

if (!batch_count--) {

4694

batch_count = PRECHARGE_COUNT_AT_ONCE;

4693

batch_count = PRECHARGE_COUNT_AT_ONCE;

4695

cond_resched();

4694

cond_resched();

4696

}

4695

}

4697

ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);

4696

ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);

4698

if (ret || !mem)

4697

if (ret || !mem)

4699

/* mem_cgroup_clear_mc() will do uncharge later */

4698

/* mem_cgroup_clear_mc() will do uncharge later */

4700

return -ENOMEM;

4699

return -ENOMEM;

4701

mc.precharge++;

4700

mc.precharge++;

4702

}

4701

}

4703

return ret;

4702

return ret;

4704

}

4703

}

4705

4704

4706

/**

4705

/**

4707

* is_target_pte_for_mc - check a pte whether it is valid for move charge

4706

* is_target_pte_for_mc - check a pte whether it is valid for move charge

4708

* @vma: the vma the pte to be checked belongs

4707

* @vma: the vma the pte to be checked belongs

4709

* @addr: the address corresponding to the pte to be checked

4708

* @addr: the address corresponding to the pte to be checked

4710

* @ptent: the pte to be checked

4709

* @ptent: the pte to be checked

4711

* @target: the pointer the target page or swap ent will be stored(can be NULL)

4710

* @target: the pointer the target page or swap ent will be stored(can be NULL)

4712

*

4711

*

4713

* Returns

4712

* Returns

4714

* 0(MC_TARGET_NONE): if the pte is not a target for move charge.

4713

* 0(MC_TARGET_NONE): if the pte is not a target for move charge.

4715

* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for

4714

* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for

4716

* move charge. if @target is not NULL, the page is stored in target->page

4715

* move charge. if @target is not NULL, the page is stored in target->page

4717

* with extra refcnt got(Callers should handle it).

4716

* with extra refcnt got(Callers should handle it).

4718

* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a

4717

* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a

4719

* target for charge migration. if @target is not NULL, the entry is stored

4718

* target for charge migration. if @target is not NULL, the entry is stored

4720

* in target->ent.

4719

* in target->ent.

4721

*

4720

*

4722

* Called with pte lock held.

4721

* Called with pte lock held.

4723

*/

4722

*/

4724

union mc_target {

4723

union mc_target {

4725

struct page *page;

4724

struct page *page;

4726

swp_entry_t ent;

4725

swp_entry_t ent;

4727

};

4726

};

4728

4727

4729

enum mc_target_type {

4728

enum mc_target_type {

4730

MC_TARGET_NONE, /* not used */

4729

MC_TARGET_NONE, /* not used */

4731

MC_TARGET_PAGE,

4730

MC_TARGET_PAGE,

4732

MC_TARGET_SWAP,

4731

MC_TARGET_SWAP,

4733

};

4732

};

4734

4733

4735

static struct page *mc_handle_present_pte(struct vm_area_struct *vma,

4734

static struct page *mc_handle_present_pte(struct vm_area_struct *vma,

4736

unsigned long addr, pte_t ptent)

4735

unsigned long addr, pte_t ptent)

4737

{

4736

{

4738

struct page *page = vm_normal_page(vma, addr, ptent);

4737

struct page *page = vm_normal_page(vma, addr, ptent);

4739

4738

4740

if (!page || !page_mapped(page))

4739

if (!page || !page_mapped(page))

4741

return NULL;

4740

return NULL;

4742

if (PageAnon(page)) {

4741

if (PageAnon(page)) {

4743

/* we don't move shared anon */

4742

/* we don't move shared anon */

4744

if (!move_anon() || page_mapcount(page) > 2)

4743

if (!move_anon() || page_mapcount(page) > 2)

4745

return NULL;

4744

return NULL;

4746

} else if (!move_file())

4745

} else if (!move_file())

4747

/* we ignore mapcount for file pages */

4746

/* we ignore mapcount for file pages */

4748

return NULL;

4747

return NULL;

4749

if (!get_page_unless_zero(page))

4748

if (!get_page_unless_zero(page))

4750

return NULL;

4749

return NULL;

4751

4750

4752

return page;

4751

return page;

4753

}

4752

}

4754

4753

4755

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

4754

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

4756

unsigned long addr, pte_t ptent, swp_entry_t *entry)

4755

unsigned long addr, pte_t ptent, swp_entry_t *entry)

4757

{

4756

{

4758

int usage_count;

4757

int usage_count;

4759

struct page *page = NULL;

4758

struct page *page = NULL;

4760

swp_entry_t ent = pte_to_swp_entry(ptent);

4759

swp_entry_t ent = pte_to_swp_entry(ptent);

4761

4760

4762

if (!move_anon() || non_swap_entry(ent))

4761

if (!move_anon() || non_swap_entry(ent))

4763

return NULL;

4762

return NULL;

4764

usage_count = mem_cgroup_count_swap_user(ent, &page);

4763

usage_count = mem_cgroup_count_swap_user(ent, &page);

4765

if (usage_count > 1) { /* we don't move shared anon */

4764

if (usage_count > 1) { /* we don't move shared anon */

4766

if (page)

4765

if (page)

4767

put_page(page);

4766

put_page(page);

4768

return NULL;

4767

return NULL;

4769

}

4768

}

4770

if (do_swap_account)

4769

if (do_swap_account)

4771

entry->val = ent.val;

4770

entry->val = ent.val;

4772

4771

4773

return page;

4772

return page;

4774

}

4773

}

4775

4774

4776

static struct page *mc_handle_file_pte(struct vm_area_struct *vma,

4775

static struct page *mc_handle_file_pte(struct vm_area_struct *vma,

4777

unsigned long addr, pte_t ptent, swp_entry_t *entry)

4776

unsigned long addr, pte_t ptent, swp_entry_t *entry)

4778

{

4777

{

4779

struct page *page = NULL;

4778

struct page *page = NULL;

4780

struct inode *inode;

4779

struct inode *inode;

4781

struct address_space *mapping;

4780

struct address_space *mapping;

4782

pgoff_t pgoff;

4781

pgoff_t pgoff;

4783

4782

4784

if (!vma->vm_file) /* anonymous vma */

4783

if (!vma->vm_file) /* anonymous vma */

4785

return NULL;

4784

return NULL;

4786

if (!move_file())

4785

if (!move_file())

4787

return NULL;

4786

return NULL;

4788

4787

4789

inode = vma->vm_file->f_path.dentry->d_inode;

4788

inode = vma->vm_file->f_path.dentry->d_inode;

4790

mapping = vma->vm_file->f_mapping;

4789

mapping = vma->vm_file->f_mapping;

4791

if (pte_none(ptent))

4790

if (pte_none(ptent))

4792

pgoff = linear_page_index(vma, addr);

4791

pgoff = linear_page_index(vma, addr);

4793

else /* pte_file(ptent) is true */

4792

else /* pte_file(ptent) is true */

4794

pgoff = pte_to_pgoff(ptent);

4793

pgoff = pte_to_pgoff(ptent);

4795

4794

4796

/* page is moved even if it's not RSS of this task(page-faulted). */

4795

/* page is moved even if it's not RSS of this task(page-faulted). */

4797

if (!mapping_cap_swap_backed(mapping)) { /* normal file */

4796

if (!mapping_cap_swap_backed(mapping)) { /* normal file */

4798

page = find_get_page(mapping, pgoff);

4797

page = find_get_page(mapping, pgoff);

4799

} else { /* shmem/tmpfs file. we should take account of swap too. */

4798

} else { /* shmem/tmpfs file. we should take account of swap too. */

4800

swp_entry_t ent;

4799

swp_entry_t ent;

4801

mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);

4800

mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);

4802

if (do_swap_account)

4801

if (do_swap_account)

4803

entry->val = ent.val;

4802

entry->val = ent.val;

4804

}

4803

}

4805

4804

4806

return page;

4805

return page;

4807

}

4806

}

4808

4807

4809

static int is_target_pte_for_mc(struct vm_area_struct *vma,

4808

static int is_target_pte_for_mc(struct vm_area_struct *vma,

4810

unsigned long addr, pte_t ptent, union mc_target *target)

4809

unsigned long addr, pte_t ptent, union mc_target *target)

4811

{

4810

{

4812

struct page *page = NULL;

4811

struct page *page = NULL;

4813

struct page_cgroup *pc;

4812

struct page_cgroup *pc;

4814

int ret = 0;

4813

int ret = 0;

4815

swp_entry_t ent = { .val = 0 };

4814

swp_entry_t ent = { .val = 0 };

4816

4815

4817

if (pte_present(ptent))

4816

if (pte_present(ptent))

4818

page = mc_handle_present_pte(vma, addr, ptent);

4817

page = mc_handle_present_pte(vma, addr, ptent);

4819

else if (is_swap_pte(ptent))

4818

else if (is_swap_pte(ptent))

4820

page = mc_handle_swap_pte(vma, addr, ptent, &ent);

4819

page = mc_handle_swap_pte(vma, addr, ptent, &ent);

4821

else if (pte_none(ptent) || pte_file(ptent))

4820

else if (pte_none(ptent) || pte_file(ptent))

4822

page = mc_handle_file_pte(vma, addr, ptent, &ent);

4821

page = mc_handle_file_pte(vma, addr, ptent, &ent);

4823

4822

4824

if (!page && !ent.val)

4823

if (!page && !ent.val)

4825

return 0;

4824

return 0;

4826

if (page) {

4825

if (page) {

4827

pc = lookup_page_cgroup(page);

4826

pc = lookup_page_cgroup(page);

4828

/*

4827

/*

4829

* Do only loose check w/o page_cgroup lock.

4828

* Do only loose check w/o page_cgroup lock.

4830

* mem_cgroup_move_account() checks the pc is valid or not under

4829

* mem_cgroup_move_account() checks the pc is valid or not under

4831

* the lock.

4830

* the lock.

4832

*/

4831

*/

4833

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

4832

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

4834

ret = MC_TARGET_PAGE;

4833

ret = MC_TARGET_PAGE;

4835

if (target)

4834

if (target)

4836

target->page = page;

4835

target->page = page;

4837

}

4836

}

4838

if (!ret || !target)

4837

if (!ret || !target)

4839

put_page(page);

4838

put_page(page);

4840

}

4839

}

4841

/* There is a swap entry and a page doesn't exist or isn't charged */

4840

/* There is a swap entry and a page doesn't exist or isn't charged */

4842

if (ent.val && !ret &&

4841

if (ent.val && !ret &&

4843

css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {

4842

css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {

4844

ret = MC_TARGET_SWAP;

4843

ret = MC_TARGET_SWAP;

4845

if (target)

4844

if (target)

4846

target->ent = ent;

4845

target->ent = ent;

4847

}

4846

}

4848

return ret;

4847

return ret;

4849

}

4848

}

4850

4849

4851

static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,

4850

static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,

4852

unsigned long addr, unsigned long end,

4851

unsigned long addr, unsigned long end,

4853

struct mm_walk *walk)

4852

struct mm_walk *walk)

4854

{

4853

{

4855

struct vm_area_struct *vma = walk->private;

4854

struct vm_area_struct *vma = walk->private;

4856

pte_t *pte;

4855

pte_t *pte;

4857

spinlock_t *ptl;

4856

spinlock_t *ptl;

4858

4857

4859

split_huge_page_pmd(walk->mm, pmd);

4858

split_huge_page_pmd(walk->mm, pmd);

4860

4859

4861

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

4860

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

4862

for (; addr != end; pte++, addr += PAGE_SIZE)

4861

for (; addr != end; pte++, addr += PAGE_SIZE)

4863

if (is_target_pte_for_mc(vma, addr, *pte, NULL))

4862

if (is_target_pte_for_mc(vma, addr, *pte, NULL))

4864

mc.precharge++; /* increment precharge temporarily */

4863

mc.precharge++; /* increment precharge temporarily */

4865

pte_unmap_unlock(pte - 1, ptl);

4864

pte_unmap_unlock(pte - 1, ptl);

4866

cond_resched();

4865

cond_resched();

4867

4866

4868

return 0;

4867

return 0;

4869

}

4868

}

4870

4869

4871

static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)

4870

static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)

4872

{

4871

{

4873

unsigned long precharge;

4872

unsigned long precharge;

4874

struct vm_area_struct *vma;

4873

struct vm_area_struct *vma;

4875

4874

4876

down_read(&mm->mmap_sem);

4875

down_read(&mm->mmap_sem);

4877

for (vma = mm->mmap; vma; vma = vma->vm_next) {

4876

for (vma = mm->mmap; vma; vma = vma->vm_next) {

4878

struct mm_walk mem_cgroup_count_precharge_walk = {

4877

struct mm_walk mem_cgroup_count_precharge_walk = {

4879

.pmd_entry = mem_cgroup_count_precharge_pte_range,

4878

.pmd_entry = mem_cgroup_count_precharge_pte_range,

4880

.mm = mm,

4879

.mm = mm,

4881

.private = vma,

4880

.private = vma,

4882

};

4881

};

4883

if (is_vm_hugetlb_page(vma))

4882

if (is_vm_hugetlb_page(vma))

4884

continue;

4883

continue;

4885

walk_page_range(vma->vm_start, vma->vm_end,

4884

walk_page_range(vma->vm_start, vma->vm_end,

4886

&mem_cgroup_count_precharge_walk);

4885

&mem_cgroup_count_precharge_walk);

4887

}

4886

}

4888

up_read(&mm->mmap_sem);

4887

up_read(&mm->mmap_sem);

4889

4888

4890

precharge = mc.precharge;

4889

precharge = mc.precharge;

4891

mc.precharge = 0;

4890

mc.precharge = 0;

4892

4891

4893

return precharge;

4892

return precharge;

4894

}

4893

}

4895

4894

4896

static int mem_cgroup_precharge_mc(struct mm_struct *mm)

4895

static int mem_cgroup_precharge_mc(struct mm_struct *mm)

4897

{

4896

{

4898

unsigned long precharge = mem_cgroup_count_precharge(mm);

4897

unsigned long precharge = mem_cgroup_count_precharge(mm);

4899

4898

4900

VM_BUG_ON(mc.moving_task);

4899

VM_BUG_ON(mc.moving_task);

4901

mc.moving_task = current;

4900

mc.moving_task = current;

4902

return mem_cgroup_do_precharge(precharge);

4901

return mem_cgroup_do_precharge(precharge);

4903

}

4902

}

4904

4903

4905

/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */

4904

/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */

4906

static void __mem_cgroup_clear_mc(void)

4905

static void __mem_cgroup_clear_mc(void)

4907

{

4906

{

4908

struct mem_cgroup *from = mc.from;

4907

struct mem_cgroup *from = mc.from;

4909

struct mem_cgroup *to = mc.to;

4908

struct mem_cgroup *to = mc.to;

4910

4909

4911

/* we must uncharge all the leftover precharges from mc.to */

4910

/* we must uncharge all the leftover precharges from mc.to */

4912

if (mc.precharge) {

4911

if (mc.precharge) {

4913

__mem_cgroup_cancel_charge(mc.to, mc.precharge);

4912

__mem_cgroup_cancel_charge(mc.to, mc.precharge);

4914

mc.precharge = 0;

4913

mc.precharge = 0;

4915

}

4914

}

4916

/*

4915

/*

4917

* we didn't uncharge from mc.from at mem_cgroup_move_account(), so

4916

* we didn't uncharge from mc.from at mem_cgroup_move_account(), so

4918

* we must uncharge here.

4917

* we must uncharge here.

4919

*/

4918

*/

4920

if (mc.moved_charge) {

4919

if (mc.moved_charge) {

4921

__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);

4920

__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);

4922

mc.moved_charge = 0;

4921

mc.moved_charge = 0;

4923

}

4922

}

4924

/* we must fixup refcnts and charges */

4923

/* we must fixup refcnts and charges */

4925

if (mc.moved_swap) {

4924

if (mc.moved_swap) {

4926

/* uncharge swap account from the old cgroup */

4925

/* uncharge swap account from the old cgroup */

4927

if (!mem_cgroup_is_root(mc.from))

4926

if (!mem_cgroup_is_root(mc.from))

4928

res_counter_uncharge(&mc.from->memsw,

4927

res_counter_uncharge(&mc.from->memsw,

4929

PAGE_SIZE * mc.moved_swap);

4928

PAGE_SIZE * mc.moved_swap);

4930

__mem_cgroup_put(mc.from, mc.moved_swap);

4929

__mem_cgroup_put(mc.from, mc.moved_swap);

4931

4930

4932

if (!mem_cgroup_is_root(mc.to)) {

4931

if (!mem_cgroup_is_root(mc.to)) {

4933

/*

4932

/*

4934

* we charged both to->res and to->memsw, so we should

4933

* we charged both to->res and to->memsw, so we should

4935

* uncharge to->res.

4934

* uncharge to->res.

4936

*/

4935

*/

4937

res_counter_uncharge(&mc.to->res,

4936

res_counter_uncharge(&mc.to->res,

4938

PAGE_SIZE * mc.moved_swap);

4937

PAGE_SIZE * mc.moved_swap);

4939

}

4938

}

4940

/* we've already done mem_cgroup_get(mc.to) */

4939

/* we've already done mem_cgroup_get(mc.to) */

4941

mc.moved_swap = 0;

4940

mc.moved_swap = 0;

4942

}

4941

}

4943

memcg_oom_recover(from);

4942

memcg_oom_recover(from);

4944

memcg_oom_recover(to);

4943

memcg_oom_recover(to);

4945

wake_up_all(&mc.waitq);

4944

wake_up_all(&mc.waitq);

4946

}

4945

}

4947

4946

4948

static void mem_cgroup_clear_mc(void)

4947

static void mem_cgroup_clear_mc(void)

4949

{

4948

{

4950

struct mem_cgroup *from = mc.from;

4949

struct mem_cgroup *from = mc.from;

4951

4950

4952

/*

4951

/*

4953

* we must clear moving_task before waking up waiters at the end of

4952

* we must clear moving_task before waking up waiters at the end of

4954

* task migration.

4953

* task migration.

4955

*/

4954

*/

4956

mc.moving_task = NULL;

4955

mc.moving_task = NULL;

4957

__mem_cgroup_clear_mc();

4956

__mem_cgroup_clear_mc();

4958

spin_lock(&mc.lock);

4957

spin_lock(&mc.lock);

4959

mc.from = NULL;

4958

mc.from = NULL;

4960

mc.to = NULL;

4959

mc.to = NULL;

4961

spin_unlock(&mc.lock);

4960

spin_unlock(&mc.lock);

4962

mem_cgroup_end_move(from);

4961

mem_cgroup_end_move(from);

4963

}

4962

}

4964

4963

4965

static int mem_cgroup_can_attach(struct cgroup_subsys *ss,

4964

static int mem_cgroup_can_attach(struct cgroup_subsys *ss,

4966

struct cgroup *cgroup,

4965

struct cgroup *cgroup,

4967

struct task_struct *p)

4966

struct task_struct *p)

4968

{

4967

{

4969

int ret = 0;

4968

int ret = 0;

4970

struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);

4969

struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);

4971

4970

4972

if (mem->move_charge_at_immigrate) {

4971

if (mem->move_charge_at_immigrate) {

4973

struct mm_struct *mm;

4972

struct mm_struct *mm;

4974

struct mem_cgroup *from = mem_cgroup_from_task(p);

4973

struct mem_cgroup *from = mem_cgroup_from_task(p);

4975

4974

4976

VM_BUG_ON(from == mem);

4975

VM_BUG_ON(from == mem);

4977

4976

4978

mm = get_task_mm(p);

4977

mm = get_task_mm(p);

4979

if (!mm)

4978

if (!mm)

4980

return 0;

4979

return 0;

4981

/* We move charges only when we move a owner of the mm */

4980

/* We move charges only when we move a owner of the mm */

4982

if (mm->owner == p) {

4981

if (mm->owner == p) {

4983

VM_BUG_ON(mc.from);

4982

VM_BUG_ON(mc.from);

4984

VM_BUG_ON(mc.to);

4983

VM_BUG_ON(mc.to);

4985

VM_BUG_ON(mc.precharge);

4984

VM_BUG_ON(mc.precharge);

4986

VM_BUG_ON(mc.moved_charge);

4985

VM_BUG_ON(mc.moved_charge);

4987

VM_BUG_ON(mc.moved_swap);

4986

VM_BUG_ON(mc.moved_swap);

4988

mem_cgroup_start_move(from);

4987

mem_cgroup_start_move(from);

4989

spin_lock(&mc.lock);

4988

spin_lock(&mc.lock);

4990

mc.from = from;

4989

mc.from = from;

4991

mc.to = mem;

4990

mc.to = mem;

4992

spin_unlock(&mc.lock);

4991

spin_unlock(&mc.lock);

4993

/* We set mc.moving_task later */

4992

/* We set mc.moving_task later */

4994

4993

4995

ret = mem_cgroup_precharge_mc(mm);

4994

ret = mem_cgroup_precharge_mc(mm);

4996

if (ret)

4995

if (ret)

4997

mem_cgroup_clear_mc();

4996

mem_cgroup_clear_mc();

4998

}

4997

}

4999

mmput(mm);

4998

mmput(mm);

5000

}

4999

}

5001

return ret;

5000

return ret;

5002

}

5001

}

5003

5002

5004

static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,

5003

static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,

5005

struct cgroup *cgroup,

5004

struct cgroup *cgroup,

5006

struct task_struct *p)

5005

struct task_struct *p)

5007

{

5006

{

5008

mem_cgroup_clear_mc();

5007

mem_cgroup_clear_mc();

5009

}

5008

}

5010

5009

5011

static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,

5010

static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,

5012

unsigned long addr, unsigned long end,

5011

unsigned long addr, unsigned long end,

5013

struct mm_walk *walk)

5012

struct mm_walk *walk)

5014

{

5013

{

5015

int ret = 0;

5014

int ret = 0;

5016

struct vm_area_struct *vma = walk->private;

5015

struct vm_area_struct *vma = walk->private;

5017

pte_t *pte;

5016

pte_t *pte;

5018

spinlock_t *ptl;

5017

spinlock_t *ptl;

5019

5018

5020

split_huge_page_pmd(walk->mm, pmd);

5019

split_huge_page_pmd(walk->mm, pmd);

5021

retry:

5020

retry:

5022

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5021

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5023

for (; addr != end; addr += PAGE_SIZE) {

5022

for (; addr != end; addr += PAGE_SIZE) {

5024

pte_t ptent = *(pte++);

5023

pte_t ptent = *(pte++);

5025

union mc_target target;

5024

union mc_target target;

5026

int type;

5025

int type;

5027

struct page *page;

5026

struct page *page;

5028

struct page_cgroup *pc;

5027

struct page_cgroup *pc;

5029

swp_entry_t ent;

5028

swp_entry_t ent;

5030

5029

5031

if (!mc.precharge)

5030

if (!mc.precharge)

5032

break;

5031

break;

5033

5032

5034

type = is_target_pte_for_mc(vma, addr, ptent, &target);

5033

type = is_target_pte_for_mc(vma, addr, ptent, &target);

5035

switch (type) {

5034

switch (type) {

5036

case MC_TARGET_PAGE:

5035

case MC_TARGET_PAGE:

5037

page = target.page;

5036

page = target.page;

5038

if (isolate_lru_page(page))

5037

if (isolate_lru_page(page))

5039

goto put;

5038

goto put;

5040

pc = lookup_page_cgroup(page);

5039

pc = lookup_page_cgroup(page);

5041

if (!mem_cgroup_move_account(page, 1, pc,

5040

if (!mem_cgroup_move_account(page, 1, pc,

5042

mc.from, mc.to, false)) {

5041

mc.from, mc.to, false)) {

5043

mc.precharge--;

5042

mc.precharge--;

5044

/* we uncharge from mc.from later. */

5043

/* we uncharge from mc.from later. */

5045

mc.moved_charge++;

5044

mc.moved_charge++;

5046

}

5045

}

5047

putback_lru_page(page);

5046

putback_lru_page(page);

5048

put: /* is_target_pte_for_mc() gets the page */

5047

put: /* is_target_pte_for_mc() gets the page */

5049

put_page(page);

5048

put_page(page);

5050

break;

5049

break;

5051

case MC_TARGET_SWAP:

5050

case MC_TARGET_SWAP:

5052

ent = target.ent;

5051

ent = target.ent;

5053

if (!mem_cgroup_move_swap_account(ent,

5052

if (!mem_cgroup_move_swap_account(ent,

5054

mc.from, mc.to, false)) {

5053

mc.from, mc.to, false)) {

5055

mc.precharge--;

5054

mc.precharge--;

5056

/* we fixup refcnts and charges later. */

5055

/* we fixup refcnts and charges later. */

5057

mc.moved_swap++;

5056

mc.moved_swap++;

5058

}

5057

}

5059

break;

5058

break;

5060

default:

5059

default:

5061

break;

5060

break;

5062

}

5061

}

5063

}

5062

}

5064

pte_unmap_unlock(pte - 1, ptl);

5063

pte_unmap_unlock(pte - 1, ptl);

5065

cond_resched();

5064

cond_resched();

5066

5065

5067

if (addr != end) {

5066

if (addr != end) {

5068

/*

5067

/*

5069

* We have consumed all precharges we got in can_attach().

5068

* We have consumed all precharges we got in can_attach().

5070

* We try charge one by one, but don't do any additional

5069

* We try charge one by one, but don't do any additional

5071

* charges to mc.to if we have failed in charge once in attach()

5070

* charges to mc.to if we have failed in charge once in attach()

5072

* phase.

5071

* phase.

5073

*/

5072

*/

5074

ret = mem_cgroup_do_precharge(1);

5073

ret = mem_cgroup_do_precharge(1);

5075

if (!ret)

5074

if (!ret)

5076

goto retry;

5075

goto retry;

5077

}

5076

}

5078

5077

5079

return ret;

5078

return ret;

5080

}

5079

}

5081

5080

5082

static void mem_cgroup_move_charge(struct mm_struct *mm)

5081

static void mem_cgroup_move_charge(struct mm_struct *mm)

5083

{

5082

{

5084

struct vm_area_struct *vma;

5083

struct vm_area_struct *vma;

5085

5084

5086

lru_add_drain_all();

5085

lru_add_drain_all();

5087

retry:

5086

retry:

5088

if (unlikely(!down_read_trylock(&mm->mmap_sem))) {

5087

if (unlikely(!down_read_trylock(&mm->mmap_sem))) {

5089

/*

5088

/*

5090

* Someone who are holding the mmap_sem might be waiting in

5089

* Someone who are holding the mmap_sem might be waiting in

5091

* waitq. So we cancel all extra charges, wake up all waiters,

5090

* waitq. So we cancel all extra charges, wake up all waiters,

5092

* and retry. Because we cancel precharges, we might not be able

5091

* and retry. Because we cancel precharges, we might not be able

5093

* to move enough charges, but moving charge is a best-effort

5092

* to move enough charges, but moving charge is a best-effort

5094

* feature anyway, so it wouldn't be a big problem.

5093

* feature anyway, so it wouldn't be a big problem.

5095

*/

5094

*/

5096

__mem_cgroup_clear_mc();

5095

__mem_cgroup_clear_mc();

5097

cond_resched();

5096

cond_resched();

5098

goto retry;

5097

goto retry;

5099

}

5098

}

5100

for (vma = mm->mmap; vma; vma = vma->vm_next) {

5099

for (vma = mm->mmap; vma; vma = vma->vm_next) {

5101

int ret;

5100

int ret;

5102

struct mm_walk mem_cgroup_move_charge_walk = {

5101

struct mm_walk mem_cgroup_move_charge_walk = {

5103

.pmd_entry = mem_cgroup_move_charge_pte_range,

5102

.pmd_entry = mem_cgroup_move_charge_pte_range,

5104

.mm = mm,

5103

.mm = mm,

5105

.private = vma,

5104

.private = vma,

5106

};

5105

};

5107

if (is_vm_hugetlb_page(vma))

5106

if (is_vm_hugetlb_page(vma))

5108

continue;

5107

continue;

5109

ret = walk_page_range(vma->vm_start, vma->vm_end,

5108

ret = walk_page_range(vma->vm_start, vma->vm_end,

5110

&mem_cgroup_move_charge_walk);

5109

&mem_cgroup_move_charge_walk);

5111

if (ret)

5110

if (ret)

5112

/*

5111

/*

5113

* means we have consumed all precharges and failed in

5112

* means we have consumed all precharges and failed in

5114

* doing additional charge. Just abandon here.

5113

* doing additional charge. Just abandon here.

5115

*/

5114

*/

5116

break;

5115

break;

5117

}

5116

}

5118

up_read(&mm->mmap_sem);

5117

up_read(&mm->mmap_sem);

5119

}

5118

}

5120

5119

5121

static void mem_cgroup_move_task(struct cgroup_subsys *ss,

5120

static void mem_cgroup_move_task(struct cgroup_subsys *ss,

5122

struct cgroup *cont,

5121

struct cgroup *cont,

5123

struct cgroup *old_cont,

5122

struct cgroup *old_cont,

5124

struct task_struct *p)

5123

struct task_struct *p)

5125

{

5124

{

5126

struct mm_struct *mm;

5125

struct mm_struct *mm;

5127

5126

5128

if (!mc.to)

5127

if (!mc.to)

5129

/* no need to move charge */

5128

/* no need to move charge */

5130

return;

5129

return;

5131

5130

5132

mm = get_task_mm(p);

5131

mm = get_task_mm(p);

5133

if (mm) {

5132

if (mm) {

5134

mem_cgroup_move_charge(mm);

5133

mem_cgroup_move_charge(mm);

5135

mmput(mm);

5134

mmput(mm);

5136

}

5135

}

5137

mem_cgroup_clear_mc();

5136

mem_cgroup_clear_mc();

5138

}

5137

}

5139

#else /* !CONFIG_MMU */

5138

#else /* !CONFIG_MMU */

5140

static int mem_cgroup_can_attach(struct cgroup_subsys *ss,

5139

static int mem_cgroup_can_attach(struct cgroup_subsys *ss,

5141

struct cgroup *cgroup,

5140

struct cgroup *cgroup,

5142

struct task_struct *p)

5141

struct task_struct *p)

5143

{

5142

{

5144

return 0;

5143

return 0;

5145

}

5144

}

5146

static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,

5145

static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,

5147

struct cgroup *cgroup,

5146

struct cgroup *cgroup,

5148

struct task_struct *p)

5147

struct task_struct *p)

5149

{

5148

{

5150

}

5149

}

5151

static void mem_cgroup_move_task(struct cgroup_subsys *ss,

5150

static void mem_cgroup_move_task(struct cgroup_subsys *ss,

5152

struct cgroup *cont,

5151

struct cgroup *cont,

5153

struct cgroup *old_cont,

5152

struct cgroup *old_cont,

5154

struct task_struct *p)

5153

struct task_struct *p)

5155

{

5154

{

5156

}

5155

}

5157

#endif

5156

#endif

5158

5157

5159

struct cgroup_subsys mem_cgroup_subsys = {

5158

struct cgroup_subsys mem_cgroup_subsys = {

5160

.name = "memory",

5159

.name = "memory",

5161

.subsys_id = mem_cgroup_subsys_id,

5160

.subsys_id = mem_cgroup_subsys_id,

5162

.create = mem_cgroup_create,

5161

.create = mem_cgroup_create,

5163

.pre_destroy = mem_cgroup_pre_destroy,

5162

.pre_destroy = mem_cgroup_pre_destroy,

5164

.destroy = mem_cgroup_destroy,

5163

.destroy = mem_cgroup_destroy,

5165

.populate = mem_cgroup_populate,

5164

.populate = mem_cgroup_populate,

5166

.can_attach = mem_cgroup_can_attach,

5165

.can_attach = mem_cgroup_can_attach,

5167

.cancel_attach = mem_cgroup_cancel_attach,

5166

.cancel_attach = mem_cgroup_cancel_attach,

5168

.attach = mem_cgroup_move_task,

5167

.attach = mem_cgroup_move_task,

5169

.early_init = 0,

5168

.early_init = 0,

5170

.use_id = 1,

5169

.use_id = 1,

5171

};

5170

};

5172

5171

5173

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

5172

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP

5174

static int __init enable_swap_account(char *s)

5173

static int __init enable_swap_account(char *s)

5175

{

5174

{

5176

/* consider enabled if no parameter or 1 is given */

5175

/* consider enabled if no parameter or 1 is given */

5177

if (!strcmp(s, "1"))

5176

if (!strcmp(s, "1"))

5178

really_do_swap_account = 1;

5177

really_do_swap_account = 1;

5179

else if (!strcmp(s, "0"))

5178

else if (!strcmp(s, "0"))

5180

really_do_swap_account = 0;

5179

really_do_swap_account = 0;

5181

return 1;

5180

return 1;

5182

}

5181

}

5183

__setup("swapaccount=", enable_swap_account);

5182

__setup("swapaccount=", enable_swap_account);

5184

5183

5185

#endif

5184

#endif

5186

5185

GITLAB

memcg: remove pointless next_mz nullification in mem_cgroup_soft_limit_reclaim()

 /* memcontrol.c - Memory Controller
  *
  * Copyright IBM Corporation, 2007
  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
  *
  * Copyright 2007 OpenVZ SWsoft Inc
  * Author: Pavel Emelianov <xemul@openvz.org>
  *
  * Memory thresholds
  * Copyright (C) 2009 Nokia Corporation
  * Author: Kirill A. Shutemov
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  */
 #include <linux/res_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/smp.h>
 #include <linux/page-flags.h>
 #include <linux/backing-dev.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/limits.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/spinlock.h>
 #include <linux/eventfd.h>
 #include <linux/sort.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include "internal.h"
 #include <asm/uaccess.h>
 #include <trace/events/vmscan.h>
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 struct mem_cgroup *root_mem_cgroup __read_mostly;
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
 int do_swap_account __read_mostly;
 /* for remember boot option*/
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
 static int really_do_swap_account __initdata = 1;
 #else
 static int really_do_swap_account __initdata = 0;
 #endif
 #else
 #define do_swap_account		(0)
 #endif
 /*
  * Statistics for memory cgroup.
  */
 enum mem_cgroup_stat_index {
 	/*
 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
 	 */
 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
 	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
 	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
 	MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
 	MEM_CGROUP_ON_MOVE,	/* someone is moving account between groups */
 	MEM_CGROUP_STAT_NSTATS,
 };
 enum mem_cgroup_events_index {
 	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
 	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
 	MEM_CGROUP_EVENTS_COUNT,	/* # of pages paged in/out */
 	MEM_CGROUP_EVENTS_NSTATS,
 };
 /*
  * Per memcg event counter is incremented at every pagein/pageout. With THP,
  * it will be incremated by the number of pages. This counter is used for
  * for trigger some periodic events. This is straightforward and better
  * than using jiffies etc. to handle periodic memcg event.
  */
 enum mem_cgroup_events_target {
 	MEM_CGROUP_TARGET_THRESH,
 	MEM_CGROUP_TARGET_SOFTLIMIT,
 	MEM_CGROUP_NTARGETS,
 };
 #define THRESHOLDS_EVENTS_TARGET (128)
 #define SOFTLIMIT_EVENTS_TARGET (1024)
 struct mem_cgroup_stat_cpu {
 	long count[MEM_CGROUP_STAT_NSTATS];
 	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
 	unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 /*
  * per-zone information in memory controller.
  */
 struct mem_cgroup_per_zone {
 	/*
 	 * spin_lock to protect the per cgroup LRU
 	 */
 	struct list_head	lists[NR_LRU_LISTS];
 	unsigned long		count[NR_LRU_LISTS];
 	struct zone_reclaim_stat reclaim_stat;
 	struct rb_node		tree_node;	/* RB tree node */
 	unsigned long long	usage_in_excess;/* Set to the value by which */
 						/* the soft limit is exceeded*/
 	bool			on_tree;
 	struct mem_cgroup	*mem;		/* Back pointer, we cannot */
 						/* use container_of	   */
 };
 /* Macro for accessing counter */
 #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
 struct mem_cgroup_per_node {
 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 };
 struct mem_cgroup_lru_info {
 	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 };
 /*
  * Cgroups above their limits are maintained in a RB-Tree, independent of
  * their hierarchy representation
  */
 struct mem_cgroup_tree_per_zone {
 	struct rb_root rb_root;
 	spinlock_t lock;
 };
 struct mem_cgroup_tree_per_node {
 	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
 };
 struct mem_cgroup_tree {
 	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 };
 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 struct mem_cgroup_threshold {
 	struct eventfd_ctx *eventfd;
 	u64 threshold;
 };
 /* For threshold */
 struct mem_cgroup_threshold_ary {
 	/* An array index points to threshold just below usage. */
 	int current_threshold;
 	/* Size of entries[] */
 	unsigned int size;
 	/* Array of thresholds */
 	struct mem_cgroup_threshold entries[0];
 };
 struct mem_cgroup_thresholds {
 	/* Primary thresholds array */
 	struct mem_cgroup_threshold_ary *primary;
 	/*
 	 * Spare threshold array.
 	 * This is needed to make mem_cgroup_unregister_event() "never fail".
 	 * It must be able to store at least primary->size - 1 entries.
 	 */
 	struct mem_cgroup_threshold_ary *spare;
 };
 /* for OOM */
 struct mem_cgroup_eventfd_list {
 	struct list_head list;
 	struct eventfd_ctx *eventfd;
 };
 static void mem_cgroup_threshold(struct mem_cgroup *mem);
 static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
  * to help the administrator determine what knobs to tune.
  *
  * TODO: Add a water mark for the memory controller. Reclaim will begin when
  * we hit the water mark. May be even add a low water mark, such that
  * no reclaim occurs from a cgroup at it's low water mark, this is
  * a feature that will be implemented much later in the future.
  */
 struct mem_cgroup {
 	struct cgroup_subsys_state css;
 	/*
 	 * the counter to account for memory usage
 	 */
 	struct res_counter res;
 	/*
 	 * the counter to account for mem+swap usage.
 	 */
 	struct res_counter memsw;
 	/*
 	 * Per cgroup active and inactive list, similar to the
 	 * per zone LRU lists.
 	 */
 	struct mem_cgroup_lru_info info;
 	/*
 	 * While reclaiming in a hierarchy, we cache the last child we
 	 * reclaimed from.
 	 */
 	int last_scanned_child;
 	/*
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
 	bool use_hierarchy;
 	atomic_t	oom_lock;
 	atomic_t	refcnt;
 	unsigned int	swappiness;
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
 	/* set when res.limit == memsw.limit */
 	bool		memsw_is_minimum;
 	/* protect arrays of thresholds */
 	struct mutex thresholds_lock;
 	/* thresholds for memory usage. RCU-protected */
 	struct mem_cgroup_thresholds thresholds;
 	/* thresholds for mem+swap usage. RCU-protected */
 	struct mem_cgroup_thresholds memsw_thresholds;
 	/* For oom notifier event fd */
 	struct list_head oom_notify;
 	/*
 	 * Should we move charges of a task when a task is moved into this
 	 * mem_cgroup ? And what type of charges should we move ?
 	 */
 	unsigned long 	move_charge_at_immigrate;
 	/*
 	 * percpu counter.
 	 */
 	struct mem_cgroup_stat_cpu *stat;
 	/*
 	 * used when a cpu is offlined or other synchronizations
 	 * See mem_cgroup_read_stat().
 	 */
 	struct mem_cgroup_stat_cpu nocpu_base;
 	spinlock_t pcp_counter_lock;
 };
 /* Stuffs for move charges at task migration. */
 /*
  * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
  * left-shifted bitmap of these types.
  */
 enum move_type {
 	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
 	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
 	NR_MOVE_TYPE,
 };
 /* "mc" and its members are protected by cgroup_mutex */
 static struct move_charge_struct {
 	spinlock_t	  lock; /* for from, to */
 	struct mem_cgroup *from;
 	struct mem_cgroup *to;
 	unsigned long precharge;
 	unsigned long moved_charge;
 	unsigned long moved_swap;
 	struct task_struct *moving_task;	/* a task moving charges */
 	wait_queue_head_t waitq;		/* a waitq for other context */
 } mc = {
 	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 };
 static bool move_anon(void)
 {
 	return test_bit(MOVE_CHARGE_TYPE_ANON,
 					&mc.to->move_charge_at_immigrate);
 }
 static bool move_file(void)
 {
 	return test_bit(MOVE_CHARGE_TYPE_FILE,
 					&mc.to->move_charge_at_immigrate);
 }
 /*
  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
  * limit reclaim to prevent infinite loops, if they ever occur.
  */
 #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		(100)
 #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	(2)
 enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_MAPPED,
 	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
 	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
 	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
 	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
 	NR_CHARGE_TYPE,
 };
 /* for encoding cft->private value on file */
 #define _MEM			(0)
 #define _MEMSWAP		(1)
 #define _OOM_TYPE		(2)
 #define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
 #define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)
 /* Used for OOM nofiier */
 #define OOM_CONTROL		(0)
 /*
  * Reclaim flags for mem_cgroup_hierarchical_reclaim
  */
 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
 #define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
 #define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
 #define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
 #define MEM_CGROUP_RECLAIM_SOFT_BIT	0x2
 #define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
 static void drain_all_stock_async(void);
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
 {
 	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
 }
 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
 {
 	return &mem->css;
 }
 static struct mem_cgroup_per_zone *
 page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
 {
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	return mem_cgroup_zoneinfo(mem, nid, zid);
 }
 static struct mem_cgroup_tree_per_zone *
 soft_limit_tree_node_zone(int nid, int zid)
 {
 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 }
 static struct mem_cgroup_tree_per_zone *
 soft_limit_tree_from_page(struct page *page)
 {
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 }
 static void
 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz,
 				unsigned long long new_usage_in_excess)
 {
 	struct rb_node **p = &mctz->rb_root.rb_node;
 	struct rb_node *parent = NULL;
 	struct mem_cgroup_per_zone *mz_node;
 	if (mz->on_tree)
 		return;
 	mz->usage_in_excess = new_usage_in_excess;
 	if (!mz->usage_in_excess)
 		return;
 	while (*p) {
 		parent = *p;
 		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
 					tree_node);
 		if (mz->usage_in_excess < mz_node->usage_in_excess)
 			p = &(*p)->rb_left;
 		/*
 		 * We can't avoid mem cgroups that are over their soft
 		 * limit by the same amount
 		 */
 		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 			p = &(*p)->rb_right;
 	}
 	rb_link_node(&mz->tree_node, parent, p);
 	rb_insert_color(&mz->tree_node, &mctz->rb_root);
 	mz->on_tree = true;
 }
 static void
 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz)
 {
 	if (!mz->on_tree)
 		return;
 	rb_erase(&mz->tree_node, &mctz->rb_root);
 	mz->on_tree = false;
 }
 static void
 mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz)
 {
 	spin_lock(&mctz->lock);
 	__mem_cgroup_remove_exceeded(mem, mz, mctz);
 	spin_unlock(&mctz->lock);
 }
 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
 {
 	unsigned long long excess;
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup_tree_per_zone *mctz;
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	mctz = soft_limit_tree_from_page(page);
 	/*
 	 * Necessary to update all ancestors when hierarchy is used.
 	 * because their event counter is not touched.
 	 */
 	for (; mem; mem = parent_mem_cgroup(mem)) {
 		mz = mem_cgroup_zoneinfo(mem, nid, zid);
 		excess = res_counter_soft_limit_excess(&mem->res);
 		/*
 		 * We have to update the tree if mz is on RB-tree or
 		 * mem is over its softlimit.
 		 */
 		if (excess || mz->on_tree) {
 			spin_lock(&mctz->lock);
 			/* if on-tree, remove it */
 			if (mz->on_tree)
 				__mem_cgroup_remove_exceeded(mem, mz, mctz);
 			/*
 			 * Insert again. mz->usage_in_excess will be updated.
 			 * If excess is 0, no tree ops.
 			 */
 			__mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
 			spin_unlock(&mctz->lock);
 		}
 	}
 }
 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
 {
 	int node, zone;
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup_tree_per_zone *mctz;
 	for_each_node_state(node, N_POSSIBLE) {
 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 			mz = mem_cgroup_zoneinfo(mem, node, zone);
 			mctz = soft_limit_tree_node_zone(node, zone);
 			mem_cgroup_remove_exceeded(mem, mz, mctz);
 		}
 	}
 }
 static struct mem_cgroup_per_zone *
 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 {
 	struct rb_node *rightmost = NULL;
 	struct mem_cgroup_per_zone *mz;
 retry:
 	mz = NULL;
 	rightmost = rb_last(&mctz->rb_root);
 	if (!rightmost)
 		goto done;		/* Nothing to reclaim from */
 	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
 	/*
 	 * Remove the node now but someone else can add it back,
 	 * we will to add it back at the end of reclaim to its correct
 	 * position in the tree.
 	 */
 	__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
 	if (!res_counter_soft_limit_excess(&mz->mem->res) ||
 		!css_tryget(&mz->mem->css))
 		goto retry;
 done:
 	return mz;
 }
 static struct mem_cgroup_per_zone *
 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 {
 	struct mem_cgroup_per_zone *mz;
 	spin_lock(&mctz->lock);
 	mz = __mem_cgroup_largest_soft_limit_node(mctz);
 	spin_unlock(&mctz->lock);
 	return mz;
 }
 /*
  * Implementation Note: reading percpu statistics for memcg.
  *
  * Both of vmstat[] and percpu_counter has threshold and do periodic
  * synchronization to implement "quick" read. There are trade-off between
  * reading cost and precision of value. Then, we may have a chance to implement
  * a periodic synchronizion of counter in memcg's counter.
  *
  * But this _read() function is used for user interface now. The user accounts
  * memory usage by memory cgroup and he _always_ requires exact value because
  * he accounts memory. Even if we provide quick-and-fuzzy read, we always
  * have to visit all online cpus and make sum. So, for now, unnecessary
  * synchronization is not implemented. (just implemented for cpu hotplug)
  *
  * If there are kernel internal actions which can make use of some not-exact
  * value, and reading all cpu value can be performance bottleneck in some
  * common workload, threashold and synchonization as vmstat[] should be
  * implemented.
  */
 static long mem_cgroup_read_stat(struct mem_cgroup *mem,
 				 enum mem_cgroup_stat_index idx)
 {
 	long val = 0;
 	int cpu;
 	get_online_cpus();
 	for_each_online_cpu(cpu)
 		val += per_cpu(mem->stat->count[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 	spin_lock(&mem->pcp_counter_lock);
 	val += mem->nocpu_base.count[idx];
 	spin_unlock(&mem->pcp_counter_lock);
 #endif
 	put_online_cpus();
 	return val;
 }
 static long mem_cgroup_local_usage(struct mem_cgroup *mem)
 {
 	long ret;
 	ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
 	ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
 	return ret;
 }
 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
 					 bool charge)
 {
 	int val = (charge) ? 1 : -1;
 	this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
 }
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
 					    enum mem_cgroup_events_index idx)
 {
 	unsigned long val = 0;
 	int cpu;
 	for_each_online_cpu(cpu)
 		val += per_cpu(mem->stat->events[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 	spin_lock(&mem->pcp_counter_lock);
 	val += mem->nocpu_base.events[idx];
 	spin_unlock(&mem->pcp_counter_lock);
 #endif
 	return val;
 }
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 					 bool file, int nr_pages)
 {
 	preempt_disable();
 	if (file)
 		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
 	else
 		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
 	/* pagein of a big page is an event. So, ignore page size */
 	if (nr_pages > 0)
 		__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 	else {
 		__this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 		nr_pages = -nr_pages; /* for event */
 	}
 	__this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
 	preempt_enable();
 }
 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
 					enum lru_list idx)
 {
 	int nid, zid;
 	struct mem_cgroup_per_zone *mz;
 	u64 total = 0;
 	for_each_online_node(nid)
 		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 			mz = mem_cgroup_zoneinfo(mem, nid, zid);
 			total += MEM_CGROUP_ZSTAT(mz, idx);
 		}
 	return total;
 }
 static bool __memcg_event_check(struct mem_cgroup *mem, int target)
 {
 	unsigned long val, next;
 	val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
 	next = this_cpu_read(mem->stat->targets[target]);
 	/* from time_after() in jiffies.h */
 	return ((long)next - (long)val < 0);
 }
 static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
 {
 	unsigned long val, next;
 	val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
 	switch (target) {
 	case MEM_CGROUP_TARGET_THRESH:
 		next = val + THRESHOLDS_EVENTS_TARGET;
 		break;
 	case MEM_CGROUP_TARGET_SOFTLIMIT:
 		next = val + SOFTLIMIT_EVENTS_TARGET;
 		break;
 	default:
 		return;
 	}
 	this_cpu_write(mem->stat->targets[target], next);
 }
 /*
  * Check events in order.
  *
  */
 static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
 {
 	/* threshold event is triggered in finer grain than soft limit */
 	if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
 		mem_cgroup_threshold(mem);
 		__mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
 		if (unlikely(__memcg_event_check(mem,
 			MEM_CGROUP_TARGET_SOFTLIMIT))){
 			mem_cgroup_update_tree(mem, page);
 			__mem_cgroup_target_update(mem,
 				MEM_CGROUP_TARGET_SOFTLIMIT);
 		}
 	}
 }
 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 {
 	return container_of(cgroup_subsys_state(cont,
 				mem_cgroup_subsys_id), struct mem_cgroup,
 				css);
 }
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 {
 	/*
 	 * mm_update_next_owner() may clear mm->owner to NULL
 	 * if it races with swapoff, page migration, etc.
 	 * So this can be called with p == NULL.
 	 */
 	if (unlikely(!p))
 		return NULL;
 	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 				struct mem_cgroup, css);
 }
 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
 	struct mem_cgroup *mem = NULL;
 	if (!mm)
 		return NULL;
 	/*
 	 * Because we have no locks, mm->owner's may be being moved to other
 	 * cgroup. We use css_tryget() here even if this looks
 	 * pessimistic (rather than adding locks here).
 	 */
 	rcu_read_lock();
 	do {
 		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 		if (unlikely(!mem))
 			break;
 	} while (!css_tryget(&mem->css));
 	rcu_read_unlock();
 	return mem;
 }
 /* The caller has to guarantee "mem" exists before calling this */
 static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
 {
 	struct cgroup_subsys_state *css;
 	int found;
 	if (!mem) /* ROOT cgroup has the smallest ID */
 		return root_mem_cgroup; /*css_put/get against root is ignored*/
 	if (!mem->use_hierarchy) {
 		if (css_tryget(&mem->css))
 			return mem;
 		return NULL;
 	}
 	rcu_read_lock();
 	/*
 	 * searching a memory cgroup which has the smallest ID under given
 	 * ROOT cgroup. (ID >= 1)
 	 */
 	css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
 	if (css && css_tryget(css))
 		mem = container_of(css, struct mem_cgroup, css);
 	else
 		mem = NULL;
 	rcu_read_unlock();
 	return mem;
 }
 static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
 					struct mem_cgroup *root,
 					bool cond)
 {
 	int nextid = css_id(&iter->css) + 1;
 	int found;
 	int hierarchy_used;
 	struct cgroup_subsys_state *css;
 	hierarchy_used = iter->use_hierarchy;
 	css_put(&iter->css);
 	/* If no ROOT, walk all, ignore hierarchy */
 	if (!cond || (root && !hierarchy_used))
 		return NULL;
 	if (!root)
 		root = root_mem_cgroup;
 	do {
 		iter = NULL;
 		rcu_read_lock();
 		css = css_get_next(&mem_cgroup_subsys, nextid,
 				&root->css, &found);
 		if (css && css_tryget(css))
 			iter = container_of(css, struct mem_cgroup, css);
 		rcu_read_unlock();
 		/* If css is NULL, no more cgroups will be found */
 		nextid = found + 1;
 	} while (css && !iter);
 	return iter;
 }
 /*
  * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
  * be careful that "break" loop is not allowed. We have reference count.
  * Instead of that modify "cond" to be false and "continue" to exit the loop.
  */
 #define for_each_mem_cgroup_tree_cond(iter, root, cond)	\
 	for (iter = mem_cgroup_start_loop(root);\
 	     iter != NULL;\
 	     iter = mem_cgroup_get_next(iter, root, cond))
 #define for_each_mem_cgroup_tree(iter, root) \
 	for_each_mem_cgroup_tree_cond(iter, root, true)
 #define for_each_mem_cgroup_all(iter) \
 	for_each_mem_cgroup_tree_cond(iter, NULL, true)
 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
 {
 	return (mem == root_mem_cgroup);
 }
 /*
  * Following LRU functions are allowed to be used without PCG_LOCK.
  * Operations are called by routine of global LRU independently from memcg.
  * What we have to take care of here is validness of pc->mem_cgroup.
  *
  * Changes to pc->mem_cgroup happens when
  * 1. charge
  * 2. moving account
  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
  * It is added to LRU before charge.
  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
  * When moving account, the page is not on LRU. It's isolated.
  */
 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup_per_zone *mz;
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(page);
 	/* can happen while we handle swapcache. */
 	if (!TestClearPageCgroupAcctLRU(pc))
 		return;
 	VM_BUG_ON(!pc->mem_cgroup);
 	/*
 	 * We don't check PCG_USED bit. It's cleared when the "page" is finally
 	 * removed from global LRU.
 	 */
 	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
 	/* huge page split is done under lru_lock. so, we have no races. */
 	MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
 	if (mem_cgroup_is_root(pc->mem_cgroup))
 		return;
 	VM_BUG_ON(list_empty(&pc->lru));
 	list_del_init(&pc->lru);
 }
 void mem_cgroup_del_lru(struct page *page)
 {
 	mem_cgroup_del_lru_list(page, page_lru(page));
 }
 /*
  * Writeback is about to end against a page which has been marked for immediate
  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
  * inactive list.
  */
 void mem_cgroup_rotate_reclaimable_page(struct page *page)
 {
 	struct mem_cgroup_per_zone *mz;
 	struct page_cgroup *pc;
 	enum lru_list lru = page_lru(page);
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(page);
 	/* unused or root page is not rotated. */
 	if (!PageCgroupUsed(pc))
 		return;
 	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
 	smp_rmb();
 	if (mem_cgroup_is_root(pc->mem_cgroup))
 		return;
 	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
 	list_move_tail(&pc->lru, &mz->lists[lru]);
 }
 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
 {
 	struct mem_cgroup_per_zone *mz;
 	struct page_cgroup *pc;
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(page);
 	/* unused or root page is not rotated. */
 	if (!PageCgroupUsed(pc))
 		return;
 	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
 	smp_rmb();
 	if (mem_cgroup_is_root(pc->mem_cgroup))
 		return;
 	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
 	list_move(&pc->lru, &mz->lists[lru]);
 }
 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup_per_zone *mz;
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(page);
 	VM_BUG_ON(PageCgroupAcctLRU(pc));
 	if (!PageCgroupUsed(pc))
 		return;
 	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
 	smp_rmb();
 	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
 	/* huge page split is done under lru_lock. so, we have no races. */
 	MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
 	SetPageCgroupAcctLRU(pc);
 	if (mem_cgroup_is_root(pc->mem_cgroup))
 		return;
 	list_add(&pc->lru, &mz->lists[lru]);
 }
 /*
  * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
  * while it's linked to lru because the page may be reused after it's fully
  * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
  * It's done under lock_page and expected that zone->lru_lock isnever held.
  */
 static void mem_cgroup_lru_del_before_commit(struct page *page)
 {
 	unsigned long flags;
 	struct zone *zone = page_zone(page);
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	/*
 	 * Doing this check without taking ->lru_lock seems wrong but this
 	 * is safe. Because if page_cgroup's USED bit is unset, the page
 	 * will not be added to any memcg's LRU. If page_cgroup's USED bit is
 	 * set, the commit after this will fail, anyway.
 	 * This all charge/uncharge is done under some mutual execustion.
 	 * So, we don't need to taking care of changes in USED bit.
 	 */
 	if (likely(!PageLRU(page)))
 		return;
 	spin_lock_irqsave(&zone->lru_lock, flags);
 	/*
 	 * Forget old LRU when this page_cgroup is *not* used. This Used bit
 	 * is guarded by lock_page() because the page is SwapCache.
 	 */
 	if (!PageCgroupUsed(pc))
 		mem_cgroup_del_lru_list(page, page_lru(page));
 	spin_unlock_irqrestore(&zone->lru_lock, flags);
 }
 static void mem_cgroup_lru_add_after_commit(struct page *page)
 {
 	unsigned long flags;
 	struct zone *zone = page_zone(page);
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	/* taking care of that the page is added to LRU while we commit it */
 	if (likely(!PageLRU(page)))
 		return;
 	spin_lock_irqsave(&zone->lru_lock, flags);
 	/* link when the page is linked to LRU but page_cgroup isn't */
 	if (PageLRU(page) && !PageCgroupAcctLRU(pc))
 		mem_cgroup_add_lru_list(page, page_lru(page));
 	spin_unlock_irqrestore(&zone->lru_lock, flags);
 }
 void mem_cgroup_move_lists(struct page *page,
 			   enum lru_list from, enum lru_list to)
 {
 	if (mem_cgroup_disabled())
 		return;
 	mem_cgroup_del_lru_list(page, from);
 	mem_cgroup_add_lru_list(page, to);
 }
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 {
 	int ret;
 	struct mem_cgroup *curr = NULL;
 	struct task_struct *p;
 	p = find_lock_task_mm(task);
 	if (!p)
 		return 0;
 	curr = try_get_mem_cgroup_from_mm(p->mm);
 	task_unlock(p);
 	if (!curr)
 		return 0;
 	/*
 	 * We should check use_hierarchy of "mem" not "curr". Because checking
 	 * use_hierarchy of "curr" here make this function true if hierarchy is
 	 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
 	 * hierarchy(even if use_hierarchy is disabled in "mem").
 	 */
 	if (mem->use_hierarchy)
 		ret = css_is_ancestor(&curr->css, &mem->css);
 	else
 		ret = (curr == mem);
 	css_put(&curr->css);
 	return ret;
 }
 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
 {
 	unsigned long active;
 	unsigned long inactive;
 	unsigned long gb;
 	unsigned long inactive_ratio;
 	inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
 	active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
 	if (gb)
 		inactive_ratio = int_sqrt(10 * gb);
 	else
 		inactive_ratio = 1;
 	if (present_pages) {
 		present_pages[0] = inactive;
 		present_pages[1] = active;
 	}
 	return inactive_ratio;
 }
 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
 {
 	unsigned long active;
 	unsigned long inactive;
 	unsigned long present_pages[2];
 	unsigned long inactive_ratio;
 	inactive_ratio = calc_inactive_ratio(memcg, present_pages);
 	inactive = present_pages[0];
 	active = present_pages[1];
 	if (inactive * inactive_ratio < active)
 		return 1;
 	return 0;
 }
 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
 {
 	unsigned long active;
 	unsigned long inactive;
 	inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
 	active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
 	return (active > inactive);
 }
 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 				       struct zone *zone,
 				       enum lru_list lru)
 {
 	int nid = zone_to_nid(zone);
 	int zid = zone_idx(zone);
 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 	return MEM_CGROUP_ZSTAT(mz, lru);
 }
 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
 						      struct zone *zone)
 {
 	int nid = zone_to_nid(zone);
 	int zid = zone_idx(zone);
 	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 	return &mz->reclaim_stat;
 }
 struct zone_reclaim_stat *
 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
 {
 	struct page_cgroup *pc;
 	struct mem_cgroup_per_zone *mz;
 	if (mem_cgroup_disabled())
 		return NULL;
 	pc = lookup_page_cgroup(page);
 	if (!PageCgroupUsed(pc))
 		return NULL;
 	/* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
 	smp_rmb();
 	mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
 	return &mz->reclaim_stat;
 }
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 					struct list_head *dst,
 					unsigned long *scanned, int order,
 					int mode, struct zone *z,
 					struct mem_cgroup *mem_cont,
 					int active, int file)
 {
 	unsigned long nr_taken = 0;
 	struct page *page;
 	unsigned long scan;
 	LIST_HEAD(pc_list);
 	struct list_head *src;
 	struct page_cgroup *pc, *tmp;
 	int nid = zone_to_nid(z);
 	int zid = zone_idx(z);
 	struct mem_cgroup_per_zone *mz;
 	int lru = LRU_FILE * file + active;
 	int ret;
 	BUG_ON(!mem_cont);
 	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
 	src = &mz->lists[lru];
 	scan = 0;
 	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
 		if (scan >= nr_to_scan)
 			break;
 		if (unlikely(!PageCgroupUsed(pc)))
 			continue;
 		page = lookup_cgroup_page(pc);
 		if (unlikely(!PageLRU(page)))
 			continue;
 		scan++;
 		ret = __isolate_lru_page(page, mode, file);
 		switch (ret) {
 		case 0:
 			list_move(&page->lru, dst);
 			mem_cgroup_del_lru(page);
 			nr_taken += hpage_nr_pages(page);
 			break;
 		case -EBUSY:
 			/* we don't affect global LRU but rotate in our LRU */
 			mem_cgroup_rotate_lru_list(page, page_lru(page));
 			break;
 		default:
 			break;
 		}
 	}
 	*scanned = scan;
 	trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
 				      0, 0, 0, mode);
 	return nr_taken;
 }
 #define mem_cgroup_from_res_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 /**
  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
  * @mem: the memory cgroup
  *
  * Returns the maximum amount of memory @mem can be charged with, in
  * pages.
  */
 static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
 {
 	unsigned long long margin;
 	margin = res_counter_margin(&mem->res);
 	if (do_swap_account)
 		margin = min(margin, res_counter_margin(&mem->memsw));
 	return margin >> PAGE_SHIFT;
 }
 static unsigned int get_swappiness(struct mem_cgroup *memcg)
 {
 	struct cgroup *cgrp = memcg->css.cgroup;
 	/* root ? */
 	if (cgrp->parent == NULL)
 		return vm_swappiness;
 	return memcg->swappiness;
 }
 static void mem_cgroup_start_move(struct mem_cgroup *mem)
 {
 	int cpu;
 	get_online_cpus();
 	spin_lock(&mem->pcp_counter_lock);
 	for_each_online_cpu(cpu)
 		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
 	mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
 	spin_unlock(&mem->pcp_counter_lock);
 	put_online_cpus();
 	synchronize_rcu();
 }
 static void mem_cgroup_end_move(struct mem_cgroup *mem)
 {
 	int cpu;
 	if (!mem)
 		return;
 	get_online_cpus();
 	spin_lock(&mem->pcp_counter_lock);
 	for_each_online_cpu(cpu)
 		per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
 	mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
 	spin_unlock(&mem->pcp_counter_lock);
 	put_online_cpus();
 }
 /*
  * 2 routines for checking "mem" is under move_account() or not.
  *
  * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
  *			  for avoiding race in accounting. If true,
  *			  pc->mem_cgroup may be overwritten.
  *
  * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
  *			  under hierarchy of moving cgroups. This is for
  *			  waiting at hith-memory prressure caused by "move".
  */
 static bool mem_cgroup_stealed(struct mem_cgroup *mem)
 {
 	VM_BUG_ON(!rcu_read_lock_held());
 	return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
 }
 static bool mem_cgroup_under_move(struct mem_cgroup *mem)
 {
 	struct mem_cgroup *from;
 	struct mem_cgroup *to;
 	bool ret = false;
 	/*
 	 * Unlike task_move routines, we access mc.to, mc.from not under
 	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
 	 */
 	spin_lock(&mc.lock);
 	from = mc.from;
 	to = mc.to;
 	if (!from)
 		goto unlock;
 	if (from == mem || to == mem
 	    || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
 	    || (mem->use_hierarchy && css_is_ancestor(&to->css,	&mem->css)))
 		ret = true;
 unlock:
 	spin_unlock(&mc.lock);
 	return ret;
 }
 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
 {
 	if (mc.moving_task && current != mc.moving_task) {
 		if (mem_cgroup_under_move(mem)) {
 			DEFINE_WAIT(wait);
 			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
 			/* moving charge context might have finished. */
 			if (mc.moving_task)
 				schedule();
 			finish_wait(&mc.waitq, &wait);
 			return true;
 		}
 	}
 	return false;
 }
 /**
  * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
  * @memcg: The memory cgroup that went over limit
  * @p: Task that is going to be killed
  *
  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
  * enabled
  */
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 	struct cgroup *task_cgrp;
 	struct cgroup *mem_cgrp;
 	/*
 	 * Need a buffer in BSS, can't rely on allocations. The code relies
 	 * on the assumption that OOM is serialized for memory controller.
 	 * If this assumption is broken, revisit this code.
 	 */
 	static char memcg_name[PATH_MAX];
 	int ret;
 	if (!memcg || !p)
 		return;
 	rcu_read_lock();
 	mem_cgrp = memcg->css.cgroup;
 	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
 	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
 	if (ret < 0) {
 		/*
 		 * Unfortunately, we are unable to convert to a useful name
 		 * But we'll still print out the usage information
 		 */
 		rcu_read_unlock();
 		goto done;
 	}
 	rcu_read_unlock();
 	printk(KERN_INFO "Task in %s killed", memcg_name);
 	rcu_read_lock();
 	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
 	if (ret < 0) {
 		rcu_read_unlock();
 		goto done;
 	}
 	rcu_read_unlock();
 	/*
 	 * Continues from above, so we don't need an KERN_ level
 	 */
 	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
 done:
 	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
 		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
 		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
 		res_counter_read_u64(&memcg->res, RES_FAILCNT));
 	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
 		"failcnt %llu\n",
 		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
 		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
 		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
 }
 /*
  * This function returns the number of memcg under hierarchy tree. Returns
  * 1(self count) if no children.
  */
 static int mem_cgroup_count_children(struct mem_cgroup *mem)
 {
 	int num = 0;
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, mem)
 		num++;
 	return num;
 }
 /*
  * Return the memory (and swap, if configured) limit for a memcg.
  */
 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
 {
 	u64 limit;
 	u64 memsw;
 	limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 	limit += total_swap_pages << PAGE_SHIFT;
 	memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 	/*
 	 * If memsw is finite and limits the amount of swap space available
 	 * to this memcg, return that limit.
 	 */
 	return min(limit, memsw);
 }
 /*
  * Visit the first child (need not be the first child as per the ordering
  * of the cgroup list, since we track last_scanned_child) of @mem and use
  * that to reclaim free pages from.
  */
 static struct mem_cgroup *
 mem_cgroup_select_victim(struct mem_cgroup *root_mem)
 {
 	struct mem_cgroup *ret = NULL;
 	struct cgroup_subsys_state *css;
 	int nextid, found;
 	if (!root_mem->use_hierarchy) {
 		css_get(&root_mem->css);
 		ret = root_mem;
 	}
 	while (!ret) {
 		rcu_read_lock();
 		nextid = root_mem->last_scanned_child + 1;
 		css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
 				   &found);
 		if (css && css_tryget(css))
 			ret = container_of(css, struct mem_cgroup, css);
 		rcu_read_unlock();
 		/* Updates scanning parameter */
 		if (!css) {
 			/* this means start scan from ID:1 */
 			root_mem->last_scanned_child = 0;
 		} else
 			root_mem->last_scanned_child = found;
 	}
 	return ret;
 }
 /*
  * Scan the hierarchy if needed to reclaim memory. We remember the last child
  * we reclaimed from, so that we don't end up penalizing one child extensively
  * based on its position in the children list.
  *
  * root_mem is the original ancestor that we've been reclaim from.
  *
  * We give up and return to the caller when we visit root_mem twice.
  * (other groups can be removed while we're walking....)
  *
  * If shrink==true, for avoiding to free too much, this returns immedieately.
  */
 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 						struct zone *zone,
 						gfp_t gfp_mask,
 						unsigned long reclaim_options,
 						unsigned long *total_scanned)
 {
 	struct mem_cgroup *victim;
 	int ret, total = 0;
 	int loop = 0;
 	bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
 	bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
 	bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
 	unsigned long excess;
 	unsigned long nr_scanned;
 	excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
 	/* If memsw_is_minimum==1, swap-out is of-no-use. */
 	if (root_mem->memsw_is_minimum)
 		noswap = true;
 	while (1) {
 		victim = mem_cgroup_select_victim(root_mem);
 		if (victim == root_mem) {
 			loop++;
 			if (loop >= 1)
 				drain_all_stock_async();
 			if (loop >= 2) {
 				/*
 				 * If we have not been able to reclaim
 				 * anything, it might because there are
 				 * no reclaimable pages under this hierarchy
 				 */
 				if (!check_soft || !total) {
 					css_put(&victim->css);
 					break;
 				}
 				/*
 				 * We want to do more targeted reclaim.
 				 * excess >> 2 is not to excessive so as to
 				 * reclaim too much, nor too less that we keep
 				 * coming back to reclaim from this cgroup
 				 */
 				if (total >= (excess >> 2) ||
 					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
 					css_put(&victim->css);
 					break;
 				}
 			}
 		}
 		if (!mem_cgroup_local_usage(victim)) {
 			/* this cgroup's local usage == 0 */
 			css_put(&victim->css);
 			continue;
 		}
 		/* we use swappiness of local cgroup */
 		if (check_soft) {
 			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
 				noswap, get_swappiness(victim), zone,
 				&nr_scanned);
 			*total_scanned += nr_scanned;
 		} else
 			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
 						noswap, get_swappiness(victim));
 		css_put(&victim->css);
 		/*
 		 * At shrinking usage, we can't check we should stop here or
 		 * reclaim more. It's depends on callers. last_scanned_child
 		 * will work enough for keeping fairness under tree.
 		 */
 		if (shrink)
 			return ret;
 		total += ret;
 		if (check_soft) {
 			if (!res_counter_soft_limit_excess(&root_mem->res))
 				return total;
 		} else if (mem_cgroup_margin(root_mem))
 			return 1 + total;
 	}
 	return total;
 }
 /*
  * Check OOM-Killer is already running under our hierarchy.
  * If someone is running, return false.
  */
 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
 {
 	int x, lock_count = 0;
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, mem) {
 		x = atomic_inc_return(&iter->oom_lock);
 		lock_count = max(x, lock_count);
 	}
 	if (lock_count == 1)
 		return true;
 	return false;
 }
 static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
 {
 	struct mem_cgroup *iter;
 	/*
 	 * When a new child is created while the hierarchy is under oom,
 	 * mem_cgroup_oom_lock() may not be called. We have to use
 	 * atomic_add_unless() here.
 	 */
 	for_each_mem_cgroup_tree(iter, mem)
 		atomic_add_unless(&iter->oom_lock, -1, 0);
 	return 0;
 }
 static DEFINE_MUTEX(memcg_oom_mutex);
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 struct oom_wait_info {
 	struct mem_cgroup *mem;
 	wait_queue_t	wait;
 };
 static int memcg_oom_wake_function(wait_queue_t *wait,
 	unsigned mode, int sync, void *arg)
 {
 	struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
 	struct oom_wait_info *oom_wait_info;
 	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
 	if (oom_wait_info->mem == wake_mem)
 		goto wakeup;
 	/* if no hierarchy, no match */
 	if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
 		return 0;
 	/*
 	 * Both of oom_wait_info->mem and wake_mem are stable under us.
 	 * Then we can use css_is_ancestor without taking care of RCU.
 	 */
 	if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
 	    !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
 		return 0;
 wakeup:
 	return autoremove_wake_function(wait, mode, sync, arg);
 }
 static void memcg_wakeup_oom(struct mem_cgroup *mem)
 {
 	/* for filtering, pass "mem" as argument. */
 	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
 }
 static void memcg_oom_recover(struct mem_cgroup *mem)
 {
 	if (mem && atomic_read(&mem->oom_lock))
 		memcg_wakeup_oom(mem);
 }
 /*
  * try to call OOM killer. returns false if we should exit memory-reclaim loop.
  */
 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
 {
 	struct oom_wait_info owait;
 	bool locked, need_to_kill;
 	owait.mem = mem;
 	owait.wait.flags = 0;
 	owait.wait.func = memcg_oom_wake_function;
 	owait.wait.private = current;
 	INIT_LIST_HEAD(&owait.wait.task_list);
 	need_to_kill = true;
 	/* At first, try to OOM lock hierarchy under mem.*/
 	mutex_lock(&memcg_oom_mutex);
 	locked = mem_cgroup_oom_lock(mem);
 	/*
 	 * Even if signal_pending(), we can't quit charge() loop without
 	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
 	 * under OOM is always welcomed, use TASK_KILLABLE here.
 	 */
 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
 	if (!locked || mem->oom_kill_disable)
 		need_to_kill = false;
 	if (locked)
 		mem_cgroup_oom_notify(mem);
 	mutex_unlock(&memcg_oom_mutex);
 	if (need_to_kill) {
 		finish_wait(&memcg_oom_waitq, &owait.wait);
 		mem_cgroup_out_of_memory(mem, mask);
 	} else {
 		schedule();
 		finish_wait(&memcg_oom_waitq, &owait.wait);
 	}
 	mutex_lock(&memcg_oom_mutex);
 	mem_cgroup_oom_unlock(mem);
 	memcg_wakeup_oom(mem);
 	mutex_unlock(&memcg_oom_mutex);
 	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
 		return false;
 	/* Give chance to dying process */
 	schedule_timeout(1);
 	return true;
 }
 /*
  * Currently used to update mapped file statistics, but the routine can be
  * generalized to update other statistics as well.
  *
  * Notes: Race condition
  *
  * We usually use page_cgroup_lock() for accessing page_cgroup member but
  * it tends to be costly. But considering some conditions, we doesn't need
  * to do so _always_.
  *
  * Considering "charge", lock_page_cgroup() is not required because all
  * file-stat operations happen after a page is attached to radix-tree. There
  * are no race with "charge".
  *
  * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
  * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
  * if there are race with "uncharge". Statistics itself is properly handled
  * by flags.
  *
  * Considering "move", this is an only case we see a race. To make the race
  * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
  * possibility of race condition. If there is, we take a lock.
  */
 void mem_cgroup_update_page_stat(struct page *page,
 				 enum mem_cgroup_page_stat_item idx, int val)
 {
 	struct mem_cgroup *mem;
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	bool need_unlock = false;
 	unsigned long uninitialized_var(flags);
 	if (unlikely(!pc))
 		return;
 	rcu_read_lock();
 	mem = pc->mem_cgroup;
 	if (unlikely(!mem || !PageCgroupUsed(pc)))
 		goto out;
 	/* pc->mem_cgroup is unstable ? */
 	if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
 		/* take a lock against to access pc->mem_cgroup */
 		move_lock_page_cgroup(pc, &flags);
 		need_unlock = true;
 		mem = pc->mem_cgroup;
 		if (!mem || !PageCgroupUsed(pc))
 			goto out;
 	}
 	switch (idx) {
 	case MEMCG_NR_FILE_MAPPED:
 		if (val > 0)
 			SetPageCgroupFileMapped(pc);
 		else if (!page_mapped(page))
 			ClearPageCgroupFileMapped(pc);
 		idx = MEM_CGROUP_STAT_FILE_MAPPED;
 		break;
 	default:
 		BUG();
 	}
 	this_cpu_add(mem->stat->count[idx], val);
 out:
 	if (unlikely(need_unlock))
 		move_unlock_page_cgroup(pc, &flags);
 	rcu_read_unlock();
 	return;
 }
 EXPORT_SYMBOL(mem_cgroup_update_page_stat);
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
  * TODO: maybe necessary to use big numbers in big irons.
  */
 #define CHARGE_BATCH	32U
 struct memcg_stock_pcp {
 	struct mem_cgroup *cached; /* this never be root cgroup */
 	unsigned int nr_pages;
 	struct work_struct work;
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
 static atomic_t memcg_drain_count;
 /*
  * Try to consume stocked charge on this cpu. If success, one page is consumed
  * from local stock and true is returned. If the stock is 0 or charges from a
  * cgroup which is not current target, returns false. This stock will be
  * refilled.
  */
 static bool consume_stock(struct mem_cgroup *mem)
 {
 	struct memcg_stock_pcp *stock;
 	bool ret = true;
 	stock = &get_cpu_var(memcg_stock);
 	if (mem == stock->cached && stock->nr_pages)
 		stock->nr_pages--;
 	else /* need to call res_counter_charge */
 		ret = false;
 	put_cpu_var(memcg_stock);
 	return ret;
 }
 /*
  * Returns stocks cached in percpu to res_counter and reset cached information.
  */
 static void drain_stock(struct memcg_stock_pcp *stock)
 {
 	struct mem_cgroup *old = stock->cached;
 	if (stock->nr_pages) {
 		unsigned long bytes = stock->nr_pages * PAGE_SIZE;
 		res_counter_uncharge(&old->res, bytes);
 		if (do_swap_account)
 			res_counter_uncharge(&old->memsw, bytes);
 		stock->nr_pages = 0;
 	}
 	stock->cached = NULL;
 }
 /*
  * This must be called under preempt disabled or must be called by
  * a thread which is pinned to local cpu.
  */
 static void drain_local_stock(struct work_struct *dummy)
 {
 	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
 	drain_stock(stock);
 }
 /*
  * Cache charges(val) which is from res_counter, to local per_cpu area.
  * This will be consumed by consume_stock() function, later.
  */
 static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
 {
 	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
 	if (stock->cached != mem) { /* reset if necessary */
 		drain_stock(stock);
 		stock->cached = mem;
 	}
 	stock->nr_pages += nr_pages;
 	put_cpu_var(memcg_stock);
 }
 /*
  * Tries to drain stocked charges in other cpus. This function is asynchronous
  * and just put a work per cpu for draining localy on each cpu. Caller can
  * expects some charges will be back to res_counter later but cannot wait for
  * it.
  */
 static void drain_all_stock_async(void)
 {
 	int cpu;
 	/* This function is for scheduling "drain" in asynchronous way.
 	 * The result of "drain" is not directly handled by callers. Then,
 	 * if someone is calling drain, we don't have to call drain more.
 	 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
 	 * there is a race. We just do loose check here.
 	 */
 	if (atomic_read(&memcg_drain_count))
 		return;
 	/* Notify other cpus that system-wide "drain" is running */
 	atomic_inc(&memcg_drain_count);
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
 		schedule_work_on(cpu, &stock->work);
 	}
  	put_online_cpus();
 	atomic_dec(&memcg_drain_count);
 	/* We don't wait for flush_work */
 }
 /* This is a synchronous drain interface. */
 static void drain_all_stock_sync(void)
 {
 	/* called when force_empty is called */
 	atomic_inc(&memcg_drain_count);
 	schedule_on_each_cpu(drain_local_stock);
 	atomic_dec(&memcg_drain_count);
 }
 /*
  * This function drains percpu counter value from DEAD cpu and
  * move it to local cpu. Note that this function can be preempted.
  */
 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
 {
 	int i;
 	spin_lock(&mem->pcp_counter_lock);
 	for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
 		long x = per_cpu(mem->stat->count[i], cpu);
 		per_cpu(mem->stat->count[i], cpu) = 0;
 		mem->nocpu_base.count[i] += x;
 	}
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
 		unsigned long x = per_cpu(mem->stat->events[i], cpu);
 		per_cpu(mem->stat->events[i], cpu) = 0;
 		mem->nocpu_base.events[i] += x;
 	}
 	/* need to clear ON_MOVE value, works as a kind of lock. */
 	per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
 	spin_unlock(&mem->pcp_counter_lock);
 }
 static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
 {
 	int idx = MEM_CGROUP_ON_MOVE;
 	spin_lock(&mem->pcp_counter_lock);
 	per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
 	spin_unlock(&mem->pcp_counter_lock);
 }
 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
 					unsigned long action,
 					void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	struct memcg_stock_pcp *stock;
 	struct mem_cgroup *iter;
 	if ((action == CPU_ONLINE)) {
 		for_each_mem_cgroup_all(iter)
 			synchronize_mem_cgroup_on_move(iter, cpu);
 		return NOTIFY_OK;
 	}
 	if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
 		return NOTIFY_OK;
 	for_each_mem_cgroup_all(iter)
 		mem_cgroup_drain_pcp_counter(iter, cpu);
 	stock = &per_cpu(memcg_stock, cpu);
 	drain_stock(stock);
 	return NOTIFY_OK;
 }
 /* See __mem_cgroup_try_charge() for details */
 enum {
 	CHARGE_OK,		/* success */
 	CHARGE_RETRY,		/* need to retry but retry is not bad */
 	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
 	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
 	CHARGE_OOM_DIE,		/* the current is killed because of OOM */
 };
 static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
 				unsigned int nr_pages, bool oom_check)
 {
 	unsigned long csize = nr_pages * PAGE_SIZE;
 	struct mem_cgroup *mem_over_limit;
 	struct res_counter *fail_res;
 	unsigned long flags = 0;
 	int ret;
 	ret = res_counter_charge(&mem->res, csize, &fail_res);
 	if (likely(!ret)) {
 		if (!do_swap_account)
 			return CHARGE_OK;
 		ret = res_counter_charge(&mem->memsw, csize, &fail_res);
 		if (likely(!ret))
 			return CHARGE_OK;
 		res_counter_uncharge(&mem->res, csize);
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
 		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
 	} else
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
 	/*
 	 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
 	 * of regular pages (CHARGE_BATCH), or a single regular page (1).
 	 *
 	 * Never reclaim on behalf of optional batching, retry with a
 	 * single page instead.
 	 */
 	if (nr_pages == CHARGE_BATCH)
 		return CHARGE_RETRY;
 	if (!(gfp_mask & __GFP_WAIT))
 		return CHARGE_WOULDBLOCK;
 	ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
 					      gfp_mask, flags, NULL);
 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
 		return CHARGE_RETRY;
 	/*
 	 * Even though the limit is exceeded at this point, reclaim
 	 * may have been able to free some pages.  Retry the charge
 	 * before killing the task.
 	 *
 	 * Only for regular pages, though: huge pages are rather
 	 * unlikely to succeed so close to the limit, and we fall back
 	 * to regular pages anyway in case of failure.
 	 */
 	if (nr_pages == 1 && ret)
 		return CHARGE_RETRY;
 	/*
 	 * At task move, charge accounts can be doubly counted. So, it's
 	 * better to wait until the end of task_move if something is going on.
 	 */
 	if (mem_cgroup_wait_acct_move(mem_over_limit))
 		return CHARGE_RETRY;
 	/* If we don't need to call oom-killer at el, return immediately */
 	if (!oom_check)
 		return CHARGE_NOMEM;
 	/* check OOM */
 	if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
 		return CHARGE_OOM_DIE;
 	return CHARGE_RETRY;
 }
 /*
  * Unlike exported interface, "oom" parameter is added. if oom==true,
  * oom-killer can be invoked.
  */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
 				   gfp_t gfp_mask,
 				   unsigned int nr_pages,
 				   struct mem_cgroup **memcg,
 				   bool oom)
 {
 	unsigned int batch = max(CHARGE_BATCH, nr_pages);
 	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct mem_cgroup *mem = NULL;
 	int ret;
 	/*
 	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
 	 * in system level. So, allow to go ahead dying process in addition to
 	 * MEMDIE process.
 	 */
 	if (unlikely(test_thread_flag(TIF_MEMDIE)
 		     || fatal_signal_pending(current)))
 		goto bypass;
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
 	 * The mm_struct's mem_cgroup changes on task migration if the
 	 * thread group leader migrates. It's possible that mm is not
 	 * set, if so charge the init_mm (happens for pagecache usage).
 	 */
 	if (!*memcg && !mm)
 		goto bypass;
 again:
 	if (*memcg) { /* css should be a valid one */
 		mem = *memcg;
 		VM_BUG_ON(css_is_removed(&mem->css));
 		if (mem_cgroup_is_root(mem))
 			goto done;
 		if (nr_pages == 1 && consume_stock(mem))
 			goto done;
 		css_get(&mem->css);
 	} else {
 		struct task_struct *p;
 		rcu_read_lock();
 		p = rcu_dereference(mm->owner);
 		/*
 		 * Because we don't have task_lock(), "p" can exit.
 		 * In that case, "mem" can point to root or p can be NULL with
 		 * race with swapoff. Then, we have small risk of mis-accouning.
 		 * But such kind of mis-account by race always happens because
 		 * we don't have cgroup_mutex(). It's overkill and we allo that
 		 * small race, here.
 		 * (*) swapoff at el will charge against mm-struct not against
 		 * task-struct. So, mm->owner can be NULL.
 		 */
 		mem = mem_cgroup_from_task(p);
 		if (!mem || mem_cgroup_is_root(mem)) {
 			rcu_read_unlock();
 			goto done;
 		}
 		if (nr_pages == 1 && consume_stock(mem)) {
 			/*
 			 * It seems dagerous to access memcg without css_get().
 			 * But considering how consume_stok works, it's not
 			 * necessary. If consume_stock success, some charges
 			 * from this memcg are cached on this cpu. So, we
 			 * don't need to call css_get()/css_tryget() before
 			 * calling consume_stock().
 			 */
 			rcu_read_unlock();
 			goto done;
 		}
 		/* after here, we may be blocked. we need to get refcnt */
 		if (!css_tryget(&mem->css)) {
 			rcu_read_unlock();
 			goto again;
 		}
 		rcu_read_unlock();
 	}
 	do {
 		bool oom_check;
 		/* If killed, bypass charge */
 		if (fatal_signal_pending(current)) {
 			css_put(&mem->css);
 			goto bypass;
 		}
 		oom_check = false;
 		if (oom && !nr_oom_retries) {
 			oom_check = true;
 			nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
 		}
 		ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
 		switch (ret) {
 		case CHARGE_OK:
 			break;
 		case CHARGE_RETRY: /* not in OOM situation but retry */
 			batch = nr_pages;
 			css_put(&mem->css);
 			mem = NULL;
 			goto again;
 		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
 			css_put(&mem->css);
 			goto nomem;
 		case CHARGE_NOMEM: /* OOM routine works */
 			if (!oom) {
 				css_put(&mem->css);
 				goto nomem;
 			}
 			/* If oom, we never return -ENOMEM */
 			nr_oom_retries--;
 			break;
 		case CHARGE_OOM_DIE: /* Killed by OOM Killer */
 			css_put(&mem->css);
 			goto bypass;
 		}
 	} while (ret != CHARGE_OK);
 	if (batch > nr_pages)
 		refill_stock(mem, batch - nr_pages);
 	css_put(&mem->css);
 done:
 	*memcg = mem;
 	return 0;
 nomem:
 	*memcg = NULL;
 	return -ENOMEM;
 bypass:
 	*memcg = NULL;
 	return 0;
 }
 /*
  * Somemtimes we have to undo a charge we got by try_charge().
  * This function is for that and do uncharge, put css's refcnt.
  * gotten by try_charge().
  */
 static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
 				       unsigned int nr_pages)
 {
 	if (!mem_cgroup_is_root(mem)) {
 		unsigned long bytes = nr_pages * PAGE_SIZE;
 		res_counter_uncharge(&mem->res, bytes);
 		if (do_swap_account)
 			res_counter_uncharge(&mem->memsw, bytes);
 	}
 }
 /*
  * A helper function to get mem_cgroup from ID. must be called under
  * rcu_read_lock(). The caller must check css_is_removed() or some if
  * it's concern. (dropping refcnt from swap can be called against removed
  * memcg.)
  */
 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
 {
 	struct cgroup_subsys_state *css;
 	/* ID 0 is unused ID */
 	if (!id)
 		return NULL;
 	css = css_lookup(&mem_cgroup_subsys, id);
 	if (!css)
 		return NULL;
 	return container_of(css, struct mem_cgroup, css);
 }
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 {
 	struct mem_cgroup *mem = NULL;
 	struct page_cgroup *pc;
 	unsigned short id;
 	swp_entry_t ent;
 	VM_BUG_ON(!PageLocked(page));
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
 		if (mem && !css_tryget(&mem->css))
 			mem = NULL;
 	} else if (PageSwapCache(page)) {
 		ent.val = page_private(page);
 		id = lookup_swap_cgroup(ent);
 		rcu_read_lock();
 		mem = mem_cgroup_lookup(id);
 		if (mem && !css_tryget(&mem->css))
 			mem = NULL;
 		rcu_read_unlock();
 	}
 	unlock_page_cgroup(pc);
 	return mem;
 }
 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 				       struct page *page,
 				       unsigned int nr_pages,
 				       struct page_cgroup *pc,
 				       enum charge_type ctype)
 {
 	lock_page_cgroup(pc);
 	if (unlikely(PageCgroupUsed(pc))) {
 		unlock_page_cgroup(pc);
 		__mem_cgroup_cancel_charge(mem, nr_pages);
 		return;
 	}
 	/*
 	 * we don't need page_cgroup_lock about tail pages, becase they are not
 	 * accessed by any other context at this point.
 	 */
 	pc->mem_cgroup = mem;
 	/*
 	 * We access a page_cgroup asynchronously without lock_page_cgroup().
 	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
 	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
 	 * before USED bit, we need memory barrier here.
 	 * See mem_cgroup_add_lru_list(), etc.
  	 */
 	smp_wmb();
 	switch (ctype) {
 	case MEM_CGROUP_CHARGE_TYPE_CACHE:
 	case MEM_CGROUP_CHARGE_TYPE_SHMEM:
 		SetPageCgroupCache(pc);
 		SetPageCgroupUsed(pc);
 		break;
 	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
 		ClearPageCgroupCache(pc);
 		SetPageCgroupUsed(pc);
 		break;
 	default:
 		break;
 	}
 	mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
 	unlock_page_cgroup(pc);
 	/*
 	 * "charge_statistics" updated event counter. Then, check it.
 	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
 	 * if they exceeds softlimit.
 	 */
 	memcg_check_events(mem, page);
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
 			(1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
 /*
  * Because tail pages are not marked as "used", set it. We're under
  * zone->lru_lock, 'splitting on pmd' and compund_lock.
  */
 void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
 {
 	struct page_cgroup *head_pc = lookup_page_cgroup(head);
 	struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
 	unsigned long flags;
 	if (mem_cgroup_disabled())
 		return;
 	/*
 	 * We have no races with charge/uncharge but will have races with
 	 * page state accounting.
 	 */
 	move_lock_page_cgroup(head_pc, &flags);
 	tail_pc->mem_cgroup = head_pc->mem_cgroup;
 	smp_wmb(); /* see __commit_charge() */
 	if (PageCgroupAcctLRU(head_pc)) {
 		enum lru_list lru;
 		struct mem_cgroup_per_zone *mz;
 		/*
 		 * LRU flags cannot be copied because we need to add tail
 		 *.page to LRU by generic call and our hook will be called.
 		 * We hold lru_lock, then, reduce counter directly.
 		 */
 		lru = page_lru(head);
 		mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
 		MEM_CGROUP_ZSTAT(mz, lru) -= 1;
 	}
 	tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
 	move_unlock_page_cgroup(head_pc, &flags);
 }
 #endif
 /**
  * mem_cgroup_move_account - move account of the page
  * @page: the page
  * @nr_pages: number of regular pages (>1 for huge pages)
  * @pc:	page_cgroup of the page.
  * @from: mem_cgroup which the page is moved from.
  * @to:	mem_cgroup which the page is moved to. @from != @to.
  * @uncharge: whether we should call uncharge and css_put against @from.
  *
  * The caller must confirm following.
  * - page is not on LRU (isolate_page() is useful.)
  * - compound_lock is held when nr_pages > 1
  *
  * This function doesn't do "charge" nor css_get to new cgroup. It should be
  * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
  * true, this function does "uncharge" from old cgroup, but it doesn't if
  * @uncharge is false, so a caller should do "uncharge".
  */
 static int mem_cgroup_move_account(struct page *page,
 				   unsigned int nr_pages,
 				   struct page_cgroup *pc,
 				   struct mem_cgroup *from,
 				   struct mem_cgroup *to,
 				   bool uncharge)
 {
 	unsigned long flags;
 	int ret;
 	VM_BUG_ON(from == to);
 	VM_BUG_ON(PageLRU(page));
 	/*
 	 * The page is isolated from LRU. So, collapse function
 	 * will not handle this page. But page splitting can happen.
 	 * Do this check under compound_page_lock(). The caller should
 	 * hold it.
 	 */
 	ret = -EBUSY;
 	if (nr_pages > 1 && !PageTransHuge(page))
 		goto out;
 	lock_page_cgroup(pc);
 	ret = -EINVAL;
 	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
 		goto unlock;
 	move_lock_page_cgroup(pc, &flags);
 	if (PageCgroupFileMapped(pc)) {
 		/* Update mapped_file data for mem_cgroup */
 		preempt_disable();
 		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
 		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
 		preempt_enable();
 	}
 	mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
 	if (uncharge)
 		/* This is not "cancel", but cancel_charge does all we need. */
 		__mem_cgroup_cancel_charge(from, nr_pages);
 	/* caller should have done css_get */
 	pc->mem_cgroup = to;
 	mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
 	/*
 	 * We charges against "to" which may not have any tasks. Then, "to"
 	 * can be under rmdir(). But in current implementation, caller of
 	 * this function is just force_empty() and move charge, so it's
 	 * guaranteed that "to" is never removed. So, we don't check rmdir
 	 * status here.
 	 */
 	move_unlock_page_cgroup(pc, &flags);
 	ret = 0;
 unlock:
 	unlock_page_cgroup(pc);
 	/*
 	 * check events
 	 */
 	memcg_check_events(to, page);
 	memcg_check_events(from, page);
 out:
 	return ret;
 }
 /*
  * move charges to its parent.
  */
 static int mem_cgroup_move_parent(struct page *page,
 				  struct page_cgroup *pc,
 				  struct mem_cgroup *child,
 				  gfp_t gfp_mask)
 {
 	struct cgroup *cg = child->css.cgroup;
 	struct cgroup *pcg = cg->parent;
 	struct mem_cgroup *parent;
 	unsigned int nr_pages;
 	unsigned long uninitialized_var(flags);
 	int ret;
 	/* Is ROOT ? */
 	if (!pcg)
 		return -EINVAL;
 	ret = -EBUSY;
 	if (!get_page_unless_zero(page))
 		goto out;
 	if (isolate_lru_page(page))
 		goto put;
 	nr_pages = hpage_nr_pages(page);
 	parent = mem_cgroup_from_cont(pcg);
 	ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
 	if (ret || !parent)
 		goto put_back;
 	if (nr_pages > 1)
 		flags = compound_lock_irqsave(page);
 	ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
 	if (ret)
 		__mem_cgroup_cancel_charge(parent, nr_pages);
 	if (nr_pages > 1)
 		compound_unlock_irqrestore(page, flags);
 put_back:
 	putback_lru_page(page);
 put:
 	put_page(page);
 out:
 	return ret;
 }
 /*
  * Charge the memory controller for page usage.
  * Return
  * 0 if the charge was successful
  * < 0 if the cgroup is over its limit
  */
 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask, enum charge_type ctype)
 {
 	struct mem_cgroup *mem = NULL;
 	unsigned int nr_pages = 1;
 	struct page_cgroup *pc;
 	bool oom = true;
 	int ret;
 	if (PageTransHuge(page)) {
 		nr_pages <<= compound_order(page);
 		VM_BUG_ON(!PageTransHuge(page));
 		/*
 		 * Never OOM-kill a process for a huge page.  The
 		 * fault handler will fall back to regular pages.
 		 */
 		oom = false;
 	}
 	pc = lookup_page_cgroup(page);
 	BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
 	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
 	if (ret || !mem)
 		return ret;
 	__mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
 	return 0;
 }
 int mem_cgroup_newpage_charge(struct page *page,
 			      struct mm_struct *mm, gfp_t gfp_mask)
 {
 	if (mem_cgroup_disabled())
 		return 0;
 	/*
 	 * If already mapped, we don't have to account.
 	 * If page cache, page->mapping has address_space.
 	 * But page->mapping may have out-of-use anon_vma pointer,
 	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
 	 * is NULL.
   	 */
 	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
 		return 0;
 	if (unlikely(!mm))
 		mm = &init_mm;
 	return mem_cgroup_charge_common(page, mm, gfp_mask,
 				MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
 					enum charge_type ctype);
 static void
 __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
 					enum charge_type ctype)
 {
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	/*
 	 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
 	 * is already on LRU. It means the page may on some other page_cgroup's
 	 * LRU. Take care of it.
 	 */
 	mem_cgroup_lru_del_before_commit(page);
 	__mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
 	mem_cgroup_lru_add_after_commit(page);
 	return;
 }
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask)
 {
 	struct mem_cgroup *mem = NULL;
 	int ret;
 	if (mem_cgroup_disabled())
 		return 0;
 	if (PageCompound(page))
 		return 0;
 	/*
 	 * Corner case handling. This is called from add_to_page_cache()
 	 * in usual. But some FS (shmem) precharges this page before calling it
 	 * and call add_to_page_cache() with GFP_NOWAIT.
 	 *
 	 * For GFP_NOWAIT case, the page may be pre-charged before calling
 	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
 	 * charge twice. (It works but has to pay a bit larger cost.)
 	 * And when the page is SwapCache, it should take swap information
 	 * into account. This is under lock_page() now.
 	 */
 	if (!(gfp_mask & __GFP_WAIT)) {
 		struct page_cgroup *pc;
 		pc = lookup_page_cgroup(page);
 		if (!pc)
 			return 0;
 		lock_page_cgroup(pc);
 		if (PageCgroupUsed(pc)) {
 			unlock_page_cgroup(pc);
 			return 0;
 		}
 		unlock_page_cgroup(pc);
 	}
 	if (unlikely(!mm))
 		mm = &init_mm;
 	if (page_is_file_cache(page)) {
 		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
 		if (ret || !mem)
 			return ret;
 		/*
 		 * FUSE reuses pages without going through the final
 		 * put that would remove them from the LRU list, make
 		 * sure that they get relinked properly.
 		 */
 		__mem_cgroup_commit_charge_lrucare(page, mem,
 					MEM_CGROUP_CHARGE_TYPE_CACHE);
 		return ret;
 	}
 	/* shmem */
 	if (PageSwapCache(page)) {
 		ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
 		if (!ret)
 			__mem_cgroup_commit_charge_swapin(page, mem,
 					MEM_CGROUP_CHARGE_TYPE_SHMEM);
 	} else
 		ret = mem_cgroup_charge_common(page, mm, gfp_mask,
 					MEM_CGROUP_CHARGE_TYPE_SHMEM);
 	return ret;
 }
 /*
  * While swap-in, try_charge -> commit or cancel, the page is locked.
  * And when try_charge() successfully returns, one refcnt to memcg without
  * struct page_cgroup is acquired. This refcnt will be consumed by
  * "commit()" or removed by "cancel()"
  */
 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 				 struct page *page,
 				 gfp_t mask, struct mem_cgroup **ptr)
 {
 	struct mem_cgroup *mem;
 	int ret;
 	*ptr = NULL;
 	if (mem_cgroup_disabled())
 		return 0;
 	if (!do_swap_account)
 		goto charge_cur_mm;
 	/*
 	 * A racing thread's fault, or swapoff, may have already updated
 	 * the pte, and even removed page from swap cache: in those cases
 	 * do_swap_page()'s pte_same() test will fail; but there's also a
 	 * KSM case which does need to charge the page.
 	 */
 	if (!PageSwapCache(page))
 		goto charge_cur_mm;
 	mem = try_get_mem_cgroup_from_page(page);
 	if (!mem)
 		goto charge_cur_mm;
 	*ptr = mem;
 	ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
 	css_put(&mem->css);
 	return ret;
 charge_cur_mm:
 	if (unlikely(!mm))
 		mm = &init_mm;
 	return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
 }
 static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
 					enum charge_type ctype)
 {
 	if (mem_cgroup_disabled())
 		return;
 	if (!ptr)
 		return;
 	cgroup_exclude_rmdir(&ptr->css);
 	__mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
 	/*
 	 * Now swap is on-memory. This means this page may be
 	 * counted both as mem and swap....double count.
 	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
 	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
 	 * may call delete_from_swap_cache() before reach here.
 	 */
 	if (do_swap_account && PageSwapCache(page)) {
 		swp_entry_t ent = {.val = page_private(page)};
 		unsigned short id;
 		struct mem_cgroup *memcg;
 		id = swap_cgroup_record(ent, 0);
 		rcu_read_lock();
 		memcg = mem_cgroup_lookup(id);
 		if (memcg) {
 			/*
 			 * This recorded memcg can be obsolete one. So, avoid
 			 * calling css_tryget
 			 */
 			if (!mem_cgroup_is_root(memcg))
 				res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 			mem_cgroup_swap_statistics(memcg, false);
 			mem_cgroup_put(memcg);
 		}
 		rcu_read_unlock();
 	}
 	/*
 	 * At swapin, we may charge account against cgroup which has no tasks.
 	 * So, rmdir()->pre_destroy() can be called while we do this charge.
 	 * In that case, we need to call pre_destroy() again. check it here.
 	 */
 	cgroup_release_and_wakeup_rmdir(&ptr->css);
 }
 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 {
 	__mem_cgroup_commit_charge_swapin(page, ptr,
 					MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
 {
 	if (mem_cgroup_disabled())
 		return;
 	if (!mem)
 		return;
 	__mem_cgroup_cancel_charge(mem, 1);
 }
 static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
 				   unsigned int nr_pages,
 				   const enum charge_type ctype)
 {
 	struct memcg_batch_info *batch = NULL;
 	bool uncharge_memsw = true;
 	/* If swapout, usage of swap doesn't decrease */
 	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
 		uncharge_memsw = false;
 	batch = &current->memcg_batch;
 	/*
 	 * In usual, we do css_get() when we remember memcg pointer.
 	 * But in this case, we keep res->usage until end of a series of
 	 * uncharges. Then, it's ok to ignore memcg's refcnt.
 	 */
 	if (!batch->memcg)
 		batch->memcg = mem;
 	/*
 	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
 	 * In those cases, all pages freed continuously can be expected to be in
 	 * the same cgroup and we have chance to coalesce uncharges.
 	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
 	 * because we want to do uncharge as soon as possible.
 	 */
 	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
 		goto direct_uncharge;
 	if (nr_pages > 1)
 		goto direct_uncharge;
 	/*
 	 * In typical case, batch->memcg == mem. This means we can
 	 * merge a series of uncharges to an uncharge of res_counter.
 	 * If not, we uncharge res_counter ony by one.
 	 */
 	if (batch->memcg != mem)
 		goto direct_uncharge;
 	/* remember freed charge and uncharge it later */
 	batch->nr_pages++;
 	if (uncharge_memsw)
 		batch->memsw_nr_pages++;
 	return;
 direct_uncharge:
 	res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
 	if (uncharge_memsw)
 		res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
 	if (unlikely(batch->memcg != mem))
 		memcg_oom_recover(mem);
 	return;
 }
 /*
  * uncharge if !page_mapped(page)
  */
 static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
 	struct mem_cgroup *mem = NULL;
 	unsigned int nr_pages = 1;
 	struct page_cgroup *pc;
 	if (mem_cgroup_disabled())
 		return NULL;
 	if (PageSwapCache(page))
 		return NULL;
 	if (PageTransHuge(page)) {
 		nr_pages <<= compound_order(page);
 		VM_BUG_ON(!PageTransHuge(page));
 	}
 	/*
 	 * Check if our page_cgroup is valid
 	 */
 	pc = lookup_page_cgroup(page);
 	if (unlikely(!pc || !PageCgroupUsed(pc)))
 		return NULL;
 	lock_page_cgroup(pc);
 	mem = pc->mem_cgroup;
 	if (!PageCgroupUsed(pc))
 		goto unlock_out;
 	switch (ctype) {
 	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
 	case MEM_CGROUP_CHARGE_TYPE_DROP:
 		/* See mem_cgroup_prepare_migration() */
 		if (page_mapped(page) || PageCgroupMigration(pc))
 			goto unlock_out;
 		break;
 	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
 		if (!PageAnon(page)) {	/* Shared memory */
 			if (page->mapping && !page_is_file_cache(page))
 				goto unlock_out;
 		} else if (page_mapped(page)) /* Anon */
 				goto unlock_out;
 		break;
 	default:
 		break;
 	}
 	mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
 	ClearPageCgroupUsed(pc);
 	/*
 	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
 	 * freed from LRU. This is safe because uncharged page is expected not
 	 * to be reused (freed soon). Exception is SwapCache, it's handled by
 	 * special functions.
 	 */
 	unlock_page_cgroup(pc);
 	/*
 	 * even after unlock, we have mem->res.usage here and this memcg
 	 * will never be freed.
 	 */
 	memcg_check_events(mem, page);
 	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
 		mem_cgroup_swap_statistics(mem, true);
 		mem_cgroup_get(mem);
 	}
 	if (!mem_cgroup_is_root(mem))
 		mem_cgroup_do_uncharge(mem, nr_pages, ctype);
 	return mem;
 unlock_out:
 	unlock_page_cgroup(pc);
 	return NULL;
 }
 void mem_cgroup_uncharge_page(struct page *page)
 {
 	/* early check. */
 	if (page_mapped(page))
 		return;
 	if (page->mapping && !PageAnon(page))
 		return;
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 void mem_cgroup_uncharge_cache_page(struct page *page)
 {
 	VM_BUG_ON(page_mapped(page));
 	VM_BUG_ON(page->mapping);
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 }
 /*
  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
  * In that cases, pages are freed continuously and we can expect pages
  * are in the same memcg. All these calls itself limits the number of
  * pages freed at once, then uncharge_start/end() is called properly.
  * This may be called prural(2) times in a context,
  */
 void mem_cgroup_uncharge_start(void)
 {
 	current->memcg_batch.do_batch++;
 	/* We can do nest. */
 	if (current->memcg_batch.do_batch == 1) {
 		current->memcg_batch.memcg = NULL;
 		current->memcg_batch.nr_pages = 0;
 		current->memcg_batch.memsw_nr_pages = 0;
 	}
 }
 void mem_cgroup_uncharge_end(void)
 {
 	struct memcg_batch_info *batch = &current->memcg_batch;
 	if (!batch->do_batch)
 		return;
 	batch->do_batch--;
 	if (batch->do_batch) /* If stacked, do nothing. */
 		return;
 	if (!batch->memcg)
 		return;
 	/*
 	 * This "batch->memcg" is valid without any css_get/put etc...
 	 * bacause we hide charges behind us.
 	 */
 	if (batch->nr_pages)
 		res_counter_uncharge(&batch->memcg->res,
 				     batch->nr_pages * PAGE_SIZE);
 	if (batch->memsw_nr_pages)
 		res_counter_uncharge(&batch->memcg->memsw,
 				     batch->memsw_nr_pages * PAGE_SIZE);
 	memcg_oom_recover(batch->memcg);
 	/* forget this pointer (for sanity check) */
 	batch->memcg = NULL;
 }
 #ifdef CONFIG_SWAP
 /*
  * called after __delete_from_swap_cache() and drop "page" account.
  * memcg information is recorded to swap_cgroup of "ent"
  */
 void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 {
 	struct mem_cgroup *memcg;
 	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
 	if (!swapout) /* this was a swap cache but the swap is unused ! */
 		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
 	memcg = __mem_cgroup_uncharge_common(page, ctype);
 	/*
 	 * record memcg information,  if swapout && memcg != NULL,
 	 * mem_cgroup_get() was called in uncharge().
 	 */
 	if (do_swap_account && swapout && memcg)
 		swap_cgroup_record(ent, css_id(&memcg->css));
 }
 #endif
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 /*
  * called from swap_entry_free(). remove record in swap_cgroup and
  * uncharge "memsw" account.
  */
 void mem_cgroup_uncharge_swap(swp_entry_t ent)
 {
 	struct mem_cgroup *memcg;
 	unsigned short id;
 	if (!do_swap_account)
 		return;
 	id = swap_cgroup_record(ent, 0);
 	rcu_read_lock();
 	memcg = mem_cgroup_lookup(id);
 	if (memcg) {
 		/*
 		 * We uncharge this because swap is freed.
 		 * This memcg can be obsolete one. We avoid calling css_tryget
 		 */
 		if (!mem_cgroup_is_root(memcg))
 			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 		mem_cgroup_swap_statistics(memcg, false);
 		mem_cgroup_put(memcg);
 	}
 	rcu_read_unlock();
 }
 /**
  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
  * @entry: swap entry to be moved
  * @from:  mem_cgroup which the entry is moved from
  * @to:  mem_cgroup which the entry is moved to
  * @need_fixup: whether we should fixup res_counters and refcounts.
  *
  * It succeeds only when the swap_cgroup's record for this entry is the same
  * as the mem_cgroup's id of @from.
  *
  * Returns 0 on success, -EINVAL on failure.
  *
  * The caller must have charged to @to, IOW, called res_counter_charge() about
  * both res and memsw, and called css_get().
  */
 static int mem_cgroup_move_swap_account(swp_entry_t entry,
 		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
 {
 	unsigned short old_id, new_id;
 	old_id = css_id(&from->css);
 	new_id = css_id(&to->css);
 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
 		mem_cgroup_swap_statistics(from, false);
 		mem_cgroup_swap_statistics(to, true);
 		/*
 		 * This function is only called from task migration context now.
 		 * It postpones res_counter and refcount handling till the end
 		 * of task migration(mem_cgroup_clear_mc()) for performance
 		 * improvement. But we cannot postpone mem_cgroup_get(to)
 		 * because if the process that has been moved to @to does
 		 * swap-in, the refcount of @to might be decreased to 0.
 		 */
 		mem_cgroup_get(to);
 		if (need_fixup) {
 			if (!mem_cgroup_is_root(from))
 				res_counter_uncharge(&from->memsw, PAGE_SIZE);
 			mem_cgroup_put(from);
 			/*
 			 * we charged both to->res and to->memsw, so we should
 			 * uncharge to->res.
 			 */
 			if (!mem_cgroup_is_root(to))
 				res_counter_uncharge(&to->res, PAGE_SIZE);
 		}
 		return 0;
 	}
 	return -EINVAL;
 }
 #else
 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
 {
 	return -EINVAL;
 }
 #endif
 /*
  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
  * page belongs to.
  */
 int mem_cgroup_prepare_migration(struct page *page,
 	struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
 {
 	struct mem_cgroup *mem = NULL;
 	struct page_cgroup *pc;
 	enum charge_type ctype;
 	int ret = 0;
 	*ptr = NULL;
 	VM_BUG_ON(PageTransHuge(page));
 	if (mem_cgroup_disabled())
 		return 0;
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		mem = pc->mem_cgroup;
 		css_get(&mem->css);
 		/*
 		 * At migrating an anonymous page, its mapcount goes down
 		 * to 0 and uncharge() will be called. But, even if it's fully
 		 * unmapped, migration may fail and this page has to be
 		 * charged again. We set MIGRATION flag here and delay uncharge
 		 * until end_migration() is called
 		 *
 		 * Corner Case Thinking
 		 * A)
 		 * When the old page was mapped as Anon and it's unmap-and-freed
 		 * while migration was ongoing.
 		 * If unmap finds the old page, uncharge() of it will be delayed
 		 * until end_migration(). If unmap finds a new page, it's
 		 * uncharged when it make mapcount to be 1->0. If unmap code
 		 * finds swap_migration_entry, the new page will not be mapped
 		 * and end_migration() will find it(mapcount==0).
 		 *
 		 * B)
 		 * When the old page was mapped but migraion fails, the kernel
 		 * remaps it. A charge for it is kept by MIGRATION flag even
 		 * if mapcount goes down to 0. We can do remap successfully
 		 * without charging it again.
 		 *
 		 * C)
 		 * The "old" page is under lock_page() until the end of
 		 * migration, so, the old page itself will not be swapped-out.
 		 * If the new page is swapped out before end_migraton, our
 		 * hook to usual swap-out path will catch the event.
 		 */
 		if (PageAnon(page))
 			SetPageCgroupMigration(pc);
 	}
 	unlock_page_cgroup(pc);
 	/*
 	 * If the page is not charged at this point,
 	 * we return here.
 	 */
 	if (!mem)
 		return 0;
 	*ptr = mem;
 	ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
 	css_put(&mem->css);/* drop extra refcnt */
 	if (ret || *ptr == NULL) {
 		if (PageAnon(page)) {
 			lock_page_cgroup(pc);
 			ClearPageCgroupMigration(pc);
 			unlock_page_cgroup(pc);
 			/*
 			 * The old page may be fully unmapped while we kept it.
 			 */
 			mem_cgroup_uncharge_page(page);
 		}
 		return -ENOMEM;
 	}
 	/*
 	 * We charge new page before it's used/mapped. So, even if unlock_page()
 	 * is called before end_migration, we can catch all events on this new
 	 * page. In the case new page is migrated but not remapped, new page's
 	 * mapcount will be finally 0 and we call uncharge in end_migration().
 	 */
 	pc = lookup_page_cgroup(newpage);
 	if (PageAnon(page))
 		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
 	else if (page_is_file_cache(page))
 		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	else
 		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
 	__mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
 	return ret;
 }
 /* remove redundant charge if migration failed*/
 void mem_cgroup_end_migration(struct mem_cgroup *mem,
 	struct page *oldpage, struct page *newpage, bool migration_ok)
 {
 	struct page *used, *unused;
 	struct page_cgroup *pc;
 	if (!mem)
 		return;
 	/* blocks rmdir() */
 	cgroup_exclude_rmdir(&mem->css);
 	if (!migration_ok) {
 		used = oldpage;
 		unused = newpage;
 	} else {
 		used = newpage;
 		unused = oldpage;
 	}
 	/*
 	 * We disallowed uncharge of pages under migration because mapcount
 	 * of the page goes down to zero, temporarly.
 	 * Clear the flag and check the page should be charged.
 	 */
 	pc = lookup_page_cgroup(oldpage);
 	lock_page_cgroup(pc);
 	ClearPageCgroupMigration(pc);
 	unlock_page_cgroup(pc);
 	__mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
 	/*
 	 * If a page is a file cache, radix-tree replacement is very atomic
 	 * and we can skip this check. When it was an Anon page, its mapcount
 	 * goes down to 0. But because we added MIGRATION flage, it's not
 	 * uncharged yet. There are several case but page->mapcount check
 	 * and USED bit check in mem_cgroup_uncharge_page() will do enough
 	 * check. (see prepare_charge() also)
 	 */
 	if (PageAnon(used))
 		mem_cgroup_uncharge_page(used);
 	/*
 	 * At migration, we may charge account against cgroup which has no
 	 * tasks.
 	 * So, rmdir()->pre_destroy() can be called while we do this charge.
 	 * In that case, we need to call pre_destroy() again. check it here.
 	 */
 	cgroup_release_and_wakeup_rmdir(&mem->css);
 }
 /*
  * A call to try to shrink memory usage on charge failure at shmem's swapin.
  * Calling hierarchical_reclaim is not enough because we should update
  * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
  * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
  * not from the memcg which this page would be charged to.
  * try_charge_swapin does all of these works properly.
  */
 int mem_cgroup_shmem_charge_fallback(struct page *page,
 			    struct mm_struct *mm,
 			    gfp_t gfp_mask)
 {
 	struct mem_cgroup *mem;
 	int ret;
 	if (mem_cgroup_disabled())
 		return 0;
 	ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
 	if (!ret)
 		mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
 	return ret;
 }
 #ifdef CONFIG_DEBUG_VM
 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
 {
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup(page);
 	if (likely(pc) && PageCgroupUsed(pc))
 		return pc;
 	return NULL;
 }
 bool mem_cgroup_bad_page_check(struct page *page)
 {
 	if (mem_cgroup_disabled())
 		return false;
 	return lookup_page_cgroup_used(page) != NULL;
 }
 void mem_cgroup_print_bad_page(struct page *page)
 {
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup_used(page);
 	if (pc) {
 		int ret = -1;
 		char *path;
 		printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
 		       pc, pc->flags, pc->mem_cgroup);
 		path = kmalloc(PATH_MAX, GFP_KERNEL);
 		if (path) {
 			rcu_read_lock();
 			ret = cgroup_path(pc->mem_cgroup->css.cgroup,
 							path, PATH_MAX);
 			rcu_read_unlock();
 		}
 		printk(KERN_CONT "(%s)\n",
 				(ret < 0) ? "cannot get the path" : path);
 		kfree(path);
 	}
 }
 #endif
 static DEFINE_MUTEX(set_limit_mutex);
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 				unsigned long long val)
 {
 	int retry_count;
 	u64 memswlimit, memlimit;
 	int ret = 0;
 	int children = mem_cgroup_count_children(memcg);
 	u64 curusage, oldusage;
 	int enlarge;
 	/*
 	 * For keeping hierarchical_reclaim simple, how long we should retry
 	 * is depends on callers. We set our retry-count to be function
 	 * of # of children which we should visit in this loop.
 	 */
 	retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
 	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 	enlarge = 0;
 	while (retry_count) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		/*
 		 * Rather than hide all in some function, I do this in
 		 * open coded manner. You see what this really does.
 		 * We have to guarantee mem->res.limit < mem->memsw.limit.
 		 */
 		mutex_lock(&set_limit_mutex);
 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 		if (memswlimit < val) {
 			ret = -EINVAL;
 			mutex_unlock(&set_limit_mutex);
 			break;
 		}
 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 		if (memlimit < val)
 			enlarge = 1;
 		ret = res_counter_set_limit(&memcg->res, val);
 		if (!ret) {
 			if (memswlimit == val)
 				memcg->memsw_is_minimum = true;
 			else
 				memcg->memsw_is_minimum = false;
 		}
 		mutex_unlock(&set_limit_mutex);
 		if (!ret)
 			break;
 		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
 						MEM_CGROUP_RECLAIM_SHRINK,
 						NULL);
 		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 		/* Usage is reduced ? */
   		if (curusage >= oldusage)
 			retry_count--;
 		else
 			oldusage = curusage;
 	}
 	if (!ret && enlarge)
 		memcg_oom_recover(memcg);
 	return ret;
 }
 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 					unsigned long long val)
 {
 	int retry_count;
 	u64 memlimit, memswlimit, oldusage, curusage;
 	int children = mem_cgroup_count_children(memcg);
 	int ret = -EBUSY;
 	int enlarge = 0;
 	/* see mem_cgroup_resize_res_limit */
  	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
 	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
 	while (retry_count) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		/*
 		 * Rather than hide all in some function, I do this in
 		 * open coded manner. You see what this really does.
 		 * We have to guarantee mem->res.limit < mem->memsw.limit.
 		 */
 		mutex_lock(&set_limit_mutex);
 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 		if (memlimit > val) {
 			ret = -EINVAL;
 			mutex_unlock(&set_limit_mutex);
 			break;
 		}
 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 		if (memswlimit < val)
 			enlarge = 1;
 		ret = res_counter_set_limit(&memcg->memsw, val);
 		if (!ret) {
 			if (memlimit == val)
 				memcg->memsw_is_minimum = true;
 			else
 				memcg->memsw_is_minimum = false;
 		}
 		mutex_unlock(&set_limit_mutex);
 		if (!ret)
 			break;
 		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
 						MEM_CGROUP_RECLAIM_NOSWAP |
 						MEM_CGROUP_RECLAIM_SHRINK,
 						NULL);
 		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
 		/* Usage is reduced ? */
 		if (curusage >= oldusage)
 			retry_count--;
 		else
 			oldusage = curusage;
 	}
 	if (!ret && enlarge)
 		memcg_oom_recover(memcg);
 	return ret;
 }
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 					    gfp_t gfp_mask,
 					    unsigned long *total_scanned)
 {
 	unsigned long nr_reclaimed = 0;
 	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
 	unsigned long reclaimed;
 	int loop = 0;
 	struct mem_cgroup_tree_per_zone *mctz;
 	unsigned long long excess;
 	unsigned long nr_scanned;
 	if (order > 0)
 		return 0;
 	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
 	/*
 	 * This loop can run a while, specially if mem_cgroup's continuously
 	 * keep exceeding their soft limit and putting the system under
 	 * pressure
 	 */
 	do {
 		if (next_mz)
 			mz = next_mz;
 		else
 			mz = mem_cgroup_largest_soft_limit_node(mctz);
 		if (!mz)
 			break;
 		nr_scanned = 0;
 		reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
 						gfp_mask,
 						MEM_CGROUP_RECLAIM_SOFT,
 						&nr_scanned);
 		nr_reclaimed += reclaimed;
 		*total_scanned += nr_scanned;
 		spin_lock(&mctz->lock);
 		/*
 		 * If we failed to reclaim anything from this memory cgroup
 		 * it is time to move on to the next cgroup
 		 */
 		next_mz = NULL;
 		if (!reclaimed) {
 			do {
 				/*
 				 * Loop until we find yet another one.
 				 *
 				 * By the time we get the soft_limit lock
 				 * again, someone might have aded the
 				 * group back on the RB tree. Iterate to
 				 * make sure we get a different mem.
 				 * mem_cgroup_largest_soft_limit_node returns
 				 * NULL if no other cgroup is present on
 				 * the tree
 				 */
 				next_mz =
 				__mem_cgroup_largest_soft_limit_node(mctz);
-				if (next_mz == mz) {
+				if (next_mz == mz)
 					css_put(&next_mz->mem->css);
-					next_mz = NULL;
+				else /* next_mz == NULL or other memcg */
-				} else /* next_mz == NULL or other memcg */
 					break;
 			} while (1);
 		}
 		__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
 		excess = res_counter_soft_limit_excess(&mz->mem->res);
 		/*
 		 * One school of thought says that we should not add
 		 * back the node to the tree if reclaim returns 0.
 		 * But our reclaim could return 0, simply because due
 		 * to priority we are exposing a smaller subset of
 		 * memory to reclaim from. Consider this as a longer
 		 * term TODO.
 		 */
 		/* If excess == 0, no tree ops */
 		__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
 		spin_unlock(&mctz->lock);
 		css_put(&mz->mem->css);
 		loop++;
 		/*
 		 * Could not reclaim anything and there are no more
 		 * mem cgroups to try or we seem to be looping without
 		 * reclaiming anything.
 		 */
 		if (!nr_reclaimed &&
 			(next_mz == NULL ||
 			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
 			break;
 	} while (!nr_reclaimed);
 	if (next_mz)
 		css_put(&next_mz->mem->css);
 	return nr_reclaimed;
 }
 /*
  * This routine traverse page_cgroup in given list and drop them all.
  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
  */
 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 				int node, int zid, enum lru_list lru)
 {
 	struct zone *zone;
 	struct mem_cgroup_per_zone *mz;
 	struct page_cgroup *pc, *busy;
 	unsigned long flags, loop;
 	struct list_head *list;
 	int ret = 0;
 	zone = &NODE_DATA(node)->node_zones[zid];
 	mz = mem_cgroup_zoneinfo(mem, node, zid);
 	list = &mz->lists[lru];
 	loop = MEM_CGROUP_ZSTAT(mz, lru);
 	/* give some margin against EBUSY etc...*/
 	loop += 256;
 	busy = NULL;
 	while (loop--) {
 		struct page *page;
 		ret = 0;
 		spin_lock_irqsave(&zone->lru_lock, flags);
 		if (list_empty(list)) {
 			spin_unlock_irqrestore(&zone->lru_lock, flags);
 			break;
 		}
 		pc = list_entry(list->prev, struct page_cgroup, lru);
 		if (busy == pc) {
 			list_move(&pc->lru, list);
 			busy = NULL;
 			spin_unlock_irqrestore(&zone->lru_lock, flags);
 			continue;
 		}
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 		page = lookup_cgroup_page(pc);
 		ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
 		if (ret == -ENOMEM)
 			break;
 		if (ret == -EBUSY || ret == -EINVAL) {
 			/* found lock contention or "pc" is obsolete. */
 			busy = pc;
 			cond_resched();
 		} else
 			busy = NULL;
 	}
 	if (!ret && !list_empty(list))
 		return -EBUSY;
 	return ret;
 }
 /*
  * make mem_cgroup's charge to be 0 if there is no task.
  * This enables deleting this mem_cgroup.
  */
 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
 {
 	int ret;
 	int node, zid, shrink;
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct cgroup *cgrp = mem->css.cgroup;
 	css_get(&mem->css);
 	shrink = 0;
 	/* should free all ? */
 	if (free_all)
 		goto try_to_free;
 move_account:
 	do {
 		ret = -EBUSY;
 		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
 			goto out;
 		ret = -EINTR;
 		if (signal_pending(current))
 			goto out;
 		/* This is for making all *used* pages to be on LRU. */
 		lru_add_drain_all();
 		drain_all_stock_sync();
 		ret = 0;
 		mem_cgroup_start_move(mem);
 		for_each_node_state(node, N_HIGH_MEMORY) {
 			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
 				enum lru_list l;
 				for_each_lru(l) {
 					ret = mem_cgroup_force_empty_list(mem,
 							node, zid, l);
 					if (ret)
 						break;
 				}
 			}
 			if (ret)
 				break;
 		}
 		mem_cgroup_end_move(mem);
 		memcg_oom_recover(mem);
 		/* it seems parent cgroup doesn't have enough mem */
 		if (ret == -ENOMEM)
 			goto try_to_free;
 		cond_resched();
 	/* "ret" should also be checked to ensure all lists are empty. */
 	} while (mem->res.usage > 0 || ret);
 out:
 	css_put(&mem->css);
 	return ret;
 try_to_free:
 	/* returns EBUSY if there is a task or if we come here twice. */
 	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
 		ret = -EBUSY;
 		goto out;
 	}
 	/* we call try-to-free pages for make this cgroup empty */
 	lru_add_drain_all();
 	/* try to free all pages in this cgroup */
 	shrink = 1;
 	while (nr_retries && mem->res.usage > 0) {
 		int progress;
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			goto out;
 		}
 		progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
 						false, get_swappiness(mem));
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
 			congestion_wait(BLK_RW_ASYNC, HZ/10);
 		}
 	}
 	lru_add_drain();
 	/* try move_account...there may be some *locked* pages. */
 	goto move_account;
 }
 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
 {
 	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
 }
 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
 {
 	return mem_cgroup_from_cont(cont)->use_hierarchy;
 }
 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
 					u64 val)
 {
 	int retval = 0;
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 	struct cgroup *parent = cont->parent;
 	struct mem_cgroup *parent_mem = NULL;
 	if (parent)
 		parent_mem = mem_cgroup_from_cont(parent);
 	cgroup_lock();
 	/*
 	 * If parent's use_hierarchy is set, we can't make any modifications
 	 * in the child subtrees. If it is unset, then the change can
 	 * occur, provided the current cgroup has no children.
 	 *
 	 * For the root cgroup, parent_mem is NULL, we allow value to be
 	 * set if there are no children.
 	 */
 	if ((!parent_mem || !parent_mem->use_hierarchy) &&
 				(val == 1 || val == 0)) {
 		if (list_empty(&cont->children))
 			mem->use_hierarchy = val;
 		else
 			retval = -EBUSY;
 	} else
 		retval = -EINVAL;
 	cgroup_unlock();
 	return retval;
 }
 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
 					       enum mem_cgroup_stat_index idx)
 {
 	struct mem_cgroup *iter;
 	long val = 0;
 	/* Per-cpu values can be negative, use a signed accumulator */
 	for_each_mem_cgroup_tree(iter, mem)
 		val += mem_cgroup_read_stat(iter, idx);
 	if (val < 0) /* race ? */
 		val = 0;
 	return val;
 }
 static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
 {
 	u64 val;
 	if (!mem_cgroup_is_root(mem)) {
 		if (!swap)
 			return res_counter_read_u64(&mem->res, RES_USAGE);
 		else
 			return res_counter_read_u64(&mem->memsw, RES_USAGE);
 	}
 	val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
 	val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
 	if (swap)
 		val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
 	return val << PAGE_SHIFT;
 }
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 	u64 val;
 	int type, name;
 	type = MEMFILE_TYPE(cft->private);
 	name = MEMFILE_ATTR(cft->private);
 	switch (type) {
 	case _MEM:
 		if (name == RES_USAGE)
 			val = mem_cgroup_usage(mem, false);
 		else
 			val = res_counter_read_u64(&mem->res, name);
 		break;
 	case _MEMSWAP:
 		if (name == RES_USAGE)
 			val = mem_cgroup_usage(mem, true);
 		else
 			val = res_counter_read_u64(&mem->memsw, name);
 		break;
 	default:
 		BUG();
 		break;
 	}
 	return val;
 }
 /*
  * The user of this function is...
  * RES_LIMIT.
  */
 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 			    const char *buffer)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	int type, name;
 	unsigned long long val;
 	int ret;
 	type = MEMFILE_TYPE(cft->private);
 	name = MEMFILE_ATTR(cft->private);
 	switch (name) {
 	case RES_LIMIT:
 		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
 			ret = -EINVAL;
 			break;
 		}
 		/* This function does all necessary parse...reuse it */
 		ret = res_counter_memparse_write_strategy(buffer, &val);
 		if (ret)
 			break;
 		if (type == _MEM)
 			ret = mem_cgroup_resize_limit(memcg, val);
 		else
 			ret = mem_cgroup_resize_memsw_limit(memcg, val);
 		break;
 	case RES_SOFT_LIMIT:
 		ret = res_counter_memparse_write_strategy(buffer, &val);
 		if (ret)
 			break;
 		/*
 		 * For memsw, soft limits are hard to implement in terms
 		 * of semantics, for now, we support soft limits for
 		 * control without swap
 		 */
 		if (type == _MEM)
 			ret = res_counter_set_soft_limit(&memcg->res, val);
 		else
 			ret = -EINVAL;
 		break;
 	default:
 		ret = -EINVAL; /* should be BUG() ? */
 		break;
 	}
 	return ret;
 }
 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
 		unsigned long long *mem_limit, unsigned long long *memsw_limit)
 {
 	struct cgroup *cgroup;
 	unsigned long long min_limit, min_memsw_limit, tmp;
 	min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 	min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 	cgroup = memcg->css.cgroup;
 	if (!memcg->use_hierarchy)
 		goto out;
 	while (cgroup->parent) {
 		cgroup = cgroup->parent;
 		memcg = mem_cgroup_from_cont(cgroup);
 		if (!memcg->use_hierarchy)
 			break;
 		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
 		min_limit = min(min_limit, tmp);
 		tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 		min_memsw_limit = min(min_memsw_limit, tmp);
 	}
 out:
 	*mem_limit = min_limit;
 	*memsw_limit = min_memsw_limit;
 	return;
 }
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
 	struct mem_cgroup *mem;
 	int type, name;
 	mem = mem_cgroup_from_cont(cont);
 	type = MEMFILE_TYPE(event);
 	name = MEMFILE_ATTR(event);
 	switch (name) {
 	case RES_MAX_USAGE:
 		if (type == _MEM)
 			res_counter_reset_max(&mem->res);
 		else
 			res_counter_reset_max(&mem->memsw);
 		break;
 	case RES_FAILCNT:
 		if (type == _MEM)
 			res_counter_reset_failcnt(&mem->res);
 		else
 			res_counter_reset_failcnt(&mem->memsw);
 		break;
 	}
 	return 0;
 }
 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
 					struct cftype *cft)
 {
 	return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
 }
 #ifdef CONFIG_MMU
 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 					struct cftype *cft, u64 val)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
 	if (val >= (1 << NR_MOVE_TYPE))
 		return -EINVAL;
 	/*
 	 * We check this value several times in both in can_attach() and
 	 * attach(), so we need cgroup lock to prevent this value from being
 	 * inconsistent.
 	 */
 	cgroup_lock();
 	mem->move_charge_at_immigrate = val;
 	cgroup_unlock();
 	return 0;
 }
 #else
 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 					struct cftype *cft, u64 val)
 {
 	return -ENOSYS;
 }
 #endif
 /* For read statistics */
 enum {
 	MCS_CACHE,
 	MCS_RSS,
 	MCS_FILE_MAPPED,
 	MCS_PGPGIN,
 	MCS_PGPGOUT,
 	MCS_SWAP,
 	MCS_INACTIVE_ANON,
 	MCS_ACTIVE_ANON,
 	MCS_INACTIVE_FILE,
 	MCS_ACTIVE_FILE,
 	MCS_UNEVICTABLE,
 	NR_MCS_STAT,
 };
 struct mcs_total_stat {
 	s64 stat[NR_MCS_STAT];
 };
 struct {
 	char *local_name;
 	char *total_name;
 } memcg_stat_strings[NR_MCS_STAT] = {
 	{"cache", "total_cache"},
 	{"rss", "total_rss"},
 	{"mapped_file", "total_mapped_file"},
 	{"pgpgin", "total_pgpgin"},
 	{"pgpgout", "total_pgpgout"},
 	{"swap", "total_swap"},
 	{"inactive_anon", "total_inactive_anon"},
 	{"active_anon", "total_active_anon"},
 	{"inactive_file", "total_inactive_file"},
 	{"active_file", "total_active_file"},
 	{"unevictable", "total_unevictable"}
 };
 static void
 mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
 {
 	s64 val;
 	/* per cpu stat */
 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
 	s->stat[MCS_CACHE] += val * PAGE_SIZE;
 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
 	s->stat[MCS_RSS] += val * PAGE_SIZE;
 	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
 	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
 	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
 	s->stat[MCS_PGPGIN] += val;
 	val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
 	s->stat[MCS_PGPGOUT] += val;
 	if (do_swap_account) {
 		val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
 		s->stat[MCS_SWAP] += val * PAGE_SIZE;
 	}
 	/* per zone stat */
 	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
 	s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
 	val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
 	s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
 	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
 	s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
 	val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
 	s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
 	val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
 	s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
 }
 static void
 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
 {
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, mem)
 		mem_cgroup_get_local_stat(iter, s);
 }
 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
 				 struct cgroup_map_cb *cb)
 {
 	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
 	struct mcs_total_stat mystat;
 	int i;
 	memset(&mystat, 0, sizeof(mystat));
 	mem_cgroup_get_local_stat(mem_cont, &mystat);
 	for (i = 0; i < NR_MCS_STAT; i++) {
 		if (i == MCS_SWAP && !do_swap_account)
 			continue;
 		cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
 	}
 	/* Hierarchical information */
 	{
 		unsigned long long limit, memsw_limit;
 		memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
 		cb->fill(cb, "hierarchical_memory_limit", limit);
 		if (do_swap_account)
 			cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
 	}
 	memset(&mystat, 0, sizeof(mystat));
 	mem_cgroup_get_total_stat(mem_cont, &mystat);
 	for (i = 0; i < NR_MCS_STAT; i++) {
 		if (i == MCS_SWAP && !do_swap_account)
 			continue;
 		cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
 	}
 #ifdef CONFIG_DEBUG_VM
 	cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
 	{
 		int nid, zid;
 		struct mem_cgroup_per_zone *mz;
 		unsigned long recent_rotated[2] = {0, 0};
 		unsigned long recent_scanned[2] = {0, 0};
 		for_each_online_node(nid)
 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 				mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
 				recent_rotated[0] +=
 					mz->reclaim_stat.recent_rotated[0];
 				recent_rotated[1] +=
 					mz->reclaim_stat.recent_rotated[1];
 				recent_scanned[0] +=
 					mz->reclaim_stat.recent_scanned[0];
 				recent_scanned[1] +=
 					mz->reclaim_stat.recent_scanned[1];
 			}
 		cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
 		cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
 		cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
 		cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
 	}
 #endif
 	return 0;
 }
 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	return get_swappiness(memcg);
 }
 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
 				       u64 val)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup *parent;
 	if (val > 100)
 		return -EINVAL;
 	if (cgrp->parent == NULL)
 		return -EINVAL;
 	parent = mem_cgroup_from_cont(cgrp->parent);
 	cgroup_lock();
 	/* If under hierarchy, only empty-root can set this value */
 	if ((parent->use_hierarchy) ||
 	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
 		cgroup_unlock();
 		return -EINVAL;
 	}
 	memcg->swappiness = val;
 	cgroup_unlock();
 	return 0;
 }
 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 {
 	struct mem_cgroup_threshold_ary *t;
 	u64 usage;
 	int i;
 	rcu_read_lock();
 	if (!swap)
 		t = rcu_dereference(memcg->thresholds.primary);
 	else
 		t = rcu_dereference(memcg->memsw_thresholds.primary);
 	if (!t)
 		goto unlock;
 	usage = mem_cgroup_usage(memcg, swap);
 	/*
 	 * current_threshold points to threshold just below usage.
 	 * If it's not true, a threshold was crossed after last
 	 * call of __mem_cgroup_threshold().
 	 */
 	i = t->current_threshold;
 	/*
 	 * Iterate backward over array of thresholds starting from
 	 * current_threshold and check if a threshold is crossed.
 	 * If none of thresholds below usage is crossed, we read
 	 * only one element of the array here.
 	 */
 	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
 		eventfd_signal(t->entries[i].eventfd, 1);
 	/* i = current_threshold + 1 */
 	i++;
 	/*
 	 * Iterate forward over array of thresholds starting from
 	 * current_threshold+1 and check if a threshold is crossed.
 	 * If none of thresholds above usage is crossed, we read
 	 * only one element of the array here.
 	 */
 	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
 		eventfd_signal(t->entries[i].eventfd, 1);
 	/* Update current_threshold */
 	t->current_threshold = i - 1;
 unlock:
 	rcu_read_unlock();
 }
 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
 {
 	while (memcg) {
 		__mem_cgroup_threshold(memcg, false);
 		if (do_swap_account)
 			__mem_cgroup_threshold(memcg, true);
 		memcg = parent_mem_cgroup(memcg);
 	}
 }
 static int compare_thresholds(const void *a, const void *b)
 {
 	const struct mem_cgroup_threshold *_a = a;
 	const struct mem_cgroup_threshold *_b = b;
 	return _a->threshold - _b->threshold;
 }
 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
 {
 	struct mem_cgroup_eventfd_list *ev;
 	list_for_each_entry(ev, &mem->oom_notify, list)
 		eventfd_signal(ev->eventfd, 1);
 	return 0;
 }
 static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
 {
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, mem)
 		mem_cgroup_oom_notify_cb(iter);
 }
 static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
 	int type = MEMFILE_TYPE(cft->private);
 	u64 threshold, usage;
 	int i, size, ret;
 	ret = res_counter_memparse_write_strategy(args, &threshold);
 	if (ret)
 		return ret;
 	mutex_lock(&memcg->thresholds_lock);
 	if (type == _MEM)
 		thresholds = &memcg->thresholds;
 	else if (type == _MEMSWAP)
 		thresholds = &memcg->memsw_thresholds;
 	else
 		BUG();
 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
 	/* Check if a threshold crossed before adding a new one */
 	if (thresholds->primary)
 		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
 	/* Allocate memory for new array of thresholds */
 	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
 			GFP_KERNEL);
 	if (!new) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
 	new->size = size;
 	/* Copy thresholds (if any) to new array */
 	if (thresholds->primary) {
 		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
 				sizeof(struct mem_cgroup_threshold));
 	}
 	/* Add new threshold */
 	new->entries[size - 1].eventfd = eventfd;
 	new->entries[size - 1].threshold = threshold;
 	/* Sort thresholds. Registering of new threshold isn't time-critical */
 	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
 			compare_thresholds, NULL);
 	/* Find current threshold */
 	new->current_threshold = -1;
 	for (i = 0; i < size; i++) {
 		if (new->entries[i].threshold < usage) {
 			/*
 			 * new->current_threshold will not be used until
 			 * rcu_assign_pointer(), so it's safe to increment
 			 * it here.
 			 */
 			++new->current_threshold;
 		}
 	}
 	/* Free old spare buffer and save old primary buffer as spare */
 	kfree(thresholds->spare);
 	thresholds->spare = thresholds->primary;
 	rcu_assign_pointer(thresholds->primary, new);
 	/* To be sure that nobody uses thresholds */
 	synchronize_rcu();
 unlock:
 	mutex_unlock(&memcg->thresholds_lock);
 	return ret;
 }
 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
 	int type = MEMFILE_TYPE(cft->private);
 	u64 usage;
 	int i, j, size;
 	mutex_lock(&memcg->thresholds_lock);
 	if (type == _MEM)
 		thresholds = &memcg->thresholds;
 	else if (type == _MEMSWAP)
 		thresholds = &memcg->memsw_thresholds;
 	else
 		BUG();
 	/*
 	 * Something went wrong if we trying to unregister a threshold
 	 * if we don't have thresholds
 	 */
 	BUG_ON(!thresholds);
 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
 	/* Check if a threshold crossed before removing */
 	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 	/* Calculate new number of threshold */
 	size = 0;
 	for (i = 0; i < thresholds->primary->size; i++) {
 		if (thresholds->primary->entries[i].eventfd != eventfd)
 			size++;
 	}
 	new = thresholds->spare;
 	/* Set thresholds array to NULL if we don't have thresholds */
 	if (!size) {
 		kfree(new);
 		new = NULL;
 		goto swap_buffers;
 	}
 	new->size = size;
 	/* Copy thresholds and find current threshold */
 	new->current_threshold = -1;
 	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
 		if (thresholds->primary->entries[i].eventfd == eventfd)
 			continue;
 		new->entries[j] = thresholds->primary->entries[i];
 		if (new->entries[j].threshold < usage) {
 			/*
 			 * new->current_threshold will not be used
 			 * until rcu_assign_pointer(), so it's safe to increment
 			 * it here.
 			 */
 			++new->current_threshold;
 		}
 		j++;
 	}
 swap_buffers:
 	/* Swap primary and spare array */
 	thresholds->spare = thresholds->primary;
 	rcu_assign_pointer(thresholds->primary, new);
 	/* To be sure that nobody uses thresholds */
 	synchronize_rcu();
 	mutex_unlock(&memcg->thresholds_lock);
 }
 static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_eventfd_list *event;
 	int type = MEMFILE_TYPE(cft->private);
 	BUG_ON(type != _OOM_TYPE);
 	event = kmalloc(sizeof(*event),	GFP_KERNEL);
 	if (!event)
 		return -ENOMEM;
 	mutex_lock(&memcg_oom_mutex);
 	event->eventfd = eventfd;
 	list_add(&event->list, &memcg->oom_notify);
 	/* already in OOM ? */
 	if (atomic_read(&memcg->oom_lock))
 		eventfd_signal(eventfd, 1);
 	mutex_unlock(&memcg_oom_mutex);
 	return 0;
 }
 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_eventfd_list *ev, *tmp;
 	int type = MEMFILE_TYPE(cft->private);
 	BUG_ON(type != _OOM_TYPE);
 	mutex_lock(&memcg_oom_mutex);
 	list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
 		if (ev->eventfd == eventfd) {
 			list_del(&ev->list);
 			kfree(ev);
 		}
 	}
 	mutex_unlock(&memcg_oom_mutex);
 }
 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
 	struct cftype *cft,  struct cgroup_map_cb *cb)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
 	cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
 	if (atomic_read(&mem->oom_lock))
 		cb->fill(cb, "under_oom", 1);
 	else
 		cb->fill(cb, "under_oom", 0);
 	return 0;
 }
 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 	struct cftype *cft, u64 val)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup *parent;
 	/* cannot set to root cgroup and only 0 and 1 are allowed */
 	if (!cgrp->parent || !((val == 0) || (val == 1)))
 		return -EINVAL;
 	parent = mem_cgroup_from_cont(cgrp->parent);
 	cgroup_lock();
 	/* oom-kill-disable is a flag for subhierarchy. */
 	if ((parent->use_hierarchy) ||
 	    (mem->use_hierarchy && !list_empty(&cgrp->children))) {
 		cgroup_unlock();
 		return -EINVAL;
 	}
 	mem->oom_kill_disable = val;
 	if (!val)
 		memcg_oom_recover(mem);
 	cgroup_unlock();
 	return 0;
 }
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
 		.read_u64 = mem_cgroup_read,
 		.register_event = mem_cgroup_usage_register_event,
 		.unregister_event = mem_cgroup_usage_unregister_event,
 	},
 	{
 		.name = "max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "soft_limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "failcnt",
 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "stat",
 		.read_map = mem_control_stat_show,
 	},
 	{
 		.name = "force_empty",
 		.trigger = mem_cgroup_force_empty_write,
 	},
 	{
 		.name = "use_hierarchy",
 		.write_u64 = mem_cgroup_hierarchy_write,
 		.read_u64 = mem_cgroup_hierarchy_read,
 	},
 	{
 		.name = "swappiness",
 		.read_u64 = mem_cgroup_swappiness_read,
 		.write_u64 = mem_cgroup_swappiness_write,
 	},
 	{
 		.name = "move_charge_at_immigrate",
 		.read_u64 = mem_cgroup_move_charge_read,
 		.write_u64 = mem_cgroup_move_charge_write,
 	},
 	{
 		.name = "oom_control",
 		.read_map = mem_cgroup_oom_control_read,
 		.write_u64 = mem_cgroup_oom_control_write,
 		.register_event = mem_cgroup_oom_register_event,
 		.unregister_event = mem_cgroup_oom_unregister_event,
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 	},
 };
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 static struct cftype memsw_cgroup_files[] = {
 	{
 		.name = "memsw.usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
 		.read_u64 = mem_cgroup_read,
 		.register_event = mem_cgroup_usage_register_event,
 		.unregister_event = mem_cgroup_usage_unregister_event,
 	},
 	{
 		.name = "memsw.max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "memsw.limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read,
 	},
 	{
 		.name = "memsw.failcnt",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read,
 	},
 };
 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
 {
 	if (!do_swap_account)
 		return 0;
 	return cgroup_add_files(cont, ss, memsw_cgroup_files,
 				ARRAY_SIZE(memsw_cgroup_files));
 };
 #else
 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
 {
 	return 0;
 }
 #endif
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
 	struct mem_cgroup_per_node *pn;
 	struct mem_cgroup_per_zone *mz;
 	enum lru_list l;
 	int zone, tmp = node;
 	/*
 	 * This routine is called against possible nodes.
 	 * But it's BUG to call kmalloc() against offline node.
 	 *
 	 * TODO: this routine can waste much memory for nodes which will
 	 *       never be onlined. It's better to use memory hotplug callback
 	 *       function.
 	 */
 	if (!node_state(node, N_NORMAL_MEMORY))
 		tmp = -1;
 	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
 	if (!pn)
 		return 1;
 	mem->info.nodeinfo[node] = pn;
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
 		for_each_lru(l)
 			INIT_LIST_HEAD(&mz->lists[l]);
 		mz->usage_in_excess = 0;
 		mz->on_tree = false;
 		mz->mem = mem;
 	}
 	return 0;
 }
 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
 	kfree(mem->info.nodeinfo[node]);
 }
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
 	struct mem_cgroup *mem;
 	int size = sizeof(struct mem_cgroup);
 	/* Can be very big if MAX_NUMNODES is very big */
 	if (size < PAGE_SIZE)
 		mem = kzalloc(size, GFP_KERNEL);
 	else
 		mem = vzalloc(size);
 	if (!mem)
 		return NULL;
 	mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
 	if (!mem->stat)
 		goto out_free;
 	spin_lock_init(&mem->pcp_counter_lock);
 	return mem;
 out_free:
 	if (size < PAGE_SIZE)
 		kfree(mem);
 	else
 		vfree(mem);
 	return NULL;
 }
 /*
  * At destroying mem_cgroup, references from swap_cgroup can remain.
  * (scanning all at force_empty is too costly...)
  *
  * Instead of clearing all references at force_empty, we remember
  * the number of reference from swap_cgroup and free mem_cgroup when
  * it goes down to 0.
  *
  * Removal of cgroup itself succeeds regardless of refs from swap.
  */
 static void __mem_cgroup_free(struct mem_cgroup *mem)
 {
 	int node;
 	mem_cgroup_remove_from_trees(mem);
 	free_css_id(&mem_cgroup_subsys, &mem->css);
 	for_each_node_state(node, N_POSSIBLE)
 		free_mem_cgroup_per_zone_info(mem, node);
 	free_percpu(mem->stat);
 	if (sizeof(struct mem_cgroup) < PAGE_SIZE)
 		kfree(mem);
 	else
 		vfree(mem);
 }
 static void mem_cgroup_get(struct mem_cgroup *mem)
 {
 	atomic_inc(&mem->refcnt);
 }
 static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
 {
 	if (atomic_sub_and_test(count, &mem->refcnt)) {
 		struct mem_cgroup *parent = parent_mem_cgroup(mem);
 		__mem_cgroup_free(mem);
 		if (parent)
 			mem_cgroup_put(parent);
 	}
 }
 static void mem_cgroup_put(struct mem_cgroup *mem)
 {
 	__mem_cgroup_put(mem, 1);
 }
 /*
  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
  */
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
 {
 	if (!mem->res.parent)
 		return NULL;
 	return mem_cgroup_from_res_counter(mem->res.parent, res);
 }
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 static void __init enable_swap_cgroup(void)
 {
 	if (!mem_cgroup_disabled() && really_do_swap_account)
 		do_swap_account = 1;
 }
 #else
 static void __init enable_swap_cgroup(void)
 {
 }
 #endif
 static int mem_cgroup_soft_limit_tree_init(void)
 {
 	struct mem_cgroup_tree_per_node *rtpn;
 	struct mem_cgroup_tree_per_zone *rtpz;
 	int tmp, node, zone;
 	for_each_node_state(node, N_POSSIBLE) {
 		tmp = node;
 		if (!node_state(node, N_NORMAL_MEMORY))
 			tmp = -1;
 		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
 		if (!rtpn)
 			return 1;
 		soft_limit_tree.rb_tree_per_node[node] = rtpn;
 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 			rtpz = &rtpn->rb_tree_per_zone[zone];
 			rtpz->rb_root = RB_ROOT;
 			spin_lock_init(&rtpz->lock);
 		}
 	}
 	return 0;
 }
 static struct cgroup_subsys_state * __ref
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	struct mem_cgroup *mem, *parent;
 	long error = -ENOMEM;
 	int node;
 	mem = mem_cgroup_alloc();
 	if (!mem)
 		return ERR_PTR(error);
 	for_each_node_state(node, N_POSSIBLE)
 		if (alloc_mem_cgroup_per_zone_info(mem, node))
 			goto free_out;
 	/* root ? */
 	if (cont->parent == NULL) {
 		int cpu;
 		enable_swap_cgroup();
 		parent = NULL;
 		root_mem_cgroup = mem;
 		if (mem_cgroup_soft_limit_tree_init())
 			goto free_out;
 		for_each_possible_cpu(cpu) {
 			struct memcg_stock_pcp *stock =
 						&per_cpu(memcg_stock, cpu);
 			INIT_WORK(&stock->work, drain_local_stock);
 		}
 		hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
 	} else {
 		parent = mem_cgroup_from_cont(cont->parent);
 		mem->use_hierarchy = parent->use_hierarchy;
 		mem->oom_kill_disable = parent->oom_kill_disable;
 	}
 	if (parent && parent->use_hierarchy) {
 		res_counter_init(&mem->res, &parent->res);
 		res_counter_init(&mem->memsw, &parent->memsw);
 		/*
 		 * We increment refcnt of the parent to ensure that we can
 		 * safely access it on res_counter_charge/uncharge.
 		 * This refcnt will be decremented when freeing this
 		 * mem_cgroup(see mem_cgroup_put).
 		 */
 		mem_cgroup_get(parent);
 	} else {
 		res_counter_init(&mem->res, NULL);
 		res_counter_init(&mem->memsw, NULL);
 	}
 	mem->last_scanned_child = 0;
 	INIT_LIST_HEAD(&mem->oom_notify);
 	if (parent)
 		mem->swappiness = get_swappiness(parent);
 	atomic_set(&mem->refcnt, 1);
 	mem->move_charge_at_immigrate = 0;
 	mutex_init(&mem->thresholds_lock);
 	return &mem->css;
 free_out:
 	__mem_cgroup_free(mem);
 	root_mem_cgroup = NULL;
 	return ERR_PTR(error);
 }
 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
 					struct cgroup *cont)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 	return mem_cgroup_force_empty(mem, false);
 }
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 				struct cgroup *cont)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 	mem_cgroup_put(mem);
 }
 static int mem_cgroup_populate(struct cgroup_subsys *ss,
 				struct cgroup *cont)
 {
 	int ret;
 	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
 				ARRAY_SIZE(mem_cgroup_files));
 	if (!ret)
 		ret = register_memsw_files(cont, ss);
 	return ret;
 }
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
 #define PRECHARGE_COUNT_AT_ONCE	256
 static int mem_cgroup_do_precharge(unsigned long count)
 {
 	int ret = 0;
 	int batch_count = PRECHARGE_COUNT_AT_ONCE;
 	struct mem_cgroup *mem = mc.to;
 	if (mem_cgroup_is_root(mem)) {
 		mc.precharge += count;
 		/* we don't need css_get for root */
 		return ret;
 	}
 	/* try to charge at once */
 	if (count > 1) {
 		struct res_counter *dummy;
 		/*
 		 * "mem" cannot be under rmdir() because we've already checked
 		 * by cgroup_lock_live_cgroup() that it is not removed and we
 		 * are still under the same cgroup_mutex. So we can postpone
 		 * css_get().
 		 */
 		if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
 			goto one_by_one;
 		if (do_swap_account && res_counter_charge(&mem->memsw,
 						PAGE_SIZE * count, &dummy)) {
 			res_counter_uncharge(&mem->res, PAGE_SIZE * count);
 			goto one_by_one;
 		}
 		mc.precharge += count;
 		return ret;
 	}
 one_by_one:
 	/* fall back to one by one charge */
 	while (count--) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		if (!batch_count--) {
 			batch_count = PRECHARGE_COUNT_AT_ONCE;
 			cond_resched();
 		}
 		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
 		if (ret || !mem)
 			/* mem_cgroup_clear_mc() will do uncharge later */
 			return -ENOMEM;
 		mc.precharge++;
 	}
 	return ret;
 }
 /**
  * is_target_pte_for_mc - check a pte whether it is valid for move charge
  * @vma: the vma the pte to be checked belongs
  * @addr: the address corresponding to the pte to be checked
  * @ptent: the pte to be checked
  * @target: the pointer the target page or swap ent will be stored(can be NULL)
  *
  * Returns
  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
  *     move charge. if @target is not NULL, the page is stored in target->page
  *     with extra refcnt got(Callers should handle it).
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  *     target for charge migration. if @target is not NULL, the entry is stored
  *     in target->ent.
  *
  * Called with pte lock held.
  */
 union mc_target {
 	struct page	*page;
 	swp_entry_t	ent;
 };
 enum mc_target_type {
 	MC_TARGET_NONE,	/* not used */
 	MC_TARGET_PAGE,
 	MC_TARGET_SWAP,
 };
 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
 						unsigned long addr, pte_t ptent)
 {
 	struct page *page = vm_normal_page(vma, addr, ptent);
 	if (!page || !page_mapped(page))
 		return NULL;
 	if (PageAnon(page)) {
 		/* we don't move shared anon */
 		if (!move_anon() || page_mapcount(page) > 2)
 			return NULL;
 	} else if (!move_file())
 		/* we ignore mapcount for file pages */
 		return NULL;
 	if (!get_page_unless_zero(page))
 		return NULL;
 	return page;
 }
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	int usage_count;
 	struct page *page = NULL;
 	swp_entry_t ent = pte_to_swp_entry(ptent);
 	if (!move_anon() || non_swap_entry(ent))
 		return NULL;
 	usage_count = mem_cgroup_count_swap_user(ent, &page);
 	if (usage_count > 1) { /* we don't move shared anon */
 		if (page)
 			put_page(page);
 		return NULL;
 	}
 	if (do_swap_account)
 		entry->val = ent.val;
 	return page;
 }
 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	struct page *page = NULL;
 	struct inode *inode;
 	struct address_space *mapping;
 	pgoff_t pgoff;
 	if (!vma->vm_file) /* anonymous vma */
 		return NULL;
 	if (!move_file())
 		return NULL;
 	inode = vma->vm_file->f_path.dentry->d_inode;
 	mapping = vma->vm_file->f_mapping;
 	if (pte_none(ptent))
 		pgoff = linear_page_index(vma, addr);
 	else /* pte_file(ptent) is true */
 		pgoff = pte_to_pgoff(ptent);
 	/* page is moved even if it's not RSS of this task(page-faulted). */
 	if (!mapping_cap_swap_backed(mapping)) { /* normal file */
 		page = find_get_page(mapping, pgoff);
 	} else { /* shmem/tmpfs file. we should take account of swap too. */
 		swp_entry_t ent;
 		mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
 		if (do_swap_account)
 			entry->val = ent.val;
 	}
 	return page;
 }
 static int is_target_pte_for_mc(struct vm_area_struct *vma,
 		unsigned long addr, pte_t ptent, union mc_target *target)
 {
 	struct page *page = NULL;
 	struct page_cgroup *pc;
 	int ret = 0;
 	swp_entry_t ent = { .val = 0 };
 	if (pte_present(ptent))
 		page = mc_handle_present_pte(vma, addr, ptent);
 	else if (is_swap_pte(ptent))
 		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
 	else if (pte_none(ptent) || pte_file(ptent))
 		page = mc_handle_file_pte(vma, addr, ptent, &ent);
 	if (!page && !ent.val)
 		return 0;
 	if (page) {
 		pc = lookup_page_cgroup(page);
 		/*
 		 * Do only loose check w/o page_cgroup lock.
 		 * mem_cgroup_move_account() checks the pc is valid or not under
 		 * the lock.
 		 */
 		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
 			ret = MC_TARGET_PAGE;
 			if (target)
 				target->page = page;
 		}
 		if (!ret || !target)
 			put_page(page);
 	}
 	/* There is a swap entry and a page doesn't exist or isn't charged */
 	if (ent.val && !ret &&
 			css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
 		ret = MC_TARGET_SWAP;
 		if (target)
 			target->ent = ent;
 	}
 	return ret;
 }
 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 					unsigned long addr, unsigned long end,
 					struct mm_walk *walk)
 {
 	struct vm_area_struct *vma = walk->private;
 	pte_t *pte;
 	spinlock_t *ptl;
 	split_huge_page_pmd(walk->mm, pmd);
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE)
 		if (is_target_pte_for_mc(vma, addr, *pte, NULL))
 			mc.precharge++;	/* increment precharge temporarily */
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	return 0;
 }
 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
 {
 	unsigned long precharge;
 	struct vm_area_struct *vma;
 	down_read(&mm->mmap_sem);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		struct mm_walk mem_cgroup_count_precharge_walk = {
 			.pmd_entry = mem_cgroup_count_precharge_pte_range,
 			.mm = mm,
 			.private = vma,
 		};
 		if (is_vm_hugetlb_page(vma))
 			continue;
 		walk_page_range(vma->vm_start, vma->vm_end,
 					&mem_cgroup_count_precharge_walk);
 	}
 	up_read(&mm->mmap_sem);
 	precharge = mc.precharge;
 	mc.precharge = 0;
 	return precharge;
 }
 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 {
 	unsigned long precharge = mem_cgroup_count_precharge(mm);
 	VM_BUG_ON(mc.moving_task);
 	mc.moving_task = current;
 	return mem_cgroup_do_precharge(precharge);
 }
 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
 static void __mem_cgroup_clear_mc(void)
 {
 	struct mem_cgroup *from = mc.from;
 	struct mem_cgroup *to = mc.to;
 	/* we must uncharge all the leftover precharges from mc.to */
 	if (mc.precharge) {
 		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
 		mc.precharge = 0;
 	}
 	/*
 	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
 	 * we must uncharge here.
 	 */
 	if (mc.moved_charge) {
 		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
 		mc.moved_charge = 0;
 	}
 	/* we must fixup refcnts and charges */
 	if (mc.moved_swap) {
 		/* uncharge swap account from the old cgroup */
 		if (!mem_cgroup_is_root(mc.from))
 			res_counter_uncharge(&mc.from->memsw,
 						PAGE_SIZE * mc.moved_swap);
 		__mem_cgroup_put(mc.from, mc.moved_swap);
 		if (!mem_cgroup_is_root(mc.to)) {
 			/*
 			 * we charged both to->res and to->memsw, so we should
 			 * uncharge to->res.
 			 */
 			res_counter_uncharge(&mc.to->res,
 						PAGE_SIZE * mc.moved_swap);
 		}
 		/* we've already done mem_cgroup_get(mc.to) */
 		mc.moved_swap = 0;
 	}
 	memcg_oom_recover(from);
 	memcg_oom_recover(to);
 	wake_up_all(&mc.waitq);
 }
 static void mem_cgroup_clear_mc(void)
 {
 	struct mem_cgroup *from = mc.from;
 	/*
 	 * we must clear moving_task before waking up waiters at the end of
 	 * task migration.
 	 */
 	mc.moving_task = NULL;
 	__mem_cgroup_clear_mc();
 	spin_lock(&mc.lock);
 	mc.from = NULL;
 	mc.to = NULL;
 	spin_unlock(&mc.lock);
 	mem_cgroup_end_move(from);
 }
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
 				struct cgroup *cgroup,
 				struct task_struct *p)
 {
 	int ret = 0;
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
 	if (mem->move_charge_at_immigrate) {
 		struct mm_struct *mm;
 		struct mem_cgroup *from = mem_cgroup_from_task(p);
 		VM_BUG_ON(from == mem);
 		mm = get_task_mm(p);
 		if (!mm)
 			return 0;
 		/* We move charges only when we move a owner of the mm */
 		if (mm->owner == p) {
 			VM_BUG_ON(mc.from);
 			VM_BUG_ON(mc.to);
 			VM_BUG_ON(mc.precharge);
 			VM_BUG_ON(mc.moved_charge);
 			VM_BUG_ON(mc.moved_swap);
 			mem_cgroup_start_move(from);
 			spin_lock(&mc.lock);
 			mc.from = from;
 			mc.to = mem;
 			spin_unlock(&mc.lock);
 			/* We set mc.moving_task later */
 			ret = mem_cgroup_precharge_mc(mm);
 			if (ret)
 				mem_cgroup_clear_mc();
 		}
 		mmput(mm);
 	}
 	return ret;
 }
 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
 				struct cgroup *cgroup,
 				struct task_struct *p)
 {
 	mem_cgroup_clear_mc();
 }
 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct mm_walk *walk)
 {
 	int ret = 0;
 	struct vm_area_struct *vma = walk->private;
 	pte_t *pte;
 	spinlock_t *ptl;
 	split_huge_page_pmd(walk->mm, pmd);
 retry:
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; addr += PAGE_SIZE) {
 		pte_t ptent = *(pte++);
 		union mc_target target;
 		int type;
 		struct page *page;
 		struct page_cgroup *pc;
 		swp_entry_t ent;
 		if (!mc.precharge)
 			break;
 		type = is_target_pte_for_mc(vma, addr, ptent, &target);
 		switch (type) {
 		case MC_TARGET_PAGE:
 			page = target.page;
 			if (isolate_lru_page(page))
 				goto put;
 			pc = lookup_page_cgroup(page);
 			if (!mem_cgroup_move_account(page, 1, pc,
 						     mc.from, mc.to, false)) {
 				mc.precharge--;
 				/* we uncharge from mc.from later. */
 				mc.moved_charge++;
 			}
 			putback_lru_page(page);
 put:			/* is_target_pte_for_mc() gets the page */
 			put_page(page);
 			break;
 		case MC_TARGET_SWAP:
 			ent = target.ent;
 			if (!mem_cgroup_move_swap_account(ent,
 						mc.from, mc.to, false)) {
 				mc.precharge--;
 				/* we fixup refcnts and charges later. */
 				mc.moved_swap++;
 			}
 			break;
 		default:
 			break;
 		}
 	}
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	if (addr != end) {
 		/*
 		 * We have consumed all precharges we got in can_attach().
 		 * We try charge one by one, but don't do any additional
 		 * charges to mc.to if we have failed in charge once in attach()
 		 * phase.
 		 */
 		ret = mem_cgroup_do_precharge(1);
 		if (!ret)
 			goto retry;
 	}
 	return ret;
 }
 static void mem_cgroup_move_charge(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
 	lru_add_drain_all();
 retry:
 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
 		/*
 		 * Someone who are holding the mmap_sem might be waiting in
 		 * waitq. So we cancel all extra charges, wake up all waiters,
 		 * and retry. Because we cancel precharges, we might not be able
 		 * to move enough charges, but moving charge is a best-effort
 		 * feature anyway, so it wouldn't be a big problem.
 		 */
 		__mem_cgroup_clear_mc();
 		cond_resched();
 		goto retry;
 	}
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		int ret;
 		struct mm_walk mem_cgroup_move_charge_walk = {
 			.pmd_entry = mem_cgroup_move_charge_pte_range,
 			.mm = mm,
 			.private = vma,
 		};
 		if (is_vm_hugetlb_page(vma))
 			continue;
 		ret = walk_page_range(vma->vm_start, vma->vm_end,
 						&mem_cgroup_move_charge_walk);
 		if (ret)
 			/*
 			 * means we have consumed all precharges and failed in
 			 * doing additional charge. Just abandon here.
 			 */
 			break;
 	}
 	up_read(&mm->mmap_sem);
 }
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 				struct cgroup *cont,
 				struct cgroup *old_cont,
 				struct task_struct *p)
 {
 	struct mm_struct *mm;
 	if (!mc.to)
 		/* no need to move charge */
 		return;
 	mm = get_task_mm(p);
 	if (mm) {
 		mem_cgroup_move_charge(mm);
 		mmput(mm);
 	}
 	mem_cgroup_clear_mc();
 }
 #else	/* !CONFIG_MMU */
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
 				struct cgroup *cgroup,
 				struct task_struct *p)
 {
 	return 0;
 }
 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
 				struct cgroup *cgroup,
 				struct task_struct *p)
 {
 }
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 				struct cgroup *cont,
 				struct cgroup *old_cont,
 				struct task_struct *p)
 {
 }
 #endif
 struct cgroup_subsys mem_cgroup_subsys = {
 	.name = "memory",
 	.subsys_id = mem_cgroup_subsys_id,
 	.create = mem_cgroup_create,
 	.pre_destroy = mem_cgroup_pre_destroy,
 	.destroy = mem_cgroup_destroy,
 	.populate = mem_cgroup_populate,
 	.can_attach = mem_cgroup_can_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.attach = mem_cgroup_move_task,
 	.early_init = 0,
 	.use_id = 1,
 };
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 static int __init enable_swap_account(char *s)
 {
 	/* consider enabled if no parameter or 1 is given */
 	if (!strcmp(s, "1"))
 		really_do_swap_account = 1;
 	else if (!strcmp(s, "0"))
 		really_do_swap_account = 0;
 	return 1;
 }
 __setup("swapaccount=", enable_swap_account);
 #endif