Doug / smarc-fsl-linux-kernel

1

/* memcontrol.c - Memory Controller

1

/* memcontrol.c - Memory Controller

2

*

2

*

3

* Copyright IBM Corporation, 2007

3

* Copyright IBM Corporation, 2007

4

* Author Balbir Singh <balbir@linux.vnet.ibm.com>

4

* Author Balbir Singh <balbir@linux.vnet.ibm.com>

5

*

5

*

6

7

* Author: Pavel Emelianov <xemul@openvz.org>

7

* Author: Pavel Emelianov <xemul@openvz.org>

8

*

8

*

9

* Memory thresholds

9

* Memory thresholds

10

11

* Author: Kirill A. Shutemov

11

* Author: Kirill A. Shutemov

12

*

12

*

13

* This program is free software; you can redistribute it and/or modify

13

* This program is free software; you can redistribute it and/or modify

14

* it under the terms of the GNU General Public License as published by

14

* it under the terms of the GNU General Public License as published by

15

* the Free Software Foundation; either version 2 of the License, or

15

* the Free Software Foundation; either version 2 of the License, or

16

* (at your option) any later version.

16

* (at your option) any later version.

17

*

17

*

18

* This program is distributed in the hope that it will be useful,

18

* This program is distributed in the hope that it will be useful,

19

* but WITHOUT ANY WARRANTY; without even the implied warranty of

19

* but WITHOUT ANY WARRANTY; without even the implied warranty of

20

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

20

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

21

* GNU General Public License for more details.

21

* GNU General Public License for more details.

22

*/

22

*/

23

24

#include <linux/res_counter.h>

24

#include <linux/res_counter.h>

25

#include <linux/memcontrol.h>

25

#include <linux/memcontrol.h>

26

#include <linux/cgroup.h>

26

#include <linux/cgroup.h>

27

#include <linux/mm.h>

27

#include <linux/mm.h>

28

#include <linux/hugetlb.h>

28

#include <linux/hugetlb.h>

29

#include <linux/pagemap.h>

29

#include <linux/pagemap.h>

30

#include <linux/smp.h>

30

#include <linux/smp.h>

31

#include <linux/page-flags.h>

31

#include <linux/page-flags.h>

32

#include <linux/backing-dev.h>

32

#include <linux/backing-dev.h>

33

#include <linux/bit_spinlock.h>

33

#include <linux/bit_spinlock.h>

34

#include <linux/rcupdate.h>

34

#include <linux/rcupdate.h>

35

#include <linux/limits.h>

35

#include <linux/limits.h>

36

#include <linux/export.h>

36

#include <linux/export.h>

37

#include <linux/mutex.h>

37

#include <linux/mutex.h>

38

#include <linux/rbtree.h>

38

#include <linux/rbtree.h>

39

#include <linux/slab.h>

39

#include <linux/slab.h>

40

#include <linux/swap.h>

40

#include <linux/swap.h>

41

#include <linux/swapops.h>

41

#include <linux/swapops.h>

42

#include <linux/spinlock.h>

42

#include <linux/spinlock.h>

43

#include <linux/eventfd.h>

43

#include <linux/eventfd.h>

44

#include <linux/sort.h>

44

#include <linux/sort.h>

45

#include <linux/fs.h>

45

#include <linux/fs.h>

46

#include <linux/seq_file.h>

46

#include <linux/seq_file.h>

47

#include <linux/vmalloc.h>

47

#include <linux/vmalloc.h>

48

#include <linux/mm_inline.h>

48

#include <linux/mm_inline.h>

49

#include <linux/page_cgroup.h>

49

#include <linux/page_cgroup.h>

50

#include <linux/cpu.h>

50

#include <linux/cpu.h>

51

#include <linux/oom.h>

51

#include <linux/oom.h>

52

#include "internal.h"

52

#include "internal.h"

53

#include <net/sock.h>

53

#include <net/sock.h>

54

#include <net/tcp_memcontrol.h>

54

#include <net/tcp_memcontrol.h>

55

56

#include <asm/uaccess.h>

56

#include <asm/uaccess.h>

57

58

#include <trace/events/vmscan.h>

58

#include <trace/events/vmscan.h>

59

60

struct cgroup_subsys mem_cgroup_subsys __read_mostly;

60

struct cgroup_subsys mem_cgroup_subsys __read_mostly;

61

#define MEM_CGROUP_RECLAIM_RETRIES 5

61

#define MEM_CGROUP_RECLAIM_RETRIES 5

62

static struct mem_cgroup *root_mem_cgroup __read_mostly;

62

static struct mem_cgroup *root_mem_cgroup __read_mostly;

63

64

#ifdef CONFIG_MEMCG_SWAP

64

#ifdef CONFIG_MEMCG_SWAP

65

/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */

65

/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */

66

int do_swap_account __read_mostly;

66

int do_swap_account __read_mostly;

67

68

/* for remember boot option*/

68

/* for remember boot option*/

69

#ifdef CONFIG_MEMCG_SWAP_ENABLED

69

#ifdef CONFIG_MEMCG_SWAP_ENABLED

70

static int really_do_swap_account __initdata = 1;

70

static int really_do_swap_account __initdata = 1;

71

#else

71

#else

72

static int really_do_swap_account __initdata = 0;

72

static int really_do_swap_account __initdata = 0;

73

#endif

73

#endif

74

75

#else

75

#else

76

#define do_swap_account 0

76

#define do_swap_account 0

77

#endif

77

#endif

78

79

80

/*

80

/*

81

* Statistics for memory cgroup.

81

* Statistics for memory cgroup.

82

*/

82

*/

83

enum mem_cgroup_stat_index {

83

enum mem_cgroup_stat_index {

84

/*

84

/*

85

* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.

85

* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.

86

*/

86

*/

87

MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */

87

MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */

88

MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */

88

MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */

89

MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */

89

MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */

90

MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */

90

MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */

91

MEM_CGROUP_STAT_NSTATS,

91

MEM_CGROUP_STAT_NSTATS,

92

};

92

};

93

94

static const char * const mem_cgroup_stat_names[] = {

94

static const char * const mem_cgroup_stat_names[] = {

95

"cache",

95

"cache",

96

"rss",

96

"rss",

97

"mapped_file",

97

"mapped_file",

98

"swap",

98

"swap",

99

};

99

};

100

101

enum mem_cgroup_events_index {

101

enum mem_cgroup_events_index {

102

MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */

102

MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */

103

MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */

103

MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */

104

MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */

104

MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */

105

MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */

105

MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */

106

MEM_CGROUP_EVENTS_NSTATS,

106

MEM_CGROUP_EVENTS_NSTATS,

107

};

107

};

108

109

static const char * const mem_cgroup_events_names[] = {

109

static const char * const mem_cgroup_events_names[] = {

110

"pgpgin",

110

"pgpgin",

111

"pgpgout",

111

"pgpgout",

112

"pgfault",

112

"pgfault",

113

"pgmajfault",

113

"pgmajfault",

114

};

114

};

115

116

/*

116

/*

117

* Per memcg event counter is incremented at every pagein/pageout. With THP,

117

* Per memcg event counter is incremented at every pagein/pageout. With THP,

118

* it will be incremated by the number of pages. This counter is used for

118

* it will be incremated by the number of pages. This counter is used for

119

* for trigger some periodic events. This is straightforward and better

119

* for trigger some periodic events. This is straightforward and better

120

* than using jiffies etc. to handle periodic memcg event.

120

* than using jiffies etc. to handle periodic memcg event.

121

*/

121

*/

122

enum mem_cgroup_events_target {

122

enum mem_cgroup_events_target {

123

MEM_CGROUP_TARGET_THRESH,

123

MEM_CGROUP_TARGET_THRESH,

124

MEM_CGROUP_TARGET_SOFTLIMIT,

124

MEM_CGROUP_TARGET_SOFTLIMIT,

125

MEM_CGROUP_TARGET_NUMAINFO,

125

MEM_CGROUP_TARGET_NUMAINFO,

126

MEM_CGROUP_NTARGETS,

126

MEM_CGROUP_NTARGETS,

127

};

127

};

128

#define THRESHOLDS_EVENTS_TARGET 128

128

#define THRESHOLDS_EVENTS_TARGET 128

129

#define SOFTLIMIT_EVENTS_TARGET 1024

129

#define SOFTLIMIT_EVENTS_TARGET 1024

130

#define NUMAINFO_EVENTS_TARGET 1024

130

#define NUMAINFO_EVENTS_TARGET 1024

131

132

struct mem_cgroup_stat_cpu {

132

struct mem_cgroup_stat_cpu {

133

long count[MEM_CGROUP_STAT_NSTATS];

133

long count[MEM_CGROUP_STAT_NSTATS];

134

unsigned long events[MEM_CGROUP_EVENTS_NSTATS];

134

unsigned long events[MEM_CGROUP_EVENTS_NSTATS];

135

unsigned long nr_page_events;

135

unsigned long nr_page_events;

136

unsigned long targets[MEM_CGROUP_NTARGETS];

136

unsigned long targets[MEM_CGROUP_NTARGETS];

137

};

137

};

138

139

struct mem_cgroup_reclaim_iter {

139

struct mem_cgroup_reclaim_iter {

140

/* css_id of the last scanned hierarchy member */

140

/* css_id of the last scanned hierarchy member */

141

int position;

141

int position;

142

/* scan generation, increased every round-trip */

142

/* scan generation, increased every round-trip */

143

unsigned int generation;

143

unsigned int generation;

144

};

144

};

145

146

/*

146

/*

147

* per-zone information in memory controller.

147

* per-zone information in memory controller.

148

*/

148

*/

149

struct mem_cgroup_per_zone {

149

struct mem_cgroup_per_zone {

150

struct lruvec lruvec;

150

struct lruvec lruvec;

151

unsigned long lru_size[NR_LRU_LISTS];

151

unsigned long lru_size[NR_LRU_LISTS];

152

153

struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];

153

struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];

154

155

struct rb_node tree_node; /* RB tree node */

155

struct rb_node tree_node; /* RB tree node */

156

unsigned long long usage_in_excess;/* Set to the value by which */

156

unsigned long long usage_in_excess;/* Set to the value by which */

157

/* the soft limit is exceeded*/

157

/* the soft limit is exceeded*/

158

bool on_tree;

158

bool on_tree;

159

struct mem_cgroup *memcg; /* Back pointer, we cannot */

159

struct mem_cgroup *memcg; /* Back pointer, we cannot */

160

/* use container_of */

160

/* use container_of */

161

};

161

};

162

163

struct mem_cgroup_per_node {

163

struct mem_cgroup_per_node {

164

struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];

164

struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];

165

};

165

};

166

167

struct mem_cgroup_lru_info {

167

struct mem_cgroup_lru_info {

168

struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];

168

struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];

169

};

169

};

170

171

/*

171

/*

172

* Cgroups above their limits are maintained in a RB-Tree, independent of

172

* Cgroups above their limits are maintained in a RB-Tree, independent of

173

* their hierarchy representation

173

* their hierarchy representation

174

*/

174

*/

175

176

struct mem_cgroup_tree_per_zone {

176

struct mem_cgroup_tree_per_zone {

177

struct rb_root rb_root;

177

struct rb_root rb_root;

178

spinlock_t lock;

178

spinlock_t lock;

179

};

179

};

180

181

struct mem_cgroup_tree_per_node {

181

struct mem_cgroup_tree_per_node {

182

struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];

182

struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];

183

};

183

};

184

185

struct mem_cgroup_tree {

185

struct mem_cgroup_tree {

186

struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];

186

struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];

187

};

187

};

188

189

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

189

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

190

191

struct mem_cgroup_threshold {

191

struct mem_cgroup_threshold {

192

struct eventfd_ctx *eventfd;

192

struct eventfd_ctx *eventfd;

193

u64 threshold;

193

u64 threshold;

194

};

194

};

195

196

/* For threshold */

196

/* For threshold */

197

struct mem_cgroup_threshold_ary {

197

struct mem_cgroup_threshold_ary {

198

/* An array index points to threshold just below or equal to usage. */

198

/* An array index points to threshold just below or equal to usage. */

199

int current_threshold;

199

int current_threshold;

200

/* Size of entries[] */

200

/* Size of entries[] */

201

unsigned int size;

201

unsigned int size;

202

/* Array of thresholds */

202

/* Array of thresholds */

203

struct mem_cgroup_threshold entries[0];

203

struct mem_cgroup_threshold entries[0];

204

};

204

};

205

206

struct mem_cgroup_thresholds {

206

struct mem_cgroup_thresholds {

207

/* Primary thresholds array */

207

/* Primary thresholds array */

208

struct mem_cgroup_threshold_ary *primary;

208

struct mem_cgroup_threshold_ary *primary;

209

/*

209

/*

210

* Spare threshold array.

210

* Spare threshold array.

211

* This is needed to make mem_cgroup_unregister_event() "never fail".

211

* This is needed to make mem_cgroup_unregister_event() "never fail".

212

* It must be able to store at least primary->size - 1 entries.

212

* It must be able to store at least primary->size - 1 entries.

213

*/

213

*/

214

struct mem_cgroup_threshold_ary *spare;

214

struct mem_cgroup_threshold_ary *spare;

215

};

215

};

216

217

/* for OOM */

217

/* for OOM */

218

struct mem_cgroup_eventfd_list {

218

struct mem_cgroup_eventfd_list {

219

struct list_head list;

219

struct list_head list;

220

struct eventfd_ctx *eventfd;

220

struct eventfd_ctx *eventfd;

221

};

221

};

222

223

static void mem_cgroup_threshold(struct mem_cgroup *memcg);

223

static void mem_cgroup_threshold(struct mem_cgroup *memcg);

224

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);

224

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);

225

226

/*

226

/*

227

* The memory controller data structure. The memory controller controls both

227

* The memory controller data structure. The memory controller controls both

228

* page cache and RSS per cgroup. We would eventually like to provide

228

* page cache and RSS per cgroup. We would eventually like to provide

229

* statistics based on the statistics developed by Rik Van Riel for clock-pro,

229

* statistics based on the statistics developed by Rik Van Riel for clock-pro,

230

* to help the administrator determine what knobs to tune.

230

* to help the administrator determine what knobs to tune.

231

*

231

*

232

* TODO: Add a water mark for the memory controller. Reclaim will begin when

232

* TODO: Add a water mark for the memory controller. Reclaim will begin when

233

* we hit the water mark. May be even add a low water mark, such that

233

* we hit the water mark. May be even add a low water mark, such that

234

* no reclaim occurs from a cgroup at it's low water mark, this is

234

* no reclaim occurs from a cgroup at it's low water mark, this is

235

* a feature that will be implemented much later in the future.

235

* a feature that will be implemented much later in the future.

236

*/

236

*/

237

struct mem_cgroup {

237

struct mem_cgroup {

238

struct cgroup_subsys_state css;

238

struct cgroup_subsys_state css;

239

/*

239

/*

240

* the counter to account for memory usage

240

* the counter to account for memory usage

241

*/

241

*/

242

struct res_counter res;

242

struct res_counter res;

243

244

union {

244

union {

245

/*

245

/*

246

* the counter to account for mem+swap usage.

246

* the counter to account for mem+swap usage.

247

*/

247

*/

248

struct res_counter memsw;

248

struct res_counter memsw;

249

250

/*

250

/*

251

* rcu_freeing is used only when freeing struct mem_cgroup,

251

* rcu_freeing is used only when freeing struct mem_cgroup,

252

* so put it into a union to avoid wasting more memory.

252

* so put it into a union to avoid wasting more memory.

253

* It must be disjoint from the css field. It could be

253

* It must be disjoint from the css field. It could be

254

* in a union with the res field, but res plays a much

254

* in a union with the res field, but res plays a much

255

* larger part in mem_cgroup life than memsw, and might

255

* larger part in mem_cgroup life than memsw, and might

256

* be of interest, even at time of free, when debugging.

256

* be of interest, even at time of free, when debugging.

257

* So share rcu_head with the less interesting memsw.

257

* So share rcu_head with the less interesting memsw.

258

*/

258

*/

259

struct rcu_head rcu_freeing;

259

struct rcu_head rcu_freeing;

260

/*

260

/*

261

* We also need some space for a worker in deferred freeing.

261

* We also need some space for a worker in deferred freeing.

262

* By the time we call it, rcu_freeing is no longer in use.

262

* By the time we call it, rcu_freeing is no longer in use.

263

*/

263

*/

264

struct work_struct work_freeing;

264

struct work_struct work_freeing;

265

};

265

};

266

267

/*

267

/*

268

* Per cgroup active and inactive list, similar to the

268

* Per cgroup active and inactive list, similar to the

269

* per zone LRU lists.

269

* per zone LRU lists.

270

*/

270

*/

271

struct mem_cgroup_lru_info info;

271

struct mem_cgroup_lru_info info;

272

int last_scanned_node;

272

int last_scanned_node;

273

#if MAX_NUMNODES > 1

273

#if MAX_NUMNODES > 1

274

nodemask_t scan_nodes;

274

nodemask_t scan_nodes;

275

atomic_t numainfo_events;

275

atomic_t numainfo_events;

276

atomic_t numainfo_updating;

276

atomic_t numainfo_updating;

277

#endif

277

#endif

278

/*

278

/*

279

* Should the accounting and control be hierarchical, per subtree?

279

* Should the accounting and control be hierarchical, per subtree?

280

*/

280

*/

281

bool use_hierarchy;

281

bool use_hierarchy;

282

283

bool oom_lock;

283

bool oom_lock;

284

atomic_t under_oom;

284

atomic_t under_oom;

285

286

atomic_t refcnt;

286

atomic_t refcnt;

287

288

int swappiness;

288

int swappiness;

289

/* OOM-Killer disable */

289

/* OOM-Killer disable */

290

int oom_kill_disable;

290

int oom_kill_disable;

291

292

/* set when res.limit == memsw.limit */

292

/* set when res.limit == memsw.limit */

293

bool memsw_is_minimum;

293

bool memsw_is_minimum;

294

295

/* protect arrays of thresholds */

295

/* protect arrays of thresholds */

296

struct mutex thresholds_lock;

296

struct mutex thresholds_lock;

297

298

/* thresholds for memory usage. RCU-protected */

298

/* thresholds for memory usage. RCU-protected */

299

struct mem_cgroup_thresholds thresholds;

299

struct mem_cgroup_thresholds thresholds;

300

301

/* thresholds for mem+swap usage. RCU-protected */

301

/* thresholds for mem+swap usage. RCU-protected */

302

struct mem_cgroup_thresholds memsw_thresholds;

302

struct mem_cgroup_thresholds memsw_thresholds;

303

304

/* For oom notifier event fd */

304

/* For oom notifier event fd */

305

struct list_head oom_notify;

305

struct list_head oom_notify;

306

307

/*

307

/*

308

* Should we move charges of a task when a task is moved into this

308

* Should we move charges of a task when a task is moved into this

309

* mem_cgroup ? And what type of charges should we move ?

309

* mem_cgroup ? And what type of charges should we move ?

310

*/

310

*/

311

unsigned long move_charge_at_immigrate;

311

unsigned long move_charge_at_immigrate;

312

/*

312

/*

313

* set > 0 if pages under this cgroup are moving to other cgroup.

313

* set > 0 if pages under this cgroup are moving to other cgroup.

314

*/

314

*/

315

atomic_t moving_account;

315

atomic_t moving_account;

316

/* taken only while moving_account > 0 */

316

/* taken only while moving_account > 0 */

317

spinlock_t move_lock;

317

spinlock_t move_lock;

318

/*

318

/*

319

* percpu counter.

319

* percpu counter.

320

*/

320

*/

321

struct mem_cgroup_stat_cpu __percpu *stat;

321

struct mem_cgroup_stat_cpu __percpu *stat;

322

/*

322

/*

323

* used when a cpu is offlined or other synchronizations

323

* used when a cpu is offlined or other synchronizations

324

* See mem_cgroup_read_stat().

324

* See mem_cgroup_read_stat().

325

*/

325

*/

326

struct mem_cgroup_stat_cpu nocpu_base;

326

struct mem_cgroup_stat_cpu nocpu_base;

327

spinlock_t pcp_counter_lock;

327

spinlock_t pcp_counter_lock;

328

329

#ifdef CONFIG_INET

329

#ifdef CONFIG_INET

330

struct tcp_memcontrol tcp_mem;

330

struct tcp_memcontrol tcp_mem;

331

#endif

331

#endif

332

};

332

};

333

334

/* Stuffs for move charges at task migration. */

334

/* Stuffs for move charges at task migration. */

335

/*

335

/*

336

* Types of charges to be moved. "move_charge_at_immitgrate" is treated as a

336

* Types of charges to be moved. "move_charge_at_immitgrate" is treated as a

337

* left-shifted bitmap of these types.

337

* left-shifted bitmap of these types.

338

*/

338

*/

339

enum move_type {

339

enum move_type {

340

MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */

340

MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */

341

MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */

341

MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */

342

NR_MOVE_TYPE,

342

NR_MOVE_TYPE,

343

};

343

};

344

345

/* "mc" and its members are protected by cgroup_mutex */

345

/* "mc" and its members are protected by cgroup_mutex */

346

static struct move_charge_struct {

346

static struct move_charge_struct {

347

spinlock_t lock; /* for from, to */

347

spinlock_t lock; /* for from, to */

348

struct mem_cgroup *from;

348

struct mem_cgroup *from;

349

struct mem_cgroup *to;

349

struct mem_cgroup *to;

350

unsigned long precharge;

350

unsigned long precharge;

351

unsigned long moved_charge;

351

unsigned long moved_charge;

352

unsigned long moved_swap;

352

unsigned long moved_swap;

353

struct task_struct *moving_task; /* a task moving charges */

353

struct task_struct *moving_task; /* a task moving charges */

354

wait_queue_head_t waitq; /* a waitq for other context */

354

wait_queue_head_t waitq; /* a waitq for other context */

355

} mc = {

355

} mc = {

356

.lock = __SPIN_LOCK_UNLOCKED(mc.lock),

356

.lock = __SPIN_LOCK_UNLOCKED(mc.lock),

357

.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),

357

.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),

358

};

358

};

359

360

static bool move_anon(void)

360

static bool move_anon(void)

361

{

361

{

362

return test_bit(MOVE_CHARGE_TYPE_ANON,

362

return test_bit(MOVE_CHARGE_TYPE_ANON,

363

&mc.to->move_charge_at_immigrate);

363

&mc.to->move_charge_at_immigrate);

364

}

364

}

365

366

static bool move_file(void)

366

static bool move_file(void)

367

{

367

{

368

return test_bit(MOVE_CHARGE_TYPE_FILE,

368

return test_bit(MOVE_CHARGE_TYPE_FILE,

369

&mc.to->move_charge_at_immigrate);

369

&mc.to->move_charge_at_immigrate);

370

}

370

}

371

372

/*

372

/*

373

* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft

373

* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft

374

* limit reclaim to prevent infinite loops, if they ever occur.

374

* limit reclaim to prevent infinite loops, if they ever occur.

375

*/

375

*/

376

#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100

376

#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100

377

#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2

377

#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2

378

379

enum charge_type {

379

enum charge_type {

380

MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

380

MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

381

MEM_CGROUP_CHARGE_TYPE_ANON,

381

MEM_CGROUP_CHARGE_TYPE_ANON,

382

MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */

382

MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */

383

MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */

383

MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */

384

NR_CHARGE_TYPE,

384

NR_CHARGE_TYPE,

385

};

385

};

386

387

/* for encoding cft->private value on file */

387

/* for encoding cft->private value on file */

388

#define _MEM (0)

388

#define _MEM (0)

389

#define _MEMSWAP (1)

389

#define _MEMSWAP (1)

390

#define _OOM_TYPE (2)

390

#define _OOM_TYPE (2)

391

#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))

391

#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))

392

#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)

392

#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)

393

#define MEMFILE_ATTR(val) ((val) & 0xffff)

393

#define MEMFILE_ATTR(val) ((val) & 0xffff)

394

/* Used for OOM nofiier */

394

/* Used for OOM nofiier */

395

#define OOM_CONTROL (0)

395

#define OOM_CONTROL (0)

396

397

/*

397

/*

398

* Reclaim flags for mem_cgroup_hierarchical_reclaim

398

* Reclaim flags for mem_cgroup_hierarchical_reclaim

399

*/

399

*/

400

#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0

400

#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0

401

#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)

401

#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)

402

#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1

402

#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1

403

#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)

403

#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)

404

405

static void mem_cgroup_get(struct mem_cgroup *memcg);

405

static void mem_cgroup_get(struct mem_cgroup *memcg);

406

static void mem_cgroup_put(struct mem_cgroup *memcg);

406

static void mem_cgroup_put(struct mem_cgroup *memcg);

407

408

/* Writing them here to avoid exposing memcg's inner layout */

408

/* Writing them here to avoid exposing memcg's inner layout */

409

#ifdef CONFIG_MEMCG_KMEM

409

#ifdef CONFIG_MEMCG_KMEM

410

#include <net/sock.h>

410

#include <net/sock.h>

411

#include <net/ip.h>

411

#include <net/ip.h>

412

413

static bool mem_cgroup_is_root(struct mem_cgroup *memcg);

413

static bool mem_cgroup_is_root(struct mem_cgroup *memcg);

414

void sock_update_memcg(struct sock *sk)

414

void sock_update_memcg(struct sock *sk)

415

{

415

{

416

if (mem_cgroup_sockets_enabled) {

416

if (mem_cgroup_sockets_enabled) {

417

struct mem_cgroup *memcg;

417

struct mem_cgroup *memcg;

418

struct cg_proto *cg_proto;

418

struct cg_proto *cg_proto;

419

420

BUG_ON(!sk->sk_prot->proto_cgroup);

420

BUG_ON(!sk->sk_prot->proto_cgroup);

421

422

/* Socket cloning can throw us here with sk_cgrp already

422

/* Socket cloning can throw us here with sk_cgrp already

423

* filled. It won't however, necessarily happen from

423

* filled. It won't however, necessarily happen from

424

* process context. So the test for root memcg given

424

* process context. So the test for root memcg given

425

* the current task's memcg won't help us in this case.

425

* the current task's memcg won't help us in this case.

426

*

426

*

427

* Respecting the original socket's memcg is a better

427

* Respecting the original socket's memcg is a better

428

* decision in this case.

428

* decision in this case.

429

*/

429

*/

430

if (sk->sk_cgrp) {

430

if (sk->sk_cgrp) {

431

BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));

431

BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));

432

mem_cgroup_get(sk->sk_cgrp->memcg);

432

mem_cgroup_get(sk->sk_cgrp->memcg);

433

return;

433

return;

434

}

434

}

435

436

rcu_read_lock();

436

rcu_read_lock();

437

memcg = mem_cgroup_from_task(current);

437

memcg = mem_cgroup_from_task(current);

438

cg_proto = sk->sk_prot->proto_cgroup(memcg);

438

cg_proto = sk->sk_prot->proto_cgroup(memcg);

439

if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {

439

if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {

440

mem_cgroup_get(memcg);

440

mem_cgroup_get(memcg);

441

sk->sk_cgrp = cg_proto;

441

sk->sk_cgrp = cg_proto;

442

}

442

}

443

rcu_read_unlock();

443

rcu_read_unlock();

444

}

444

}

445

}

445

}

446

EXPORT_SYMBOL(sock_update_memcg);

446

EXPORT_SYMBOL(sock_update_memcg);

447

448

void sock_release_memcg(struct sock *sk)

448

void sock_release_memcg(struct sock *sk)

449

{

449

{

450

if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {

450

if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {

451

struct mem_cgroup *memcg;

451

struct mem_cgroup *memcg;

452

WARN_ON(!sk->sk_cgrp->memcg);

452

WARN_ON(!sk->sk_cgrp->memcg);

453

memcg = sk->sk_cgrp->memcg;

453

memcg = sk->sk_cgrp->memcg;

454

mem_cgroup_put(memcg);

454

mem_cgroup_put(memcg);

455

}

455

}

456

}

456

}

457

458

#ifdef CONFIG_INET

458

#ifdef CONFIG_INET

459

struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)

459

struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)

460

{

460

{

461

if (!memcg || mem_cgroup_is_root(memcg))

461

if (!memcg || mem_cgroup_is_root(memcg))

462

return NULL;

462

return NULL;

463

464

return &memcg->tcp_mem.cg_proto;

464

return &memcg->tcp_mem.cg_proto;

465

}

465

}

466

EXPORT_SYMBOL(tcp_proto_cgroup);

466

EXPORT_SYMBOL(tcp_proto_cgroup);

467

#endif /* CONFIG_INET */

467

#endif /* CONFIG_INET */

468

#endif /* CONFIG_MEMCG_KMEM */

468

#endif /* CONFIG_MEMCG_KMEM */

469

470

#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)

470

#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)

471

static void disarm_sock_keys(struct mem_cgroup *memcg)

471

static void disarm_sock_keys(struct mem_cgroup *memcg)

472

{

472

{

473

if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))

473

if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))

474

return;

474

return;

475

static_key_slow_dec(&memcg_socket_limit_enabled);

475

static_key_slow_dec(&memcg_socket_limit_enabled);

476

}

476

}

477

#else

477

#else

478

static void disarm_sock_keys(struct mem_cgroup *memcg)

478

static void disarm_sock_keys(struct mem_cgroup *memcg)

479

{

479

{

480

}

480

}

481

#endif

481

#endif

482

483

static void drain_all_stock_async(struct mem_cgroup *memcg);

483

static void drain_all_stock_async(struct mem_cgroup *memcg);

484

485

static struct mem_cgroup_per_zone *

485

static struct mem_cgroup_per_zone *

486

mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)

486

mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)

487

{

487

{

488

return &memcg->info.nodeinfo[nid]->zoneinfo[zid];

488

return &memcg->info.nodeinfo[nid]->zoneinfo[zid];

489

}

489

}

490

491

struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)

491

struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)

492

{

492

{

493

return &memcg->css;

493

return &memcg->css;

494

}

494

}

495

496

static struct mem_cgroup_per_zone *

496

static struct mem_cgroup_per_zone *

497

page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)

497

page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)

498

{

498

{

499

int nid = page_to_nid(page);

499

int nid = page_to_nid(page);

500

int zid = page_zonenum(page);

500

int zid = page_zonenum(page);

501

502

return mem_cgroup_zoneinfo(memcg, nid, zid);

502

return mem_cgroup_zoneinfo(memcg, nid, zid);

503

}

503

}

504

505

static struct mem_cgroup_tree_per_zone *

505

static struct mem_cgroup_tree_per_zone *

506

soft_limit_tree_node_zone(int nid, int zid)

506

soft_limit_tree_node_zone(int nid, int zid)

507

{

507

{

508

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

508

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

509

}

509

}

510

511

static struct mem_cgroup_tree_per_zone *

511

static struct mem_cgroup_tree_per_zone *

512

soft_limit_tree_from_page(struct page *page)

512

soft_limit_tree_from_page(struct page *page)

513

{

513

{

514

int nid = page_to_nid(page);

514

int nid = page_to_nid(page);

515

int zid = page_zonenum(page);

515

int zid = page_zonenum(page);

516

517

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

517

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

518

}

518

}

519

520

static void

520

static void

521

__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,

521

__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,

522

struct mem_cgroup_per_zone *mz,

522

struct mem_cgroup_per_zone *mz,

523

struct mem_cgroup_tree_per_zone *mctz,

523

struct mem_cgroup_tree_per_zone *mctz,

524

unsigned long long new_usage_in_excess)

524

unsigned long long new_usage_in_excess)

525

{

525

{

526

struct rb_node **p = &mctz->rb_root.rb_node;

526

struct rb_node **p = &mctz->rb_root.rb_node;

527

struct rb_node *parent = NULL;

527

struct rb_node *parent = NULL;

528

struct mem_cgroup_per_zone *mz_node;

528

struct mem_cgroup_per_zone *mz_node;

529

530

if (mz->on_tree)

530

if (mz->on_tree)

531

return;

531

return;

532

533

mz->usage_in_excess = new_usage_in_excess;

533

mz->usage_in_excess = new_usage_in_excess;

534

if (!mz->usage_in_excess)

534

if (!mz->usage_in_excess)

535

return;

535

return;

536

while (*p) {

536

while (*p) {

537

parent = *p;

537

parent = *p;

538

mz_node = rb_entry(parent, struct mem_cgroup_per_zone,

538

mz_node = rb_entry(parent, struct mem_cgroup_per_zone,

539

tree_node);

539

tree_node);

540

if (mz->usage_in_excess < mz_node->usage_in_excess)

540

if (mz->usage_in_excess < mz_node->usage_in_excess)

541

p = &(*p)->rb_left;

541

p = &(*p)->rb_left;

542

/*

542

/*

543

* We can't avoid mem cgroups that are over their soft

543

* We can't avoid mem cgroups that are over their soft

544

* limit by the same amount

544

* limit by the same amount

545

*/

545

*/

546

else if (mz->usage_in_excess >= mz_node->usage_in_excess)

546

else if (mz->usage_in_excess >= mz_node->usage_in_excess)

547

p = &(*p)->rb_right;

547

p = &(*p)->rb_right;

548

}

548

}

549

rb_link_node(&mz->tree_node, parent, p);

549

rb_link_node(&mz->tree_node, parent, p);

550

rb_insert_color(&mz->tree_node, &mctz->rb_root);

550

rb_insert_color(&mz->tree_node, &mctz->rb_root);

551

mz->on_tree = true;

551

mz->on_tree = true;

552

}

552

}

553

554

static void

554

static void

555

__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,

555

__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,

556

struct mem_cgroup_per_zone *mz,

556

struct mem_cgroup_per_zone *mz,

557

struct mem_cgroup_tree_per_zone *mctz)

557

struct mem_cgroup_tree_per_zone *mctz)

558

{

558

{

559

if (!mz->on_tree)

559

if (!mz->on_tree)

560

return;

560

return;

561

rb_erase(&mz->tree_node, &mctz->rb_root);

561

rb_erase(&mz->tree_node, &mctz->rb_root);

562

mz->on_tree = false;

562

mz->on_tree = false;

563

}

563

}

564

565

static void

565

static void

566

mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,

566

mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,

567

struct mem_cgroup_per_zone *mz,

567

struct mem_cgroup_per_zone *mz,

568

struct mem_cgroup_tree_per_zone *mctz)

568

struct mem_cgroup_tree_per_zone *mctz)

569

{

569

{

570

spin_lock(&mctz->lock);

570

spin_lock(&mctz->lock);

571

__mem_cgroup_remove_exceeded(memcg, mz, mctz);

571

__mem_cgroup_remove_exceeded(memcg, mz, mctz);

572

spin_unlock(&mctz->lock);

572

spin_unlock(&mctz->lock);

573

}

573

}

574

575

576

static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)

576

static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)

577

{

577

{

578

unsigned long long excess;

578

unsigned long long excess;

579

struct mem_cgroup_per_zone *mz;

579

struct mem_cgroup_per_zone *mz;

580

struct mem_cgroup_tree_per_zone *mctz;

580

struct mem_cgroup_tree_per_zone *mctz;

581

int nid = page_to_nid(page);

581

int nid = page_to_nid(page);

582

int zid = page_zonenum(page);

582

int zid = page_zonenum(page);

583

mctz = soft_limit_tree_from_page(page);

583

mctz = soft_limit_tree_from_page(page);

584

585

/*

585

/*

586

* Necessary to update all ancestors when hierarchy is used.

586

* Necessary to update all ancestors when hierarchy is used.

587

* because their event counter is not touched.

587

* because their event counter is not touched.

588

*/

588

*/

589

for (; memcg; memcg = parent_mem_cgroup(memcg)) {

589

for (; memcg; memcg = parent_mem_cgroup(memcg)) {

590

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

590

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

591

excess = res_counter_soft_limit_excess(&memcg->res);

591

excess = res_counter_soft_limit_excess(&memcg->res);

592

/*

592

/*

593

* We have to update the tree if mz is on RB-tree or

593

* We have to update the tree if mz is on RB-tree or

594

* mem is over its softlimit.

594

* mem is over its softlimit.

595

*/

595

*/

596

if (excess || mz->on_tree) {

596

if (excess || mz->on_tree) {

597

spin_lock(&mctz->lock);

597

spin_lock(&mctz->lock);

598

/* if on-tree, remove it */

598

/* if on-tree, remove it */

599

if (mz->on_tree)

599

if (mz->on_tree)

600

__mem_cgroup_remove_exceeded(memcg, mz, mctz);

600

__mem_cgroup_remove_exceeded(memcg, mz, mctz);

601

/*

601

/*

602

* Insert again. mz->usage_in_excess will be updated.

602

* Insert again. mz->usage_in_excess will be updated.

603

* If excess is 0, no tree ops.

603

* If excess is 0, no tree ops.

604

*/

604

*/

605

__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);

605

__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);

606

spin_unlock(&mctz->lock);

606

spin_unlock(&mctz->lock);

607

}

607

}

608

}

608

}

609

}

609

}

610

611

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)

611

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)

612

{

612

{

613

int node, zone;

613

int node, zone;

614

struct mem_cgroup_per_zone *mz;

614

struct mem_cgroup_per_zone *mz;

615

struct mem_cgroup_tree_per_zone *mctz;

615

struct mem_cgroup_tree_per_zone *mctz;

616

617

for_each_node(node) {

617

for_each_node(node) {

618

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

618

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

619

mz = mem_cgroup_zoneinfo(memcg, node, zone);

619

mz = mem_cgroup_zoneinfo(memcg, node, zone);

620

mctz = soft_limit_tree_node_zone(node, zone);

620

mctz = soft_limit_tree_node_zone(node, zone);

621

mem_cgroup_remove_exceeded(memcg, mz, mctz);

621

mem_cgroup_remove_exceeded(memcg, mz, mctz);

622

}

622

}

623

}

623

}

624

}

624

}

625

626

static struct mem_cgroup_per_zone *

626

static struct mem_cgroup_per_zone *

627

__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

627

__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

628

{

628

{

629

struct rb_node *rightmost = NULL;

629

struct rb_node *rightmost = NULL;

630

struct mem_cgroup_per_zone *mz;

630

struct mem_cgroup_per_zone *mz;

631

632

retry:

632

retry:

633

mz = NULL;

633

mz = NULL;

634

rightmost = rb_last(&mctz->rb_root);

634

rightmost = rb_last(&mctz->rb_root);

635

if (!rightmost)

635

if (!rightmost)

636

goto done; /* Nothing to reclaim from */

636

goto done; /* Nothing to reclaim from */

637

638

mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);

638

mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);

639

/*

639

/*

640

* Remove the node now but someone else can add it back,

640

* Remove the node now but someone else can add it back,

641

* we will to add it back at the end of reclaim to its correct

641

* we will to add it back at the end of reclaim to its correct

642

* position in the tree.

642

* position in the tree.

643

*/

643

*/

644

__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);

644

__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);

645

if (!res_counter_soft_limit_excess(&mz->memcg->res) ||

645

if (!res_counter_soft_limit_excess(&mz->memcg->res) ||

646

!css_tryget(&mz->memcg->css))

646

!css_tryget(&mz->memcg->css))

647

goto retry;

647

goto retry;

648

done:

648

done:

649

return mz;

649

return mz;

650

}

650

}

651

652

static struct mem_cgroup_per_zone *

652

static struct mem_cgroup_per_zone *

653

mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

653

mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

654

{

654

{

655

struct mem_cgroup_per_zone *mz;

655

struct mem_cgroup_per_zone *mz;

656

657

spin_lock(&mctz->lock);

657

spin_lock(&mctz->lock);

658

mz = __mem_cgroup_largest_soft_limit_node(mctz);

658

mz = __mem_cgroup_largest_soft_limit_node(mctz);

659

spin_unlock(&mctz->lock);

659

spin_unlock(&mctz->lock);

660

return mz;

660

return mz;

661

}

661

}

662

663

/*

663

/*

664

* Implementation Note: reading percpu statistics for memcg.

664

* Implementation Note: reading percpu statistics for memcg.

665

*

665

*

666

* Both of vmstat[] and percpu_counter has threshold and do periodic

666

* Both of vmstat[] and percpu_counter has threshold and do periodic

667

* synchronization to implement "quick" read. There are trade-off between

667

* synchronization to implement "quick" read. There are trade-off between

668

* reading cost and precision of value. Then, we may have a chance to implement

668

* reading cost and precision of value. Then, we may have a chance to implement

669

* a periodic synchronizion of counter in memcg's counter.

669

* a periodic synchronizion of counter in memcg's counter.

670

*

670

*

671

* But this _read() function is used for user interface now. The user accounts

671

* But this _read() function is used for user interface now. The user accounts

672

* memory usage by memory cgroup and he _always_ requires exact value because

672

* memory usage by memory cgroup and he _always_ requires exact value because

673

* he accounts memory. Even if we provide quick-and-fuzzy read, we always

673

* he accounts memory. Even if we provide quick-and-fuzzy read, we always

674

* have to visit all online cpus and make sum. So, for now, unnecessary

674

* have to visit all online cpus and make sum. So, for now, unnecessary

675

* synchronization is not implemented. (just implemented for cpu hotplug)

675

* synchronization is not implemented. (just implemented for cpu hotplug)

676

*

676

*

677

* If there are kernel internal actions which can make use of some not-exact

677

* If there are kernel internal actions which can make use of some not-exact

678

* value, and reading all cpu value can be performance bottleneck in some

678

* value, and reading all cpu value can be performance bottleneck in some

679

* common workload, threashold and synchonization as vmstat[] should be

679

* common workload, threashold and synchonization as vmstat[] should be

680

* implemented.

680

* implemented.

681

*/

681

*/

682

static long mem_cgroup_read_stat(struct mem_cgroup *memcg,

682

static long mem_cgroup_read_stat(struct mem_cgroup *memcg,

683

enum mem_cgroup_stat_index idx)

683

enum mem_cgroup_stat_index idx)

684

{

684

{

685

long val = 0;

685

long val = 0;

686

int cpu;

686

int cpu;

687

688

get_online_cpus();

688

get_online_cpus();

689

for_each_online_cpu(cpu)

689

for_each_online_cpu(cpu)

690

val += per_cpu(memcg->stat->count[idx], cpu);

690

val += per_cpu(memcg->stat->count[idx], cpu);

691

#ifdef CONFIG_HOTPLUG_CPU

691

#ifdef CONFIG_HOTPLUG_CPU

692

spin_lock(&memcg->pcp_counter_lock);

692

spin_lock(&memcg->pcp_counter_lock);

693

val += memcg->nocpu_base.count[idx];

693

val += memcg->nocpu_base.count[idx];

694

spin_unlock(&memcg->pcp_counter_lock);

694

spin_unlock(&memcg->pcp_counter_lock);

695

#endif

695

#endif

696

put_online_cpus();

696

put_online_cpus();

697

return val;

697

return val;

698

}

698

}

699

700

static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,

700

static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,

701

bool charge)

701

bool charge)

702

{

702

{

703

int val = (charge) ? 1 : -1;

703

int val = (charge) ? 1 : -1;

704

this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);

704

this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);

705

}

705

}

706

707

static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,

707

static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,

708

enum mem_cgroup_events_index idx)

708

enum mem_cgroup_events_index idx)

709

{

709

{

710

unsigned long val = 0;

710

unsigned long val = 0;

711

int cpu;

711

int cpu;

712

713

for_each_online_cpu(cpu)

713

for_each_online_cpu(cpu)

714

val += per_cpu(memcg->stat->events[idx], cpu);

714

val += per_cpu(memcg->stat->events[idx], cpu);

715

#ifdef CONFIG_HOTPLUG_CPU

715

#ifdef CONFIG_HOTPLUG_CPU

716

spin_lock(&memcg->pcp_counter_lock);

716

spin_lock(&memcg->pcp_counter_lock);

717

val += memcg->nocpu_base.events[idx];

717

val += memcg->nocpu_base.events[idx];

718

spin_unlock(&memcg->pcp_counter_lock);

718

spin_unlock(&memcg->pcp_counter_lock);

719

#endif

719

#endif

720

return val;

720

return val;

721

}

721

}

722

723

static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,

723

static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,

724

bool anon, int nr_pages)

724

bool anon, int nr_pages)

725

{

725

{

726

preempt_disable();

726

preempt_disable();

727

728

/*

728

/*

729

* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is

729

* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is

730

* counted as CACHE even if it's on ANON LRU.

730

* counted as CACHE even if it's on ANON LRU.

731

*/

731

*/

732

if (anon)

732

if (anon)

733

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],

733

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],

734

nr_pages);

734

nr_pages);

735

else

735

else

736

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],

736

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],

737

nr_pages);

737

nr_pages);

738

739

/* pagein of a big page is an event. So, ignore page size */

739

/* pagein of a big page is an event. So, ignore page size */

740

if (nr_pages > 0)

740

if (nr_pages > 0)

741

__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);

741

__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);

742

else {

742

else {

743

__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);

743

__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);

744

nr_pages = -nr_pages; /* for event */

744

nr_pages = -nr_pages; /* for event */

745

}

745

}

746

747

__this_cpu_add(memcg->stat->nr_page_events, nr_pages);

747

__this_cpu_add(memcg->stat->nr_page_events, nr_pages);

748

749

preempt_enable();

749

preempt_enable();

750

}

750

}

751

752

unsigned long

752

unsigned long

753

mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)

753

mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)

754

{

754

{

755

struct mem_cgroup_per_zone *mz;

755

struct mem_cgroup_per_zone *mz;

756

757

mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);

757

mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);

758

return mz->lru_size[lru];

758

return mz->lru_size[lru];

759

}

759

}

760

761

static unsigned long

761

static unsigned long

762

mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,

762

mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,

763

unsigned int lru_mask)

763

unsigned int lru_mask)

764

{

764

{

765

struct mem_cgroup_per_zone *mz;

765

struct mem_cgroup_per_zone *mz;

766

enum lru_list lru;

766

enum lru_list lru;

767

unsigned long ret = 0;

767

unsigned long ret = 0;

768

769

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

769

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

770

771

for_each_lru(lru) {

771

for_each_lru(lru) {

772

if (BIT(lru) & lru_mask)

772

if (BIT(lru) & lru_mask)

773

ret += mz->lru_size[lru];

773

ret += mz->lru_size[lru];

774

}

774

}

775

return ret;

775

return ret;

776

}

776

}

777

778

static unsigned long

778

static unsigned long

779

mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,

779

mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,

780

int nid, unsigned int lru_mask)

780

int nid, unsigned int lru_mask)

781

{

781

{

782

u64 total = 0;

782

u64 total = 0;

783

int zid;

783

int zid;

784

785

for (zid = 0; zid < MAX_NR_ZONES; zid++)

785

for (zid = 0; zid < MAX_NR_ZONES; zid++)

786

total += mem_cgroup_zone_nr_lru_pages(memcg,

786

total += mem_cgroup_zone_nr_lru_pages(memcg,

787

nid, zid, lru_mask);

787

nid, zid, lru_mask);

788

789

return total;

789

return total;

790

}

790

}

791

792

static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,

792

static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,

793

unsigned int lru_mask)

793

unsigned int lru_mask)

794

{

794

{

795

int nid;

795

int nid;

796

u64 total = 0;

796

u64 total = 0;

797

798

for_each_node_state(nid, N_HIGH_MEMORY)

798

for_each_node_state(nid, N_HIGH_MEMORY)

799

total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);

799

total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);

800

return total;

800

return total;

801

}

801

}

802

803

static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,

803

static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,

804

enum mem_cgroup_events_target target)

804

enum mem_cgroup_events_target target)

805

{

805

{

806

unsigned long val, next;

806

unsigned long val, next;

807

808

val = __this_cpu_read(memcg->stat->nr_page_events);

808

val = __this_cpu_read(memcg->stat->nr_page_events);

809

next = __this_cpu_read(memcg->stat->targets[target]);

809

next = __this_cpu_read(memcg->stat->targets[target]);

810

/* from time_after() in jiffies.h */

810

/* from time_after() in jiffies.h */

811

if ((long)next - (long)val < 0) {

811

if ((long)next - (long)val < 0) {

812

switch (target) {

812

switch (target) {

813

case MEM_CGROUP_TARGET_THRESH:

813

case MEM_CGROUP_TARGET_THRESH:

814

next = val + THRESHOLDS_EVENTS_TARGET;

814

next = val + THRESHOLDS_EVENTS_TARGET;

815

break;

815

break;

816

case MEM_CGROUP_TARGET_SOFTLIMIT:

816

case MEM_CGROUP_TARGET_SOFTLIMIT:

817

next = val + SOFTLIMIT_EVENTS_TARGET;

817

next = val + SOFTLIMIT_EVENTS_TARGET;

818

break;

818

break;

819

case MEM_CGROUP_TARGET_NUMAINFO:

819

case MEM_CGROUP_TARGET_NUMAINFO:

820

next = val + NUMAINFO_EVENTS_TARGET;

820

next = val + NUMAINFO_EVENTS_TARGET;

821

break;

821

break;

822

default:

822

default:

823

break;

823

break;

824

}

824

}

825

__this_cpu_write(memcg->stat->targets[target], next);

825

__this_cpu_write(memcg->stat->targets[target], next);

826

return true;

826

return true;

827

}

827

}

828

return false;

828

return false;

829

}

829

}

830

831

/*

831

/*

832

* Check events in order.

832

* Check events in order.

833

*

833

*

834

*/

834

*/

835

static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)

835

static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)

836

{

836

{

837

preempt_disable();

837

preempt_disable();

838

/* threshold event is triggered in finer grain than soft limit */

838

/* threshold event is triggered in finer grain than soft limit */

839

if (unlikely(mem_cgroup_event_ratelimit(memcg,

839

if (unlikely(mem_cgroup_event_ratelimit(memcg,

840

MEM_CGROUP_TARGET_THRESH))) {

840

MEM_CGROUP_TARGET_THRESH))) {

841

bool do_softlimit;

841

bool do_softlimit;

842

bool do_numainfo __maybe_unused;

842

bool do_numainfo __maybe_unused;

843

844

do_softlimit = mem_cgroup_event_ratelimit(memcg,

844

do_softlimit = mem_cgroup_event_ratelimit(memcg,

845

MEM_CGROUP_TARGET_SOFTLIMIT);

845

MEM_CGROUP_TARGET_SOFTLIMIT);

846

#if MAX_NUMNODES > 1

846

#if MAX_NUMNODES > 1

847

do_numainfo = mem_cgroup_event_ratelimit(memcg,

847

do_numainfo = mem_cgroup_event_ratelimit(memcg,

848

MEM_CGROUP_TARGET_NUMAINFO);

848

MEM_CGROUP_TARGET_NUMAINFO);

849

#endif

849

#endif

850

preempt_enable();

850

preempt_enable();

851

852

mem_cgroup_threshold(memcg);

852

mem_cgroup_threshold(memcg);

853

if (unlikely(do_softlimit))

853

if (unlikely(do_softlimit))

854

mem_cgroup_update_tree(memcg, page);

854

mem_cgroup_update_tree(memcg, page);

855

#if MAX_NUMNODES > 1

855

#if MAX_NUMNODES > 1

856

if (unlikely(do_numainfo))

856

if (unlikely(do_numainfo))

857

atomic_inc(&memcg->numainfo_events);

857

atomic_inc(&memcg->numainfo_events);

858

#endif

858

#endif

859

} else

859

} else

860

preempt_enable();

860

preempt_enable();

861

}

861

}

862

863

struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)

863

struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)

864

{

864

{

865

return container_of(cgroup_subsys_state(cont,

865

return container_of(cgroup_subsys_state(cont,

866

mem_cgroup_subsys_id), struct mem_cgroup,

866

mem_cgroup_subsys_id), struct mem_cgroup,

867

css);

867

css);

868

}

868

}

869

870

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)

870

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)

871

{

871

{

872

/*

872

/*

873

* mm_update_next_owner() may clear mm->owner to NULL

873

* mm_update_next_owner() may clear mm->owner to NULL

874

* if it races with swapoff, page migration, etc.

874

* if it races with swapoff, page migration, etc.

875

* So this can be called with p == NULL.

875

* So this can be called with p == NULL.

876

*/

876

*/

877

if (unlikely(!p))

877

if (unlikely(!p))

878

return NULL;

878

return NULL;

879

880

return container_of(task_subsys_state(p, mem_cgroup_subsys_id),

880

return container_of(task_subsys_state(p, mem_cgroup_subsys_id),

881

struct mem_cgroup, css);

881

struct mem_cgroup, css);

882

}

882

}

883

884

struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)

884

struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)

885

{

885

{

886

struct mem_cgroup *memcg = NULL;

886

struct mem_cgroup *memcg = NULL;

887

888

if (!mm)

888

if (!mm)

889

return NULL;

889

return NULL;

890

/*

890

/*

891

* Because we have no locks, mm->owner's may be being moved to other

891

* Because we have no locks, mm->owner's may be being moved to other

892

* cgroup. We use css_tryget() here even if this looks

892

* cgroup. We use css_tryget() here even if this looks

893

* pessimistic (rather than adding locks here).

893

* pessimistic (rather than adding locks here).

894

*/

894

*/

895

rcu_read_lock();

895

rcu_read_lock();

896

do {

896

do {

897

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));

897

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));

898

if (unlikely(!memcg))

898

if (unlikely(!memcg))

899

break;

899

break;

900

} while (!css_tryget(&memcg->css));

900

} while (!css_tryget(&memcg->css));

901

rcu_read_unlock();

901

rcu_read_unlock();

902

return memcg;

902

return memcg;

903

}

903

}

904

905

/**

905

/**

906

* mem_cgroup_iter - iterate over memory cgroup hierarchy

906

* mem_cgroup_iter - iterate over memory cgroup hierarchy

907

* @root: hierarchy root

907

* @root: hierarchy root

908

* @prev: previously returned memcg, NULL on first invocation

908

* @prev: previously returned memcg, NULL on first invocation

909

* @reclaim: cookie for shared reclaim walks, NULL for full walks

909

* @reclaim: cookie for shared reclaim walks, NULL for full walks

910

*

910

*

911

* Returns references to children of the hierarchy below @root, or

911

* Returns references to children of the hierarchy below @root, or

912

* @root itself, or %NULL after a full round-trip.

912

* @root itself, or %NULL after a full round-trip.

913

*

913

*

914

* Caller must pass the return value in @prev on subsequent

914

* Caller must pass the return value in @prev on subsequent

915

* invocations for reference counting, or use mem_cgroup_iter_break()

915

* invocations for reference counting, or use mem_cgroup_iter_break()

916

* to cancel a hierarchy walk before the round-trip is complete.

916

* to cancel a hierarchy walk before the round-trip is complete.

917

*

917

*

918

* Reclaimers can specify a zone and a priority level in @reclaim to

918

* Reclaimers can specify a zone and a priority level in @reclaim to

919

* divide up the memcgs in the hierarchy among all concurrent

919

* divide up the memcgs in the hierarchy among all concurrent

920

* reclaimers operating on the same zone and priority.

920

* reclaimers operating on the same zone and priority.

921

*/

921

*/

922

struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,

922

struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,

923

struct mem_cgroup *prev,

923

struct mem_cgroup *prev,

924

struct mem_cgroup_reclaim_cookie *reclaim)

924

struct mem_cgroup_reclaim_cookie *reclaim)

925

{

925

{

926

struct mem_cgroup *memcg = NULL;

926

struct mem_cgroup *memcg = NULL;

927

int id = 0;

927

int id = 0;

928

929

if (mem_cgroup_disabled())

929

if (mem_cgroup_disabled())

930

return NULL;

930

return NULL;

931

932

if (!root)

932

if (!root)

933

root = root_mem_cgroup;

933

root = root_mem_cgroup;

934

935

if (prev && !reclaim)

935

if (prev && !reclaim)

936

id = css_id(&prev->css);

936

id = css_id(&prev->css);

937

938

if (prev && prev != root)

938

if (prev && prev != root)

939

css_put(&prev->css);

939

css_put(&prev->css);

940

941

if (!root->use_hierarchy && root != root_mem_cgroup) {

941

if (!root->use_hierarchy && root != root_mem_cgroup) {

942

if (prev)

942

if (prev)

943

return NULL;

943

return NULL;

944

return root;

944

return root;

945

}

945

}

946

947

while (!memcg) {

947

while (!memcg) {

948

struct mem_cgroup_reclaim_iter *uninitialized_var(iter);

948

struct mem_cgroup_reclaim_iter *uninitialized_var(iter);

949

struct cgroup_subsys_state *css;

949

struct cgroup_subsys_state *css;

950

951

if (reclaim) {

951

if (reclaim) {

952

int nid = zone_to_nid(reclaim->zone);

952

int nid = zone_to_nid(reclaim->zone);

953

int zid = zone_idx(reclaim->zone);

953

int zid = zone_idx(reclaim->zone);

954

struct mem_cgroup_per_zone *mz;

954

struct mem_cgroup_per_zone *mz;

955

956

mz = mem_cgroup_zoneinfo(root, nid, zid);

956

mz = mem_cgroup_zoneinfo(root, nid, zid);

957

iter = &mz->reclaim_iter[reclaim->priority];

957

iter = &mz->reclaim_iter[reclaim->priority];

958

if (prev && reclaim->generation != iter->generation)

958

if (prev && reclaim->generation != iter->generation)

959

return NULL;

959

return NULL;

960

id = iter->position;

960

id = iter->position;

961

}

961

}

962

963

rcu_read_lock();

963

rcu_read_lock();

964

css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);

964

css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);

965

if (css) {

965

if (css) {

966

if (css == &root->css || css_tryget(css))

966

if (css == &root->css || css_tryget(css))

967

memcg = container_of(css,

967

memcg = container_of(css,

968

struct mem_cgroup, css);

968

struct mem_cgroup, css);

969

} else

969

} else

970

id = 0;

970

id = 0;

971

rcu_read_unlock();

971

rcu_read_unlock();

972

973

if (reclaim) {

973

if (reclaim) {

974

iter->position = id;

974

iter->position = id;

975

if (!css)

975

if (!css)

976

iter->generation++;

976

iter->generation++;

977

else if (!prev && memcg)

977

else if (!prev && memcg)

978

reclaim->generation = iter->generation;

978

reclaim->generation = iter->generation;

979

}

979

}

980

981

if (prev && !css)

981

if (prev && !css)

982

return NULL;

982

return NULL;

983

}

983

}

984

return memcg;

984

return memcg;

985

}

985

}

986

987

/**

987

/**

988

* mem_cgroup_iter_break - abort a hierarchy walk prematurely

988

* mem_cgroup_iter_break - abort a hierarchy walk prematurely

989

* @root: hierarchy root

989

* @root: hierarchy root

990

* @prev: last visited hierarchy member as returned by mem_cgroup_iter()

990

* @prev: last visited hierarchy member as returned by mem_cgroup_iter()

991

*/

991

*/

992

void mem_cgroup_iter_break(struct mem_cgroup *root,

992

void mem_cgroup_iter_break(struct mem_cgroup *root,

993

struct mem_cgroup *prev)

993

struct mem_cgroup *prev)

994

{

994

{

995

if (!root)

995

if (!root)

996

root = root_mem_cgroup;

996

root = root_mem_cgroup;

997

if (prev && prev != root)

997

if (prev && prev != root)

998

css_put(&prev->css);

998

css_put(&prev->css);

999

}

999

}

1000

1001

/*

1001

/*

1002

* Iteration constructs for visiting all cgroups (under a tree). If

1002

* Iteration constructs for visiting all cgroups (under a tree). If

1003

* loops are exited prematurely (break), mem_cgroup_iter_break() must

1003

* loops are exited prematurely (break), mem_cgroup_iter_break() must

1004

* be used for reference counting.

1004

* be used for reference counting.

1005

*/

1005

*/

1006

#define for_each_mem_cgroup_tree(iter, root) \

1006

#define for_each_mem_cgroup_tree(iter, root) \

1007

for (iter = mem_cgroup_iter(root, NULL, NULL); \

1007

for (iter = mem_cgroup_iter(root, NULL, NULL); \

1008

iter != NULL; \

1008

iter != NULL; \

1009

iter = mem_cgroup_iter(root, iter, NULL))

1009

iter = mem_cgroup_iter(root, iter, NULL))

1010

1011

#define for_each_mem_cgroup(iter) \

1011

#define for_each_mem_cgroup(iter) \

1012

for (iter = mem_cgroup_iter(NULL, NULL, NULL); \

1012

for (iter = mem_cgroup_iter(NULL, NULL, NULL); \

1013

iter != NULL; \

1013

iter != NULL; \

1014

iter = mem_cgroup_iter(NULL, iter, NULL))

1014

iter = mem_cgroup_iter(NULL, iter, NULL))

1015

1016

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)

1016

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)

1017

{

1017

{

1018

return (memcg == root_mem_cgroup);

1018

return (memcg == root_mem_cgroup);

1019

}

1019

}

1020

1021

void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)

1021

void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)

1022

{

1022

{

1023

struct mem_cgroup *memcg;

1023

struct mem_cgroup *memcg;

1024

1025

if (!mm)

1025

if (!mm)

1026

return;

1026

return;

1027

1028

rcu_read_lock();

1028

rcu_read_lock();

1029

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));

1029

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));

1030

if (unlikely(!memcg))

1030

if (unlikely(!memcg))

1031

goto out;

1031

goto out;

1032

1033

switch (idx) {

1033

switch (idx) {

1034

case PGFAULT:

1034

case PGFAULT:

1035

this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);

1035

this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);

1036

break;

1036

break;

1037

case PGMAJFAULT:

1037

case PGMAJFAULT:

1038

this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);

1038

this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);

1039

break;

1039

break;

1040

default:

1040

default:

1041

BUG();

1041

BUG();

1042

}

1042

}

1043

out:

1043

out:

1044

rcu_read_unlock();

1044

rcu_read_unlock();

1045

}

1045

}

1046

EXPORT_SYMBOL(mem_cgroup_count_vm_event);

1046

EXPORT_SYMBOL(mem_cgroup_count_vm_event);

1047

1048

/**

1048

/**

1049

* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg

1049

* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg

1050

* @zone: zone of the wanted lruvec

1050

* @zone: zone of the wanted lruvec

1051

* @memcg: memcg of the wanted lruvec

1051

* @memcg: memcg of the wanted lruvec

1052

*

1052

*

1053

* Returns the lru list vector holding pages for the given @zone and

1053

* Returns the lru list vector holding pages for the given @zone and

1054

* @mem. This can be the global zone lruvec, if the memory controller

1054

* @mem. This can be the global zone lruvec, if the memory controller

1055

* is disabled.

1055

* is disabled.

1056

*/

1056

*/

1057

struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,

1057

struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,

1058

struct mem_cgroup *memcg)

1058

struct mem_cgroup *memcg)

1059

{

1059

{

1060

struct mem_cgroup_per_zone *mz;

1060

struct mem_cgroup_per_zone *mz;

1061

1062

if (mem_cgroup_disabled())

1062

if (mem_cgroup_disabled())

1063

return &zone->lruvec;

1063

return &zone->lruvec;

1064

1065

mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));

1065

mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));

1066

return &mz->lruvec;

1066

return &mz->lruvec;

1067

}

1067

}

1068

1069

/*

1069

/*

1070

* Following LRU functions are allowed to be used without PCG_LOCK.

1070

* Following LRU functions are allowed to be used without PCG_LOCK.

1071

* Operations are called by routine of global LRU independently from memcg.

1071

* Operations are called by routine of global LRU independently from memcg.

1072

* What we have to take care of here is validness of pc->mem_cgroup.

1072

* What we have to take care of here is validness of pc->mem_cgroup.

1073

*

1073

*

1074

* Changes to pc->mem_cgroup happens when

1074

* Changes to pc->mem_cgroup happens when

1075

* 1. charge

1075

* 1. charge

1076

* 2. moving account

1076

* 2. moving account

1077

* In typical case, "charge" is done before add-to-lru. Exception is SwapCache.

1077

* In typical case, "charge" is done before add-to-lru. Exception is SwapCache.

1078

* It is added to LRU before charge.

1078

* It is added to LRU before charge.

1079

* If PCG_USED bit is not set, page_cgroup is not added to this private LRU.

1079

* If PCG_USED bit is not set, page_cgroup is not added to this private LRU.

1080

* When moving account, the page is not on LRU. It's isolated.

1080

* When moving account, the page is not on LRU. It's isolated.

1081

*/

1081

*/

1082

1083

/**

1083

/**

1084

* mem_cgroup_page_lruvec - return lruvec for adding an lru page

1084

* mem_cgroup_page_lruvec - return lruvec for adding an lru page

1085

* @page: the page

1085

* @page: the page

1086

* @zone: zone of the page

1086

* @zone: zone of the page

1087

*/

1087

*/

1088

struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)

1088

struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)

1089

{

1089

{

1090

struct mem_cgroup_per_zone *mz;

1090

struct mem_cgroup_per_zone *mz;

1091

struct mem_cgroup *memcg;

1091

struct mem_cgroup *memcg;

1092

struct page_cgroup *pc;

1092

struct page_cgroup *pc;

1093

1094

if (mem_cgroup_disabled())

1094

if (mem_cgroup_disabled())

1095

return &zone->lruvec;

1095

return &zone->lruvec;

1096

1097

pc = lookup_page_cgroup(page);

1097

pc = lookup_page_cgroup(page);

1098

memcg = pc->mem_cgroup;

1098

memcg = pc->mem_cgroup;

1099

1100

/*

1100

/*

1101

* Surreptitiously switch any uncharged offlist page to root:

1101

* Surreptitiously switch any uncharged offlist page to root:

1102

* an uncharged page off lru does nothing to secure

1102

* an uncharged page off lru does nothing to secure

1103

* its former mem_cgroup from sudden removal.

1103

* its former mem_cgroup from sudden removal.

1104

*

1104

*

1105

* Our caller holds lru_lock, and PageCgroupUsed is updated

1105

* Our caller holds lru_lock, and PageCgroupUsed is updated

1106

* under page_cgroup lock: between them, they make all uses

1106

* under page_cgroup lock: between them, they make all uses

1107

* of pc->mem_cgroup safe.

1107

* of pc->mem_cgroup safe.

1108

*/

1108

*/

1109

if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)

1109

if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)

1110

pc->mem_cgroup = memcg = root_mem_cgroup;

1110

pc->mem_cgroup = memcg = root_mem_cgroup;

1111

1112

mz = page_cgroup_zoneinfo(memcg, page);

1112

mz = page_cgroup_zoneinfo(memcg, page);

1113

return &mz->lruvec;

1113

return &mz->lruvec;

1114

}

1114

}

1115

1116

/**

1116

/**

1117

* mem_cgroup_update_lru_size - account for adding or removing an lru page

1117

* mem_cgroup_update_lru_size - account for adding or removing an lru page

1118

* @lruvec: mem_cgroup per zone lru vector

1118

* @lruvec: mem_cgroup per zone lru vector

1119

* @lru: index of lru list the page is sitting on

1119

* @lru: index of lru list the page is sitting on

1120

* @nr_pages: positive when adding or negative when removing

1120

* @nr_pages: positive when adding or negative when removing

1121

*

1121

*

1122

* This function must be called when a page is added to or removed from an

1122

* This function must be called when a page is added to or removed from an

1123

* lru list.

1123

* lru list.

1124

*/

1124

*/

1125

void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,

1125

void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,

1126

int nr_pages)

1126

int nr_pages)

1127

{

1127

{

1128

struct mem_cgroup_per_zone *mz;

1128

struct mem_cgroup_per_zone *mz;

1129

unsigned long *lru_size;

1129

unsigned long *lru_size;

1130

1131

if (mem_cgroup_disabled())

1131

if (mem_cgroup_disabled())

1132

return;

1132

return;

1133

1134

mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);

1134

mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);

1135

lru_size = mz->lru_size + lru;

1135

lru_size = mz->lru_size + lru;

1136

*lru_size += nr_pages;

1136

*lru_size += nr_pages;

1137

VM_BUG_ON((long)(*lru_size) < 0);

1137

VM_BUG_ON((long)(*lru_size) < 0);

1138

}

1138

}

1139

1140

/*

1140

/*

1141

* Checks whether given mem is same or in the root_mem_cgroup's

1141

* Checks whether given mem is same or in the root_mem_cgroup's

1142

* hierarchy subtree

1142

* hierarchy subtree

1143

*/

1143

*/

1144

bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,

1144

bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,

1145

struct mem_cgroup *memcg)

1145

struct mem_cgroup *memcg)

1146

{

1146

{

1147

if (root_memcg == memcg)

1147

if (root_memcg == memcg)

1148

return true;

1148

return true;

1149

if (!root_memcg->use_hierarchy || !memcg)

1149

if (!root_memcg->use_hierarchy || !memcg)

1150

return false;

1150

return false;

1151

return css_is_ancestor(&memcg->css, &root_memcg->css);

1151

return css_is_ancestor(&memcg->css, &root_memcg->css);

1152

}

1152

}

1153

1154

static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,

1154

static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,

1155

struct mem_cgroup *memcg)

1155

struct mem_cgroup *memcg)

1156

{

1156

{

1157

bool ret;

1157

bool ret;

1158

1159

rcu_read_lock();

1159

rcu_read_lock();

1160

ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);

1160

ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);

1161

rcu_read_unlock();

1161

rcu_read_unlock();

1162

return ret;

1162

return ret;

1163

}

1163

}

1164

1165

int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)

1165

int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)

1166

{

1166

{

1167

int ret;

1167

int ret;

1168

struct mem_cgroup *curr = NULL;

1168

struct mem_cgroup *curr = NULL;

1169

struct task_struct *p;

1169

struct task_struct *p;

1170

1171

p = find_lock_task_mm(task);

1171

p = find_lock_task_mm(task);

1172

if (p) {

1172

if (p) {

1173

curr = try_get_mem_cgroup_from_mm(p->mm);

1173

curr = try_get_mem_cgroup_from_mm(p->mm);

1174

task_unlock(p);

1174

task_unlock(p);

1175

} else {

1175

} else {

1176

/*

1176

/*

1177

* All threads may have already detached their mm's, but the oom

1177

* All threads may have already detached their mm's, but the oom

1178

* killer still needs to detect if they have already been oom

1178

* killer still needs to detect if they have already been oom

1179

* killed to prevent needlessly killing additional tasks.

1179

* killed to prevent needlessly killing additional tasks.

1180

*/

1180

*/

1181

task_lock(task);

1181

task_lock(task);

1182

curr = mem_cgroup_from_task(task);

1182

curr = mem_cgroup_from_task(task);

1183

if (curr)

1183

if (curr)

1184

css_get(&curr->css);

1184

css_get(&curr->css);

1185

task_unlock(task);

1185

task_unlock(task);

1186

}

1186

}

1187

if (!curr)

1187

if (!curr)

1188

return 0;

1188

return 0;

1189

/*

1189

/*

1190

* We should check use_hierarchy of "memcg" not "curr". Because checking

1190

* We should check use_hierarchy of "memcg" not "curr". Because checking

1191

* use_hierarchy of "curr" here make this function true if hierarchy is

1191

* use_hierarchy of "curr" here make this function true if hierarchy is

1192

* enabled in "curr" and "curr" is a child of "memcg" in *cgroup*

1192

* enabled in "curr" and "curr" is a child of "memcg" in *cgroup*

1193

* hierarchy(even if use_hierarchy is disabled in "memcg").

1193

* hierarchy(even if use_hierarchy is disabled in "memcg").

1194

*/

1194

*/

1195

ret = mem_cgroup_same_or_subtree(memcg, curr);

1195

ret = mem_cgroup_same_or_subtree(memcg, curr);

1196

css_put(&curr->css);

1196

css_put(&curr->css);

1197

return ret;

1197

return ret;

1198

}

1198

}

1199

1200

int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)

1200

int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)

1201

{

1201

{

1202

unsigned long inactive_ratio;

1202

unsigned long inactive_ratio;

1203

unsigned long inactive;

1203

unsigned long inactive;

1204

unsigned long active;

1204

unsigned long active;

1205

unsigned long gb;

1205

unsigned long gb;

1206

1207

inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);

1207

inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);

1208

active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);

1208

active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);

1209

1210

gb = (inactive + active) >> (30 - PAGE_SHIFT);

1210

gb = (inactive + active) >> (30 - PAGE_SHIFT);

1211

if (gb)

1211

if (gb)

1212

inactive_ratio = int_sqrt(10 * gb);

1212

inactive_ratio = int_sqrt(10 * gb);

1213

else

1213

else

1214

inactive_ratio = 1;

1214

inactive_ratio = 1;

1215

1216

return inactive * inactive_ratio < active;

1216

return inactive * inactive_ratio < active;

1217

}

1217

}

1218

1219

int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)

1219

int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)

1220

{

1220

{

1221

unsigned long active;

1221

unsigned long active;

1222

unsigned long inactive;

1222

unsigned long inactive;

1223

1224

inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);

1224

inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);

1225

active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);

1225

active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);

1226

1227

return (active > inactive);

1227

return (active > inactive);

1228

}

1228

}

1229

1230

#define mem_cgroup_from_res_counter(counter, member) \

1230

#define mem_cgroup_from_res_counter(counter, member) \

1231

container_of(counter, struct mem_cgroup, member)

1231

container_of(counter, struct mem_cgroup, member)

1232

1233

/**

1233

/**

1234

* mem_cgroup_margin - calculate chargeable space of a memory cgroup

1234

* mem_cgroup_margin - calculate chargeable space of a memory cgroup

1235

* @memcg: the memory cgroup

1235

* @memcg: the memory cgroup

1236

*

1236

*

1237

* Returns the maximum amount of memory @mem can be charged with, in

1237

* Returns the maximum amount of memory @mem can be charged with, in

1238

* pages.

1238

* pages.

1239

*/

1239

*/

1240

static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)

1240

static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)

1241

{

1241

{

1242

unsigned long long margin;

1242

unsigned long long margin;

1243

1244

margin = res_counter_margin(&memcg->res);

1244

margin = res_counter_margin(&memcg->res);

1245

if (do_swap_account)

1245

if (do_swap_account)

1246

margin = min(margin, res_counter_margin(&memcg->memsw));

1246

margin = min(margin, res_counter_margin(&memcg->memsw));

1247

return margin >> PAGE_SHIFT;

1247

return margin >> PAGE_SHIFT;

1248

}

1248

}

1249

1250

int mem_cgroup_swappiness(struct mem_cgroup *memcg)

1250

int mem_cgroup_swappiness(struct mem_cgroup *memcg)

1251

{

1251

{

1252

struct cgroup *cgrp = memcg->css.cgroup;

1252

struct cgroup *cgrp = memcg->css.cgroup;

1253

1254

/* root ? */

1254

/* root ? */

1255

if (cgrp->parent == NULL)

1255

if (cgrp->parent == NULL)

1256

return vm_swappiness;

1256

return vm_swappiness;

1257

1258

return memcg->swappiness;

1258

return memcg->swappiness;

1259

}

1259

}

1260

1261

/*

1261

/*

1262

* memcg->moving_account is used for checking possibility that some thread is

1262

* memcg->moving_account is used for checking possibility that some thread is

1263

* calling move_account(). When a thread on CPU-A starts moving pages under

1263

* calling move_account(). When a thread on CPU-A starts moving pages under

1264

* a memcg, other threads should check memcg->moving_account under

1264

* a memcg, other threads should check memcg->moving_account under

1265

* rcu_read_lock(), like this:

1265

* rcu_read_lock(), like this:

1266

*

1266

*

1267

* CPU-A CPU-B

1267

* CPU-A CPU-B

1268

* rcu_read_lock()

1268

* rcu_read_lock()

1269

* memcg->moving_account+1 if (memcg->mocing_account)

1269

* memcg->moving_account+1 if (memcg->mocing_account)

1270

* take heavy locks.

1270

* take heavy locks.

1271

* synchronize_rcu() update something.

1271

* synchronize_rcu() update something.

1272

* rcu_read_unlock()

1272

* rcu_read_unlock()

1273

* start move here.

1273

* start move here.

1274

*/

1274

*/

1275

1276

/* for quick checking without looking up memcg */

1276

/* for quick checking without looking up memcg */

1277

atomic_t memcg_moving __read_mostly;

1277

atomic_t memcg_moving __read_mostly;

1278

1279

static void mem_cgroup_start_move(struct mem_cgroup *memcg)

1279

static void mem_cgroup_start_move(struct mem_cgroup *memcg)

1280

{

1280

{

1281

atomic_inc(&memcg_moving);

1281

atomic_inc(&memcg_moving);

1282

atomic_inc(&memcg->moving_account);

1282

atomic_inc(&memcg->moving_account);

1283

synchronize_rcu();

1283

synchronize_rcu();

1284

}

1284

}

1285

1286

static void mem_cgroup_end_move(struct mem_cgroup *memcg)

1286

static void mem_cgroup_end_move(struct mem_cgroup *memcg)

1287

{

1287

{

1288

/*

1288

/*

1289

* Now, mem_cgroup_clear_mc() may call this function with NULL.

1289

* Now, mem_cgroup_clear_mc() may call this function with NULL.

1290

* We check NULL in callee rather than caller.

1290

* We check NULL in callee rather than caller.

1291

*/

1291

*/

1292

if (memcg) {

1292

if (memcg) {

1293

atomic_dec(&memcg_moving);

1293

atomic_dec(&memcg_moving);

1294

atomic_dec(&memcg->moving_account);

1294

atomic_dec(&memcg->moving_account);

1295

}

1295

}

1296

}

1296

}

1297

1298

/*

1298

/*

1299

* 2 routines for checking "mem" is under move_account() or not.

1299

* 2 routines for checking "mem" is under move_account() or not.

1300

*

1300

*

1301

* mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This

1301

* mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This

1302

* is used for avoiding races in accounting. If true,

1302

* is used for avoiding races in accounting. If true,

1303

* pc->mem_cgroup may be overwritten.

1303

* pc->mem_cgroup may be overwritten.

1304

*

1304

*

1305

* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or

1305

* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or

1306

* under hierarchy of moving cgroups. This is for

1306

* under hierarchy of moving cgroups. This is for

1307

* waiting at hith-memory prressure caused by "move".

1307

* waiting at hith-memory prressure caused by "move".

1308

*/

1308

*/

1309

1310

static bool mem_cgroup_stolen(struct mem_cgroup *memcg)

1310

static bool mem_cgroup_stolen(struct mem_cgroup *memcg)

1311

{

1311

{

1312

VM_BUG_ON(!rcu_read_lock_held());

1312

VM_BUG_ON(!rcu_read_lock_held());

1313

return atomic_read(&memcg->moving_account) > 0;

1313

return atomic_read(&memcg->moving_account) > 0;

1314

}

1314

}

1315

1316

static bool mem_cgroup_under_move(struct mem_cgroup *memcg)

1316

static bool mem_cgroup_under_move(struct mem_cgroup *memcg)

1317

{

1317

{

1318

struct mem_cgroup *from;

1318

struct mem_cgroup *from;

1319

struct mem_cgroup *to;

1319

struct mem_cgroup *to;

1320

bool ret = false;

1320

bool ret = false;

1321

/*

1321

/*

1322

* Unlike task_move routines, we access mc.to, mc.from not under

1322

* Unlike task_move routines, we access mc.to, mc.from not under

1323

* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.

1323

* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.

1324

*/

1324

*/

1325

spin_lock(&mc.lock);

1325

spin_lock(&mc.lock);

1326

from = mc.from;

1326

from = mc.from;

1327

to = mc.to;

1327

to = mc.to;

1328

if (!from)

1328

if (!from)

1329

goto unlock;

1329

goto unlock;

1330

1331

ret = mem_cgroup_same_or_subtree(memcg, from)

1331

ret = mem_cgroup_same_or_subtree(memcg, from)

1332

|| mem_cgroup_same_or_subtree(memcg, to);

1332

|| mem_cgroup_same_or_subtree(memcg, to);

1333

unlock:

1333

unlock:

1334

spin_unlock(&mc.lock);

1334

spin_unlock(&mc.lock);

1335

return ret;

1335

return ret;

1336

}

1336

}

1337

1338

static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)

1338

static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)

1339

{

1339

{

1340

if (mc.moving_task && current != mc.moving_task) {

1340

if (mc.moving_task && current != mc.moving_task) {

1341

if (mem_cgroup_under_move(memcg)) {

1341

if (mem_cgroup_under_move(memcg)) {

1342

DEFINE_WAIT(wait);

1342

DEFINE_WAIT(wait);

1343

prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);

1343

prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);

1344

/* moving charge context might have finished. */

1344

/* moving charge context might have finished. */

1345

if (mc.moving_task)

1345

if (mc.moving_task)

1346

schedule();

1346

schedule();

1347

finish_wait(&mc.waitq, &wait);

1347

finish_wait(&mc.waitq, &wait);

1348

return true;

1348

return true;

1349

}

1349

}

1350

}

1350

}

1351

return false;

1351

return false;

1352

}

1352

}

1353

1354

/*

1354

/*

1355

* Take this lock when

1355

* Take this lock when

1356

* - a code tries to modify page's memcg while it's USED.

1356

* - a code tries to modify page's memcg while it's USED.

1357

* - a code tries to modify page state accounting in a memcg.

1357

* - a code tries to modify page state accounting in a memcg.

1358

* see mem_cgroup_stolen(), too.

1358

* see mem_cgroup_stolen(), too.

1359

*/

1359

*/

1360

static void move_lock_mem_cgroup(struct mem_cgroup *memcg,

1360

static void move_lock_mem_cgroup(struct mem_cgroup *memcg,

1361

unsigned long *flags)

1361

unsigned long *flags)

1362

{

1362

{

1363

spin_lock_irqsave(&memcg->move_lock, *flags);

1363

spin_lock_irqsave(&memcg->move_lock, *flags);

1364

}

1364

}

1365

1366

static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,

1366

static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,

1367

unsigned long *flags)

1367

unsigned long *flags)

1368

{

1368

{

1369

spin_unlock_irqrestore(&memcg->move_lock, *flags);

1369

spin_unlock_irqrestore(&memcg->move_lock, *flags);

1370

}

1370

}

1371

1372

/**

1372

/**

1373

* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.

1373

* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.

1374

* @memcg: The memory cgroup that went over limit

1374

* @memcg: The memory cgroup that went over limit

1375

* @p: Task that is going to be killed

1375

* @p: Task that is going to be killed

1376

*

1376

*

1377

* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is

1377

* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is

1378

* enabled

1378

* enabled

1379

*/

1379

*/

1380

void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)

1380

void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)

1381

{

1381

{

1382

struct cgroup *task_cgrp;

1382

struct cgroup *task_cgrp;

1383

struct cgroup *mem_cgrp;

1383

struct cgroup *mem_cgrp;

1384

/*

1384

/*

1385

* Need a buffer in BSS, can't rely on allocations. The code relies

1385

* Need a buffer in BSS, can't rely on allocations. The code relies

1386

* on the assumption that OOM is serialized for memory controller.

1386

* on the assumption that OOM is serialized for memory controller.

1387

* If this assumption is broken, revisit this code.

1387

* If this assumption is broken, revisit this code.

1388

*/

1388

*/

1389

static char memcg_name[PATH_MAX];

1389

static char memcg_name[PATH_MAX];

1390

int ret;

1390

int ret;

1391

1392

if (!memcg || !p)

1392

if (!memcg || !p)

1393

return;

1393

return;

1394

1395

rcu_read_lock();

1395

rcu_read_lock();

1396

1397

mem_cgrp = memcg->css.cgroup;

1397

mem_cgrp = memcg->css.cgroup;

1398

task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);

1398

task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);

1399

1400

ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);

1400

ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);

1401

if (ret < 0) {

1401

if (ret < 0) {

1402

/*

1402

/*

1403

* Unfortunately, we are unable to convert to a useful name

1403

* Unfortunately, we are unable to convert to a useful name

1404

* But we'll still print out the usage information

1404

* But we'll still print out the usage information

1405

*/

1405

*/

1406

rcu_read_unlock();

1406

rcu_read_unlock();

1407

goto done;

1407

goto done;

1408

}

1408

}

1409

rcu_read_unlock();

1409

rcu_read_unlock();

1410

1411

printk(KERN_INFO "Task in %s killed", memcg_name);

1411

printk(KERN_INFO "Task in %s killed", memcg_name);

1412

1413

rcu_read_lock();

1413

rcu_read_lock();

1414

ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);

1414

ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);

1415

if (ret < 0) {

1415

if (ret < 0) {

1416

rcu_read_unlock();

1416

rcu_read_unlock();

1417

goto done;

1417

goto done;

1418

}

1418

}

1419

rcu_read_unlock();

1419

rcu_read_unlock();

1420

1421

/*

1421

/*

1422

* Continues from above, so we don't need an KERN_ level

1422

* Continues from above, so we don't need an KERN_ level

1423

*/

1423

*/

1424

printk(KERN_CONT " as a result of limit of %s\n", memcg_name);

1424

printk(KERN_CONT " as a result of limit of %s\n", memcg_name);

1425

done:

1425

done:

1426

1427

printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",

1427

printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",

1428

res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,

1428

res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,

1429

res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,

1429

res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,

1430

res_counter_read_u64(&memcg->res, RES_FAILCNT));

1430

res_counter_read_u64(&memcg->res, RES_FAILCNT));

1431

printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "

1431

printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "

1432

"failcnt %llu\n",

1432

"failcnt %llu\n",

1433

res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,

1433

res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,

1434

res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,

1434

res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,

1435

res_counter_read_u64(&memcg->memsw, RES_FAILCNT));

1435

res_counter_read_u64(&memcg->memsw, RES_FAILCNT));

1436

}

1436

}

1437

1438

/*

1438

/*

1439

* This function returns the number of memcg under hierarchy tree. Returns

1439

* This function returns the number of memcg under hierarchy tree. Returns

1440

* 1(self count) if no children.

1440

* 1(self count) if no children.

1441

*/

1441

*/

1442

static int mem_cgroup_count_children(struct mem_cgroup *memcg)

1442

static int mem_cgroup_count_children(struct mem_cgroup *memcg)

1443

{

1443

{

1444

int num = 0;

1444

int num = 0;

1445

struct mem_cgroup *iter;

1445

struct mem_cgroup *iter;

1446

1447

for_each_mem_cgroup_tree(iter, memcg)

1447

for_each_mem_cgroup_tree(iter, memcg)

1448

num++;

1448

num++;

1449

return num;

1449

return num;

1450

}

1450

}

1451

1452

/*

1452

/*

1453

* Return the memory (and swap, if configured) limit for a memcg.

1453

* Return the memory (and swap, if configured) limit for a memcg.

1454

*/

1454

*/

1455

static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)

1455

static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)

1456

{

1456

{

1457

u64 limit;

1457

u64 limit;

1458

u64 memsw;

1458

u64 memsw;

1459

1460

limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

1460

limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

1461

limit += total_swap_pages << PAGE_SHIFT;

1461

limit += total_swap_pages << PAGE_SHIFT;

1462

1463

memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

1463

memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

1464

/*

1464

/*

1465

* If memsw is finite and limits the amount of swap space available

1465

* If memsw is finite and limits the amount of swap space available

1466

* to this memcg, return that limit.

1466

* to this memcg, return that limit.

1467

*/

1467

*/

1468

return min(limit, memsw);

1468

return min(limit, memsw);

1469

}

1469

}

1470

1471

void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,

1471

void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,

1472

int order)

1472

int order)

1473

{

1473

{

1474

struct mem_cgroup *iter;

1474

struct mem_cgroup *iter;

1475

unsigned long chosen_points = 0;

1475

unsigned long chosen_points = 0;

1476

unsigned long totalpages;

1476

unsigned long totalpages;

1477

unsigned int points = 0;

1477

unsigned int points = 0;

1478

struct task_struct *chosen = NULL;

1478

struct task_struct *chosen = NULL;

1479

1480

/*

1480

/*

1481

* If current has a pending SIGKILL, then automatically select it. The

1481

* If current has a pending SIGKILL, then automatically select it. The

1482

* goal is to allow it to allocate so that it may quickly exit and free

1482

* goal is to allow it to allocate so that it may quickly exit and free

1483

* its memory.

1483

* its memory.

1484

*/

1484

*/

1485

if (fatal_signal_pending(current)) {

1485

if (fatal_signal_pending(current)) {

1486

set_thread_flag(TIF_MEMDIE);

1486

set_thread_flag(TIF_MEMDIE);

1487

return;

1487

return;

1488

}

1488

}

1489

1490

check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);

1490

check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);

1491

totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;

1491

totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;

1492

for_each_mem_cgroup_tree(iter, memcg) {

1492

for_each_mem_cgroup_tree(iter, memcg) {

1493

struct cgroup *cgroup = iter->css.cgroup;

1493

struct cgroup *cgroup = iter->css.cgroup;

1494

struct cgroup_iter it;

1494

struct cgroup_iter it;

1495

struct task_struct *task;

1495

struct task_struct *task;

1496

1497

cgroup_iter_start(cgroup, &it);

1497

cgroup_iter_start(cgroup, &it);

1498

while ((task = cgroup_iter_next(cgroup, &it))) {

1498

while ((task = cgroup_iter_next(cgroup, &it))) {

1499

switch (oom_scan_process_thread(task, totalpages, NULL,

1499

switch (oom_scan_process_thread(task, totalpages, NULL,

1500

false)) {

1500

false)) {

1501

case OOM_SCAN_SELECT:

1501

case OOM_SCAN_SELECT:

1502

if (chosen)

1502

if (chosen)

1503

put_task_struct(chosen);

1503

put_task_struct(chosen);

1504

chosen = task;

1504

chosen = task;

1505

chosen_points = ULONG_MAX;

1505

chosen_points = ULONG_MAX;

1506

get_task_struct(chosen);

1506

get_task_struct(chosen);

1507

/* fall through */

1507

/* fall through */

1508

case OOM_SCAN_CONTINUE:

1508

case OOM_SCAN_CONTINUE:

1509

continue;

1509

continue;

1510

case OOM_SCAN_ABORT:

1510

case OOM_SCAN_ABORT:

1511

cgroup_iter_end(cgroup, &it);

1511

cgroup_iter_end(cgroup, &it);

1512

mem_cgroup_iter_break(memcg, iter);

1512

mem_cgroup_iter_break(memcg, iter);

1513

if (chosen)

1513

if (chosen)

1514

put_task_struct(chosen);

1514

put_task_struct(chosen);

1515

return;

1515

return;

1516

case OOM_SCAN_OK:

1516

case OOM_SCAN_OK:

1517

break;

1517

break;

1518

};

1518

};

1519

points = oom_badness(task, memcg, NULL, totalpages);

1519

points = oom_badness(task, memcg, NULL, totalpages);

1520

if (points > chosen_points) {

1520

if (points > chosen_points) {

1521

if (chosen)

1521

if (chosen)

1522

put_task_struct(chosen);

1522

put_task_struct(chosen);

1523

chosen = task;

1523

chosen = task;

1524

chosen_points = points;

1524

chosen_points = points;

1525

get_task_struct(chosen);

1525

get_task_struct(chosen);

1526

}

1526

}

1527

}

1527

}

1528

cgroup_iter_end(cgroup, &it);

1528

cgroup_iter_end(cgroup, &it);

1529

}

1529

}

1530

1531

if (!chosen)

1531

if (!chosen)

1532

return;

1532

return;

1533

points = chosen_points * 1000 / totalpages;

1533

points = chosen_points * 1000 / totalpages;

1534

oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,

1534

oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,

1535

NULL, "Memory cgroup out of memory");

1535

NULL, "Memory cgroup out of memory");

1536

}

1536

}

1537

1538

static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,

1538

static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,

1539

gfp_t gfp_mask,

1539

gfp_t gfp_mask,

1540

unsigned long flags)

1540

unsigned long flags)

1541

{

1541

{

1542

unsigned long total = 0;

1542

unsigned long total = 0;

1543

bool noswap = false;

1543

bool noswap = false;

1544

int loop;

1544

int loop;

1545

1546

if (flags & MEM_CGROUP_RECLAIM_NOSWAP)

1546

if (flags & MEM_CGROUP_RECLAIM_NOSWAP)

1547

noswap = true;

1547

noswap = true;

1548

if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)

1548

if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)

1549

noswap = true;

1549

noswap = true;

1550

1551

for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {

1551

for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {

1552

if (loop)

1552

if (loop)

1553

drain_all_stock_async(memcg);

1553

drain_all_stock_async(memcg);

1554

total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);

1554

total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);

1555

/*

1555

/*

1556

* Allow limit shrinkers, which are triggered directly

1556

* Allow limit shrinkers, which are triggered directly

1557

* by userspace, to catch signals and stop reclaim

1557

* by userspace, to catch signals and stop reclaim

1558

* after minimal progress, regardless of the margin.

1558

* after minimal progress, regardless of the margin.

1559

*/

1559

*/

1560

if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))

1560

if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))

1561

break;

1561

break;

1562

if (mem_cgroup_margin(memcg))

1562

if (mem_cgroup_margin(memcg))

1563

break;

1563

break;

1564

/*

1564

/*

1565

* If nothing was reclaimed after two attempts, there

1565

* If nothing was reclaimed after two attempts, there

1566

* may be no reclaimable pages in this hierarchy.

1566

* may be no reclaimable pages in this hierarchy.

1567

*/

1567

*/

1568

if (loop && !total)

1568

if (loop && !total)

1569

break;

1569

break;

1570

}

1570

}

1571

return total;

1571

return total;

1572

}

1572

}

1573

1574

/**

1574

/**

1575

* test_mem_cgroup_node_reclaimable

1575

* test_mem_cgroup_node_reclaimable

1576

* @memcg: the target memcg

1576

* @memcg: the target memcg

1577

* @nid: the node ID to be checked.

1577

* @nid: the node ID to be checked.

1578

* @noswap : specify true here if the user wants flle only information.

1578

* @noswap : specify true here if the user wants flle only information.

1579

*

1579

*

1580

* This function returns whether the specified memcg contains any

1580

* This function returns whether the specified memcg contains any

1581

* reclaimable pages on a node. Returns true if there are any reclaimable

1581

* reclaimable pages on a node. Returns true if there are any reclaimable

1582

* pages in the node.

1582

* pages in the node.

1583

*/

1583

*/

1584

static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,

1584

static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,

1585

int nid, bool noswap)

1585

int nid, bool noswap)

1586

{

1586

{

1587

if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))

1587

if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))

1588

return true;

1588

return true;

1589

if (noswap || !total_swap_pages)

1589

if (noswap || !total_swap_pages)

1590

return false;

1590

return false;

1591

if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))

1591

if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))

1592

return true;

1592

return true;

1593

return false;

1593

return false;

1594

1595

}

1595

}

1596

#if MAX_NUMNODES > 1

1596

#if MAX_NUMNODES > 1

1597

1598

/*

1598

/*

1599

* Always updating the nodemask is not very good - even if we have an empty

1599

* Always updating the nodemask is not very good - even if we have an empty

1600

* list or the wrong list here, we can start from some node and traverse all

1600

* list or the wrong list here, we can start from some node and traverse all

1601

* nodes based on the zonelist. So update the list loosely once per 10 secs.

1601

* nodes based on the zonelist. So update the list loosely once per 10 secs.

1602

*

1602

*

1603

*/

1603

*/

1604

static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)

1604

static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)

1605

{

1605

{

1606

int nid;

1606

int nid;

1607

/*

1607

/*

1608

* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET

1608

* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET

1609

* pagein/pageout changes since the last update.

1609

* pagein/pageout changes since the last update.

1610

*/

1610

*/

1611

if (!atomic_read(&memcg->numainfo_events))

1611

if (!atomic_read(&memcg->numainfo_events))

1612

return;

1612

return;

1613

if (atomic_inc_return(&memcg->numainfo_updating) > 1)

1613

if (atomic_inc_return(&memcg->numainfo_updating) > 1)

1614

return;

1614

return;

1615

1616

/* make a nodemask where this memcg uses memory from */

1616

/* make a nodemask where this memcg uses memory from */

1617

memcg->scan_nodes = node_states[N_HIGH_MEMORY];

1617

memcg->scan_nodes = node_states[N_HIGH_MEMORY];

1618

1619

for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {

1619

for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {

1620

1621

if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))

1621

if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))

1622

node_clear(nid, memcg->scan_nodes);

1622

node_clear(nid, memcg->scan_nodes);

1623

}

1623

}

1624

1625

atomic_set(&memcg->numainfo_events, 0);

1625

atomic_set(&memcg->numainfo_events, 0);

1626

atomic_set(&memcg->numainfo_updating, 0);

1626

atomic_set(&memcg->numainfo_updating, 0);

1627

}

1627

}

1628

1629

/*

1629

/*

1630

* Selecting a node where we start reclaim from. Because what we need is just

1630

* Selecting a node where we start reclaim from. Because what we need is just

1631

* reducing usage counter, start from anywhere is O,K. Considering

1631

* reducing usage counter, start from anywhere is O,K. Considering

1632

* memory reclaim from current node, there are pros. and cons.

1632

* memory reclaim from current node, there are pros. and cons.

1633

*

1633

*

1634

* Freeing memory from current node means freeing memory from a node which

1634

* Freeing memory from current node means freeing memory from a node which

1635

* we'll use or we've used. So, it may make LRU bad. And if several threads

1635

* we'll use or we've used. So, it may make LRU bad. And if several threads

1636

* hit limits, it will see a contention on a node. But freeing from remote

1636

* hit limits, it will see a contention on a node. But freeing from remote

1637

* node means more costs for memory reclaim because of memory latency.

1637

* node means more costs for memory reclaim because of memory latency.

1638

*

1638

*

1639

* Now, we use round-robin. Better algorithm is welcomed.

1639

* Now, we use round-robin. Better algorithm is welcomed.

1640

*/

1640

*/

1641

int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)

1641

int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)

1642

{

1642

{

1643

int node;

1643

int node;

1644

1645

mem_cgroup_may_update_nodemask(memcg);

1645

mem_cgroup_may_update_nodemask(memcg);

1646

node = memcg->last_scanned_node;

1646

node = memcg->last_scanned_node;

1647

1648

node = next_node(node, memcg->scan_nodes);

1648

node = next_node(node, memcg->scan_nodes);

1649

if (node == MAX_NUMNODES)

1649

if (node == MAX_NUMNODES)

1650

node = first_node(memcg->scan_nodes);

1650

node = first_node(memcg->scan_nodes);

1651

/*

1651

/*

1652

* We call this when we hit limit, not when pages are added to LRU.

1652

* We call this when we hit limit, not when pages are added to LRU.

1653

* No LRU may hold pages because all pages are UNEVICTABLE or

1653

* No LRU may hold pages because all pages are UNEVICTABLE or

1654

* memcg is too small and all pages are not on LRU. In that case,

1654

* memcg is too small and all pages are not on LRU. In that case,

1655

* we use curret node.

1655

* we use curret node.

1656

*/

1656

*/

1657

if (unlikely(node == MAX_NUMNODES))

1657

if (unlikely(node == MAX_NUMNODES))

1658

node = numa_node_id();

1658

node = numa_node_id();

1659

1660

memcg->last_scanned_node = node;

1660

memcg->last_scanned_node = node;

1661

return node;

1661

return node;

1662

}

1662

}

1663

1664

/*

1664

/*

1665

* Check all nodes whether it contains reclaimable pages or not.

1665

* Check all nodes whether it contains reclaimable pages or not.

1666

* For quick scan, we make use of scan_nodes. This will allow us to skip

1666

* For quick scan, we make use of scan_nodes. This will allow us to skip

1667

* unused nodes. But scan_nodes is lazily updated and may not cotain

1667

* unused nodes. But scan_nodes is lazily updated and may not cotain

1668

* enough new information. We need to do double check.

1668

* enough new information. We need to do double check.

1669

*/

1669

*/

1670

static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)

1670

static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)

1671

{

1671

{

1672

int nid;

1672

int nid;

1673

1674

/*

1674

/*

1675

* quick check...making use of scan_node.

1675

* quick check...making use of scan_node.

1676

* We can skip unused nodes.

1676

* We can skip unused nodes.

1677

*/

1677

*/

1678

if (!nodes_empty(memcg->scan_nodes)) {

1678

if (!nodes_empty(memcg->scan_nodes)) {

1679

for (nid = first_node(memcg->scan_nodes);

1679

for (nid = first_node(memcg->scan_nodes);

1680

nid < MAX_NUMNODES;

1680

nid < MAX_NUMNODES;

1681

nid = next_node(nid, memcg->scan_nodes)) {

1681

nid = next_node(nid, memcg->scan_nodes)) {

1682

1683

if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))

1683

if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))

1684

return true;

1684

return true;

1685

}

1685

}

1686

}

1686

}

1687

/*

1687

/*

1688

* Check rest of nodes.

1688

* Check rest of nodes.

1689

*/

1689

*/

1690

for_each_node_state(nid, N_HIGH_MEMORY) {

1690

for_each_node_state(nid, N_HIGH_MEMORY) {

1691

if (node_isset(nid, memcg->scan_nodes))

1691

if (node_isset(nid, memcg->scan_nodes))

1692

continue;

1692

continue;

1693

if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))

1693

if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))

1694

return true;

1694

return true;

1695

}

1695

}

1696

return false;

1696

return false;

1697

}

1697

}

1698

1699

#else

1699

#else

1700

int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)

1700

int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)

1701

{

1701

{

1702

return 0;

1702

return 0;

1703

}

1703

}

1704

1705

static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)

1705

static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)

1706

{

1706

{

1707

return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);

1707

return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);

1708

}

1708

}

1709

#endif

1709

#endif

1710

1711

static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,

1711

static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,

1712

struct zone *zone,

1712

struct zone *zone,

1713

gfp_t gfp_mask,

1713

gfp_t gfp_mask,

1714

unsigned long *total_scanned)

1714

unsigned long *total_scanned)

1715

{

1715

{

1716

struct mem_cgroup *victim = NULL;

1716

struct mem_cgroup *victim = NULL;

1717

int total = 0;

1717

int total = 0;

1718

int loop = 0;

1718

int loop = 0;

1719

unsigned long excess;

1719

unsigned long excess;

1720

unsigned long nr_scanned;

1720

unsigned long nr_scanned;

1721

struct mem_cgroup_reclaim_cookie reclaim = {

1721

struct mem_cgroup_reclaim_cookie reclaim = {

1722

.zone = zone,

1722

.zone = zone,

1723

.priority = 0,

1723

.priority = 0,

1724

};

1724

};

1725

1726

excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;

1726

excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;

1727

1728

while (1) {

1728

while (1) {

1729

victim = mem_cgroup_iter(root_memcg, victim, &reclaim);

1729

victim = mem_cgroup_iter(root_memcg, victim, &reclaim);

1730

if (!victim) {

1730

if (!victim) {

1731

loop++;

1731

loop++;

1732

if (loop >= 2) {

1732

if (loop >= 2) {

1733

/*

1733

/*

1734

* If we have not been able to reclaim

1734

* If we have not been able to reclaim

1735

* anything, it might because there are

1735

* anything, it might because there are

1736

* no reclaimable pages under this hierarchy

1736

* no reclaimable pages under this hierarchy

1737

*/

1737

*/

1738

if (!total)

1738

if (!total)

1739

break;

1739

break;

1740

/*

1740

/*

1741

* We want to do more targeted reclaim.

1741

* We want to do more targeted reclaim.

1742

* excess >> 2 is not to excessive so as to

1742

* excess >> 2 is not to excessive so as to

1743

* reclaim too much, nor too less that we keep

1743

* reclaim too much, nor too less that we keep

1744

* coming back to reclaim from this cgroup

1744

* coming back to reclaim from this cgroup

1745

*/

1745

*/

1746

if (total >= (excess >> 2) ||

1746

if (total >= (excess >> 2) ||

1747

(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))

1747

(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))

1748

break;

1748

break;

1749

}

1749

}

1750

continue;

1750

continue;

1751

}

1751

}

1752

if (!mem_cgroup_reclaimable(victim, false))

1752

if (!mem_cgroup_reclaimable(victim, false))

1753

continue;

1753

continue;

1754

total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,

1754

total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,

1755

zone, &nr_scanned);

1755

zone, &nr_scanned);

1756

*total_scanned += nr_scanned;

1756

*total_scanned += nr_scanned;

1757

if (!res_counter_soft_limit_excess(&root_memcg->res))

1757

if (!res_counter_soft_limit_excess(&root_memcg->res))

1758

break;

1758

break;

1759

}

1759

}

1760

mem_cgroup_iter_break(root_memcg, victim);

1760

mem_cgroup_iter_break(root_memcg, victim);

1761

return total;

1761

return total;

1762

}

1762

}

1763

1764

/*

1764

/*

1765

* Check OOM-Killer is already running under our hierarchy.

1765

* Check OOM-Killer is already running under our hierarchy.

1766

* If someone is running, return false.

1766

* If someone is running, return false.

1767

* Has to be called with memcg_oom_lock

1767

* Has to be called with memcg_oom_lock

1768

*/

1768

*/

1769

static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)

1769

static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)

1770

{

1770

{

1771

struct mem_cgroup *iter, *failed = NULL;

1771

struct mem_cgroup *iter, *failed = NULL;

1772

1773

for_each_mem_cgroup_tree(iter, memcg) {

1773

for_each_mem_cgroup_tree(iter, memcg) {

1774

if (iter->oom_lock) {

1774

if (iter->oom_lock) {

1775

/*

1775

/*

1776

* this subtree of our hierarchy is already locked

1776

* this subtree of our hierarchy is already locked

1777

* so we cannot give a lock.

1777

* so we cannot give a lock.

1778

*/

1778

*/

1779

failed = iter;

1779

failed = iter;

1780

mem_cgroup_iter_break(memcg, iter);

1780

mem_cgroup_iter_break(memcg, iter);

1781

break;

1781

break;

1782

} else

1782

} else

1783

iter->oom_lock = true;

1783

iter->oom_lock = true;

1784

}

1784

}

1785

1786

if (!failed)

1786

if (!failed)

1787

return true;

1787

return true;

1788

1789

/*

1789

/*

1790

* OK, we failed to lock the whole subtree so we have to clean up

1790

* OK, we failed to lock the whole subtree so we have to clean up

1791

* what we set up to the failing subtree

1791

* what we set up to the failing subtree

1792

*/

1792

*/

1793

for_each_mem_cgroup_tree(iter, memcg) {

1793

for_each_mem_cgroup_tree(iter, memcg) {

1794

if (iter == failed) {

1794

if (iter == failed) {

1795

mem_cgroup_iter_break(memcg, iter);

1795

mem_cgroup_iter_break(memcg, iter);

1796

break;

1796

break;

1797

}

1797

}

1798

iter->oom_lock = false;

1798

iter->oom_lock = false;

1799

}

1799

}

1800

return false;

1800

return false;

1801

}

1801

}

1802

1803

/*

1803

/*

1804

* Has to be called with memcg_oom_lock

1804

* Has to be called with memcg_oom_lock

1805

*/

1805

*/

1806

static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)

1806

static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)

1807

{

1807

{

1808

struct mem_cgroup *iter;

1808

struct mem_cgroup *iter;

1809

1810

for_each_mem_cgroup_tree(iter, memcg)

1810

for_each_mem_cgroup_tree(iter, memcg)

1811

iter->oom_lock = false;

1811

iter->oom_lock = false;

1812

return 0;

1812

return 0;

1813

}

1813

}

1814

1815

static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)

1815

static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)

1816

{

1816

{

1817

struct mem_cgroup *iter;

1817

struct mem_cgroup *iter;

1818

1819

for_each_mem_cgroup_tree(iter, memcg)

1819

for_each_mem_cgroup_tree(iter, memcg)

1820

atomic_inc(&iter->under_oom);

1820

atomic_inc(&iter->under_oom);

1821

}

1821

}

1822

1823

static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)

1823

static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)

1824

{

1824

{

1825

struct mem_cgroup *iter;

1825

struct mem_cgroup *iter;

1826

1827

/*

1827

/*

1828

* When a new child is created while the hierarchy is under oom,

1828

* When a new child is created while the hierarchy is under oom,

1829

* mem_cgroup_oom_lock() may not be called. We have to use

1829

* mem_cgroup_oom_lock() may not be called. We have to use

1830

* atomic_add_unless() here.

1830

* atomic_add_unless() here.

1831

*/

1831

*/

1832

for_each_mem_cgroup_tree(iter, memcg)

1832

for_each_mem_cgroup_tree(iter, memcg)

1833

atomic_add_unless(&iter->under_oom, -1, 0);

1833

atomic_add_unless(&iter->under_oom, -1, 0);

1834

}

1834

}

1835

1836

static DEFINE_SPINLOCK(memcg_oom_lock);

1836

static DEFINE_SPINLOCK(memcg_oom_lock);

1837

static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);

1837

static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);

1838

1839

struct oom_wait_info {

1839

struct oom_wait_info {

1840

struct mem_cgroup *memcg;

1840

struct mem_cgroup *memcg;

1841

wait_queue_t wait;

1841

wait_queue_t wait;

1842

};

1842

};

1843

1844

static int memcg_oom_wake_function(wait_queue_t *wait,

1844

static int memcg_oom_wake_function(wait_queue_t *wait,

1845

unsigned mode, int sync, void *arg)

1845

unsigned mode, int sync, void *arg)

1846

{

1846

{

1847

struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;

1847

struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;

1848

struct mem_cgroup *oom_wait_memcg;

1848

struct mem_cgroup *oom_wait_memcg;

1849

struct oom_wait_info *oom_wait_info;

1849

struct oom_wait_info *oom_wait_info;

1850

1851

oom_wait_info = container_of(wait, struct oom_wait_info, wait);

1851

oom_wait_info = container_of(wait, struct oom_wait_info, wait);

1852

oom_wait_memcg = oom_wait_info->memcg;

1852

oom_wait_memcg = oom_wait_info->memcg;

1853

1854

/*

1854

/*

1855

* Both of oom_wait_info->memcg and wake_memcg are stable under us.

1855

* Both of oom_wait_info->memcg and wake_memcg are stable under us.

1856

* Then we can use css_is_ancestor without taking care of RCU.

1856

* Then we can use css_is_ancestor without taking care of RCU.

1857

*/

1857

*/

1858

if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)

1858

if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)

1859

&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))

1859

&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))

1860

return 0;

1860

return 0;

1861

return autoremove_wake_function(wait, mode, sync, arg);

1861

return autoremove_wake_function(wait, mode, sync, arg);

1862

}

1862

}

1863

1864

static void memcg_wakeup_oom(struct mem_cgroup *memcg)

1864

static void memcg_wakeup_oom(struct mem_cgroup *memcg)

1865

{

1865

{

1866

/* for filtering, pass "memcg" as argument. */

1866

/* for filtering, pass "memcg" as argument. */

1867

__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);

1867

__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);

1868

}

1868

}

1869

1870

static void memcg_oom_recover(struct mem_cgroup *memcg)

1870

static void memcg_oom_recover(struct mem_cgroup *memcg)

1871

{

1871

{

1872

if (memcg && atomic_read(&memcg->under_oom))

1872

if (memcg && atomic_read(&memcg->under_oom))

1873

memcg_wakeup_oom(memcg);

1873

memcg_wakeup_oom(memcg);

1874

}

1874

}

1875

1876

/*

1876

/*

1877

* try to call OOM killer. returns false if we should exit memory-reclaim loop.

1877

* try to call OOM killer. returns false if we should exit memory-reclaim loop.

1878

*/

1878

*/

1879

static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,

1879

static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,

1880

int order)

1880

int order)

1881

{

1881

{

1882

struct oom_wait_info owait;

1882

struct oom_wait_info owait;

1883

bool locked, need_to_kill;

1883

bool locked, need_to_kill;

1884

1885

owait.memcg = memcg;

1885

owait.memcg = memcg;

1886

owait.wait.flags = 0;

1886

owait.wait.flags = 0;

1887

owait.wait.func = memcg_oom_wake_function;

1887

owait.wait.func = memcg_oom_wake_function;

1888

owait.wait.private = current;

1888

owait.wait.private = current;

1889

INIT_LIST_HEAD(&owait.wait.task_list);

1889

INIT_LIST_HEAD(&owait.wait.task_list);

1890

need_to_kill = true;

1890

need_to_kill = true;

1891

mem_cgroup_mark_under_oom(memcg);

1891

mem_cgroup_mark_under_oom(memcg);

1892

1893

/* At first, try to OOM lock hierarchy under memcg.*/

1893

/* At first, try to OOM lock hierarchy under memcg.*/

1894

spin_lock(&memcg_oom_lock);

1894

spin_lock(&memcg_oom_lock);

1895

locked = mem_cgroup_oom_lock(memcg);

1895

locked = mem_cgroup_oom_lock(memcg);

1896

/*

1896

/*

1897

* Even if signal_pending(), we can't quit charge() loop without

1897

* Even if signal_pending(), we can't quit charge() loop without

1898

* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL

1898

* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL

1899

* under OOM is always welcomed, use TASK_KILLABLE here.

1899

* under OOM is always welcomed, use TASK_KILLABLE here.

1900

*/

1900

*/

1901

prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);

1901

prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);

1902

if (!locked || memcg->oom_kill_disable)

1902

if (!locked || memcg->oom_kill_disable)

1903

need_to_kill = false;

1903

need_to_kill = false;

1904

if (locked)

1904

if (locked)

1905

mem_cgroup_oom_notify(memcg);

1905

mem_cgroup_oom_notify(memcg);

1906

spin_unlock(&memcg_oom_lock);

1906

spin_unlock(&memcg_oom_lock);

1907

1908

if (need_to_kill) {

1908

if (need_to_kill) {

1909

finish_wait(&memcg_oom_waitq, &owait.wait);

1909

finish_wait(&memcg_oom_waitq, &owait.wait);

1910

mem_cgroup_out_of_memory(memcg, mask, order);

1910

mem_cgroup_out_of_memory(memcg, mask, order);

1911

} else {

1911

} else {

1912

schedule();

1912

schedule();

1913

finish_wait(&memcg_oom_waitq, &owait.wait);

1913

finish_wait(&memcg_oom_waitq, &owait.wait);

1914

}

1914

}

1915

spin_lock(&memcg_oom_lock);

1915

spin_lock(&memcg_oom_lock);

1916

if (locked)

1916

if (locked)

1917

mem_cgroup_oom_unlock(memcg);

1917

mem_cgroup_oom_unlock(memcg);

1918

memcg_wakeup_oom(memcg);

1918

memcg_wakeup_oom(memcg);

1919

spin_unlock(&memcg_oom_lock);

1919

spin_unlock(&memcg_oom_lock);

1920

1921

mem_cgroup_unmark_under_oom(memcg);

1921

mem_cgroup_unmark_under_oom(memcg);

1922

1923

if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))

1923

if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))

1924

return false;

1924

return false;

1925

/* Give chance to dying process */

1925

/* Give chance to dying process */

1926

schedule_timeout_uninterruptible(1);

1926

schedule_timeout_uninterruptible(1);

1927

return true;

1927

return true;

1928

}

1928

}

1929

1930

/*

1930

/*

1931

* Currently used to update mapped file statistics, but the routine can be

1931

* Currently used to update mapped file statistics, but the routine can be

1932

* generalized to update other statistics as well.

1932

* generalized to update other statistics as well.

1933

*

1933

*

1934

* Notes: Race condition

1934

* Notes: Race condition

1935

*

1935

*

1936

* We usually use page_cgroup_lock() for accessing page_cgroup member but

1936

* We usually use page_cgroup_lock() for accessing page_cgroup member but

1937

* it tends to be costly. But considering some conditions, we doesn't need

1937

* it tends to be costly. But considering some conditions, we doesn't need

1938

* to do so _always_.

1938

* to do so _always_.

1939

*

1939

*

1940

* Considering "charge", lock_page_cgroup() is not required because all

1940

* Considering "charge", lock_page_cgroup() is not required because all

1941

* file-stat operations happen after a page is attached to radix-tree. There

1941

* file-stat operations happen after a page is attached to radix-tree. There

1942

* are no race with "charge".

1942

* are no race with "charge".

1943

*

1943

*

1944

* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup

1944

* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup

1945

* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even

1945

* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even

1946

* if there are race with "uncharge". Statistics itself is properly handled

1946

* if there are race with "uncharge". Statistics itself is properly handled

1947

* by flags.

1947

* by flags.

1948

*

1948

*

1949

* Considering "move", this is an only case we see a race. To make the race

1949

* Considering "move", this is an only case we see a race. To make the race

1950

* small, we check mm->moving_account and detect there are possibility of race

1950

* small, we check mm->moving_account and detect there are possibility of race

1951

* If there is, we take a lock.

1951

* If there is, we take a lock.

1952

*/

1952

*/

1953

1954

void __mem_cgroup_begin_update_page_stat(struct page *page,

1954

void __mem_cgroup_begin_update_page_stat(struct page *page,

1955

bool *locked, unsigned long *flags)

1955

bool *locked, unsigned long *flags)

1956

{

1956

{

1957

struct mem_cgroup *memcg;

1957

struct mem_cgroup *memcg;

1958

struct page_cgroup *pc;

1958

struct page_cgroup *pc;

1959

1960

pc = lookup_page_cgroup(page);

1960

pc = lookup_page_cgroup(page);

1961

again:

1961

again:

1962

memcg = pc->mem_cgroup;

1962

memcg = pc->mem_cgroup;

1963

if (unlikely(!memcg || !PageCgroupUsed(pc)))

1963

if (unlikely(!memcg || !PageCgroupUsed(pc)))

1964

return;

1964

return;

1965

/*

1965

/*

1966

* If this memory cgroup is not under account moving, we don't

1966

* If this memory cgroup is not under account moving, we don't

1967

* need to take move_lock_mem_cgroup(). Because we already hold

1967

* need to take move_lock_mem_cgroup(). Because we already hold

1968

* rcu_read_lock(), any calls to move_account will be delayed until

1968

* rcu_read_lock(), any calls to move_account will be delayed until

1969

* rcu_read_unlock() if mem_cgroup_stolen() == true.

1969

* rcu_read_unlock() if mem_cgroup_stolen() == true.

1970

*/

1970

*/

1971

if (!mem_cgroup_stolen(memcg))

1971

if (!mem_cgroup_stolen(memcg))

1972

return;

1972

return;

1973

1974

move_lock_mem_cgroup(memcg, flags);

1974

move_lock_mem_cgroup(memcg, flags);

1975

if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {

1975

if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {

1976

move_unlock_mem_cgroup(memcg, flags);

1976

move_unlock_mem_cgroup(memcg, flags);

1977

goto again;

1977

goto again;

1978

}

1978

}

1979

*locked = true;

1979

*locked = true;

1980

}

1980

}

1981

1982

void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)

1982

void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)

1983

{

1983

{

1984

struct page_cgroup *pc = lookup_page_cgroup(page);

1984

struct page_cgroup *pc = lookup_page_cgroup(page);

1985

1986

/*

1986

/*

1987

* It's guaranteed that pc->mem_cgroup never changes while

1987

* It's guaranteed that pc->mem_cgroup never changes while

1988

* lock is held because a routine modifies pc->mem_cgroup

1988

* lock is held because a routine modifies pc->mem_cgroup

1989

* should take move_lock_mem_cgroup().

1989

* should take move_lock_mem_cgroup().

1990

*/

1990

*/

1991

move_unlock_mem_cgroup(pc->mem_cgroup, flags);

1991

move_unlock_mem_cgroup(pc->mem_cgroup, flags);

1992

}

1992

}

1993

1994

void mem_cgroup_update_page_stat(struct page *page,

1994

void mem_cgroup_update_page_stat(struct page *page,

1995

enum mem_cgroup_page_stat_item idx, int val)

1995

enum mem_cgroup_page_stat_item idx, int val)

1996

{

1996

{

1997

struct mem_cgroup *memcg;

1997

struct mem_cgroup *memcg;

1998

struct page_cgroup *pc = lookup_page_cgroup(page);

1998

struct page_cgroup *pc = lookup_page_cgroup(page);

1999

unsigned long uninitialized_var(flags);

1999

unsigned long uninitialized_var(flags);

2000

2001

if (mem_cgroup_disabled())

2001

if (mem_cgroup_disabled())

2002

return;

2002

return;

2003

2004

memcg = pc->mem_cgroup;

2004

memcg = pc->mem_cgroup;

2005

if (unlikely(!memcg || !PageCgroupUsed(pc)))

2005

if (unlikely(!memcg || !PageCgroupUsed(pc)))

2006

return;

2006

return;

2007

2008

switch (idx) {

2008

switch (idx) {

2009

case MEMCG_NR_FILE_MAPPED:

2009

case MEMCG_NR_FILE_MAPPED:

2010

idx = MEM_CGROUP_STAT_FILE_MAPPED;

2010

idx = MEM_CGROUP_STAT_FILE_MAPPED;

2011

break;

2011

break;

2012

default:

2012

default:

2013

BUG();

2013

BUG();

2014

}

2014

}

2015

2016

this_cpu_add(memcg->stat->count[idx], val);

2016

this_cpu_add(memcg->stat->count[idx], val);

2017

}

2017

}

2018

2019

/*

2019

/*

2020

* size of first charge trial. "32" comes from vmscan.c's magic value.

2020

* size of first charge trial. "32" comes from vmscan.c's magic value.

2021

* TODO: maybe necessary to use big numbers in big irons.

2021

* TODO: maybe necessary to use big numbers in big irons.

2022

*/

2022

*/

2023

#define CHARGE_BATCH 32U

2023

#define CHARGE_BATCH 32U

2024

struct memcg_stock_pcp {

2024

struct memcg_stock_pcp {

2025

struct mem_cgroup *cached; /* this never be root cgroup */

2025

struct mem_cgroup *cached; /* this never be root cgroup */

2026

unsigned int nr_pages;

2026

unsigned int nr_pages;

2027

struct work_struct work;

2027

struct work_struct work;

2028

unsigned long flags;

2028

unsigned long flags;

2029

#define FLUSHING_CACHED_CHARGE 0

2029

#define FLUSHING_CACHED_CHARGE 0

2030

};

2030

};

2031

static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);

2031

static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);

2032

static DEFINE_MUTEX(percpu_charge_mutex);

2032

static DEFINE_MUTEX(percpu_charge_mutex);

2033

2034

/*

2034

/*

2035

* Try to consume stocked charge on this cpu. If success, one page is consumed

2035

* Try to consume stocked charge on this cpu. If success, one page is consumed

2036

* from local stock and true is returned. If the stock is 0 or charges from a

2036

* from local stock and true is returned. If the stock is 0 or charges from a

2037

* cgroup which is not current target, returns false. This stock will be

2037

* cgroup which is not current target, returns false. This stock will be

2038

* refilled.

2038

* refilled.

2039

*/

2039

*/

2040

static bool consume_stock(struct mem_cgroup *memcg)

2040

static bool consume_stock(struct mem_cgroup *memcg)

2041

{

2041

{

2042

struct memcg_stock_pcp *stock;

2042

struct memcg_stock_pcp *stock;

2043

bool ret = true;

2043

bool ret = true;

2044

2045

stock = &get_cpu_var(memcg_stock);

2045

stock = &get_cpu_var(memcg_stock);

2046

if (memcg == stock->cached && stock->nr_pages)

2046

if (memcg == stock->cached && stock->nr_pages)

2047

stock->nr_pages--;

2047

stock->nr_pages--;

2048

else /* need to call res_counter_charge */

2048

else /* need to call res_counter_charge */

2049

ret = false;

2049

ret = false;

2050

put_cpu_var(memcg_stock);

2050

put_cpu_var(memcg_stock);

2051

return ret;

2051

return ret;

2052

}

2052

}

2053

2054

/*

2054

/*

2055

* Returns stocks cached in percpu to res_counter and reset cached information.

2055

* Returns stocks cached in percpu to res_counter and reset cached information.

2056

*/

2056

*/

2057

static void drain_stock(struct memcg_stock_pcp *stock)

2057

static void drain_stock(struct memcg_stock_pcp *stock)

2058

{

2058

{

2059

struct mem_cgroup *old = stock->cached;

2059

struct mem_cgroup *old = stock->cached;

2060

2061

if (stock->nr_pages) {

2061

if (stock->nr_pages) {

2062

unsigned long bytes = stock->nr_pages * PAGE_SIZE;

2062

unsigned long bytes = stock->nr_pages * PAGE_SIZE;

2063

2064

res_counter_uncharge(&old->res, bytes);

2064

res_counter_uncharge(&old->res, bytes);

2065

if (do_swap_account)

2065

if (do_swap_account)

2066

res_counter_uncharge(&old->memsw, bytes);

2066

res_counter_uncharge(&old->memsw, bytes);

2067

stock->nr_pages = 0;

2067

stock->nr_pages = 0;

2068

}

2068

}

2069

stock->cached = NULL;

2069

stock->cached = NULL;

2070

}

2070

}

2071

2072

/*

2072

/*

2073

* This must be called under preempt disabled or must be called by

2073

* This must be called under preempt disabled or must be called by

2074

* a thread which is pinned to local cpu.

2074

* a thread which is pinned to local cpu.

2075

*/

2075

*/

2076

static void drain_local_stock(struct work_struct *dummy)

2076

static void drain_local_stock(struct work_struct *dummy)

2077

{

2077

{

2078

struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);

2078

struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);

2079

drain_stock(stock);

2079

drain_stock(stock);

2080

clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);

2080

clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);

2081

}

2081

}

2082

2083

/*

2083

/*

2084

* Cache charges(val) which is from res_counter, to local per_cpu area.

2084

* Cache charges(val) which is from res_counter, to local per_cpu area.

2085

* This will be consumed by consume_stock() function, later.

2085

* This will be consumed by consume_stock() function, later.

2086

*/

2086

*/

2087

static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)

2087

static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)

2088

{

2088

{

2089

struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);

2089

struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);

2090

2091

if (stock->cached != memcg) { /* reset if necessary */

2091

if (stock->cached != memcg) { /* reset if necessary */

2092

drain_stock(stock);

2092

drain_stock(stock);

2093

stock->cached = memcg;

2093

stock->cached = memcg;

2094

}

2094

}

2095

stock->nr_pages += nr_pages;

2095

stock->nr_pages += nr_pages;

2096

put_cpu_var(memcg_stock);

2096

put_cpu_var(memcg_stock);

2097

}

2097

}

2098

2099

/*

2099

/*

2100

* Drains all per-CPU charge caches for given root_memcg resp. subtree

2100

* Drains all per-CPU charge caches for given root_memcg resp. subtree

2101

* of the hierarchy under it. sync flag says whether we should block

2101

* of the hierarchy under it. sync flag says whether we should block

2102

* until the work is done.

2102

* until the work is done.

2103

*/

2103

*/

2104

static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)

2104

static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)

2105

{

2105

{

2106

int cpu, curcpu;

2106

int cpu, curcpu;

2107

2108

/* Notify other cpus that system-wide "drain" is running */

2108

/* Notify other cpus that system-wide "drain" is running */

2109

get_online_cpus();

2109

get_online_cpus();

2110

curcpu = get_cpu();

2110

curcpu = get_cpu();

2111

for_each_online_cpu(cpu) {

2111

for_each_online_cpu(cpu) {

2112

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2112

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2113

struct mem_cgroup *memcg;

2113

struct mem_cgroup *memcg;

2114

2115

memcg = stock->cached;

2115

memcg = stock->cached;

2116

if (!memcg || !stock->nr_pages)

2116

if (!memcg || !stock->nr_pages)

2117

continue;

2117

continue;

2118

if (!mem_cgroup_same_or_subtree(root_memcg, memcg))

2118

if (!mem_cgroup_same_or_subtree(root_memcg, memcg))

2119

continue;

2119

continue;

2120

if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {

2120

if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {

2121

if (cpu == curcpu)

2121

if (cpu == curcpu)

2122

drain_local_stock(&stock->work);

2122

drain_local_stock(&stock->work);

2123

else

2123

else

2124

schedule_work_on(cpu, &stock->work);

2124

schedule_work_on(cpu, &stock->work);

2125

}

2125

}

2126

}

2126

}

2127

put_cpu();

2127

put_cpu();

2128

2129

if (!sync)

2129

if (!sync)

2130

goto out;

2130

goto out;

2131

2132

for_each_online_cpu(cpu) {

2132

for_each_online_cpu(cpu) {

2133

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2133

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2134

if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))

2134

if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))

2135

flush_work(&stock->work);

2135

flush_work(&stock->work);

2136

}

2136

}

2137

out:

2137

out:

2138

put_online_cpus();

2138

put_online_cpus();

2139

}

2139

}

2140

2141

/*

2141

/*

2142

* Tries to drain stocked charges in other cpus. This function is asynchronous

2142

* Tries to drain stocked charges in other cpus. This function is asynchronous

2143

* and just put a work per cpu for draining localy on each cpu. Caller can

2143

* and just put a work per cpu for draining localy on each cpu. Caller can

2144

* expects some charges will be back to res_counter later but cannot wait for

2144

* expects some charges will be back to res_counter later but cannot wait for

2145

* it.

2145

* it.

2146

*/

2146

*/

2147

static void drain_all_stock_async(struct mem_cgroup *root_memcg)

2147

static void drain_all_stock_async(struct mem_cgroup *root_memcg)

2148

{

2148

{

2149

/*

2149

/*

2150

* If someone calls draining, avoid adding more kworker runs.

2150

* If someone calls draining, avoid adding more kworker runs.

2151

*/

2151

*/

2152

if (!mutex_trylock(&percpu_charge_mutex))

2152

if (!mutex_trylock(&percpu_charge_mutex))

2153

return;

2153

return;

2154

drain_all_stock(root_memcg, false);

2154

drain_all_stock(root_memcg, false);

2155

mutex_unlock(&percpu_charge_mutex);

2155

mutex_unlock(&percpu_charge_mutex);

2156

}

2156

}

2157

2158

/* This is a synchronous drain interface. */

2158

/* This is a synchronous drain interface. */

2159

static void drain_all_stock_sync(struct mem_cgroup *root_memcg)

2159

static void drain_all_stock_sync(struct mem_cgroup *root_memcg)

2160

{

2160

{

2161

/* called when force_empty is called */

2161

/* called when force_empty is called */

2162

mutex_lock(&percpu_charge_mutex);

2162

mutex_lock(&percpu_charge_mutex);

2163

drain_all_stock(root_memcg, true);

2163

drain_all_stock(root_memcg, true);

2164

mutex_unlock(&percpu_charge_mutex);

2164

mutex_unlock(&percpu_charge_mutex);

2165

}

2165

}

2166

2167

/*

2167

/*

2168

* This function drains percpu counter value from DEAD cpu and

2168

* This function drains percpu counter value from DEAD cpu and

2169

* move it to local cpu. Note that this function can be preempted.

2169

* move it to local cpu. Note that this function can be preempted.

2170

*/

2170

*/

2171

static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)

2171

static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)

2172

{

2172

{

2173

int i;

2173

int i;

2174

2175

spin_lock(&memcg->pcp_counter_lock);

2175

spin_lock(&memcg->pcp_counter_lock);

2176

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

2176

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

2177

long x = per_cpu(memcg->stat->count[i], cpu);

2177

long x = per_cpu(memcg->stat->count[i], cpu);

2178

2179

per_cpu(memcg->stat->count[i], cpu) = 0;

2179

per_cpu(memcg->stat->count[i], cpu) = 0;

2180

memcg->nocpu_base.count[i] += x;

2180

memcg->nocpu_base.count[i] += x;

2181

}

2181

}

2182

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

2182

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

2183

unsigned long x = per_cpu(memcg->stat->events[i], cpu);

2183

unsigned long x = per_cpu(memcg->stat->events[i], cpu);

2184

2185

per_cpu(memcg->stat->events[i], cpu) = 0;

2185

per_cpu(memcg->stat->events[i], cpu) = 0;

2186

memcg->nocpu_base.events[i] += x;

2186

memcg->nocpu_base.events[i] += x;

2187

}

2187

}

2188

spin_unlock(&memcg->pcp_counter_lock);

2188

spin_unlock(&memcg->pcp_counter_lock);

2189

}

2189

}

2190

2191

static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,

2191

static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,

2192

unsigned long action,

2192

unsigned long action,

2193

void *hcpu)

2193

void *hcpu)

2194

{

2194

{

2195

int cpu = (unsigned long)hcpu;

2195

int cpu = (unsigned long)hcpu;

2196

struct memcg_stock_pcp *stock;

2196

struct memcg_stock_pcp *stock;

2197

struct mem_cgroup *iter;

2197

struct mem_cgroup *iter;

2198

2199

if (action == CPU_ONLINE)

2199

if (action == CPU_ONLINE)

2200

return NOTIFY_OK;

2200

return NOTIFY_OK;

2201

2202

if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)

2202

if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)

2203

return NOTIFY_OK;

2203

return NOTIFY_OK;

2204

2205

for_each_mem_cgroup(iter)

2205

for_each_mem_cgroup(iter)

2206

mem_cgroup_drain_pcp_counter(iter, cpu);

2206

mem_cgroup_drain_pcp_counter(iter, cpu);

2207

2208

stock = &per_cpu(memcg_stock, cpu);

2208

stock = &per_cpu(memcg_stock, cpu);

2209

drain_stock(stock);

2209

drain_stock(stock);

2210

return NOTIFY_OK;

2210

return NOTIFY_OK;

2211

}

2211

}

2212

2213

2214

/* See __mem_cgroup_try_charge() for details */

2214

/* See __mem_cgroup_try_charge() for details */

2215

enum {

2215

enum {

2216

CHARGE_OK, /* success */

2216

CHARGE_OK, /* success */

2217

CHARGE_RETRY, /* need to retry but retry is not bad */

2217

CHARGE_RETRY, /* need to retry but retry is not bad */

2218

CHARGE_NOMEM, /* we can't do more. return -ENOMEM */

2218

CHARGE_NOMEM, /* we can't do more. return -ENOMEM */

2219

CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */

2219

CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */

2220

CHARGE_OOM_DIE, /* the current is killed because of OOM */

2220

CHARGE_OOM_DIE, /* the current is killed because of OOM */

2221

};

2221

};

2222

2223

static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,

2223

static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,

2224

unsigned int nr_pages, bool oom_check)

2224

unsigned int nr_pages, bool oom_check)

2225

{

2225

{

2226

unsigned long csize = nr_pages * PAGE_SIZE;

2226

unsigned long csize = nr_pages * PAGE_SIZE;

2227

struct mem_cgroup *mem_over_limit;

2227

struct mem_cgroup *mem_over_limit;

2228

struct res_counter *fail_res;

2228

struct res_counter *fail_res;

2229

unsigned long flags = 0;

2229

unsigned long flags = 0;

2230

int ret;

2230

int ret;

2231

2232

ret = res_counter_charge(&memcg->res, csize, &fail_res);

2232

ret = res_counter_charge(&memcg->res, csize, &fail_res);

2233

2234

if (likely(!ret)) {

2234

if (likely(!ret)) {

2235

if (!do_swap_account)

2235

if (!do_swap_account)

2236

return CHARGE_OK;

2236

return CHARGE_OK;

2237

ret = res_counter_charge(&memcg->memsw, csize, &fail_res);

2237

ret = res_counter_charge(&memcg->memsw, csize, &fail_res);

2238

if (likely(!ret))

2238

if (likely(!ret))

2239

return CHARGE_OK;

2239

return CHARGE_OK;

2240

2241

res_counter_uncharge(&memcg->res, csize);

2241

res_counter_uncharge(&memcg->res, csize);

2242

mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);

2242

mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);

2243

flags |= MEM_CGROUP_RECLAIM_NOSWAP;

2243

flags |= MEM_CGROUP_RECLAIM_NOSWAP;

2244

} else

2244

} else

2245

mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);

2245

mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);

2246

/*

2246

/*

2247

* nr_pages can be either a huge page (HPAGE_PMD_NR), a batch

2247

* nr_pages can be either a huge page (HPAGE_PMD_NR), a batch

2248

* of regular pages (CHARGE_BATCH), or a single regular page (1).

2248

* of regular pages (CHARGE_BATCH), or a single regular page (1).

2249

*

2249

*

2250

* Never reclaim on behalf of optional batching, retry with a

2250

* Never reclaim on behalf of optional batching, retry with a

2251

* single page instead.

2251

* single page instead.

2252

*/

2252

*/

2253

if (nr_pages == CHARGE_BATCH)

2253

if (nr_pages == CHARGE_BATCH)

2254

return CHARGE_RETRY;

2254

return CHARGE_RETRY;

2255

2256

if (!(gfp_mask & __GFP_WAIT))

2256

if (!(gfp_mask & __GFP_WAIT))

2257

return CHARGE_WOULDBLOCK;

2257

return CHARGE_WOULDBLOCK;

2258

2259

ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);

2259

ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);

2260

if (mem_cgroup_margin(mem_over_limit) >= nr_pages)

2260

if (mem_cgroup_margin(mem_over_limit) >= nr_pages)

2261

return CHARGE_RETRY;

2261

return CHARGE_RETRY;

2262

/*

2262

/*

2263

* Even though the limit is exceeded at this point, reclaim

2263

* Even though the limit is exceeded at this point, reclaim

2264

* may have been able to free some pages. Retry the charge

2264

* may have been able to free some pages. Retry the charge

2265

* before killing the task.

2265

* before killing the task.

2266

*

2266

*

2267

* Only for regular pages, though: huge pages are rather

2267

* Only for regular pages, though: huge pages are rather

2268

* unlikely to succeed so close to the limit, and we fall back

2268

* unlikely to succeed so close to the limit, and we fall back

2269

* to regular pages anyway in case of failure.

2269

* to regular pages anyway in case of failure.

2270

*/

2270

*/

2271

if (nr_pages == 1 && ret)

2271

if (nr_pages == 1 && ret)

2272

return CHARGE_RETRY;

2272

return CHARGE_RETRY;

2273

2274

/*

2274

/*

2275

* At task move, charge accounts can be doubly counted. So, it's

2275

* At task move, charge accounts can be doubly counted. So, it's

2276

* better to wait until the end of task_move if something is going on.

2276

* better to wait until the end of task_move if something is going on.

2277

*/

2277

*/

2278

if (mem_cgroup_wait_acct_move(mem_over_limit))

2278

if (mem_cgroup_wait_acct_move(mem_over_limit))

2279

return CHARGE_RETRY;

2279

return CHARGE_RETRY;

2280

2281

/* If we don't need to call oom-killer at el, return immediately */

2281

/* If we don't need to call oom-killer at el, return immediately */

2282

if (!oom_check)

2282

if (!oom_check)

2283

return CHARGE_NOMEM;

2283

return CHARGE_NOMEM;

2284

/* check OOM */

2284

/* check OOM */

2285

if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))

2285

if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))

2286

return CHARGE_OOM_DIE;

2286

return CHARGE_OOM_DIE;

2287

2288

return CHARGE_RETRY;

2288

return CHARGE_RETRY;

2289

}

2289

}

2290

2291

/*

2291

/*

2292

* __mem_cgroup_try_charge() does

2292

* __mem_cgroup_try_charge() does

2293

* 1. detect memcg to be charged against from passed *mm and *ptr,

2293

* 1. detect memcg to be charged against from passed *mm and *ptr,

2294

* 2. update res_counter

2294

* 2. update res_counter

2295

* 3. call memory reclaim if necessary.

2295

* 3. call memory reclaim if necessary.

2296

*

2296

*

2297

* In some special case, if the task is fatal, fatal_signal_pending() or

2297

* In some special case, if the task is fatal, fatal_signal_pending() or

2298

* has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup

2298

* has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup

2299

* to *ptr. There are two reasons for this. 1: fatal threads should quit as soon

2299

* to *ptr. There are two reasons for this. 1: fatal threads should quit as soon

2300

* as possible without any hazards. 2: all pages should have a valid

2300

* as possible without any hazards. 2: all pages should have a valid

2301

* pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg

2301

* pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg

2302

* pointer, that is treated as a charge to root_mem_cgroup.

2302

* pointer, that is treated as a charge to root_mem_cgroup.

2303

*

2303

*

2304

* So __mem_cgroup_try_charge() will return

2304

* So __mem_cgroup_try_charge() will return

2305

* 0 ... on success, filling *ptr with a valid memcg pointer.

2305

* 0 ... on success, filling *ptr with a valid memcg pointer.

2306

* -ENOMEM ... charge failure because of resource limits.

2306

* -ENOMEM ... charge failure because of resource limits.

2307

* -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.

2307

* -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.

2308

*

2308

*

2309

* Unlike the exported interface, an "oom" parameter is added. if oom==true,

2309

* Unlike the exported interface, an "oom" parameter is added. if oom==true,

2310

* the oom-killer can be invoked.

2310

* the oom-killer can be invoked.

2311

*/

2311

*/

2312

static int __mem_cgroup_try_charge(struct mm_struct *mm,

2312

static int __mem_cgroup_try_charge(struct mm_struct *mm,

2313

gfp_t gfp_mask,

2313

gfp_t gfp_mask,

2314

unsigned int nr_pages,

2314

unsigned int nr_pages,

2315

struct mem_cgroup **ptr,

2315

struct mem_cgroup **ptr,

2316

bool oom)

2316

bool oom)

2317

{

2317

{

2318

unsigned int batch = max(CHARGE_BATCH, nr_pages);

2318

unsigned int batch = max(CHARGE_BATCH, nr_pages);

2319

int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

2319

int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

2320

struct mem_cgroup *memcg = NULL;

2320

struct mem_cgroup *memcg = NULL;

2321

int ret;

2321

int ret;

2322

2323

/*

2323

/*

2324

* Unlike gloval-vm's OOM-kill, we're not in memory shortage

2324

* Unlike gloval-vm's OOM-kill, we're not in memory shortage

2325

* in system level. So, allow to go ahead dying process in addition to

2325

* in system level. So, allow to go ahead dying process in addition to

2326

* MEMDIE process.

2326

* MEMDIE process.

2327

*/

2327

*/

2328

if (unlikely(test_thread_flag(TIF_MEMDIE)

2328

if (unlikely(test_thread_flag(TIF_MEMDIE)

2329

|| fatal_signal_pending(current)))

2329

|| fatal_signal_pending(current)))

2330

goto bypass;

2330

goto bypass;

2331

2332

/*

2332

/*

2333

* We always charge the cgroup the mm_struct belongs to.

2333

* We always charge the cgroup the mm_struct belongs to.

2334

* The mm_struct's mem_cgroup changes on task migration if the

2334

* The mm_struct's mem_cgroup changes on task migration if the

2335

* thread group leader migrates. It's possible that mm is not

2335

* thread group leader migrates. It's possible that mm is not

2336

* set, if so charge the root memcg (happens for pagecache usage).

2336

* set, if so charge the root memcg (happens for pagecache usage).

2337

*/

2337

*/

2338

if (!*ptr && !mm)

2338

if (!*ptr && !mm)

2339

*ptr = root_mem_cgroup;

2339

*ptr = root_mem_cgroup;

2340

again:

2340

again:

2341

if (*ptr) { /* css should be a valid one */

2341

if (*ptr) { /* css should be a valid one */

2342

memcg = *ptr;

2342

memcg = *ptr;

2343

VM_BUG_ON(css_is_removed(&memcg->css));

2343

VM_BUG_ON(css_is_removed(&memcg->css));

2344

if (mem_cgroup_is_root(memcg))

2344

if (mem_cgroup_is_root(memcg))

2345

goto done;

2345

goto done;

2346

if (nr_pages == 1 && consume_stock(memcg))

2346

if (nr_pages == 1 && consume_stock(memcg))

2347

goto done;

2347

goto done;

2348

css_get(&memcg->css);

2348

css_get(&memcg->css);

2349

} else {

2349

} else {

2350

struct task_struct *p;

2350

struct task_struct *p;

2351

2352

rcu_read_lock();

2352

rcu_read_lock();

2353

p = rcu_dereference(mm->owner);

2353

p = rcu_dereference(mm->owner);

2354

/*

2354

/*

2355

* Because we don't have task_lock(), "p" can exit.

2355

* Because we don't have task_lock(), "p" can exit.

2356

* In that case, "memcg" can point to root or p can be NULL with

2356

* In that case, "memcg" can point to root or p can be NULL with

2357

* race with swapoff. Then, we have small risk of mis-accouning.

2357

* race with swapoff. Then, we have small risk of mis-accouning.

2358

* But such kind of mis-account by race always happens because

2358

* But such kind of mis-account by race always happens because

2359

* we don't have cgroup_mutex(). It's overkill and we allo that

2359

* we don't have cgroup_mutex(). It's overkill and we allo that

2360

* small race, here.

2360

* small race, here.

2361

* (*) swapoff at el will charge against mm-struct not against

2361

* (*) swapoff at el will charge against mm-struct not against

2362

* task-struct. So, mm->owner can be NULL.

2362

* task-struct. So, mm->owner can be NULL.

2363

*/

2363

*/

2364

memcg = mem_cgroup_from_task(p);

2364

memcg = mem_cgroup_from_task(p);

2365

if (!memcg)

2365

if (!memcg)

2366

memcg = root_mem_cgroup;

2366

memcg = root_mem_cgroup;

2367

if (mem_cgroup_is_root(memcg)) {

2367

if (mem_cgroup_is_root(memcg)) {

2368

rcu_read_unlock();

2368

rcu_read_unlock();

2369

goto done;

2369

goto done;

2370

}

2370

}

2371

if (nr_pages == 1 && consume_stock(memcg)) {

2371

if (nr_pages == 1 && consume_stock(memcg)) {

2372

/*

2372

/*

2373

* It seems dagerous to access memcg without css_get().

2373

* It seems dagerous to access memcg without css_get().

2374

* But considering how consume_stok works, it's not

2374

* But considering how consume_stok works, it's not

2375

* necessary. If consume_stock success, some charges

2375

* necessary. If consume_stock success, some charges

2376

* from this memcg are cached on this cpu. So, we

2376

* from this memcg are cached on this cpu. So, we

2377

* don't need to call css_get()/css_tryget() before

2377

* don't need to call css_get()/css_tryget() before

2378

* calling consume_stock().

2378

* calling consume_stock().

2379

*/

2379

*/

2380

rcu_read_unlock();

2380

rcu_read_unlock();

2381

goto done;

2381

goto done;

2382

}

2382

}

2383

/* after here, we may be blocked. we need to get refcnt */

2383

/* after here, we may be blocked. we need to get refcnt */

2384

if (!css_tryget(&memcg->css)) {

2384

if (!css_tryget(&memcg->css)) {

2385

rcu_read_unlock();

2385

rcu_read_unlock();

2386

goto again;

2386

goto again;

2387

}

2387

}

2388

rcu_read_unlock();

2388

rcu_read_unlock();

2389

}

2389

}

2390

2391

do {

2391

do {

2392

bool oom_check;

2392

bool oom_check;

2393

2394

/* If killed, bypass charge */

2394

/* If killed, bypass charge */

2395

if (fatal_signal_pending(current)) {

2395

if (fatal_signal_pending(current)) {

2396

css_put(&memcg->css);

2396

css_put(&memcg->css);

2397

goto bypass;

2397

goto bypass;

2398

}

2398

}

2399

2400

oom_check = false;

2400

oom_check = false;

2401

if (oom && !nr_oom_retries) {

2401

if (oom && !nr_oom_retries) {

2402

oom_check = true;

2402

oom_check = true;

2403

nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

2403

nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

2404

}

2404

}

2405

2406

ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);

2406

ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);

2407

switch (ret) {

2407

switch (ret) {

2408

case CHARGE_OK:

2408

case CHARGE_OK:

2409

break;

2409

break;

2410

case CHARGE_RETRY: /* not in OOM situation but retry */

2410

case CHARGE_RETRY: /* not in OOM situation but retry */

2411

batch = nr_pages;

2411

batch = nr_pages;

2412

css_put(&memcg->css);

2412

css_put(&memcg->css);

2413

memcg = NULL;

2413

memcg = NULL;

2414

goto again;

2414

goto again;

2415

case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */

2415

case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */

2416

css_put(&memcg->css);

2416

css_put(&memcg->css);

2417

goto nomem;

2417

goto nomem;

2418

case CHARGE_NOMEM: /* OOM routine works */

2418

case CHARGE_NOMEM: /* OOM routine works */

2419

if (!oom) {

2419

if (!oom) {

2420

css_put(&memcg->css);

2420

css_put(&memcg->css);

2421

goto nomem;

2421

goto nomem;

2422

}

2422

}

2423

/* If oom, we never return -ENOMEM */

2423

/* If oom, we never return -ENOMEM */

2424

nr_oom_retries--;

2424

nr_oom_retries--;

2425

break;

2425

break;

2426

case CHARGE_OOM_DIE: /* Killed by OOM Killer */

2426

case CHARGE_OOM_DIE: /* Killed by OOM Killer */

2427

css_put(&memcg->css);

2427

css_put(&memcg->css);

2428

goto bypass;

2428

goto bypass;

2429

}

2429

}

2430

} while (ret != CHARGE_OK);

2430

} while (ret != CHARGE_OK);

2431

2432

if (batch > nr_pages)

2432

if (batch > nr_pages)

2433

refill_stock(memcg, batch - nr_pages);

2433

refill_stock(memcg, batch - nr_pages);

2434

css_put(&memcg->css);

2434

css_put(&memcg->css);

2435

done:

2435

done:

2436

*ptr = memcg;

2436

*ptr = memcg;

2437

return 0;

2437

return 0;

2438

nomem:

2438

nomem:

2439

*ptr = NULL;

2439

*ptr = NULL;

2440

return -ENOMEM;

2440

return -ENOMEM;

2441

bypass:

2441

bypass:

2442

*ptr = root_mem_cgroup;

2442

*ptr = root_mem_cgroup;

2443

return -EINTR;

2443

return -EINTR;

2444

}

2444

}

2445

2446

/*

2446

/*

2447

* Somemtimes we have to undo a charge we got by try_charge().

2447

* Somemtimes we have to undo a charge we got by try_charge().

2448

* This function is for that and do uncharge, put css's refcnt.

2448

* This function is for that and do uncharge, put css's refcnt.

2449

* gotten by try_charge().

2449

* gotten by try_charge().

2450

*/

2450

*/

2451

static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,

2451

static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,

2452

unsigned int nr_pages)

2452

unsigned int nr_pages)

2453

{

2453

{

2454

if (!mem_cgroup_is_root(memcg)) {

2454

if (!mem_cgroup_is_root(memcg)) {

2455

unsigned long bytes = nr_pages * PAGE_SIZE;

2455

unsigned long bytes = nr_pages * PAGE_SIZE;

2456

2457

res_counter_uncharge(&memcg->res, bytes);

2457

res_counter_uncharge(&memcg->res, bytes);

2458

if (do_swap_account)

2458

if (do_swap_account)

2459

res_counter_uncharge(&memcg->memsw, bytes);

2459

res_counter_uncharge(&memcg->memsw, bytes);

2460

}

2460

}

2461

}

2461

}

2462

2463

/*

2463

/*

2464

* Cancel chrages in this cgroup....doesn't propagate to parent cgroup.

2464

* Cancel chrages in this cgroup....doesn't propagate to parent cgroup.

2465

* This is useful when moving usage to parent cgroup.

2465

* This is useful when moving usage to parent cgroup.

2466

*/

2466

*/

2467

static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,

2467

static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,

2468

unsigned int nr_pages)

2468

unsigned int nr_pages)

2469

{

2469

{

2470

unsigned long bytes = nr_pages * PAGE_SIZE;

2470

unsigned long bytes = nr_pages * PAGE_SIZE;

2471

2472

if (mem_cgroup_is_root(memcg))

2472

if (mem_cgroup_is_root(memcg))

2473

return;

2473

return;

2474

2475

res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);

2475

res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);

2476

if (do_swap_account)

2476

if (do_swap_account)

2477

res_counter_uncharge_until(&memcg->memsw,

2477

res_counter_uncharge_until(&memcg->memsw,

2478

memcg->memsw.parent, bytes);

2478

memcg->memsw.parent, bytes);

2479

}

2479

}

2480

2481

/*

2481

/*

2482

* A helper function to get mem_cgroup from ID. must be called under

2482

* A helper function to get mem_cgroup from ID. must be called under

2483

* rcu_read_lock(). The caller must check css_is_removed() or some if

2483

* rcu_read_lock(). The caller must check css_is_removed() or some if

2484

* it's concern. (dropping refcnt from swap can be called against removed

2484

* it's concern. (dropping refcnt from swap can be called against removed

2485

* memcg.)

2485

* memcg.)

2486

*/

2486

*/

2487

static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)

2487

static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)

2488

{

2488

{

2489

struct cgroup_subsys_state *css;

2489

struct cgroup_subsys_state *css;

2490

2491

/* ID 0 is unused ID */

2491

/* ID 0 is unused ID */

2492

if (!id)

2492

if (!id)

2493

return NULL;

2493

return NULL;

2494

css = css_lookup(&mem_cgroup_subsys, id);

2494

css = css_lookup(&mem_cgroup_subsys, id);

2495

if (!css)

2495

if (!css)

2496

return NULL;

2496

return NULL;

2497

return container_of(css, struct mem_cgroup, css);

2497

return container_of(css, struct mem_cgroup, css);

2498

}

2498

}

2499

2500

struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)

2500

struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)

2501

{

2501

{

2502

struct mem_cgroup *memcg = NULL;

2502

struct mem_cgroup *memcg = NULL;

2503

struct page_cgroup *pc;

2503

struct page_cgroup *pc;

2504

unsigned short id;

2504

unsigned short id;

2505

swp_entry_t ent;

2505

swp_entry_t ent;

2506

2507

VM_BUG_ON(!PageLocked(page));

2507

VM_BUG_ON(!PageLocked(page));

2508

2509

pc = lookup_page_cgroup(page);

2509

pc = lookup_page_cgroup(page);

2510

lock_page_cgroup(pc);

2510

lock_page_cgroup(pc);

2511

if (PageCgroupUsed(pc)) {

2511

if (PageCgroupUsed(pc)) {

2512

memcg = pc->mem_cgroup;

2512

memcg = pc->mem_cgroup;

2513

if (memcg && !css_tryget(&memcg->css))

2513

if (memcg && !css_tryget(&memcg->css))

2514

memcg = NULL;

2514

memcg = NULL;

2515

} else if (PageSwapCache(page)) {

2515

} else if (PageSwapCache(page)) {

2516

ent.val = page_private(page);

2516

ent.val = page_private(page);

2517

id = lookup_swap_cgroup_id(ent);

2517

id = lookup_swap_cgroup_id(ent);

2518

rcu_read_lock();

2518

rcu_read_lock();

2519

memcg = mem_cgroup_lookup(id);

2519

memcg = mem_cgroup_lookup(id);

2520

if (memcg && !css_tryget(&memcg->css))

2520

if (memcg && !css_tryget(&memcg->css))

2521

memcg = NULL;

2521

memcg = NULL;

2522

rcu_read_unlock();

2522

rcu_read_unlock();

2523

}

2523

}

2524

unlock_page_cgroup(pc);

2524

unlock_page_cgroup(pc);

2525

return memcg;

2525

return memcg;

2526

}

2526

}

2527

2528

static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,

2528

static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,

2529

struct page *page,

2529

struct page *page,

2530

unsigned int nr_pages,

2530

unsigned int nr_pages,

2531

enum charge_type ctype,

2531

enum charge_type ctype,

2532

bool lrucare)

2532

bool lrucare)

2533

{

2533

{

2534

struct page_cgroup *pc = lookup_page_cgroup(page);

2534

struct page_cgroup *pc = lookup_page_cgroup(page);

2535

struct zone *uninitialized_var(zone);

2535

struct zone *uninitialized_var(zone);

2536

struct lruvec *lruvec;

2536

struct lruvec *lruvec;

2537

bool was_on_lru = false;

2537

bool was_on_lru = false;

2538

bool anon;

2538

bool anon;

2539

2540

lock_page_cgroup(pc);

2540

lock_page_cgroup(pc);

2541

VM_BUG_ON(PageCgroupUsed(pc));

2541

VM_BUG_ON(PageCgroupUsed(pc));

2542

/*

2542

/*

2543

* we don't need page_cgroup_lock about tail pages, becase they are not

2543

* we don't need page_cgroup_lock about tail pages, becase they are not

2544

* accessed by any other context at this point.

2544

* accessed by any other context at this point.

2545

*/

2545

*/

2546

2547

/*

2547

/*

2548

* In some cases, SwapCache and FUSE(splice_buf->radixtree), the page

2548

* In some cases, SwapCache and FUSE(splice_buf->radixtree), the page

2549

* may already be on some other mem_cgroup's LRU. Take care of it.

2549

* may already be on some other mem_cgroup's LRU. Take care of it.

2550

*/

2550

*/

2551

if (lrucare) {

2551

if (lrucare) {

2552

zone = page_zone(page);

2552

zone = page_zone(page);

2553

spin_lock_irq(&zone->lru_lock);

2553

spin_lock_irq(&zone->lru_lock);

2554

if (PageLRU(page)) {

2554

if (PageLRU(page)) {

2555

lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);

2555

lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);

2556

ClearPageLRU(page);

2556

ClearPageLRU(page);

2557

del_page_from_lru_list(page, lruvec, page_lru(page));

2557

del_page_from_lru_list(page, lruvec, page_lru(page));

2558

was_on_lru = true;

2558

was_on_lru = true;

2559

}

2559

}

2560

}

2560

}

2561

2562

pc->mem_cgroup = memcg;

2562

pc->mem_cgroup = memcg;

2563

/*

2563

/*

2564

* We access a page_cgroup asynchronously without lock_page_cgroup().

2564

* We access a page_cgroup asynchronously without lock_page_cgroup().

2565

* Especially when a page_cgroup is taken from a page, pc->mem_cgroup

2565

* Especially when a page_cgroup is taken from a page, pc->mem_cgroup

2566

* is accessed after testing USED bit. To make pc->mem_cgroup visible

2566

* is accessed after testing USED bit. To make pc->mem_cgroup visible

2567

* before USED bit, we need memory barrier here.

2567

* before USED bit, we need memory barrier here.

2568

* See mem_cgroup_add_lru_list(), etc.

2568

* See mem_cgroup_add_lru_list(), etc.

2569

*/

2569

*/

2570

smp_wmb();

2570

smp_wmb();

2571

SetPageCgroupUsed(pc);

2571

SetPageCgroupUsed(pc);

2572

2573

if (lrucare) {

2573

if (lrucare) {

2574

if (was_on_lru) {

2574

if (was_on_lru) {

2575

lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);

2575

lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);

2576

VM_BUG_ON(PageLRU(page));

2576

VM_BUG_ON(PageLRU(page));

2577

SetPageLRU(page);

2577

SetPageLRU(page);

2578

add_page_to_lru_list(page, lruvec, page_lru(page));

2578

add_page_to_lru_list(page, lruvec, page_lru(page));

2579

}

2579

}

2580

spin_unlock_irq(&zone->lru_lock);

2580

spin_unlock_irq(&zone->lru_lock);

2581

}

2581

}

2582

2583

if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)

2583

if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)

2584

anon = true;

2584

anon = true;

2585

else

2585

else

2586

anon = false;

2586

anon = false;

2587

2588

mem_cgroup_charge_statistics(memcg, anon, nr_pages);

2588

mem_cgroup_charge_statistics(memcg, anon, nr_pages);

2589

unlock_page_cgroup(pc);

2589

unlock_page_cgroup(pc);

2590

2591

/*

2591

/*

2592

* "charge_statistics" updated event counter. Then, check it.

2592

* "charge_statistics" updated event counter. Then, check it.

2593

* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.

2593

* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.

2594

* if they exceeds softlimit.

2594

* if they exceeds softlimit.

2595

*/

2595

*/

2596

memcg_check_events(memcg, page);

2596

memcg_check_events(memcg, page);

2597

}

2597

}

2598

2599

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

2599

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

2600

2601

#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)

2601

#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)

2602

/*

2602

/*

2603

* Because tail pages are not marked as "used", set it. We're under

2603

* Because tail pages are not marked as "used", set it. We're under

2604

* zone->lru_lock, 'splitting on pmd' and compound_lock.

2604

* zone->lru_lock, 'splitting on pmd' and compound_lock.

2605

* charge/uncharge will be never happen and move_account() is done under

2605

* charge/uncharge will be never happen and move_account() is done under

2606

* compound_lock(), so we don't have to take care of races.

2606

* compound_lock(), so we don't have to take care of races.

2607

*/

2607

*/

2608

void mem_cgroup_split_huge_fixup(struct page *head)

2608

void mem_cgroup_split_huge_fixup(struct page *head)

2609

{

2609

{

2610

struct page_cgroup *head_pc = lookup_page_cgroup(head);

2610

struct page_cgroup *head_pc = lookup_page_cgroup(head);

2611

struct page_cgroup *pc;

2611

struct page_cgroup *pc;

2612

int i;

2612

int i;

2613

2614

if (mem_cgroup_disabled())

2614

if (mem_cgroup_disabled())

2615

return;

2615

return;

2616

for (i = 1; i < HPAGE_PMD_NR; i++) {

2616

for (i = 1; i < HPAGE_PMD_NR; i++) {

2617

pc = head_pc + i;

2617

pc = head_pc + i;

2618

pc->mem_cgroup = head_pc->mem_cgroup;

2618

pc->mem_cgroup = head_pc->mem_cgroup;

2619

smp_wmb();/* see __commit_charge() */

2619

smp_wmb();/* see __commit_charge() */

2620

pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;

2620

pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;

2621

}

2621

}

2622

}

2622

}

2623

#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

2623

#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

2624

2625

/**

2625

/**

2626

* mem_cgroup_move_account - move account of the page

2626

* mem_cgroup_move_account - move account of the page

2627

* @page: the page

2627

* @page: the page

2628

* @nr_pages: number of regular pages (>1 for huge pages)

2628

* @nr_pages: number of regular pages (>1 for huge pages)

2629

* @pc: page_cgroup of the page.

2629

* @pc: page_cgroup of the page.

2630

* @from: mem_cgroup which the page is moved from.

2630

* @from: mem_cgroup which the page is moved from.

2631

* @to: mem_cgroup which the page is moved to. @from != @to.

2631

* @to: mem_cgroup which the page is moved to. @from != @to.

2632

*

2632

*

2633

* The caller must confirm following.

2633

* The caller must confirm following.

2634

* - page is not on LRU (isolate_page() is useful.)

2634

* - page is not on LRU (isolate_page() is useful.)

2635

* - compound_lock is held when nr_pages > 1

2635

* - compound_lock is held when nr_pages > 1

2636

*

2636

*

2637

* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"

2637

* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"

2638

* from old cgroup.

2638

* from old cgroup.

2639

*/

2639

*/

2640

static int mem_cgroup_move_account(struct page *page,

2640

static int mem_cgroup_move_account(struct page *page,

2641

unsigned int nr_pages,

2641

unsigned int nr_pages,

2642

struct page_cgroup *pc,

2642

struct page_cgroup *pc,

2643

struct mem_cgroup *from,

2643

struct mem_cgroup *from,

2644

struct mem_cgroup *to)

2644

struct mem_cgroup *to)

2645

{

2645

{

2646

unsigned long flags;

2646

unsigned long flags;

2647

int ret;

2647

int ret;

2648

bool anon = PageAnon(page);

2648

bool anon = PageAnon(page);

2649

2650

VM_BUG_ON(from == to);

2650

VM_BUG_ON(from == to);

2651

VM_BUG_ON(PageLRU(page));

2651

VM_BUG_ON(PageLRU(page));

2652

/*

2652

/*

2653

* The page is isolated from LRU. So, collapse function

2653

* The page is isolated from LRU. So, collapse function

2654

* will not handle this page. But page splitting can happen.

2654

* will not handle this page. But page splitting can happen.

2655

* Do this check under compound_page_lock(). The caller should

2655

* Do this check under compound_page_lock(). The caller should

2656

* hold it.

2656

* hold it.

2657

*/

2657

*/

2658

ret = -EBUSY;

2658

ret = -EBUSY;

2659

if (nr_pages > 1 && !PageTransHuge(page))

2659

if (nr_pages > 1 && !PageTransHuge(page))

2660

goto out;

2660

goto out;

2661

2662

lock_page_cgroup(pc);

2662

lock_page_cgroup(pc);

2663

2664

ret = -EINVAL;

2664

ret = -EINVAL;

2665

if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)

2665

if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)

2666

goto unlock;

2666

goto unlock;

2667

2668

move_lock_mem_cgroup(from, &flags);

2668

move_lock_mem_cgroup(from, &flags);

2669

2670

if (!anon && page_mapped(page)) {

2670

if (!anon && page_mapped(page)) {

2671

/* Update mapped_file data for mem_cgroup */

2671

/* Update mapped_file data for mem_cgroup */

2672

preempt_disable();

2672

preempt_disable();

2673

__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2673

__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2674

__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2674

__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2675

preempt_enable();

2675

preempt_enable();

2676

}

2676

}

2677

mem_cgroup_charge_statistics(from, anon, -nr_pages);

2677

mem_cgroup_charge_statistics(from, anon, -nr_pages);

2678

2679

/* caller should have done css_get */

2679

/* caller should have done css_get */

2680

pc->mem_cgroup = to;

2680

pc->mem_cgroup = to;

2681

mem_cgroup_charge_statistics(to, anon, nr_pages);

2681

mem_cgroup_charge_statistics(to, anon, nr_pages);

2682

/*

2682

/*

2683

* We charges against "to" which may not have any tasks. Then, "to"

2683

* We charges against "to" which may not have any tasks. Then, "to"

2684

* can be under rmdir(). But in current implementation, caller of

2684

* can be under rmdir(). But in current implementation, caller of

2685

* this function is just force_empty() and move charge, so it's

2685

* this function is just force_empty() and move charge, so it's

2686

* guaranteed that "to" is never removed. So, we don't check rmdir

2686

* guaranteed that "to" is never removed. So, we don't check rmdir

2687

* status here.

2687

* status here.

2688

*/

2688

*/

2689

move_unlock_mem_cgroup(from, &flags);

2689

move_unlock_mem_cgroup(from, &flags);

2690

ret = 0;

2690

ret = 0;

2691

unlock:

2691

unlock:

2692

unlock_page_cgroup(pc);

2692

unlock_page_cgroup(pc);

2693

/*

2693

/*

2694

* check events

2694

* check events

2695

*/

2695

*/

2696

memcg_check_events(to, page);

2696

memcg_check_events(to, page);

2697

memcg_check_events(from, page);

2697

memcg_check_events(from, page);

2698

out:

2698

out:

2699

return ret;

2699

return ret;

2700

}

2700

}

2701

2702

/*

2702

/*

2703

* move charges to its parent.

2703

* move charges to its parent.

2704

*/

2704

*/

2705

2706

static int mem_cgroup_move_parent(struct page *page,

2706

static int mem_cgroup_move_parent(struct page *page,

2707

struct page_cgroup *pc,

2707

struct page_cgroup *pc,

2708

struct mem_cgroup *child)

2708

struct mem_cgroup *child)

2709

{

2709

{

2710

struct mem_cgroup *parent;

2710

struct mem_cgroup *parent;

2711

unsigned int nr_pages;

2711

unsigned int nr_pages;

2712

unsigned long uninitialized_var(flags);

2712

unsigned long uninitialized_var(flags);

2713

int ret;

2713

int ret;

2714

2715

/* Is ROOT ? */

2715

/* Is ROOT ? */

2716

if (mem_cgroup_is_root(child))

2716

if (mem_cgroup_is_root(child))

2717

return -EINVAL;

2717

return -EINVAL;

2718

2719

ret = -EBUSY;

2719

ret = -EBUSY;

2720

if (!get_page_unless_zero(page))

2720

if (!get_page_unless_zero(page))

2721

goto out;

2721

goto out;

2722

if (isolate_lru_page(page))

2722

if (isolate_lru_page(page))

2723

goto put;

2723

goto put;

2724

2725

nr_pages = hpage_nr_pages(page);

2725

nr_pages = hpage_nr_pages(page);

2726

2727

parent = parent_mem_cgroup(child);

2727

parent = parent_mem_cgroup(child);

2728

/*

2728

/*

2729

* If no parent, move charges to root cgroup.

2729

* If no parent, move charges to root cgroup.

2730

*/

2730

*/

2731

if (!parent)

2731

if (!parent)

2732

parent = root_mem_cgroup;

2732

parent = root_mem_cgroup;

2733

2734

if (nr_pages > 1)

2734

if (nr_pages > 1)

2735

flags = compound_lock_irqsave(page);

2735

flags = compound_lock_irqsave(page);

2736

2737

ret = mem_cgroup_move_account(page, nr_pages,

2737

ret = mem_cgroup_move_account(page, nr_pages,

2738

pc, child, parent);

2738

pc, child, parent);

2739

if (!ret)

2739

if (!ret)

2740

__mem_cgroup_cancel_local_charge(child, nr_pages);

2740

__mem_cgroup_cancel_local_charge(child, nr_pages);

2741

2742

if (nr_pages > 1)

2742

if (nr_pages > 1)

2743

compound_unlock_irqrestore(page, flags);

2743

compound_unlock_irqrestore(page, flags);

2744

putback_lru_page(page);

2744

putback_lru_page(page);

2745

put:

2745

put:

2746

put_page(page);

2746

put_page(page);

2747

out:

2747

out:

2748

return ret;

2748

return ret;

2749

}

2749

}

2750

2751

/*

2751

/*

2752

* Charge the memory controller for page usage.

2752

* Charge the memory controller for page usage.

2753

* Return

2753

* Return

2754

* 0 if the charge was successful

2754

* 0 if the charge was successful

2755

* < 0 if the cgroup is over its limit

2755

* < 0 if the cgroup is over its limit

2756

*/

2756

*/

2757

static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,

2757

static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,

2758

gfp_t gfp_mask, enum charge_type ctype)

2758

gfp_t gfp_mask, enum charge_type ctype)

2759

{

2759

{

2760

struct mem_cgroup *memcg = NULL;

2760

struct mem_cgroup *memcg = NULL;

2761

unsigned int nr_pages = 1;

2761

unsigned int nr_pages = 1;

2762

bool oom = true;

2762

bool oom = true;

2763

int ret;

2763

int ret;

2764

2765

if (PageTransHuge(page)) {

2765

if (PageTransHuge(page)) {

2766

nr_pages <<= compound_order(page);

2766

nr_pages <<= compound_order(page);

2767

VM_BUG_ON(!PageTransHuge(page));

2767

VM_BUG_ON(!PageTransHuge(page));

2768

/*

2768

/*

2769

* Never OOM-kill a process for a huge page. The

2769

* Never OOM-kill a process for a huge page. The

2770

* fault handler will fall back to regular pages.

2770

* fault handler will fall back to regular pages.

2771

*/

2771

*/

2772

oom = false;

2772

oom = false;

2773

}

2773

}

2774

2775

ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);

2775

ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);

2776

if (ret == -ENOMEM)

2776

if (ret == -ENOMEM)

2777

return ret;

2777

return ret;

2778

__mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);

2778

__mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);

2779

return 0;

2779

return 0;

2780

}

2780

}

2781

2782

int mem_cgroup_newpage_charge(struct page *page,

2782

int mem_cgroup_newpage_charge(struct page *page,

2783

struct mm_struct *mm, gfp_t gfp_mask)

2783

struct mm_struct *mm, gfp_t gfp_mask)

2784

{

2784

{

2785

if (mem_cgroup_disabled())

2785

if (mem_cgroup_disabled())

2786

return 0;

2786

return 0;

2787

VM_BUG_ON(page_mapped(page));

2787

VM_BUG_ON(page_mapped(page));

2788

VM_BUG_ON(page->mapping && !PageAnon(page));

2788

VM_BUG_ON(page->mapping && !PageAnon(page));

2789

VM_BUG_ON(!mm);

2789

VM_BUG_ON(!mm);

2790

return mem_cgroup_charge_common(page, mm, gfp_mask,

2790

return mem_cgroup_charge_common(page, mm, gfp_mask,

2791

MEM_CGROUP_CHARGE_TYPE_ANON);

2791

MEM_CGROUP_CHARGE_TYPE_ANON);

2792

}

2792

}

2793

2794

/*

2794

/*

2795

* While swap-in, try_charge -> commit or cancel, the page is locked.

2795

* While swap-in, try_charge -> commit or cancel, the page is locked.

2796

* And when try_charge() successfully returns, one refcnt to memcg without

2796

* And when try_charge() successfully returns, one refcnt to memcg without

2797

* struct page_cgroup is acquired. This refcnt will be consumed by

2797

* struct page_cgroup is acquired. This refcnt will be consumed by

2798

* "commit()" or removed by "cancel()"

2798

* "commit()" or removed by "cancel()"

2799

*/

2799

*/

2800

static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,

2800

static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,

2801

struct page *page,

2801

struct page *page,

2802

gfp_t mask,

2802

gfp_t mask,

2803

struct mem_cgroup **memcgp)

2803

struct mem_cgroup **memcgp)

2804

{

2804

{

2805

struct mem_cgroup *memcg;

2805

struct mem_cgroup *memcg;

2806

struct page_cgroup *pc;

2806

struct page_cgroup *pc;

2807

int ret;

2807

int ret;

2808

2809

pc = lookup_page_cgroup(page);

2809

pc = lookup_page_cgroup(page);

2810

/*

2810

/*

2811

* Every swap fault against a single page tries to charge the

2811

* Every swap fault against a single page tries to charge the

2812

* page, bail as early as possible. shmem_unuse() encounters

2812

* page, bail as early as possible. shmem_unuse() encounters

2813

* already charged pages, too. The USED bit is protected by

2813

* already charged pages, too. The USED bit is protected by

2814

* the page lock, which serializes swap cache removal, which

2814

* the page lock, which serializes swap cache removal, which

2815

* in turn serializes uncharging.

2815

* in turn serializes uncharging.

2816

*/

2816

*/

2817

if (PageCgroupUsed(pc))

2817

if (PageCgroupUsed(pc))

2818

return 0;

2818

return 0;

2819

if (!do_swap_account)

2819

if (!do_swap_account)

2820

goto charge_cur_mm;

2820

goto charge_cur_mm;

2821

/*

2822

* A racing thread's fault, or swapoff, may have already updated

2823

* the pte, and even removed page from swap cache: in those cases

2824

* do_swap_page()'s pte_same() test will fail; but there's also a

2825

* KSM case which does need to charge the page.

2826

*/

2827

if (!PageSwapCache(page))

2828

goto charge_cur_mm;

2829

memcg = try_get_mem_cgroup_from_page(page);

2821

memcg = try_get_mem_cgroup_from_page(page);

2830

if (!memcg)

2822

if (!memcg)

2831

goto charge_cur_mm;

2823

goto charge_cur_mm;

2832

*memcgp = memcg;

2824

*memcgp = memcg;

2833

ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);

2825

ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);

2834

css_put(&memcg->css);

2826

css_put(&memcg->css);

2835

if (ret == -EINTR)

2827

if (ret == -EINTR)

2836

ret = 0;

2828

ret = 0;

2837

return ret;

2829

return ret;

2838

charge_cur_mm:

2830

charge_cur_mm:

2839

ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);

2831

ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);

2840

if (ret == -EINTR)

2832

if (ret == -EINTR)

2841

ret = 0;

2833

ret = 0;

2842

return ret;

2834

return ret;

2843

}

2835

}

2844

2836

2845

int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,

2837

int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,

2846

gfp_t gfp_mask, struct mem_cgroup **memcgp)

2838

gfp_t gfp_mask, struct mem_cgroup **memcgp)

2847

{

2839

{

2848

*memcgp = NULL;

2840

*memcgp = NULL;

2849

if (mem_cgroup_disabled())

2841

if (mem_cgroup_disabled())

2850

return 0;

2842

return 0;

2843

/*

2844

* A racing thread's fault, or swapoff, may have already

2845

* updated the pte, and even removed page from swap cache: in

2846

* those cases unuse_pte()'s pte_same() test will fail; but

2847

* there's also a KSM case which does need to charge the page.

2848

*/

2849

if (!PageSwapCache(page)) {

2850

int ret;

2851

2852

ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);

2853

if (ret == -EINTR)

2854

ret = 0;

2855

return ret;

2856

}

2851

return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);

2857

return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);

2852

}

2858

}

2853

2859

2854

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)

2860

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)

2855

{

2861

{

2856

if (mem_cgroup_disabled())

2862

if (mem_cgroup_disabled())

2857

return;

2863

return;

2858

if (!memcg)

2864

if (!memcg)

2859

return;

2865

return;

2860

__mem_cgroup_cancel_charge(memcg, 1);

2866

__mem_cgroup_cancel_charge(memcg, 1);

2861

}

2867

}

2862

2868

2863

static void

2869

static void

2864

__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,

2870

__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,

2865

enum charge_type ctype)

2871

enum charge_type ctype)

2866

{

2872

{

2867

if (mem_cgroup_disabled())

2873

if (mem_cgroup_disabled())

2868

return;

2874

return;

2869

if (!memcg)

2875

if (!memcg)

2870

return;

2876

return;

2871

cgroup_exclude_rmdir(&memcg->css);

2877

cgroup_exclude_rmdir(&memcg->css);

2872

2878

2873

__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);

2879

__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);

2874

/*

2880

/*

2875

* Now swap is on-memory. This means this page may be

2881

* Now swap is on-memory. This means this page may be

2876

* counted both as mem and swap....double count.

2882

* counted both as mem and swap....double count.

2877

* Fix it by uncharging from memsw. Basically, this SwapCache is stable

2883

* Fix it by uncharging from memsw. Basically, this SwapCache is stable

2878

* under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()

2884

* under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()

2879

* may call delete_from_swap_cache() before reach here.

2885

* may call delete_from_swap_cache() before reach here.

2880

*/

2886

*/

2881

if (do_swap_account && PageSwapCache(page)) {

2887

if (do_swap_account && PageSwapCache(page)) {

2882

swp_entry_t ent = {.val = page_private(page)};

2888

swp_entry_t ent = {.val = page_private(page)};

2883

mem_cgroup_uncharge_swap(ent);

2889

mem_cgroup_uncharge_swap(ent);

2884

}

2890

}

2885

/*

2891

/*

2886

* At swapin, we may charge account against cgroup which has no tasks.

2892

* At swapin, we may charge account against cgroup which has no tasks.

2887

* So, rmdir()->pre_destroy() can be called while we do this charge.

2893

* So, rmdir()->pre_destroy() can be called while we do this charge.

2888

* In that case, we need to call pre_destroy() again. check it here.

2894

* In that case, we need to call pre_destroy() again. check it here.

2889

*/

2895

*/

2890

cgroup_release_and_wakeup_rmdir(&memcg->css);

2896

cgroup_release_and_wakeup_rmdir(&memcg->css);

2891

}

2897

}

2892

2898

2893

void mem_cgroup_commit_charge_swapin(struct page *page,

2899

void mem_cgroup_commit_charge_swapin(struct page *page,

2894

struct mem_cgroup *memcg)

2900

struct mem_cgroup *memcg)

2895

{

2901

{

2896

__mem_cgroup_commit_charge_swapin(page, memcg,

2902

__mem_cgroup_commit_charge_swapin(page, memcg,

2897

MEM_CGROUP_CHARGE_TYPE_ANON);

2903

MEM_CGROUP_CHARGE_TYPE_ANON);

2898

}

2904

}

2899

2905

2900

int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,

2906

int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,

2901

gfp_t gfp_mask)

2907

gfp_t gfp_mask)

2902

{

2908

{

2903

struct mem_cgroup *memcg = NULL;

2909

struct mem_cgroup *memcg = NULL;

2904

enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;

2910

enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;

2905

int ret;

2911

int ret;

2906

2912

2907

if (mem_cgroup_disabled())

2913

if (mem_cgroup_disabled())

2908

return 0;

2914

return 0;

2909

if (PageCompound(page))

2915

if (PageCompound(page))

2910

return 0;

2916

return 0;

2911

2917

2912

if (!PageSwapCache(page))

2918

if (!PageSwapCache(page))

2913

ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);

2919

ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);

2914

else { /* page is swapcache/shmem */

2920

else { /* page is swapcache/shmem */

2915

ret = __mem_cgroup_try_charge_swapin(mm, page,

2921

ret = __mem_cgroup_try_charge_swapin(mm, page,

2916

gfp_mask, &memcg);

2922

gfp_mask, &memcg);

2917

if (!ret)

2923

if (!ret)

2918

__mem_cgroup_commit_charge_swapin(page, memcg, type);

2924

__mem_cgroup_commit_charge_swapin(page, memcg, type);

2919

}

2925

}

2920

return ret;

2926

return ret;

2921

}

2927

}

2922

2928

2923

static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,

2929

static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,

2924

unsigned int nr_pages,

2930

unsigned int nr_pages,

2925

const enum charge_type ctype)

2931

const enum charge_type ctype)

2926

{

2932

{

2927

struct memcg_batch_info *batch = NULL;

2933

struct memcg_batch_info *batch = NULL;

2928

bool uncharge_memsw = true;

2934

bool uncharge_memsw = true;

2929

2935

2930

/* If swapout, usage of swap doesn't decrease */

2936

/* If swapout, usage of swap doesn't decrease */

2931

if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)

2937

if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)

2932

uncharge_memsw = false;

2938

uncharge_memsw = false;

2933

2939

2934

batch = &current->memcg_batch;

2940

batch = &current->memcg_batch;

2935

/*

2941

/*

2936

* In usual, we do css_get() when we remember memcg pointer.

2942

* In usual, we do css_get() when we remember memcg pointer.

2937

* But in this case, we keep res->usage until end of a series of

2943

* But in this case, we keep res->usage until end of a series of

2938

* uncharges. Then, it's ok to ignore memcg's refcnt.

2944

* uncharges. Then, it's ok to ignore memcg's refcnt.

2939

*/

2945

*/

2940

if (!batch->memcg)

2946

if (!batch->memcg)

2941

batch->memcg = memcg;

2947

batch->memcg = memcg;

2942

/*

2948

/*

2943

* do_batch > 0 when unmapping pages or inode invalidate/truncate.

2949

* do_batch > 0 when unmapping pages or inode invalidate/truncate.

2944

* In those cases, all pages freed continuously can be expected to be in

2950

* In those cases, all pages freed continuously can be expected to be in

2945

* the same cgroup and we have chance to coalesce uncharges.

2951

* the same cgroup and we have chance to coalesce uncharges.

2946

* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)

2952

* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)

2947

* because we want to do uncharge as soon as possible.

2953

* because we want to do uncharge as soon as possible.

2948

*/

2954

*/

2949

2955

2950

if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))

2956

if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))

2951

goto direct_uncharge;

2957

goto direct_uncharge;

2952

2958

2953

if (nr_pages > 1)

2959

if (nr_pages > 1)

2954

goto direct_uncharge;

2960

goto direct_uncharge;

2955

2961

2956

/*

2962

/*

2957

* In typical case, batch->memcg == mem. This means we can

2963

* In typical case, batch->memcg == mem. This means we can

2958

* merge a series of uncharges to an uncharge of res_counter.

2964

* merge a series of uncharges to an uncharge of res_counter.

2959

* If not, we uncharge res_counter ony by one.

2965

* If not, we uncharge res_counter ony by one.

2960

*/

2966

*/

2961

if (batch->memcg != memcg)

2967

if (batch->memcg != memcg)

2962

goto direct_uncharge;

2968

goto direct_uncharge;

2963

/* remember freed charge and uncharge it later */

2969

/* remember freed charge and uncharge it later */

2964

batch->nr_pages++;

2970

batch->nr_pages++;

2965

if (uncharge_memsw)

2971

if (uncharge_memsw)

2966

batch->memsw_nr_pages++;

2972

batch->memsw_nr_pages++;

2967

return;

2973

return;

2968

direct_uncharge:

2974

direct_uncharge:

2969

res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);

2975

res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);

2970

if (uncharge_memsw)

2976

if (uncharge_memsw)

2971

res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);

2977

res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);

2972

if (unlikely(batch->memcg != memcg))

2978

if (unlikely(batch->memcg != memcg))

2973

memcg_oom_recover(memcg);

2979

memcg_oom_recover(memcg);

2974

}

2980

}

2975

2981

2976

/*

2982

/*

2977

* uncharge if !page_mapped(page)

2983

* uncharge if !page_mapped(page)

2978

*/

2984

*/

2979

static struct mem_cgroup *

2985

static struct mem_cgroup *

2980

__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,

2986

__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,

2981

bool end_migration)

2987

bool end_migration)

2982

{

2988

{

2983

struct mem_cgroup *memcg = NULL;

2989

struct mem_cgroup *memcg = NULL;

2984

unsigned int nr_pages = 1;

2990

unsigned int nr_pages = 1;

2985

struct page_cgroup *pc;

2991

struct page_cgroup *pc;

2986

bool anon;

2992

bool anon;

2987

2993

2988

if (mem_cgroup_disabled())

2994

if (mem_cgroup_disabled())

2989

return NULL;

2995

return NULL;

2990

2996

2991

VM_BUG_ON(PageSwapCache(page));

2997

VM_BUG_ON(PageSwapCache(page));

2992

2998

2993

if (PageTransHuge(page)) {

2999

if (PageTransHuge(page)) {

2994

nr_pages <<= compound_order(page);

3000

nr_pages <<= compound_order(page);

2995

VM_BUG_ON(!PageTransHuge(page));

3001

VM_BUG_ON(!PageTransHuge(page));

2996

}

3002

}

2997

/*

3003

/*

2998

* Check if our page_cgroup is valid

3004

* Check if our page_cgroup is valid

2999

*/

3005

*/

3000

pc = lookup_page_cgroup(page);

3006

pc = lookup_page_cgroup(page);

3001

if (unlikely(!PageCgroupUsed(pc)))

3007

if (unlikely(!PageCgroupUsed(pc)))

3002

return NULL;

3008

return NULL;

3003

3009

3004

lock_page_cgroup(pc);

3010

lock_page_cgroup(pc);

3005

3011

3006

memcg = pc->mem_cgroup;

3012

memcg = pc->mem_cgroup;

3007

3013

3008

if (!PageCgroupUsed(pc))

3014

if (!PageCgroupUsed(pc))

3009

goto unlock_out;

3015

goto unlock_out;

3010

3016

3011

anon = PageAnon(page);

3017

anon = PageAnon(page);

3012

3018

3013

switch (ctype) {

3019

switch (ctype) {

3014

case MEM_CGROUP_CHARGE_TYPE_ANON:

3020

case MEM_CGROUP_CHARGE_TYPE_ANON:

3015

/*

3021

/*

3016

* Generally PageAnon tells if it's the anon statistics to be

3022

* Generally PageAnon tells if it's the anon statistics to be

3017

* updated; but sometimes e.g. mem_cgroup_uncharge_page() is

3023

* updated; but sometimes e.g. mem_cgroup_uncharge_page() is

3018

* used before page reached the stage of being marked PageAnon.

3024

* used before page reached the stage of being marked PageAnon.

3019

*/

3025

*/

3020

anon = true;

3026

anon = true;

3021

/* fallthrough */

3027

/* fallthrough */

3022

case MEM_CGROUP_CHARGE_TYPE_DROP:

3028

case MEM_CGROUP_CHARGE_TYPE_DROP:

3023

/* See mem_cgroup_prepare_migration() */

3029

/* See mem_cgroup_prepare_migration() */

3024

if (page_mapped(page))

3030

if (page_mapped(page))

3025

goto unlock_out;

3031

goto unlock_out;

3026

/*

3032

/*

3027

* Pages under migration may not be uncharged. But

3033

* Pages under migration may not be uncharged. But

3028

* end_migration() /must/ be the one uncharging the

3034

* end_migration() /must/ be the one uncharging the

3029

* unused post-migration page and so it has to call

3035

* unused post-migration page and so it has to call

3030

* here with the migration bit still set. See the

3036

* here with the migration bit still set. See the

3031

* res_counter handling below.

3037

* res_counter handling below.

3032

*/

3038

*/

3033

if (!end_migration && PageCgroupMigration(pc))

3039

if (!end_migration && PageCgroupMigration(pc))

3034

goto unlock_out;

3040

goto unlock_out;

3035

break;

3041

break;

3036

case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:

3042

case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:

3037

if (!PageAnon(page)) { /* Shared memory */

3043

if (!PageAnon(page)) { /* Shared memory */

3038

if (page->mapping && !page_is_file_cache(page))

3044

if (page->mapping && !page_is_file_cache(page))

3039

goto unlock_out;

3045

goto unlock_out;

3040

} else if (page_mapped(page)) /* Anon */

3046

} else if (page_mapped(page)) /* Anon */

3041

goto unlock_out;

3047

goto unlock_out;

3042

break;

3048

break;

3043

default:

3049

default:

3044

break;

3050

break;

3045

}

3051

}

3046

3052

3047

mem_cgroup_charge_statistics(memcg, anon, -nr_pages);

3053

mem_cgroup_charge_statistics(memcg, anon, -nr_pages);

3048

3054

3049

ClearPageCgroupUsed(pc);

3055

ClearPageCgroupUsed(pc);

3050

/*

3056

/*

3051

* pc->mem_cgroup is not cleared here. It will be accessed when it's

3057

* pc->mem_cgroup is not cleared here. It will be accessed when it's

3052

* freed from LRU. This is safe because uncharged page is expected not

3058

* freed from LRU. This is safe because uncharged page is expected not

3053

* to be reused (freed soon). Exception is SwapCache, it's handled by

3059

* to be reused (freed soon). Exception is SwapCache, it's handled by

3054

* special functions.

3060

* special functions.

3055

*/

3061

*/

3056

3062

3057

unlock_page_cgroup(pc);

3063

unlock_page_cgroup(pc);

3058

/*

3064

/*

3059

* even after unlock, we have memcg->res.usage here and this memcg

3065

* even after unlock, we have memcg->res.usage here and this memcg

3060

* will never be freed.

3066

* will never be freed.

3061

*/

3067

*/

3062

memcg_check_events(memcg, page);

3068

memcg_check_events(memcg, page);

3063

if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {

3069

if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {

3064

mem_cgroup_swap_statistics(memcg, true);

3070

mem_cgroup_swap_statistics(memcg, true);

3065

mem_cgroup_get(memcg);

3071

mem_cgroup_get(memcg);

3066

}

3072

}

3067

/*

3073

/*

3068

* Migration does not charge the res_counter for the

3074

* Migration does not charge the res_counter for the

3069

* replacement page, so leave it alone when phasing out the

3075

* replacement page, so leave it alone when phasing out the

3070

* page that is unused after the migration.

3076

* page that is unused after the migration.

3071

*/

3077

*/

3072

if (!end_migration && !mem_cgroup_is_root(memcg))

3078

if (!end_migration && !mem_cgroup_is_root(memcg))

3073

mem_cgroup_do_uncharge(memcg, nr_pages, ctype);

3079

mem_cgroup_do_uncharge(memcg, nr_pages, ctype);

3074

3080

3075

return memcg;

3081

return memcg;

3076

3082

3077

unlock_out:

3083

unlock_out:

3078

unlock_page_cgroup(pc);

3084

unlock_page_cgroup(pc);

3079

return NULL;

3085

return NULL;

3080

}

3086

}

3081

3087

3082

void mem_cgroup_uncharge_page(struct page *page)

3088

void mem_cgroup_uncharge_page(struct page *page)

3083

{

3089

{

3084

/* early check. */

3090

/* early check. */

3085

if (page_mapped(page))

3091

if (page_mapped(page))

3086

return;

3092

return;

3087

VM_BUG_ON(page->mapping && !PageAnon(page));

3093

VM_BUG_ON(page->mapping && !PageAnon(page));

3088

if (PageSwapCache(page))

3094

if (PageSwapCache(page))

3089

return;

3095

return;

3090

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);

3096

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);

3091

}

3097

}

3092

3098

3093

void mem_cgroup_uncharge_cache_page(struct page *page)

3099

void mem_cgroup_uncharge_cache_page(struct page *page)

3094

{

3100

{

3095

VM_BUG_ON(page_mapped(page));

3101

VM_BUG_ON(page_mapped(page));

3096

VM_BUG_ON(page->mapping);

3102

VM_BUG_ON(page->mapping);

3097

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);

3103

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);

3098

}

3104

}

3099

3105

3100

/*

3106

/*

3101

* Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.

3107

* Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.

3102

* In that cases, pages are freed continuously and we can expect pages

3108

* In that cases, pages are freed continuously and we can expect pages

3103

* are in the same memcg. All these calls itself limits the number of

3109

* are in the same memcg. All these calls itself limits the number of

3104

* pages freed at once, then uncharge_start/end() is called properly.

3110

* pages freed at once, then uncharge_start/end() is called properly.

3105

* This may be called prural(2) times in a context,

3111

* This may be called prural(2) times in a context,

3106

*/

3112

*/

3107

3113

3108

void mem_cgroup_uncharge_start(void)

3114

void mem_cgroup_uncharge_start(void)

3109

{

3115

{

3110

current->memcg_batch.do_batch++;

3116

current->memcg_batch.do_batch++;

3111

/* We can do nest. */

3117

/* We can do nest. */

3112

if (current->memcg_batch.do_batch == 1) {

3118

if (current->memcg_batch.do_batch == 1) {

3113

current->memcg_batch.memcg = NULL;

3119

current->memcg_batch.memcg = NULL;

3114

current->memcg_batch.nr_pages = 0;

3120

current->memcg_batch.nr_pages = 0;

3115

current->memcg_batch.memsw_nr_pages = 0;

3121

current->memcg_batch.memsw_nr_pages = 0;

3116

}

3122

}

3117

}

3123

}

3118

3124

3119

void mem_cgroup_uncharge_end(void)

3125

void mem_cgroup_uncharge_end(void)

3120

{

3126

{

3121

struct memcg_batch_info *batch = &current->memcg_batch;

3127

struct memcg_batch_info *batch = &current->memcg_batch;

3122

3128

3123

if (!batch->do_batch)

3129

if (!batch->do_batch)

3124

return;

3130

return;

3125

3131

3126

batch->do_batch--;

3132

batch->do_batch--;

3127

if (batch->do_batch) /* If stacked, do nothing. */

3133

if (batch->do_batch) /* If stacked, do nothing. */

3128

return;

3134

return;

3129

3135

3130

if (!batch->memcg)

3136

if (!batch->memcg)

3131

return;

3137

return;

3132

/*

3138

/*

3133

* This "batch->memcg" is valid without any css_get/put etc...

3139

* This "batch->memcg" is valid without any css_get/put etc...

3134

* bacause we hide charges behind us.

3140

* bacause we hide charges behind us.

3135

*/

3141

*/

3136

if (batch->nr_pages)

3142

if (batch->nr_pages)

3137

res_counter_uncharge(&batch->memcg->res,

3143

res_counter_uncharge(&batch->memcg->res,

3138

batch->nr_pages * PAGE_SIZE);

3144

batch->nr_pages * PAGE_SIZE);

3139

if (batch->memsw_nr_pages)

3145

if (batch->memsw_nr_pages)

3140

res_counter_uncharge(&batch->memcg->memsw,

3146

res_counter_uncharge(&batch->memcg->memsw,

3141

batch->memsw_nr_pages * PAGE_SIZE);

3147

batch->memsw_nr_pages * PAGE_SIZE);

3142

memcg_oom_recover(batch->memcg);

3148

memcg_oom_recover(batch->memcg);

3143

/* forget this pointer (for sanity check) */

3149

/* forget this pointer (for sanity check) */

3144

batch->memcg = NULL;

3150

batch->memcg = NULL;

3145

}

3151

}

3146

3152

3147

#ifdef CONFIG_SWAP

3153

#ifdef CONFIG_SWAP

3148

/*

3154

/*

3149

* called after __delete_from_swap_cache() and drop "page" account.

3155

* called after __delete_from_swap_cache() and drop "page" account.

3150

* memcg information is recorded to swap_cgroup of "ent"

3156

* memcg information is recorded to swap_cgroup of "ent"

3151

*/

3157

*/

3152

void

3158

void

3153

mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)

3159

mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)

3154

{

3160

{

3155

struct mem_cgroup *memcg;

3161

struct mem_cgroup *memcg;

3156

int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;

3162

int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;

3157

3163

3158

if (!swapout) /* this was a swap cache but the swap is unused ! */

3164

if (!swapout) /* this was a swap cache but the swap is unused ! */

3159

ctype = MEM_CGROUP_CHARGE_TYPE_DROP;

3165

ctype = MEM_CGROUP_CHARGE_TYPE_DROP;

3160

3166

3161

memcg = __mem_cgroup_uncharge_common(page, ctype, false);

3167

memcg = __mem_cgroup_uncharge_common(page, ctype, false);

3162

3168

3163

/*

3169

/*

3164

* record memcg information, if swapout && memcg != NULL,

3170

* record memcg information, if swapout && memcg != NULL,

3165

* mem_cgroup_get() was called in uncharge().

3171

* mem_cgroup_get() was called in uncharge().

3166

*/

3172

*/

3167

if (do_swap_account && swapout && memcg)

3173

if (do_swap_account && swapout && memcg)

3168

swap_cgroup_record(ent, css_id(&memcg->css));

3174

swap_cgroup_record(ent, css_id(&memcg->css));

3169

}

3175

}

3170

#endif

3176

#endif

3171

3177

3172

#ifdef CONFIG_MEMCG_SWAP

3178

#ifdef CONFIG_MEMCG_SWAP

3173

/*

3179

/*

3174

* called from swap_entry_free(). remove record in swap_cgroup and

3180

* called from swap_entry_free(). remove record in swap_cgroup and

3175

* uncharge "memsw" account.

3181

* uncharge "memsw" account.

3176

*/

3182

*/

3177

void mem_cgroup_uncharge_swap(swp_entry_t ent)

3183

void mem_cgroup_uncharge_swap(swp_entry_t ent)

3178

{

3184

{

3179

struct mem_cgroup *memcg;

3185

struct mem_cgroup *memcg;

3180

unsigned short id;

3186

unsigned short id;

3181

3187

3182

if (!do_swap_account)

3188

if (!do_swap_account)

3183

return;

3189

return;

3184

3190

3185

id = swap_cgroup_record(ent, 0);

3191

id = swap_cgroup_record(ent, 0);

3186

rcu_read_lock();

3192

rcu_read_lock();

3187

memcg = mem_cgroup_lookup(id);

3193

memcg = mem_cgroup_lookup(id);

3188

if (memcg) {

3194

if (memcg) {

3189

/*

3195

/*

3190

* We uncharge this because swap is freed.

3196

* We uncharge this because swap is freed.

3191

* This memcg can be obsolete one. We avoid calling css_tryget

3197

* This memcg can be obsolete one. We avoid calling css_tryget

3192

*/

3198

*/

3193

if (!mem_cgroup_is_root(memcg))

3199

if (!mem_cgroup_is_root(memcg))

3194

res_counter_uncharge(&memcg->memsw, PAGE_SIZE);

3200

res_counter_uncharge(&memcg->memsw, PAGE_SIZE);

3195

mem_cgroup_swap_statistics(memcg, false);

3201

mem_cgroup_swap_statistics(memcg, false);

3196

mem_cgroup_put(memcg);

3202

mem_cgroup_put(memcg);

3197

}

3203

}

3198

rcu_read_unlock();

3204

rcu_read_unlock();

3199

}

3205

}

3200

3206

3201

/**

3207

/**

3202

* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.

3208

* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.

3203

* @entry: swap entry to be moved

3209

* @entry: swap entry to be moved

3204

* @from: mem_cgroup which the entry is moved from

3210

* @from: mem_cgroup which the entry is moved from

3205

* @to: mem_cgroup which the entry is moved to

3211

* @to: mem_cgroup which the entry is moved to

3206

*

3212

*

3207

* It succeeds only when the swap_cgroup's record for this entry is the same

3213

* It succeeds only when the swap_cgroup's record for this entry is the same

3208

* as the mem_cgroup's id of @from.

3214

* as the mem_cgroup's id of @from.

3209

*

3215

*

3210

* Returns 0 on success, -EINVAL on failure.

3216

* Returns 0 on success, -EINVAL on failure.

3211

*

3217

*

3212

* The caller must have charged to @to, IOW, called res_counter_charge() about

3218

* The caller must have charged to @to, IOW, called res_counter_charge() about

3213

* both res and memsw, and called css_get().

3219

* both res and memsw, and called css_get().

3214

*/

3220

*/

3215

static int mem_cgroup_move_swap_account(swp_entry_t entry,

3221

static int mem_cgroup_move_swap_account(swp_entry_t entry,

3216

struct mem_cgroup *from, struct mem_cgroup *to)

3222

struct mem_cgroup *from, struct mem_cgroup *to)

3217

{

3223

{

3218

unsigned short old_id, new_id;

3224

unsigned short old_id, new_id;

3219

3225

3220

old_id = css_id(&from->css);

3226

old_id = css_id(&from->css);

3221

new_id = css_id(&to->css);

3227

new_id = css_id(&to->css);

3222

3228

3223

if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {

3229

if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {

3224

mem_cgroup_swap_statistics(from, false);

3230

mem_cgroup_swap_statistics(from, false);

3225

mem_cgroup_swap_statistics(to, true);

3231

mem_cgroup_swap_statistics(to, true);

3226

/*

3232

/*

3227

* This function is only called from task migration context now.

3233

* This function is only called from task migration context now.

3228

* It postpones res_counter and refcount handling till the end

3234

* It postpones res_counter and refcount handling till the end

3229

* of task migration(mem_cgroup_clear_mc()) for performance

3235

* of task migration(mem_cgroup_clear_mc()) for performance

3230

* improvement. But we cannot postpone mem_cgroup_get(to)

3236

* improvement. But we cannot postpone mem_cgroup_get(to)

3231

* because if the process that has been moved to @to does

3237

* because if the process that has been moved to @to does

3232

* swap-in, the refcount of @to might be decreased to 0.

3238

* swap-in, the refcount of @to might be decreased to 0.

3233

*/

3239

*/

3234

mem_cgroup_get(to);

3240

mem_cgroup_get(to);

3235

return 0;

3241

return 0;

3236

}

3242

}

3237

return -EINVAL;

3243

return -EINVAL;

3238

}

3244

}

3239

#else

3245

#else

3240

static inline int mem_cgroup_move_swap_account(swp_entry_t entry,

3246

static inline int mem_cgroup_move_swap_account(swp_entry_t entry,

3241

struct mem_cgroup *from, struct mem_cgroup *to)

3247

struct mem_cgroup *from, struct mem_cgroup *to)

3242

{

3248

{

3243

return -EINVAL;

3249

return -EINVAL;

3244

}

3250

}

3245

#endif

3251

#endif

3246

3252

3247

/*

3253

/*

3248

* Before starting migration, account PAGE_SIZE to mem_cgroup that the old

3254

* Before starting migration, account PAGE_SIZE to mem_cgroup that the old

3249

* page belongs to.

3255

* page belongs to.

3250

*/

3256

*/

3251

void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,

3257

void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,

3252

struct mem_cgroup **memcgp)

3258

struct mem_cgroup **memcgp)

3253

{

3259

{

3254

struct mem_cgroup *memcg = NULL;

3260

struct mem_cgroup *memcg = NULL;

3255

struct page_cgroup *pc;

3261

struct page_cgroup *pc;

3256

enum charge_type ctype;

3262

enum charge_type ctype;

3257

3263

3258

*memcgp = NULL;

3264

*memcgp = NULL;

3259

3265

3260

VM_BUG_ON(PageTransHuge(page));

3266

VM_BUG_ON(PageTransHuge(page));

3261

if (mem_cgroup_disabled())

3267

if (mem_cgroup_disabled())

3262

return;

3268

return;

3263

3269

3264

pc = lookup_page_cgroup(page);

3270

pc = lookup_page_cgroup(page);

3265

lock_page_cgroup(pc);

3271

lock_page_cgroup(pc);

3266

if (PageCgroupUsed(pc)) {

3272

if (PageCgroupUsed(pc)) {

3267

memcg = pc->mem_cgroup;

3273

memcg = pc->mem_cgroup;

3268

css_get(&memcg->css);

3274

css_get(&memcg->css);

3269

/*

3275

/*

3270

* At migrating an anonymous page, its mapcount goes down

3276

* At migrating an anonymous page, its mapcount goes down

3271

* to 0 and uncharge() will be called. But, even if it's fully

3277

* to 0 and uncharge() will be called. But, even if it's fully

3272

* unmapped, migration may fail and this page has to be

3278

* unmapped, migration may fail and this page has to be

3273

* charged again. We set MIGRATION flag here and delay uncharge

3279

* charged again. We set MIGRATION flag here and delay uncharge

3274

* until end_migration() is called

3280

* until end_migration() is called

3275

*

3281

*

3276

* Corner Case Thinking

3282

* Corner Case Thinking

3277

* A)

3283

* A)

3278

* When the old page was mapped as Anon and it's unmap-and-freed

3284

* When the old page was mapped as Anon and it's unmap-and-freed

3279

* while migration was ongoing.

3285

* while migration was ongoing.

3280

* If unmap finds the old page, uncharge() of it will be delayed

3286

* If unmap finds the old page, uncharge() of it will be delayed

3281

* until end_migration(). If unmap finds a new page, it's

3287

* until end_migration(). If unmap finds a new page, it's

3282

* uncharged when it make mapcount to be 1->0. If unmap code

3288

* uncharged when it make mapcount to be 1->0. If unmap code

3283

* finds swap_migration_entry, the new page will not be mapped

3289

* finds swap_migration_entry, the new page will not be mapped

3284

* and end_migration() will find it(mapcount==0).

3290

* and end_migration() will find it(mapcount==0).

3285

*

3291

*

3286

* B)

3292

* B)

3287

* When the old page was mapped but migraion fails, the kernel

3293

* When the old page was mapped but migraion fails, the kernel

3288

* remaps it. A charge for it is kept by MIGRATION flag even

3294

* remaps it. A charge for it is kept by MIGRATION flag even

3289

* if mapcount goes down to 0. We can do remap successfully

3295

* if mapcount goes down to 0. We can do remap successfully

3290

* without charging it again.

3296

* without charging it again.

3291

*

3297

*

3292

* C)

3298

* C)

3293

* The "old" page is under lock_page() until the end of

3299

* The "old" page is under lock_page() until the end of

3294

* migration, so, the old page itself will not be swapped-out.

3300

* migration, so, the old page itself will not be swapped-out.

3295

* If the new page is swapped out before end_migraton, our

3301

* If the new page is swapped out before end_migraton, our

3296

* hook to usual swap-out path will catch the event.

3302

* hook to usual swap-out path will catch the event.

3297

*/

3303

*/

3298

if (PageAnon(page))

3304

if (PageAnon(page))

3299

SetPageCgroupMigration(pc);

3305

SetPageCgroupMigration(pc);

3300

}

3306

}

3301

unlock_page_cgroup(pc);

3307

unlock_page_cgroup(pc);

3302

/*

3308

/*

3303

* If the page is not charged at this point,

3309

* If the page is not charged at this point,

3304

* we return here.

3310

* we return here.

3305

*/

3311

*/

3306

if (!memcg)

3312

if (!memcg)

3307

return;

3313

return;

3308

3314

3309

*memcgp = memcg;

3315

*memcgp = memcg;

3310

/*

3316

/*

3311

* We charge new page before it's used/mapped. So, even if unlock_page()

3317

* We charge new page before it's used/mapped. So, even if unlock_page()

3312

* is called before end_migration, we can catch all events on this new

3318

* is called before end_migration, we can catch all events on this new

3313

* page. In the case new page is migrated but not remapped, new page's

3319

* page. In the case new page is migrated but not remapped, new page's

3314

* mapcount will be finally 0 and we call uncharge in end_migration().

3320

* mapcount will be finally 0 and we call uncharge in end_migration().

3315

*/

3321

*/

3316

if (PageAnon(page))

3322

if (PageAnon(page))

3317

ctype = MEM_CGROUP_CHARGE_TYPE_ANON;

3323

ctype = MEM_CGROUP_CHARGE_TYPE_ANON;

3318

else

3324

else

3319

ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;

3325

ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;

3320

/*

3326

/*

3321

* The page is committed to the memcg, but it's not actually

3327

* The page is committed to the memcg, but it's not actually

3322

* charged to the res_counter since we plan on replacing the

3328

* charged to the res_counter since we plan on replacing the

3323

* old one and only one page is going to be left afterwards.

3329

* old one and only one page is going to be left afterwards.

3324

*/

3330

*/

3325

__mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);

3331

__mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);

3326

}

3332

}

3327

3333

3328

/* remove redundant charge if migration failed*/

3334

/* remove redundant charge if migration failed*/

3329

void mem_cgroup_end_migration(struct mem_cgroup *memcg,

3335

void mem_cgroup_end_migration(struct mem_cgroup *memcg,

3330

struct page *oldpage, struct page *newpage, bool migration_ok)

3336

struct page *oldpage, struct page *newpage, bool migration_ok)

3331

{

3337

{

3332

struct page *used, *unused;

3338

struct page *used, *unused;

3333

struct page_cgroup *pc;

3339

struct page_cgroup *pc;

3334

bool anon;

3340

bool anon;

3335

3341

3336

if (!memcg)

3342

if (!memcg)

3337

return;

3343

return;

3338

/* blocks rmdir() */

3344

/* blocks rmdir() */

3339

cgroup_exclude_rmdir(&memcg->css);

3345

cgroup_exclude_rmdir(&memcg->css);

3340

if (!migration_ok) {

3346

if (!migration_ok) {

3341

used = oldpage;

3347

used = oldpage;

3342

unused = newpage;

3348

unused = newpage;

3343

} else {

3349

} else {

3344

used = newpage;

3350

used = newpage;

3345

unused = oldpage;

3351

unused = oldpage;

3346

}

3352

}

3347

anon = PageAnon(used);

3353

anon = PageAnon(used);

3348

__mem_cgroup_uncharge_common(unused,

3354

__mem_cgroup_uncharge_common(unused,

3349

anon ? MEM_CGROUP_CHARGE_TYPE_ANON

3355

anon ? MEM_CGROUP_CHARGE_TYPE_ANON

3350

: MEM_CGROUP_CHARGE_TYPE_CACHE,

3356

: MEM_CGROUP_CHARGE_TYPE_CACHE,

3351

true);

3357

true);

3352

css_put(&memcg->css);

3358

css_put(&memcg->css);

3353

/*

3359

/*

3354

* We disallowed uncharge of pages under migration because mapcount

3360

* We disallowed uncharge of pages under migration because mapcount

3355

* of the page goes down to zero, temporarly.

3361

* of the page goes down to zero, temporarly.

3356

* Clear the flag and check the page should be charged.

3362

* Clear the flag and check the page should be charged.

3357

*/

3363

*/

3358

pc = lookup_page_cgroup(oldpage);

3364

pc = lookup_page_cgroup(oldpage);

3359

lock_page_cgroup(pc);

3365

lock_page_cgroup(pc);

3360

ClearPageCgroupMigration(pc);

3366

ClearPageCgroupMigration(pc);

3361

unlock_page_cgroup(pc);

3367

unlock_page_cgroup(pc);

3362

3368

3363

/*

3369

/*

3364

* If a page is a file cache, radix-tree replacement is very atomic

3370

* If a page is a file cache, radix-tree replacement is very atomic

3365

* and we can skip this check. When it was an Anon page, its mapcount

3371

* and we can skip this check. When it was an Anon page, its mapcount

3366

* goes down to 0. But because we added MIGRATION flage, it's not

3372

* goes down to 0. But because we added MIGRATION flage, it's not

3367

* uncharged yet. There are several case but page->mapcount check

3373

* uncharged yet. There are several case but page->mapcount check

3368

* and USED bit check in mem_cgroup_uncharge_page() will do enough

3374

* and USED bit check in mem_cgroup_uncharge_page() will do enough

3369

* check. (see prepare_charge() also)

3375

* check. (see prepare_charge() also)

3370

*/

3376

*/

3371

if (anon)

3377

if (anon)

3372

mem_cgroup_uncharge_page(used);

3378

mem_cgroup_uncharge_page(used);

3373

/*

3379

/*

3374

* At migration, we may charge account against cgroup which has no

3380

* At migration, we may charge account against cgroup which has no

3375

* tasks.

3381

* tasks.

3376

* So, rmdir()->pre_destroy() can be called while we do this charge.

3382

* So, rmdir()->pre_destroy() can be called while we do this charge.

3377

* In that case, we need to call pre_destroy() again. check it here.

3383

* In that case, we need to call pre_destroy() again. check it here.

3378

*/

3384

*/

3379

cgroup_release_and_wakeup_rmdir(&memcg->css);

3385

cgroup_release_and_wakeup_rmdir(&memcg->css);

3380

}

3386

}

3381

3387

3382

/*

3388

/*

3383

* At replace page cache, newpage is not under any memcg but it's on

3389

* At replace page cache, newpage is not under any memcg but it's on

3384

* LRU. So, this function doesn't touch res_counter but handles LRU

3390

* LRU. So, this function doesn't touch res_counter but handles LRU

3385

* in correct way. Both pages are locked so we cannot race with uncharge.

3391

* in correct way. Both pages are locked so we cannot race with uncharge.

3386

*/

3392

*/

3387

void mem_cgroup_replace_page_cache(struct page *oldpage,

3393

void mem_cgroup_replace_page_cache(struct page *oldpage,

3388

struct page *newpage)

3394

struct page *newpage)

3389

{

3395

{

3390

struct mem_cgroup *memcg = NULL;

3396

struct mem_cgroup *memcg = NULL;

3391

struct page_cgroup *pc;

3397

struct page_cgroup *pc;

3392

enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;

3398

enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;

3393

3399

3394

if (mem_cgroup_disabled())

3400

if (mem_cgroup_disabled())

3395

return;

3401

return;

3396

3402

3397

pc = lookup_page_cgroup(oldpage);

3403

pc = lookup_page_cgroup(oldpage);

3398

/* fix accounting on old pages */

3404

/* fix accounting on old pages */

3399

lock_page_cgroup(pc);

3405

lock_page_cgroup(pc);

3400

if (PageCgroupUsed(pc)) {

3406

if (PageCgroupUsed(pc)) {

3401

memcg = pc->mem_cgroup;

3407

memcg = pc->mem_cgroup;

3402

mem_cgroup_charge_statistics(memcg, false, -1);

3408

mem_cgroup_charge_statistics(memcg, false, -1);

3403

ClearPageCgroupUsed(pc);

3409

ClearPageCgroupUsed(pc);

3404

}

3410

}

3405

unlock_page_cgroup(pc);

3411

unlock_page_cgroup(pc);

3406

3412

3407

/*

3413

/*

3408

* When called from shmem_replace_page(), in some cases the

3414

* When called from shmem_replace_page(), in some cases the

3409

* oldpage has already been charged, and in some cases not.

3415

* oldpage has already been charged, and in some cases not.

3410

*/

3416

*/

3411

if (!memcg)

3417

if (!memcg)

3412

return;

3418

return;

3413

/*

3419

/*

3414

* Even if newpage->mapping was NULL before starting replacement,

3420

* Even if newpage->mapping was NULL before starting replacement,

3415

* the newpage may be on LRU(or pagevec for LRU) already. We lock

3421

* the newpage may be on LRU(or pagevec for LRU) already. We lock

3416

* LRU while we overwrite pc->mem_cgroup.

3422

* LRU while we overwrite pc->mem_cgroup.

3417

*/

3423

*/

3418

__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);

3424

__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);

3419

}

3425

}

3420

3426

3421

#ifdef CONFIG_DEBUG_VM

3427

#ifdef CONFIG_DEBUG_VM

3422

static struct page_cgroup *lookup_page_cgroup_used(struct page *page)

3428

static struct page_cgroup *lookup_page_cgroup_used(struct page *page)

3423

{

3429

{

3424

struct page_cgroup *pc;

3430

struct page_cgroup *pc;

3425

3431

3426

pc = lookup_page_cgroup(page);

3432

pc = lookup_page_cgroup(page);

3427

/*

3433

/*

3428

* Can be NULL while feeding pages into the page allocator for

3434

* Can be NULL while feeding pages into the page allocator for

3429

* the first time, i.e. during boot or memory hotplug;

3435

* the first time, i.e. during boot or memory hotplug;

3430

* or when mem_cgroup_disabled().

3436

* or when mem_cgroup_disabled().

3431

*/

3437

*/

3432

if (likely(pc) && PageCgroupUsed(pc))

3438

if (likely(pc) && PageCgroupUsed(pc))

3433

return pc;

3439

return pc;

3434

return NULL;

3440

return NULL;

3435

}

3441

}

3436

3442

3437

bool mem_cgroup_bad_page_check(struct page *page)

3443

bool mem_cgroup_bad_page_check(struct page *page)

3438

{

3444

{

3439

if (mem_cgroup_disabled())

3445

if (mem_cgroup_disabled())

3440

return false;

3446

return false;

3441

3447

3442

return lookup_page_cgroup_used(page) != NULL;

3448

return lookup_page_cgroup_used(page) != NULL;

3443

}

3449

}

3444

3450

3445

void mem_cgroup_print_bad_page(struct page *page)

3451

void mem_cgroup_print_bad_page(struct page *page)

3446

{

3452

{

3447

struct page_cgroup *pc;

3453

struct page_cgroup *pc;

3448

3454

3449

pc = lookup_page_cgroup_used(page);

3455

pc = lookup_page_cgroup_used(page);

3450

if (pc) {

3456

if (pc) {

3451

printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",

3457

printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",

3452

pc, pc->flags, pc->mem_cgroup);

3458

pc, pc->flags, pc->mem_cgroup);

3453

}

3459

}

3454

}

3460

}

3455

#endif

3461

#endif

3456

3462

3457

static DEFINE_MUTEX(set_limit_mutex);

3463

static DEFINE_MUTEX(set_limit_mutex);

3458

3464

3459

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,

3465

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,

3460

unsigned long long val)

3466

unsigned long long val)

3461

{

3467

{

3462

int retry_count;

3468

int retry_count;

3463

u64 memswlimit, memlimit;

3469

u64 memswlimit, memlimit;

3464

int ret = 0;

3470

int ret = 0;

3465

int children = mem_cgroup_count_children(memcg);

3471

int children = mem_cgroup_count_children(memcg);

3466

u64 curusage, oldusage;

3472

u64 curusage, oldusage;

3467

int enlarge;

3473

int enlarge;

3468

3474

3469

/*

3475

/*

3470

* For keeping hierarchical_reclaim simple, how long we should retry

3476

* For keeping hierarchical_reclaim simple, how long we should retry

3471

* is depends on callers. We set our retry-count to be function

3477

* is depends on callers. We set our retry-count to be function

3472

* of # of children which we should visit in this loop.

3478

* of # of children which we should visit in this loop.

3473

*/

3479

*/

3474

retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;

3480

retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;

3475

3481

3476

oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);

3482

oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);

3477

3483

3478

enlarge = 0;

3484

enlarge = 0;

3479

while (retry_count) {

3485

while (retry_count) {

3480

if (signal_pending(current)) {

3486

if (signal_pending(current)) {

3481

ret = -EINTR;

3487

ret = -EINTR;

3482

break;

3488

break;

3483

}

3489

}

3484

/*

3490

/*

3485

* Rather than hide all in some function, I do this in

3491

* Rather than hide all in some function, I do this in

3486

* open coded manner. You see what this really does.

3492

* open coded manner. You see what this really does.

3487

* We have to guarantee memcg->res.limit <= memcg->memsw.limit.

3493

* We have to guarantee memcg->res.limit <= memcg->memsw.limit.

3488

*/

3494

*/

3489

mutex_lock(&set_limit_mutex);

3495

mutex_lock(&set_limit_mutex);

3490

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3496

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3491

if (memswlimit < val) {

3497

if (memswlimit < val) {

3492

ret = -EINVAL;

3498

ret = -EINVAL;

3493

mutex_unlock(&set_limit_mutex);

3499

mutex_unlock(&set_limit_mutex);

3494

break;

3500

break;

3495

}

3501

}

3496

3502

3497

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3503

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3498

if (memlimit < val)

3504

if (memlimit < val)

3499

enlarge = 1;

3505

enlarge = 1;

3500

3506

3501

ret = res_counter_set_limit(&memcg->res, val);

3507

ret = res_counter_set_limit(&memcg->res, val);

3502

if (!ret) {

3508

if (!ret) {

3503

if (memswlimit == val)

3509

if (memswlimit == val)

3504

memcg->memsw_is_minimum = true;

3510

memcg->memsw_is_minimum = true;

3505

else

3511

else

3506

memcg->memsw_is_minimum = false;

3512

memcg->memsw_is_minimum = false;

3507

}

3513

}

3508

mutex_unlock(&set_limit_mutex);

3514

mutex_unlock(&set_limit_mutex);

3509

3515

3510

if (!ret)

3516

if (!ret)

3511

break;

3517

break;

3512

3518

3513

mem_cgroup_reclaim(memcg, GFP_KERNEL,

3519

mem_cgroup_reclaim(memcg, GFP_KERNEL,

3514

MEM_CGROUP_RECLAIM_SHRINK);

3520

MEM_CGROUP_RECLAIM_SHRINK);

3515

curusage = res_counter_read_u64(&memcg->res, RES_USAGE);

3521

curusage = res_counter_read_u64(&memcg->res, RES_USAGE);

3516

/* Usage is reduced ? */

3522

/* Usage is reduced ? */

3517

if (curusage >= oldusage)

3523

if (curusage >= oldusage)

3518

retry_count--;

3524

retry_count--;

3519

else

3525

else

3520

oldusage = curusage;

3526

oldusage = curusage;

3521

}

3527

}

3522

if (!ret && enlarge)

3528

if (!ret && enlarge)

3523

memcg_oom_recover(memcg);

3529

memcg_oom_recover(memcg);

3524

3530

3525

return ret;

3531

return ret;

3526

}

3532

}

3527

3533

3528

static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,

3534

static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,

3529

unsigned long long val)

3535

unsigned long long val)

3530

{

3536

{

3531

int retry_count;

3537

int retry_count;

3532

u64 memlimit, memswlimit, oldusage, curusage;

3538

u64 memlimit, memswlimit, oldusage, curusage;

3533

int children = mem_cgroup_count_children(memcg);

3539

int children = mem_cgroup_count_children(memcg);

3534

int ret = -EBUSY;

3540

int ret = -EBUSY;

3535

int enlarge = 0;

3541

int enlarge = 0;

3536

3542

3537

/* see mem_cgroup_resize_res_limit */

3543

/* see mem_cgroup_resize_res_limit */

3538

retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;

3544

retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;

3539

oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

3545

oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

3540

while (retry_count) {

3546

while (retry_count) {

3541

if (signal_pending(current)) {

3547

if (signal_pending(current)) {

3542

ret = -EINTR;

3548

ret = -EINTR;

3543

break;

3549

break;

3544

}

3550

}

3545

/*

3551

/*

3546

* Rather than hide all in some function, I do this in

3552

* Rather than hide all in some function, I do this in

3547

* open coded manner. You see what this really does.

3553

* open coded manner. You see what this really does.

3548

* We have to guarantee memcg->res.limit <= memcg->memsw.limit.

3554

* We have to guarantee memcg->res.limit <= memcg->memsw.limit.

3549

*/

3555

*/

3550

mutex_lock(&set_limit_mutex);

3556

mutex_lock(&set_limit_mutex);

3551

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3557

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3552

if (memlimit > val) {

3558

if (memlimit > val) {

3553

ret = -EINVAL;

3559

ret = -EINVAL;

3554

mutex_unlock(&set_limit_mutex);

3560

mutex_unlock(&set_limit_mutex);

3555

break;

3561

break;

3556

}

3562

}

3557

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3563

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3558

if (memswlimit < val)

3564

if (memswlimit < val)

3559

enlarge = 1;

3565

enlarge = 1;

3560

ret = res_counter_set_limit(&memcg->memsw, val);

3566

ret = res_counter_set_limit(&memcg->memsw, val);

3561

if (!ret) {

3567

if (!ret) {

3562

if (memlimit == val)

3568

if (memlimit == val)

3563

memcg->memsw_is_minimum = true;

3569

memcg->memsw_is_minimum = true;

3564

else

3570

else

3565

memcg->memsw_is_minimum = false;

3571

memcg->memsw_is_minimum = false;

3566

}

3572

}

3567

mutex_unlock(&set_limit_mutex);

3573

mutex_unlock(&set_limit_mutex);

3568

3574

3569

if (!ret)

3575

if (!ret)

3570

break;

3576

break;

3571

3577

3572

mem_cgroup_reclaim(memcg, GFP_KERNEL,

3578

mem_cgroup_reclaim(memcg, GFP_KERNEL,

3573

MEM_CGROUP_RECLAIM_NOSWAP |

3579

MEM_CGROUP_RECLAIM_NOSWAP |

3574

MEM_CGROUP_RECLAIM_SHRINK);

3580

MEM_CGROUP_RECLAIM_SHRINK);

3575

curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

3581

curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

3576

/* Usage is reduced ? */

3582

/* Usage is reduced ? */

3577

if (curusage >= oldusage)

3583

if (curusage >= oldusage)

3578

retry_count--;

3584

retry_count--;

3579

else

3585

else

3580

oldusage = curusage;

3586

oldusage = curusage;

3581

}

3587

}

3582

if (!ret && enlarge)

3588

if (!ret && enlarge)

3583

memcg_oom_recover(memcg);

3589

memcg_oom_recover(memcg);

3584

return ret;

3590

return ret;

3585

}

3591

}

3586

3592

3587

unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,

3593

unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,

3588

gfp_t gfp_mask,

3594

gfp_t gfp_mask,

3589

unsigned long *total_scanned)

3595

unsigned long *total_scanned)

3590

{

3596

{

3591

unsigned long nr_reclaimed = 0;

3597

unsigned long nr_reclaimed = 0;

3592

struct mem_cgroup_per_zone *mz, *next_mz = NULL;

3598

struct mem_cgroup_per_zone *mz, *next_mz = NULL;

3593

unsigned long reclaimed;

3599

unsigned long reclaimed;

3594

int loop = 0;

3600

int loop = 0;

3595

struct mem_cgroup_tree_per_zone *mctz;

3601

struct mem_cgroup_tree_per_zone *mctz;

3596

unsigned long long excess;

3602

unsigned long long excess;

3597

unsigned long nr_scanned;

3603

unsigned long nr_scanned;

3598

3604

3599

if (order > 0)

3605

if (order > 0)

3600

return 0;

3606

return 0;

3601

3607

3602

mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));

3608

mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));

3603

/*

3609

/*

3604

* This loop can run a while, specially if mem_cgroup's continuously

3610

* This loop can run a while, specially if mem_cgroup's continuously

3605

* keep exceeding their soft limit and putting the system under

3611

* keep exceeding their soft limit and putting the system under

3606

* pressure

3612

* pressure

3607

*/

3613

*/

3608

do {

3614

do {

3609

if (next_mz)

3615

if (next_mz)

3610

mz = next_mz;

3616

mz = next_mz;

3611

else

3617

else

3612

mz = mem_cgroup_largest_soft_limit_node(mctz);

3618

mz = mem_cgroup_largest_soft_limit_node(mctz);

3613

if (!mz)

3619

if (!mz)

3614

break;

3620

break;

3615

3621

3616

nr_scanned = 0;

3622

nr_scanned = 0;

3617

reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,

3623

reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,

3618

gfp_mask, &nr_scanned);

3624

gfp_mask, &nr_scanned);

3619

nr_reclaimed += reclaimed;

3625

nr_reclaimed += reclaimed;

3620

*total_scanned += nr_scanned;

3626

*total_scanned += nr_scanned;

3621

spin_lock(&mctz->lock);

3627

spin_lock(&mctz->lock);

3622

3628

3623

/*

3629

/*

3624

* If we failed to reclaim anything from this memory cgroup

3630

* If we failed to reclaim anything from this memory cgroup

3625

* it is time to move on to the next cgroup

3631

* it is time to move on to the next cgroup

3626

*/

3632

*/

3627

next_mz = NULL;

3633

next_mz = NULL;

3628

if (!reclaimed) {

3634

if (!reclaimed) {

3629

do {

3635

do {

3630

/*

3636

/*

3631

* Loop until we find yet another one.

3637

* Loop until we find yet another one.

3632

*

3638

*

3633

* By the time we get the soft_limit lock

3639

* By the time we get the soft_limit lock

3634

* again, someone might have aded the

3640

* again, someone might have aded the

3635

* group back on the RB tree. Iterate to

3641

* group back on the RB tree. Iterate to

3636

* make sure we get a different mem.

3642

* make sure we get a different mem.

3637

* mem_cgroup_largest_soft_limit_node returns

3643

* mem_cgroup_largest_soft_limit_node returns

3638

* NULL if no other cgroup is present on

3644

* NULL if no other cgroup is present on

3639

* the tree

3645

* the tree

3640

*/

3646

*/

3641

next_mz =

3647

next_mz =

3642

__mem_cgroup_largest_soft_limit_node(mctz);

3648

__mem_cgroup_largest_soft_limit_node(mctz);

3643

if (next_mz == mz)

3649

if (next_mz == mz)

3644

css_put(&next_mz->memcg->css);

3650

css_put(&next_mz->memcg->css);

3645

else /* next_mz == NULL or other memcg */

3651

else /* next_mz == NULL or other memcg */

3646

break;

3652

break;

3647

} while (1);

3653

} while (1);

3648

}

3654

}

3649

__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);

3655

__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);

3650

excess = res_counter_soft_limit_excess(&mz->memcg->res);

3656

excess = res_counter_soft_limit_excess(&mz->memcg->res);

3651

/*

3657

/*

3652

* One school of thought says that we should not add

3658

* One school of thought says that we should not add

3653

* back the node to the tree if reclaim returns 0.

3659

* back the node to the tree if reclaim returns 0.

3654

* But our reclaim could return 0, simply because due

3660

* But our reclaim could return 0, simply because due

3655

* to priority we are exposing a smaller subset of

3661

* to priority we are exposing a smaller subset of

3656

* memory to reclaim from. Consider this as a longer

3662

* memory to reclaim from. Consider this as a longer

3657

* term TODO.

3663

* term TODO.

3658

*/

3664

*/

3659

/* If excess == 0, no tree ops */

3665

/* If excess == 0, no tree ops */

3660

__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);

3666

__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);

3661

spin_unlock(&mctz->lock);

3667

spin_unlock(&mctz->lock);

3662

css_put(&mz->memcg->css);

3668

css_put(&mz->memcg->css);

3663

loop++;

3669

loop++;

3664

/*

3670

/*

3665

* Could not reclaim anything and there are no more

3671

* Could not reclaim anything and there are no more

3666

* mem cgroups to try or we seem to be looping without

3672

* mem cgroups to try or we seem to be looping without

3667

* reclaiming anything.

3673

* reclaiming anything.

3668

*/

3674

*/

3669

if (!nr_reclaimed &&

3675

if (!nr_reclaimed &&

3670

(next_mz == NULL ||

3676

(next_mz == NULL ||

3671

loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))

3677

loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))

3672

break;

3678

break;

3673

} while (!nr_reclaimed);

3679

} while (!nr_reclaimed);

3674

if (next_mz)

3680

if (next_mz)

3675

css_put(&next_mz->memcg->css);

3681

css_put(&next_mz->memcg->css);

3676

return nr_reclaimed;

3682

return nr_reclaimed;

3677

}

3683

}

3678

3684

3679

/*

3685

/*

3680

* Traverse a specified page_cgroup list and try to drop them all. This doesn't

3686

* Traverse a specified page_cgroup list and try to drop them all. This doesn't

3681

* reclaim the pages page themselves - it just removes the page_cgroups.

3687

* reclaim the pages page themselves - it just removes the page_cgroups.

3682

* Returns true if some page_cgroups were not freed, indicating that the caller

3688

* Returns true if some page_cgroups were not freed, indicating that the caller

3683

* must retry this operation.

3689

* must retry this operation.

3684

*/

3690

*/

3685

static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,

3691

static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,

3686

int node, int zid, enum lru_list lru)

3692

int node, int zid, enum lru_list lru)

3687

{

3693

{

3688

struct mem_cgroup_per_zone *mz;

3694

struct mem_cgroup_per_zone *mz;

3689

unsigned long flags, loop;

3695

unsigned long flags, loop;

3690

struct list_head *list;

3696

struct list_head *list;

3691

struct page *busy;

3697

struct page *busy;

3692

struct zone *zone;

3698

struct zone *zone;

3693

3699

3694

zone = &NODE_DATA(node)->node_zones[zid];

3700

zone = &NODE_DATA(node)->node_zones[zid];

3695

mz = mem_cgroup_zoneinfo(memcg, node, zid);

3701

mz = mem_cgroup_zoneinfo(memcg, node, zid);

3696

list = &mz->lruvec.lists[lru];

3702

list = &mz->lruvec.lists[lru];

3697

3703

3698

loop = mz->lru_size[lru];

3704

loop = mz->lru_size[lru];

3699

/* give some margin against EBUSY etc...*/

3705

/* give some margin against EBUSY etc...*/

3700

loop += 256;

3706

loop += 256;

3701

busy = NULL;

3707

busy = NULL;

3702

while (loop--) {

3708

while (loop--) {

3703

struct page_cgroup *pc;

3709

struct page_cgroup *pc;

3704

struct page *page;

3710

struct page *page;

3705

3711

3706

spin_lock_irqsave(&zone->lru_lock, flags);

3712

spin_lock_irqsave(&zone->lru_lock, flags);

3707

if (list_empty(list)) {

3713

if (list_empty(list)) {

3708

spin_unlock_irqrestore(&zone->lru_lock, flags);

3714

spin_unlock_irqrestore(&zone->lru_lock, flags);

3709

break;

3715

break;

3710

}

3716

}

3711

page = list_entry(list->prev, struct page, lru);

3717

page = list_entry(list->prev, struct page, lru);

3712

if (busy == page) {

3718

if (busy == page) {

3713

list_move(&page->lru, list);

3719

list_move(&page->lru, list);

3714

busy = NULL;

3720

busy = NULL;

3715

spin_unlock_irqrestore(&zone->lru_lock, flags);

3721

spin_unlock_irqrestore(&zone->lru_lock, flags);

3716

continue;

3722

continue;

3717

}

3723

}

3718

spin_unlock_irqrestore(&zone->lru_lock, flags);

3724

spin_unlock_irqrestore(&zone->lru_lock, flags);

3719

3725

3720

pc = lookup_page_cgroup(page);

3726

pc = lookup_page_cgroup(page);

3721

3727

3722

if (mem_cgroup_move_parent(page, pc, memcg)) {

3728

if (mem_cgroup_move_parent(page, pc, memcg)) {

3723

/* found lock contention or "pc" is obsolete. */

3729

/* found lock contention or "pc" is obsolete. */

3724

busy = page;

3730

busy = page;

3725

cond_resched();

3731

cond_resched();

3726

} else

3732

} else

3727

busy = NULL;

3733

busy = NULL;

3728

}

3734

}

3729

return !list_empty(list);

3735

return !list_empty(list);

3730

}

3736

}

3731

3737

3732

/*

3738

/*

3733

* make mem_cgroup's charge to be 0 if there is no task.

3739

* make mem_cgroup's charge to be 0 if there is no task.

3734

* This enables deleting this mem_cgroup.

3740

* This enables deleting this mem_cgroup.

3735

*/

3741

*/

3736

static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)

3742

static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)

3737

{

3743

{

3738

int ret;

3744

int ret;

3739

int node, zid, shrink;

3745

int node, zid, shrink;

3740

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

3746

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

3741

struct cgroup *cgrp = memcg->css.cgroup;

3747

struct cgroup *cgrp = memcg->css.cgroup;

3742

3748

3743

css_get(&memcg->css);

3749

css_get(&memcg->css);

3744

3750

3745

shrink = 0;

3751

shrink = 0;

3746

/* should free all ? */

3752

/* should free all ? */

3747

if (free_all)

3753

if (free_all)

3748

goto try_to_free;

3754

goto try_to_free;

3749

move_account:

3755

move_account:

3750

do {

3756

do {

3751

ret = -EBUSY;

3757

ret = -EBUSY;

3752

if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))

3758

if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))

3753

goto out;

3759

goto out;

3754

/* This is for making all *used* pages to be on LRU. */

3760

/* This is for making all *used* pages to be on LRU. */

3755

lru_add_drain_all();

3761

lru_add_drain_all();

3756

drain_all_stock_sync(memcg);

3762

drain_all_stock_sync(memcg);

3757

ret = 0;

3763

ret = 0;

3758

mem_cgroup_start_move(memcg);

3764

mem_cgroup_start_move(memcg);

3759

for_each_node_state(node, N_HIGH_MEMORY) {

3765

for_each_node_state(node, N_HIGH_MEMORY) {

3760

for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {

3766

for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {

3761

enum lru_list lru;

3767

enum lru_list lru;

3762

for_each_lru(lru) {

3768

for_each_lru(lru) {

3763

ret = mem_cgroup_force_empty_list(memcg,

3769

ret = mem_cgroup_force_empty_list(memcg,

3764

node, zid, lru);

3770

node, zid, lru);

3765

if (ret)

3771

if (ret)

3766

break;

3772

break;

3767

}

3773

}

3768

}

3774

}

3769

if (ret)

3775

if (ret)

3770

break;

3776

break;

3771

}

3777

}

3772

mem_cgroup_end_move(memcg);

3778

mem_cgroup_end_move(memcg);

3773

memcg_oom_recover(memcg);

3779

memcg_oom_recover(memcg);

3774

cond_resched();

3780

cond_resched();

3775

/* "ret" should also be checked to ensure all lists are empty. */

3781

/* "ret" should also be checked to ensure all lists are empty. */

3776

} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);

3782

} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);

3777

out:

3783

out:

3778

css_put(&memcg->css);

3784

css_put(&memcg->css);

3779

return ret;

3785

return ret;

3780

3786

3781

try_to_free:

3787

try_to_free:

3782

/* returns EBUSY if there is a task or if we come here twice. */

3788

/* returns EBUSY if there is a task or if we come here twice. */

3783

if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {

3789

if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {

3784

ret = -EBUSY;

3790

ret = -EBUSY;

3785

goto out;

3791

goto out;

3786

}

3792

}

3787

/* we call try-to-free pages for make this cgroup empty */

3793

/* we call try-to-free pages for make this cgroup empty */

3788

lru_add_drain_all();

3794

lru_add_drain_all();

3789

/* try to free all pages in this cgroup */

3795

/* try to free all pages in this cgroup */

3790

shrink = 1;

3796

shrink = 1;

3791

while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {

3797

while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {

3792

int progress;

3798

int progress;

3793

3799

3794

if (signal_pending(current)) {

3800

if (signal_pending(current)) {

3795

ret = -EINTR;

3801

ret = -EINTR;

3796

goto out;

3802

goto out;

3797

}

3803

}

3798

progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,

3804

progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,

3799

false);

3805

false);

3800

if (!progress) {

3806

if (!progress) {

3801

nr_retries--;

3807

nr_retries--;

3802

/* maybe some writeback is necessary */

3808

/* maybe some writeback is necessary */

3803

congestion_wait(BLK_RW_ASYNC, HZ/10);

3809

congestion_wait(BLK_RW_ASYNC, HZ/10);

3804

}

3810

}

3805

3811

3806

}

3812

}

3807

lru_add_drain();

3813

lru_add_drain();

3808

/* try move_account...there may be some *locked* pages. */

3814

/* try move_account...there may be some *locked* pages. */

3809

goto move_account;

3815

goto move_account;

3810

}

3816

}

3811

3817

3812

static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)

3818

static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)

3813

{

3819

{

3814

return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);

3820

return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);

3815

}

3821

}

3816

3822

3817

3823

3818

static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)

3824

static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)

3819

{

3825

{

3820

return mem_cgroup_from_cont(cont)->use_hierarchy;

3826

return mem_cgroup_from_cont(cont)->use_hierarchy;

3821

}

3827

}

3822

3828

3823

static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,

3829

static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,

3824

u64 val)

3830

u64 val)

3825

{

3831

{

3826

int retval = 0;

3832

int retval = 0;

3827

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

3833

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

3828

struct cgroup *parent = cont->parent;

3834

struct cgroup *parent = cont->parent;

3829

struct mem_cgroup *parent_memcg = NULL;

3835

struct mem_cgroup *parent_memcg = NULL;

3830

3836

3831

if (parent)

3837

if (parent)

3832

parent_memcg = mem_cgroup_from_cont(parent);

3838

parent_memcg = mem_cgroup_from_cont(parent);

3833

3839

3834

cgroup_lock();

3840

cgroup_lock();

3835

3841

3836

if (memcg->use_hierarchy == val)

3842

if (memcg->use_hierarchy == val)

3837

goto out;

3843

goto out;

3838

3844

3839

/*

3845

/*

3840

* If parent's use_hierarchy is set, we can't make any modifications

3846

* If parent's use_hierarchy is set, we can't make any modifications

3841

* in the child subtrees. If it is unset, then the change can

3847

* in the child subtrees. If it is unset, then the change can

3842

* occur, provided the current cgroup has no children.

3848

* occur, provided the current cgroup has no children.

3843

*

3849

*

3844

* For the root cgroup, parent_mem is NULL, we allow value to be

3850

* For the root cgroup, parent_mem is NULL, we allow value to be

3845

* set if there are no children.

3851

* set if there are no children.

3846

*/

3852

*/

3847

if ((!parent_memcg || !parent_memcg->use_hierarchy) &&

3853

if ((!parent_memcg || !parent_memcg->use_hierarchy) &&

3848

(val == 1 || val == 0)) {

3854

(val == 1 || val == 0)) {

3849

if (list_empty(&cont->children))

3855

if (list_empty(&cont->children))

3850

memcg->use_hierarchy = val;

3856

memcg->use_hierarchy = val;

3851

else

3857

else

3852

retval = -EBUSY;

3858

retval = -EBUSY;

3853

} else

3859

} else

3854

retval = -EINVAL;

3860

retval = -EINVAL;

3855

3861

3856

out:

3862

out:

3857

cgroup_unlock();

3863

cgroup_unlock();

3858

3864

3859

return retval;

3865

return retval;

3860

}

3866

}

3861

3867

3862

3868

3863

static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,

3869

static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,

3864

enum mem_cgroup_stat_index idx)

3870

enum mem_cgroup_stat_index idx)

3865

{

3871

{

3866

struct mem_cgroup *iter;

3872

struct mem_cgroup *iter;

3867

long val = 0;

3873

long val = 0;

3868

3874

3869

/* Per-cpu values can be negative, use a signed accumulator */

3875

/* Per-cpu values can be negative, use a signed accumulator */

3870

for_each_mem_cgroup_tree(iter, memcg)

3876

for_each_mem_cgroup_tree(iter, memcg)

3871

val += mem_cgroup_read_stat(iter, idx);

3877

val += mem_cgroup_read_stat(iter, idx);

3872

3878

3873

if (val < 0) /* race ? */

3879

if (val < 0) /* race ? */

3874

val = 0;

3880

val = 0;

3875

return val;

3881

return val;

3876

}

3882

}

3877

3883

3878

static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)

3884

static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)

3879

{

3885

{

3880

u64 val;

3886

u64 val;

3881

3887

3882

if (!mem_cgroup_is_root(memcg)) {

3888

if (!mem_cgroup_is_root(memcg)) {

3883

if (!swap)

3889

if (!swap)

3884

return res_counter_read_u64(&memcg->res, RES_USAGE);

3890

return res_counter_read_u64(&memcg->res, RES_USAGE);

3885

else

3891

else

3886

return res_counter_read_u64(&memcg->memsw, RES_USAGE);

3892

return res_counter_read_u64(&memcg->memsw, RES_USAGE);

3887

}

3893

}

3888

3894

3889

val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);

3895

val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);

3890

val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);

3896

val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);

3891

3897

3892

if (swap)

3898

if (swap)

3893

val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);

3899

val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);

3894

3900

3895

return val << PAGE_SHIFT;

3901

return val << PAGE_SHIFT;

3896

}

3902

}

3897

3903

3898

static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,

3904

static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,

3899

struct file *file, char __user *buf,

3905

struct file *file, char __user *buf,

3900

size_t nbytes, loff_t *ppos)

3906

size_t nbytes, loff_t *ppos)

3901

{

3907

{

3902

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

3908

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

3903

char str[64];

3909

char str[64];

3904

u64 val;

3910

u64 val;

3905

int type, name, len;

3911

int type, name, len;

3906

3912

3907

type = MEMFILE_TYPE(cft->private);

3913

type = MEMFILE_TYPE(cft->private);

3908

name = MEMFILE_ATTR(cft->private);

3914

name = MEMFILE_ATTR(cft->private);

3909

3915

3910

if (!do_swap_account && type == _MEMSWAP)

3916

if (!do_swap_account && type == _MEMSWAP)

3911

return -EOPNOTSUPP;

3917

return -EOPNOTSUPP;

3912

3918

3913

switch (type) {

3919

switch (type) {

3914

case _MEM:

3920

case _MEM:

3915

if (name == RES_USAGE)

3921

if (name == RES_USAGE)

3916

val = mem_cgroup_usage(memcg, false);

3922

val = mem_cgroup_usage(memcg, false);

3917

else

3923

else

3918

val = res_counter_read_u64(&memcg->res, name);

3924

val = res_counter_read_u64(&memcg->res, name);

3919

break;

3925

break;

3920

case _MEMSWAP:

3926

case _MEMSWAP:

3921

if (name == RES_USAGE)

3927

if (name == RES_USAGE)

3922

val = mem_cgroup_usage(memcg, true);

3928

val = mem_cgroup_usage(memcg, true);

3923

else

3929

else

3924

val = res_counter_read_u64(&memcg->memsw, name);

3930

val = res_counter_read_u64(&memcg->memsw, name);

3925

break;

3931

break;

3926

default:

3932

default:

3927

BUG();

3933

BUG();

3928

}

3934

}

3929

3935

3930

len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);

3936

len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);

3931

return simple_read_from_buffer(buf, nbytes, ppos, str, len);

3937

return simple_read_from_buffer(buf, nbytes, ppos, str, len);

3932

}

3938

}

3933

/*

3939

/*

3934

* The user of this function is...

3940

* The user of this function is...

3935

* RES_LIMIT.

3941

* RES_LIMIT.

3936

*/

3942

*/

3937

static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,

3943

static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,

3938

const char *buffer)

3944

const char *buffer)

3939

{

3945

{

3940

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

3946

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

3941

int type, name;

3947

int type, name;

3942

unsigned long long val;

3948

unsigned long long val;

3943

int ret;

3949

int ret;

3944

3950

3945

type = MEMFILE_TYPE(cft->private);

3951

type = MEMFILE_TYPE(cft->private);

3946

name = MEMFILE_ATTR(cft->private);

3952

name = MEMFILE_ATTR(cft->private);

3947

3953

3948

if (!do_swap_account && type == _MEMSWAP)

3954

if (!do_swap_account && type == _MEMSWAP)

3949

return -EOPNOTSUPP;

3955

return -EOPNOTSUPP;

3950

3956

3951

switch (name) {

3957

switch (name) {

3952

case RES_LIMIT:

3958

case RES_LIMIT:

3953

if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */

3959

if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */

3954

ret = -EINVAL;

3960

ret = -EINVAL;

3955

break;

3961

break;

3956

}

3962

}

3957

/* This function does all necessary parse...reuse it */

3963

/* This function does all necessary parse...reuse it */

3958

ret = res_counter_memparse_write_strategy(buffer, &val);

3964

ret = res_counter_memparse_write_strategy(buffer, &val);

3959

if (ret)

3965

if (ret)

3960

break;

3966

break;

3961

if (type == _MEM)

3967

if (type == _MEM)

3962

ret = mem_cgroup_resize_limit(memcg, val);

3968

ret = mem_cgroup_resize_limit(memcg, val);

3963

else

3969

else

3964

ret = mem_cgroup_resize_memsw_limit(memcg, val);

3970

ret = mem_cgroup_resize_memsw_limit(memcg, val);

3965

break;

3971

break;

3966

case RES_SOFT_LIMIT:

3972

case RES_SOFT_LIMIT:

3967

ret = res_counter_memparse_write_strategy(buffer, &val);

3973

ret = res_counter_memparse_write_strategy(buffer, &val);

3968

if (ret)

3974

if (ret)

3969

break;

3975

break;

3970

/*

3976

/*

3971

* For memsw, soft limits are hard to implement in terms

3977

* For memsw, soft limits are hard to implement in terms

3972

* of semantics, for now, we support soft limits for

3978

* of semantics, for now, we support soft limits for

3973

* control without swap

3979

* control without swap

3974

*/

3980

*/

3975

if (type == _MEM)

3981

if (type == _MEM)

3976

ret = res_counter_set_soft_limit(&memcg->res, val);

3982

ret = res_counter_set_soft_limit(&memcg->res, val);

3977

else

3983

else

3978

ret = -EINVAL;

3984

ret = -EINVAL;

3979

break;

3985

break;

3980

default:

3986

default:

3981

ret = -EINVAL; /* should be BUG() ? */

3987

ret = -EINVAL; /* should be BUG() ? */

3982

break;

3988

break;

3983

}

3989

}

3984

return ret;

3990

return ret;

3985

}

3991

}

3986

3992

3987

static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,

3993

static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,

3988

unsigned long long *mem_limit, unsigned long long *memsw_limit)

3994

unsigned long long *mem_limit, unsigned long long *memsw_limit)

3989

{

3995

{

3990

struct cgroup *cgroup;

3996

struct cgroup *cgroup;

3991

unsigned long long min_limit, min_memsw_limit, tmp;

3997

unsigned long long min_limit, min_memsw_limit, tmp;

3992

3998

3993

min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3999

min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3994

min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

4000

min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3995

cgroup = memcg->css.cgroup;

4001

cgroup = memcg->css.cgroup;

3996

if (!memcg->use_hierarchy)

4002

if (!memcg->use_hierarchy)

3997

goto out;

4003

goto out;

3998

4004

3999

while (cgroup->parent) {

4005

while (cgroup->parent) {

4000

cgroup = cgroup->parent;

4006

cgroup = cgroup->parent;

4001

memcg = mem_cgroup_from_cont(cgroup);

4007

memcg = mem_cgroup_from_cont(cgroup);

4002

if (!memcg->use_hierarchy)

4008

if (!memcg->use_hierarchy)

4003

break;

4009

break;

4004

tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);

4010

tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);

4005

min_limit = min(min_limit, tmp);

4011

min_limit = min(min_limit, tmp);

4006

tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

4012

tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

4007

min_memsw_limit = min(min_memsw_limit, tmp);

4013

min_memsw_limit = min(min_memsw_limit, tmp);

4008

}

4014

}

4009

out:

4015

out:

4010

*mem_limit = min_limit;

4016

*mem_limit = min_limit;

4011

*memsw_limit = min_memsw_limit;

4017

*memsw_limit = min_memsw_limit;

4012

}

4018

}

4013

4019

4014

static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)

4020

static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)

4015

{

4021

{

4016

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

4022

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

4017

int type, name;

4023

int type, name;

4018

4024

4019

type = MEMFILE_TYPE(event);

4025

type = MEMFILE_TYPE(event);

4020

name = MEMFILE_ATTR(event);

4026

name = MEMFILE_ATTR(event);

4021

4027

4022

if (!do_swap_account && type == _MEMSWAP)

4028

if (!do_swap_account && type == _MEMSWAP)

4023

return -EOPNOTSUPP;

4029

return -EOPNOTSUPP;

4024

4030

4025

switch (name) {

4031

switch (name) {

4026

case RES_MAX_USAGE:

4032

case RES_MAX_USAGE:

4027

if (type == _MEM)

4033

if (type == _MEM)

4028

res_counter_reset_max(&memcg->res);

4034

res_counter_reset_max(&memcg->res);

4029

else

4035

else

4030

res_counter_reset_max(&memcg->memsw);

4036

res_counter_reset_max(&memcg->memsw);

4031

break;

4037

break;

4032

case RES_FAILCNT:

4038

case RES_FAILCNT:

4033

if (type == _MEM)

4039

if (type == _MEM)

4034

res_counter_reset_failcnt(&memcg->res);

4040

res_counter_reset_failcnt(&memcg->res);

4035

else

4041

else

4036

res_counter_reset_failcnt(&memcg->memsw);

4042

res_counter_reset_failcnt(&memcg->memsw);

4037

break;

4043

break;

4038

}

4044

}

4039

4045

4040

return 0;

4046

return 0;

4041

}

4047

}

4042

4048

4043

static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,

4049

static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,

4044

struct cftype *cft)

4050

struct cftype *cft)

4045

{

4051

{

4046

return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;

4052

return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;

4047

}

4053

}

4048

4054

4049

#ifdef CONFIG_MMU

4055

#ifdef CONFIG_MMU

4050

static int mem_cgroup_move_charge_write(struct cgroup *cgrp,

4056

static int mem_cgroup_move_charge_write(struct cgroup *cgrp,

4051

struct cftype *cft, u64 val)

4057

struct cftype *cft, u64 val)

4052

{

4058

{

4053

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4059

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4054

4060

4055

if (val >= (1 << NR_MOVE_TYPE))

4061

if (val >= (1 << NR_MOVE_TYPE))

4056

return -EINVAL;

4062

return -EINVAL;

4057

/*

4063

/*

4058

* We check this value several times in both in can_attach() and

4064

* We check this value several times in both in can_attach() and

4059

* attach(), so we need cgroup lock to prevent this value from being

4065

* attach(), so we need cgroup lock to prevent this value from being

4060

* inconsistent.

4066

* inconsistent.

4061

*/

4067

*/

4062

cgroup_lock();

4068

cgroup_lock();

4063

memcg->move_charge_at_immigrate = val;

4069

memcg->move_charge_at_immigrate = val;

4064

cgroup_unlock();

4070

cgroup_unlock();

4065

4071

4066

return 0;

4072

return 0;

4067

}

4073

}

4068

#else

4074

#else

4069

static int mem_cgroup_move_charge_write(struct cgroup *cgrp,

4075

static int mem_cgroup_move_charge_write(struct cgroup *cgrp,

4070

struct cftype *cft, u64 val)

4076

struct cftype *cft, u64 val)

4071

{

4077

{

4072

return -ENOSYS;

4078

return -ENOSYS;

4073

}

4079

}

4074

#endif

4080

#endif

4075

4081

4076

#ifdef CONFIG_NUMA

4082

#ifdef CONFIG_NUMA

4077

static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,

4083

static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,

4078

struct seq_file *m)

4084

struct seq_file *m)

4079

{

4085

{

4080

int nid;

4086

int nid;

4081

unsigned long total_nr, file_nr, anon_nr, unevictable_nr;

4087

unsigned long total_nr, file_nr, anon_nr, unevictable_nr;

4082

unsigned long node_nr;

4088

unsigned long node_nr;

4083

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

4089

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

4084

4090

4085

total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);

4091

total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);

4086

seq_printf(m, "total=%lu", total_nr);

4092

seq_printf(m, "total=%lu", total_nr);

4087

for_each_node_state(nid, N_HIGH_MEMORY) {

4093

for_each_node_state(nid, N_HIGH_MEMORY) {

4088

node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);

4094

node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);

4089

seq_printf(m, " N%d=%lu", nid, node_nr);

4095

seq_printf(m, " N%d=%lu", nid, node_nr);

4090

}

4096

}

4091

seq_putc(m, '\n');

4097

seq_putc(m, '\n');

4092

4098

4093

file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);

4099

file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);

4094

seq_printf(m, "file=%lu", file_nr);

4100

seq_printf(m, "file=%lu", file_nr);

4095

for_each_node_state(nid, N_HIGH_MEMORY) {

4101

for_each_node_state(nid, N_HIGH_MEMORY) {

4096

node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,

4102

node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,

4097

LRU_ALL_FILE);

4103

LRU_ALL_FILE);

4098

seq_printf(m, " N%d=%lu", nid, node_nr);

4104

seq_printf(m, " N%d=%lu", nid, node_nr);

4099

}

4105

}

4100

seq_putc(m, '\n');

4106

seq_putc(m, '\n');

4101

4107

4102

anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);

4108

anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);

4103

seq_printf(m, "anon=%lu", anon_nr);

4109

seq_printf(m, "anon=%lu", anon_nr);

4104

for_each_node_state(nid, N_HIGH_MEMORY) {

4110

for_each_node_state(nid, N_HIGH_MEMORY) {

4105

node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,

4111

node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,

4106

LRU_ALL_ANON);

4112

LRU_ALL_ANON);

4107

seq_printf(m, " N%d=%lu", nid, node_nr);

4113

seq_printf(m, " N%d=%lu", nid, node_nr);

4108

}

4114

}

4109

seq_putc(m, '\n');

4115

seq_putc(m, '\n');

4110

4116

4111

unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));

4117

unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));

4112

seq_printf(m, "unevictable=%lu", unevictable_nr);

4118

seq_printf(m, "unevictable=%lu", unevictable_nr);

4113

for_each_node_state(nid, N_HIGH_MEMORY) {

4119

for_each_node_state(nid, N_HIGH_MEMORY) {

4114

node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,

4120

node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,

4115

BIT(LRU_UNEVICTABLE));

4121

BIT(LRU_UNEVICTABLE));

4116

seq_printf(m, " N%d=%lu", nid, node_nr);

4122

seq_printf(m, " N%d=%lu", nid, node_nr);

4117

}

4123

}

4118

seq_putc(m, '\n');

4124

seq_putc(m, '\n');

4119

return 0;

4125

return 0;

4120

}

4126

}

4121

#endif /* CONFIG_NUMA */

4127

#endif /* CONFIG_NUMA */

4122

4128

4123

static const char * const mem_cgroup_lru_names[] = {

4129

static const char * const mem_cgroup_lru_names[] = {

4124

"inactive_anon",

4130

"inactive_anon",

4125

"active_anon",

4131

"active_anon",

4126

"inactive_file",

4132

"inactive_file",

4127

"active_file",

4133

"active_file",

4128

"unevictable",

4134

"unevictable",

4129

};

4135

};

4130

4136

4131

static inline void mem_cgroup_lru_names_not_uptodate(void)

4137

static inline void mem_cgroup_lru_names_not_uptodate(void)

4132

{

4138

{

4133

BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);

4139

BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);

4134

}

4140

}

4135

4141

4136

static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,

4142

static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,

4137

struct seq_file *m)

4143

struct seq_file *m)

4138

{

4144

{

4139

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

4145

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

4140

struct mem_cgroup *mi;

4146

struct mem_cgroup *mi;

4141

unsigned int i;

4147

unsigned int i;

4142

4148

4143

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

4149

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

4144

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

4150

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

4145

continue;

4151

continue;

4146

seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],

4152

seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],

4147

mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);

4153

mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);

4148

}

4154

}

4149

4155

4150

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)

4156

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)

4151

seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],

4157

seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],

4152

mem_cgroup_read_events(memcg, i));

4158

mem_cgroup_read_events(memcg, i));

4153

4159

4154

for (i = 0; i < NR_LRU_LISTS; i++)

4160

for (i = 0; i < NR_LRU_LISTS; i++)

4155

seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],

4161

seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],

4156

mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);

4162

mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);

4157

4163

4158

/* Hierarchical information */

4164

/* Hierarchical information */

4159

{

4165

{

4160

unsigned long long limit, memsw_limit;

4166

unsigned long long limit, memsw_limit;

4161

memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);

4167

memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);

4162

seq_printf(m, "hierarchical_memory_limit %llu\n", limit);

4168

seq_printf(m, "hierarchical_memory_limit %llu\n", limit);

4163

if (do_swap_account)

4169

if (do_swap_account)

4164

seq_printf(m, "hierarchical_memsw_limit %llu\n",

4170

seq_printf(m, "hierarchical_memsw_limit %llu\n",

4165

memsw_limit);

4171

memsw_limit);

4166

}

4172

}

4167

4173

4168

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

4174

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

4169

long long val = 0;

4175

long long val = 0;

4170

4176

4171

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

4177

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

4172

continue;

4178

continue;

4173

for_each_mem_cgroup_tree(mi, memcg)

4179

for_each_mem_cgroup_tree(mi, memcg)

4174

val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;

4180

val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;

4175

seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);

4181

seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);

4176

}

4182

}

4177

4183

4178

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

4184

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

4179

unsigned long long val = 0;

4185

unsigned long long val = 0;

4180

4186

4181

for_each_mem_cgroup_tree(mi, memcg)

4187

for_each_mem_cgroup_tree(mi, memcg)

4182

val += mem_cgroup_read_events(mi, i);

4188

val += mem_cgroup_read_events(mi, i);

4183

seq_printf(m, "total_%s %llu\n",

4189

seq_printf(m, "total_%s %llu\n",

4184

mem_cgroup_events_names[i], val);

4190

mem_cgroup_events_names[i], val);

4185

}

4191

}

4186

4192

4187

for (i = 0; i < NR_LRU_LISTS; i++) {

4193

for (i = 0; i < NR_LRU_LISTS; i++) {

4188

unsigned long long val = 0;

4194

unsigned long long val = 0;

4189

4195

4190

for_each_mem_cgroup_tree(mi, memcg)

4196

for_each_mem_cgroup_tree(mi, memcg)

4191

val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;

4197

val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;

4192

seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);

4198

seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);

4193

}

4199

}

4194

4200

4195

#ifdef CONFIG_DEBUG_VM

4201

#ifdef CONFIG_DEBUG_VM

4196

{

4202

{

4197

int nid, zid;

4203

int nid, zid;

4198

struct mem_cgroup_per_zone *mz;

4204

struct mem_cgroup_per_zone *mz;

4199

struct zone_reclaim_stat *rstat;

4205

struct zone_reclaim_stat *rstat;

4200

unsigned long recent_rotated[2] = {0, 0};

4206

unsigned long recent_rotated[2] = {0, 0};

4201

unsigned long recent_scanned[2] = {0, 0};

4207

unsigned long recent_scanned[2] = {0, 0};

4202

4208

4203

for_each_online_node(nid)

4209

for_each_online_node(nid)

4204

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

4210

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

4205

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

4211

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

4206

rstat = &mz->lruvec.reclaim_stat;

4212

rstat = &mz->lruvec.reclaim_stat;

4207

4213

4208

recent_rotated[0] += rstat->recent_rotated[0];

4214

recent_rotated[0] += rstat->recent_rotated[0];

4209

recent_rotated[1] += rstat->recent_rotated[1];

4215

recent_rotated[1] += rstat->recent_rotated[1];

4210

recent_scanned[0] += rstat->recent_scanned[0];

4216

recent_scanned[0] += rstat->recent_scanned[0];

4211

recent_scanned[1] += rstat->recent_scanned[1];

4217

recent_scanned[1] += rstat->recent_scanned[1];

4212

}

4218

}

4213

seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);

4219

seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);

4214

seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);

4220

seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);

4215

seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);

4221

seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);

4216

seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);

4222

seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);

4217

}

4223

}

4218

#endif

4224

#endif

4219

4225

4220

return 0;

4226

return 0;

4221

}

4227

}

4222

4228

4223

static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)

4229

static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)

4224

{

4230

{

4225

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4231

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4226

4232

4227

return mem_cgroup_swappiness(memcg);

4233

return mem_cgroup_swappiness(memcg);

4228

}

4234

}

4229

4235

4230

static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,

4236

static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,

4231

u64 val)

4237

u64 val)

4232

{

4238

{

4233

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4239

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4234

struct mem_cgroup *parent;

4240

struct mem_cgroup *parent;

4235

4241

4236

if (val > 100)

4242

if (val > 100)

4237

return -EINVAL;

4243

return -EINVAL;

4238

4244

4239

if (cgrp->parent == NULL)

4245

if (cgrp->parent == NULL)

4240

return -EINVAL;

4246

return -EINVAL;

4241

4247

4242

parent = mem_cgroup_from_cont(cgrp->parent);

4248

parent = mem_cgroup_from_cont(cgrp->parent);

4243

4249

4244

cgroup_lock();

4250

cgroup_lock();

4245

4251

4246

/* If under hierarchy, only empty-root can set this value */

4252

/* If under hierarchy, only empty-root can set this value */

4247

if ((parent->use_hierarchy) ||

4253

if ((parent->use_hierarchy) ||

4248

(memcg->use_hierarchy && !list_empty(&cgrp->children))) {

4254

(memcg->use_hierarchy && !list_empty(&cgrp->children))) {

4249

cgroup_unlock();

4255

cgroup_unlock();

4250

return -EINVAL;

4256

return -EINVAL;

4251

}

4257

}

4252

4258

4253

memcg->swappiness = val;

4259

memcg->swappiness = val;

4254

4260

4255

cgroup_unlock();

4261

cgroup_unlock();

4256

4262

4257

return 0;

4263

return 0;

4258

}

4264

}

4259

4265

4260

static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)

4266

static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)

4261

{

4267

{

4262

struct mem_cgroup_threshold_ary *t;

4268

struct mem_cgroup_threshold_ary *t;

4263

u64 usage;

4269

u64 usage;

4264

int i;

4270

int i;

4265

4271

4266

rcu_read_lock();

4272

rcu_read_lock();

4267

if (!swap)

4273

if (!swap)

4268

t = rcu_dereference(memcg->thresholds.primary);

4274

t = rcu_dereference(memcg->thresholds.primary);

4269

else

4275

else

4270

t = rcu_dereference(memcg->memsw_thresholds.primary);

4276

t = rcu_dereference(memcg->memsw_thresholds.primary);

4271

4277

4272

if (!t)

4278

if (!t)

4273

goto unlock;

4279

goto unlock;

4274

4280

4275

usage = mem_cgroup_usage(memcg, swap);

4281

usage = mem_cgroup_usage(memcg, swap);

4276

4282

4277

/*

4283

/*

4278

* current_threshold points to threshold just below or equal to usage.

4284

* current_threshold points to threshold just below or equal to usage.

4279

* If it's not true, a threshold was crossed after last

4285

* If it's not true, a threshold was crossed after last

4280

* call of __mem_cgroup_threshold().

4286

* call of __mem_cgroup_threshold().

4281

*/

4287

*/

4282

i = t->current_threshold;

4288

i = t->current_threshold;

4283

4289

4284

/*

4290

/*

4285

* Iterate backward over array of thresholds starting from

4291

* Iterate backward over array of thresholds starting from

4286

* current_threshold and check if a threshold is crossed.

4292

* current_threshold and check if a threshold is crossed.

4287

* If none of thresholds below usage is crossed, we read

4293

* If none of thresholds below usage is crossed, we read

4288

* only one element of the array here.

4294

* only one element of the array here.

4289

*/

4295

*/

4290

for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)

4296

for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)

4291

eventfd_signal(t->entries[i].eventfd, 1);

4297

eventfd_signal(t->entries[i].eventfd, 1);

4292

4298

4293

/* i = current_threshold + 1 */

4299

/* i = current_threshold + 1 */

4294

i++;

4300

i++;

4295

4301

4296

/*

4302

/*

4297

* Iterate forward over array of thresholds starting from

4303

* Iterate forward over array of thresholds starting from

4298

* current_threshold+1 and check if a threshold is crossed.

4304

* current_threshold+1 and check if a threshold is crossed.

4299

* If none of thresholds above usage is crossed, we read

4305

* If none of thresholds above usage is crossed, we read

4300

* only one element of the array here.

4306

* only one element of the array here.

4301

*/

4307

*/

4302

for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)

4308

for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)

4303

eventfd_signal(t->entries[i].eventfd, 1);

4309

eventfd_signal(t->entries[i].eventfd, 1);

4304

4310

4305

/* Update current_threshold */

4311

/* Update current_threshold */

4306

t->current_threshold = i - 1;

4312

t->current_threshold = i - 1;

4307

unlock:

4313

unlock:

4308

rcu_read_unlock();

4314

rcu_read_unlock();

4309

}

4315

}

4310

4316

4311

static void mem_cgroup_threshold(struct mem_cgroup *memcg)

4317

static void mem_cgroup_threshold(struct mem_cgroup *memcg)

4312

{

4318

{

4313

while (memcg) {

4319

while (memcg) {

4314

__mem_cgroup_threshold(memcg, false);

4320

__mem_cgroup_threshold(memcg, false);

4315

if (do_swap_account)

4321

if (do_swap_account)

4316

__mem_cgroup_threshold(memcg, true);

4322

__mem_cgroup_threshold(memcg, true);

4317

4323

4318

memcg = parent_mem_cgroup(memcg);

4324

memcg = parent_mem_cgroup(memcg);

4319

}

4325

}

4320

}

4326

}

4321

4327

4322

static int compare_thresholds(const void *a, const void *b)

4328

static int compare_thresholds(const void *a, const void *b)

4323

{

4329

{

4324

const struct mem_cgroup_threshold *_a = a;

4330

const struct mem_cgroup_threshold *_a = a;

4325

const struct mem_cgroup_threshold *_b = b;

4331

const struct mem_cgroup_threshold *_b = b;

4326

4332

4327

return _a->threshold - _b->threshold;

4333

return _a->threshold - _b->threshold;

4328

}

4334

}

4329

4335

4330

static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)

4336

static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)

4331

{

4337

{

4332

struct mem_cgroup_eventfd_list *ev;

4338

struct mem_cgroup_eventfd_list *ev;

4333

4339

4334

list_for_each_entry(ev, &memcg->oom_notify, list)

4340

list_for_each_entry(ev, &memcg->oom_notify, list)

4335

eventfd_signal(ev->eventfd, 1);

4341

eventfd_signal(ev->eventfd, 1);

4336

return 0;

4342

return 0;

4337

}

4343

}

4338

4344

4339

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)

4345

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)

4340

{

4346

{

4341

struct mem_cgroup *iter;

4347

struct mem_cgroup *iter;

4342

4348

4343

for_each_mem_cgroup_tree(iter, memcg)

4349

for_each_mem_cgroup_tree(iter, memcg)

4344

mem_cgroup_oom_notify_cb(iter);

4350

mem_cgroup_oom_notify_cb(iter);

4345

}

4351

}

4346

4352

4347

static int mem_cgroup_usage_register_event(struct cgroup *cgrp,

4353

static int mem_cgroup_usage_register_event(struct cgroup *cgrp,

4348

struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)

4354

struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)

4349

{

4355

{

4350

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4356

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4351

struct mem_cgroup_thresholds *thresholds;

4357

struct mem_cgroup_thresholds *thresholds;

4352

struct mem_cgroup_threshold_ary *new;

4358

struct mem_cgroup_threshold_ary *new;

4353

int type = MEMFILE_TYPE(cft->private);

4359

int type = MEMFILE_TYPE(cft->private);

4354

u64 threshold, usage;

4360

u64 threshold, usage;

4355

int i, size, ret;

4361

int i, size, ret;

4356

4362

4357

ret = res_counter_memparse_write_strategy(args, &threshold);

4363

ret = res_counter_memparse_write_strategy(args, &threshold);

4358

if (ret)

4364

if (ret)

4359

return ret;

4365

return ret;

4360

4366

4361

mutex_lock(&memcg->thresholds_lock);

4367

mutex_lock(&memcg->thresholds_lock);

4362

4368

4363

if (type == _MEM)

4369

if (type == _MEM)

4364

thresholds = &memcg->thresholds;

4370

thresholds = &memcg->thresholds;

4365

else if (type == _MEMSWAP)

4371

else if (type == _MEMSWAP)

4366

thresholds = &memcg->memsw_thresholds;

4372

thresholds = &memcg->memsw_thresholds;

4367

else

4373

else

4368

BUG();

4374

BUG();

4369

4375

4370

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

4376

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

4371

4377

4372

/* Check if a threshold crossed before adding a new one */

4378

/* Check if a threshold crossed before adding a new one */

4373

if (thresholds->primary)

4379

if (thresholds->primary)

4374

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4380

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4375

4381

4376

size = thresholds->primary ? thresholds->primary->size + 1 : 1;

4382

size = thresholds->primary ? thresholds->primary->size + 1 : 1;

4377

4383

4378

/* Allocate memory for new array of thresholds */

4384

/* Allocate memory for new array of thresholds */

4379

new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),

4385

new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),

4380

GFP_KERNEL);

4386

GFP_KERNEL);

4381

if (!new) {

4387

if (!new) {

4382

ret = -ENOMEM;

4388

ret = -ENOMEM;

4383

goto unlock;

4389

goto unlock;

4384

}

4390

}

4385

new->size = size;

4391

new->size = size;

4386

4392

4387

/* Copy thresholds (if any) to new array */

4393

/* Copy thresholds (if any) to new array */

4388

if (thresholds->primary) {

4394

if (thresholds->primary) {

4389

memcpy(new->entries, thresholds->primary->entries, (size - 1) *

4395

memcpy(new->entries, thresholds->primary->entries, (size - 1) *

4390

sizeof(struct mem_cgroup_threshold));

4396

sizeof(struct mem_cgroup_threshold));

4391

}

4397

}

4392

4398

4393

/* Add new threshold */

4399

/* Add new threshold */

4394

new->entries[size - 1].eventfd = eventfd;

4400

new->entries[size - 1].eventfd = eventfd;

4395

new->entries[size - 1].threshold = threshold;

4401

new->entries[size - 1].threshold = threshold;

4396

4402

4397

/* Sort thresholds. Registering of new threshold isn't time-critical */

4403

/* Sort thresholds. Registering of new threshold isn't time-critical */

4398

sort(new->entries, size, sizeof(struct mem_cgroup_threshold),

4404

sort(new->entries, size, sizeof(struct mem_cgroup_threshold),

4399

compare_thresholds, NULL);

4405

compare_thresholds, NULL);

4400

4406

4401

/* Find current threshold */

4407

/* Find current threshold */

4402

new->current_threshold = -1;

4408

new->current_threshold = -1;

4403

for (i = 0; i < size; i++) {

4409

for (i = 0; i < size; i++) {

4404

if (new->entries[i].threshold <= usage) {

4410

if (new->entries[i].threshold <= usage) {

4405

/*

4411

/*

4406

* new->current_threshold will not be used until

4412

* new->current_threshold will not be used until

4407

* rcu_assign_pointer(), so it's safe to increment

4413

* rcu_assign_pointer(), so it's safe to increment

4408

* it here.

4414

* it here.

4409

*/

4415

*/

4410

++new->current_threshold;

4416

++new->current_threshold;

4411

} else

4417

} else

4412

break;

4418

break;

4413

}

4419

}

4414

4420

4415

/* Free old spare buffer and save old primary buffer as spare */

4421

/* Free old spare buffer and save old primary buffer as spare */

4416

kfree(thresholds->spare);

4422

kfree(thresholds->spare);

4417

thresholds->spare = thresholds->primary;

4423

thresholds->spare = thresholds->primary;

4418

4424

4419

rcu_assign_pointer(thresholds->primary, new);

4425

rcu_assign_pointer(thresholds->primary, new);

4420

4426

4421

/* To be sure that nobody uses thresholds */

4427

/* To be sure that nobody uses thresholds */

4422

synchronize_rcu();

4428

synchronize_rcu();

4423

4429

4424

unlock:

4430

unlock:

4425

mutex_unlock(&memcg->thresholds_lock);

4431

mutex_unlock(&memcg->thresholds_lock);

4426

4432

4427

return ret;

4433

return ret;

4428

}

4434

}

4429

4435

4430

static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,

4436

static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,

4431

struct cftype *cft, struct eventfd_ctx *eventfd)

4437

struct cftype *cft, struct eventfd_ctx *eventfd)

4432

{

4438

{

4433

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4439

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4434

struct mem_cgroup_thresholds *thresholds;

4440

struct mem_cgroup_thresholds *thresholds;

4435

struct mem_cgroup_threshold_ary *new;

4441

struct mem_cgroup_threshold_ary *new;

4436

int type = MEMFILE_TYPE(cft->private);

4442

int type = MEMFILE_TYPE(cft->private);

4437

u64 usage;

4443

u64 usage;

4438

int i, j, size;

4444

int i, j, size;

4439

4445

4440

mutex_lock(&memcg->thresholds_lock);

4446

mutex_lock(&memcg->thresholds_lock);

4441

if (type == _MEM)

4447

if (type == _MEM)

4442

thresholds = &memcg->thresholds;

4448

thresholds = &memcg->thresholds;

4443

else if (type == _MEMSWAP)

4449

else if (type == _MEMSWAP)

4444

thresholds = &memcg->memsw_thresholds;

4450

thresholds = &memcg->memsw_thresholds;

4445

else

4451

else

4446

BUG();

4452

BUG();

4447

4453

4448

if (!thresholds->primary)

4454

if (!thresholds->primary)

4449

goto unlock;

4455

goto unlock;

4450

4456

4451

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

4457

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

4452

4458

4453

/* Check if a threshold crossed before removing */

4459

/* Check if a threshold crossed before removing */

4454

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4460

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4455

4461

4456

/* Calculate new number of threshold */

4462

/* Calculate new number of threshold */

4457

size = 0;

4463

size = 0;

4458

for (i = 0; i < thresholds->primary->size; i++) {

4464

for (i = 0; i < thresholds->primary->size; i++) {

4459

if (thresholds->primary->entries[i].eventfd != eventfd)

4465

if (thresholds->primary->entries[i].eventfd != eventfd)

4460

size++;

4466

size++;

4461

}

4467

}

4462

4468

4463

new = thresholds->spare;

4469

new = thresholds->spare;

4464

4470

4465

/* Set thresholds array to NULL if we don't have thresholds */

4471

/* Set thresholds array to NULL if we don't have thresholds */

4466

if (!size) {

4472

if (!size) {

4467

kfree(new);

4473

kfree(new);

4468

new = NULL;

4474

new = NULL;

4469

goto swap_buffers;

4475

goto swap_buffers;

4470

}

4476

}

4471

4477

4472

new->size = size;

4478

new->size = size;

4473

4479

4474

/* Copy thresholds and find current threshold */

4480

/* Copy thresholds and find current threshold */

4475

new->current_threshold = -1;

4481

new->current_threshold = -1;

4476

for (i = 0, j = 0; i < thresholds->primary->size; i++) {

4482

for (i = 0, j = 0; i < thresholds->primary->size; i++) {

4477

if (thresholds->primary->entries[i].eventfd == eventfd)

4483

if (thresholds->primary->entries[i].eventfd == eventfd)

4478

continue;

4484

continue;

4479

4485

4480

new->entries[j] = thresholds->primary->entries[i];

4486

new->entries[j] = thresholds->primary->entries[i];

4481

if (new->entries[j].threshold <= usage) {

4487

if (new->entries[j].threshold <= usage) {

4482

/*

4488

/*

4483

* new->current_threshold will not be used

4489

* new->current_threshold will not be used

4484

* until rcu_assign_pointer(), so it's safe to increment

4490

* until rcu_assign_pointer(), so it's safe to increment

4485

* it here.

4491

* it here.

4486

*/

4492

*/

4487

++new->current_threshold;

4493

++new->current_threshold;

4488

}

4494

}

4489

j++;

4495

j++;

4490

}

4496

}

4491

4497

4492

swap_buffers:

4498

swap_buffers:

4493

/* Swap primary and spare array */

4499

/* Swap primary and spare array */

4494

thresholds->spare = thresholds->primary;

4500

thresholds->spare = thresholds->primary;

4495

/* If all events are unregistered, free the spare array */

4501

/* If all events are unregistered, free the spare array */

4496

if (!new) {

4502

if (!new) {

4497

kfree(thresholds->spare);

4503

kfree(thresholds->spare);

4498

thresholds->spare = NULL;

4504

thresholds->spare = NULL;

4499

}

4505

}

4500

4506

4501

rcu_assign_pointer(thresholds->primary, new);

4507

rcu_assign_pointer(thresholds->primary, new);

4502

4508

4503

/* To be sure that nobody uses thresholds */

4509

/* To be sure that nobody uses thresholds */

4504

synchronize_rcu();

4510

synchronize_rcu();

4505

unlock:

4511

unlock:

4506

mutex_unlock(&memcg->thresholds_lock);

4512

mutex_unlock(&memcg->thresholds_lock);

4507

}

4513

}

4508

4514

4509

static int mem_cgroup_oom_register_event(struct cgroup *cgrp,

4515

static int mem_cgroup_oom_register_event(struct cgroup *cgrp,

4510

struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)

4516

struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)

4511

{

4517

{

4512

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4518

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4513

struct mem_cgroup_eventfd_list *event;

4519

struct mem_cgroup_eventfd_list *event;

4514

int type = MEMFILE_TYPE(cft->private);

4520

int type = MEMFILE_TYPE(cft->private);

4515

4521

4516

BUG_ON(type != _OOM_TYPE);

4522

BUG_ON(type != _OOM_TYPE);

4517

event = kmalloc(sizeof(*event), GFP_KERNEL);

4523

event = kmalloc(sizeof(*event), GFP_KERNEL);

4518

if (!event)

4524

if (!event)

4519

return -ENOMEM;

4525

return -ENOMEM;

4520

4526

4521

spin_lock(&memcg_oom_lock);

4527

spin_lock(&memcg_oom_lock);

4522

4528

4523

event->eventfd = eventfd;

4529

event->eventfd = eventfd;

4524

list_add(&event->list, &memcg->oom_notify);

4530

list_add(&event->list, &memcg->oom_notify);

4525

4531

4526

/* already in OOM ? */

4532

/* already in OOM ? */

4527

if (atomic_read(&memcg->under_oom))

4533

if (atomic_read(&memcg->under_oom))

4528

eventfd_signal(eventfd, 1);

4534

eventfd_signal(eventfd, 1);

4529

spin_unlock(&memcg_oom_lock);

4535

spin_unlock(&memcg_oom_lock);

4530

4536

4531

return 0;

4537

return 0;

4532

}

4538

}

4533

4539

4534

static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,

4540

static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,

4535

struct cftype *cft, struct eventfd_ctx *eventfd)

4541

struct cftype *cft, struct eventfd_ctx *eventfd)

4536

{

4542

{

4537

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4543

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4538

struct mem_cgroup_eventfd_list *ev, *tmp;

4544

struct mem_cgroup_eventfd_list *ev, *tmp;

4539

int type = MEMFILE_TYPE(cft->private);

4545

int type = MEMFILE_TYPE(cft->private);

4540

4546

4541

BUG_ON(type != _OOM_TYPE);

4547

BUG_ON(type != _OOM_TYPE);

4542

4548

4543

spin_lock(&memcg_oom_lock);

4549

spin_lock(&memcg_oom_lock);

4544

4550

4545

list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {

4551

list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {

4546

if (ev->eventfd == eventfd) {

4552

if (ev->eventfd == eventfd) {

4547

list_del(&ev->list);

4553

list_del(&ev->list);

4548

kfree(ev);

4554

kfree(ev);

4549

}

4555

}

4550

}

4556

}

4551

4557

4552

spin_unlock(&memcg_oom_lock);

4558

spin_unlock(&memcg_oom_lock);

4553

}

4559

}

4554

4560

4555

static int mem_cgroup_oom_control_read(struct cgroup *cgrp,

4561

static int mem_cgroup_oom_control_read(struct cgroup *cgrp,

4556

struct cftype *cft, struct cgroup_map_cb *cb)

4562

struct cftype *cft, struct cgroup_map_cb *cb)

4557

{

4563

{

4558

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4564

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4559

4565

4560

cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);

4566

cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);

4561

4567

4562

if (atomic_read(&memcg->under_oom))

4568

if (atomic_read(&memcg->under_oom))

4563

cb->fill(cb, "under_oom", 1);

4569

cb->fill(cb, "under_oom", 1);

4564

else

4570

else

4565

cb->fill(cb, "under_oom", 0);

4571

cb->fill(cb, "under_oom", 0);

4566

return 0;

4572

return 0;

4567

}

4573

}

4568

4574

4569

static int mem_cgroup_oom_control_write(struct cgroup *cgrp,

4575

static int mem_cgroup_oom_control_write(struct cgroup *cgrp,

4570

struct cftype *cft, u64 val)

4576

struct cftype *cft, u64 val)

4571

{

4577

{

4572

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4578

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4573

struct mem_cgroup *parent;

4579

struct mem_cgroup *parent;

4574

4580

4575

/* cannot set to root cgroup and only 0 and 1 are allowed */

4581

/* cannot set to root cgroup and only 0 and 1 are allowed */

4576

if (!cgrp->parent || !((val == 0) || (val == 1)))

4582

if (!cgrp->parent || !((val == 0) || (val == 1)))

4577

return -EINVAL;

4583

return -EINVAL;

4578

4584

4579

parent = mem_cgroup_from_cont(cgrp->parent);

4585

parent = mem_cgroup_from_cont(cgrp->parent);

4580

4586

4581

cgroup_lock();

4587

cgroup_lock();

4582

/* oom-kill-disable is a flag for subhierarchy. */

4588

/* oom-kill-disable is a flag for subhierarchy. */

4583

if ((parent->use_hierarchy) ||

4589

if ((parent->use_hierarchy) ||

4584

(memcg->use_hierarchy && !list_empty(&cgrp->children))) {

4590

(memcg->use_hierarchy && !list_empty(&cgrp->children))) {

4585

cgroup_unlock();

4591

cgroup_unlock();

4586

return -EINVAL;

4592

return -EINVAL;

4587

}

4593

}

4588

memcg->oom_kill_disable = val;

4594

memcg->oom_kill_disable = val;

4589

if (!val)

4595

if (!val)

4590

memcg_oom_recover(memcg);

4596

memcg_oom_recover(memcg);

4591

cgroup_unlock();

4597

cgroup_unlock();

4592

return 0;

4598

return 0;

4593

}

4599

}

4594

4600

4595

#ifdef CONFIG_MEMCG_KMEM

4601

#ifdef CONFIG_MEMCG_KMEM

4596

static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)

4602

static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)

4597

{

4603

{

4598

return mem_cgroup_sockets_init(memcg, ss);

4604

return mem_cgroup_sockets_init(memcg, ss);

4599

};

4605

};

4600

4606

4601

static void kmem_cgroup_destroy(struct mem_cgroup *memcg)

4607

static void kmem_cgroup_destroy(struct mem_cgroup *memcg)

4602

{

4608

{

4603

mem_cgroup_sockets_destroy(memcg);

4609

mem_cgroup_sockets_destroy(memcg);

4604

}

4610

}

4605

#else

4611

#else

4606

static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)

4612

static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)

4607

{

4613

{

4608

return 0;

4614

return 0;

4609

}

4615

}

4610

4616

4611

static void kmem_cgroup_destroy(struct mem_cgroup *memcg)

4617

static void kmem_cgroup_destroy(struct mem_cgroup *memcg)

4612

{

4618

{

4613

}

4619

}

4614

#endif

4620

#endif

4615

4621

4616

static struct cftype mem_cgroup_files[] = {

4622

static struct cftype mem_cgroup_files[] = {

4617

{

4623

{

4618

.name = "usage_in_bytes",

4624

.name = "usage_in_bytes",

4619

.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),

4625

.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),

4620

.read = mem_cgroup_read,

4626

.read = mem_cgroup_read,

4621

.register_event = mem_cgroup_usage_register_event,

4627

.register_event = mem_cgroup_usage_register_event,

4622

.unregister_event = mem_cgroup_usage_unregister_event,

4628

.unregister_event = mem_cgroup_usage_unregister_event,

4623

},

4629

},

4624

{

4630

{

4625

.name = "max_usage_in_bytes",

4631

.name = "max_usage_in_bytes",

4626

.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),

4632

.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),

4627

.trigger = mem_cgroup_reset,

4633

.trigger = mem_cgroup_reset,

4628

.read = mem_cgroup_read,

4634

.read = mem_cgroup_read,

4629

},

4635

},

4630

{

4636

{

4631

.name = "limit_in_bytes",

4637

.name = "limit_in_bytes",

4632

.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),

4638

.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),

4633

.write_string = mem_cgroup_write,

4639

.write_string = mem_cgroup_write,

4634

.read = mem_cgroup_read,

4640

.read = mem_cgroup_read,

4635

},

4641

},

4636

{

4642

{

4637

.name = "soft_limit_in_bytes",

4643

.name = "soft_limit_in_bytes",

4638

.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),

4644

.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),

4639

.write_string = mem_cgroup_write,

4645

.write_string = mem_cgroup_write,

4640

.read = mem_cgroup_read,

4646

.read = mem_cgroup_read,

4641

},

4647

},

4642

{

4648

{

4643

.name = "failcnt",

4649

.name = "failcnt",

4644

.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),

4650

.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),

4645

.trigger = mem_cgroup_reset,

4651

.trigger = mem_cgroup_reset,

4646

.read = mem_cgroup_read,

4652

.read = mem_cgroup_read,

4647

},

4653

},

4648

{

4654

{

4649

.name = "stat",

4655

.name = "stat",

4650

.read_seq_string = memcg_stat_show,

4656

.read_seq_string = memcg_stat_show,

4651

},

4657

},

4652

{

4658

{

4653

.name = "force_empty",

4659

.name = "force_empty",

4654

.trigger = mem_cgroup_force_empty_write,

4660

.trigger = mem_cgroup_force_empty_write,

4655

},

4661

},

4656

{

4662

{

4657

.name = "use_hierarchy",

4663

.name = "use_hierarchy",

4658

.write_u64 = mem_cgroup_hierarchy_write,

4664

.write_u64 = mem_cgroup_hierarchy_write,

4659

.read_u64 = mem_cgroup_hierarchy_read,

4665

.read_u64 = mem_cgroup_hierarchy_read,

4660

},

4666

},

4661

{

4667

{

4662

.name = "swappiness",

4668

.name = "swappiness",

4663

.read_u64 = mem_cgroup_swappiness_read,

4669

.read_u64 = mem_cgroup_swappiness_read,

4664

.write_u64 = mem_cgroup_swappiness_write,

4670

.write_u64 = mem_cgroup_swappiness_write,

4665

},

4671

},

4666

{

4672

{

4667

.name = "move_charge_at_immigrate",

4673

.name = "move_charge_at_immigrate",

4668

.read_u64 = mem_cgroup_move_charge_read,

4674

.read_u64 = mem_cgroup_move_charge_read,

4669

.write_u64 = mem_cgroup_move_charge_write,

4675

.write_u64 = mem_cgroup_move_charge_write,

4670

},

4676

},

4671

{

4677

{

4672

.name = "oom_control",

4678

.name = "oom_control",

4673

.read_map = mem_cgroup_oom_control_read,

4679

.read_map = mem_cgroup_oom_control_read,

4674

.write_u64 = mem_cgroup_oom_control_write,

4680

.write_u64 = mem_cgroup_oom_control_write,

4675

.register_event = mem_cgroup_oom_register_event,

4681

.register_event = mem_cgroup_oom_register_event,

4676

.unregister_event = mem_cgroup_oom_unregister_event,

4682

.unregister_event = mem_cgroup_oom_unregister_event,

4677

.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),

4683

.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),

4678

},

4684

},

4679

#ifdef CONFIG_NUMA

4685

#ifdef CONFIG_NUMA

4680

{

4686

{

4681

.name = "numa_stat",

4687

.name = "numa_stat",

4682

.read_seq_string = memcg_numa_stat_show,

4688

.read_seq_string = memcg_numa_stat_show,

4683

},

4689

},

4684

#endif

4690

#endif

4685

#ifdef CONFIG_MEMCG_SWAP

4691

#ifdef CONFIG_MEMCG_SWAP

4686

{

4692

{

4687

.name = "memsw.usage_in_bytes",

4693

.name = "memsw.usage_in_bytes",

4688

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),

4694

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),

4689

.read = mem_cgroup_read,

4695

.read = mem_cgroup_read,

4690

.register_event = mem_cgroup_usage_register_event,

4696

.register_event = mem_cgroup_usage_register_event,

4691

.unregister_event = mem_cgroup_usage_unregister_event,

4697

.unregister_event = mem_cgroup_usage_unregister_event,

4692

},

4698

},

4693

{

4699

{

4694

.name = "memsw.max_usage_in_bytes",

4700

.name = "memsw.max_usage_in_bytes",

4695

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),

4701

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),

4696

.trigger = mem_cgroup_reset,

4702

.trigger = mem_cgroup_reset,

4697

.read = mem_cgroup_read,

4703

.read = mem_cgroup_read,

4698

},

4704

},

4699

{

4705

{

4700

.name = "memsw.limit_in_bytes",

4706

.name = "memsw.limit_in_bytes",

4701

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),

4707

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),

4702

.write_string = mem_cgroup_write,

4708

.write_string = mem_cgroup_write,

4703

.read = mem_cgroup_read,

4709

.read = mem_cgroup_read,

4704

},

4710

},

4705

{

4711

{

4706

.name = "memsw.failcnt",

4712

.name = "memsw.failcnt",

4707

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),

4713

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),

4708

.trigger = mem_cgroup_reset,

4714

.trigger = mem_cgroup_reset,

4709

.read = mem_cgroup_read,

4715

.read = mem_cgroup_read,

4710

},

4716

},

4711

#endif

4717

#endif

4712

{ }, /* terminate */

4718

{ }, /* terminate */

4713

};

4719

};

4714

4720

4715

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)

4721

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)

4716

{

4722

{

4717

struct mem_cgroup_per_node *pn;

4723

struct mem_cgroup_per_node *pn;

4718

struct mem_cgroup_per_zone *mz;

4724

struct mem_cgroup_per_zone *mz;

4719

int zone, tmp = node;

4725

int zone, tmp = node;

4720

/*

4726

/*

4721

* This routine is called against possible nodes.

4727

* This routine is called against possible nodes.

4722

* But it's BUG to call kmalloc() against offline node.

4728

* But it's BUG to call kmalloc() against offline node.

4723

*

4729

*

4724

* TODO: this routine can waste much memory for nodes which will

4730

* TODO: this routine can waste much memory for nodes which will

4725

* never be onlined. It's better to use memory hotplug callback

4731

* never be onlined. It's better to use memory hotplug callback

4726

* function.

4732

* function.

4727

*/

4733

*/

4728

if (!node_state(node, N_NORMAL_MEMORY))

4734

if (!node_state(node, N_NORMAL_MEMORY))

4729

tmp = -1;

4735

tmp = -1;

4730

pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

4736

pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

4731

if (!pn)

4737

if (!pn)

4732

return 1;

4738

return 1;

4733

4739

4734

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

4740

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

4735

mz = &pn->zoneinfo[zone];

4741

mz = &pn->zoneinfo[zone];

4736

lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);

4742

lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);

4737

mz->usage_in_excess = 0;

4743

mz->usage_in_excess = 0;

4738

mz->on_tree = false;

4744

mz->on_tree = false;

4739

mz->memcg = memcg;

4745

mz->memcg = memcg;

4740

}

4746

}

4741

memcg->info.nodeinfo[node] = pn;

4747

memcg->info.nodeinfo[node] = pn;

4742

return 0;

4748

return 0;

4743

}

4749

}

4744

4750

4745

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)

4751

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)

4746

{

4752

{

4747

kfree(memcg->info.nodeinfo[node]);

4753

kfree(memcg->info.nodeinfo[node]);

4748

}

4754

}

4749

4755

4750

static struct mem_cgroup *mem_cgroup_alloc(void)

4756

static struct mem_cgroup *mem_cgroup_alloc(void)

4751

{

4757

{

4752

struct mem_cgroup *memcg;

4758

struct mem_cgroup *memcg;

4753

int size = sizeof(struct mem_cgroup);

4759

int size = sizeof(struct mem_cgroup);

4754

4760

4755

/* Can be very big if MAX_NUMNODES is very big */

4761

/* Can be very big if MAX_NUMNODES is very big */

4756

if (size < PAGE_SIZE)

4762

if (size < PAGE_SIZE)

4757

memcg = kzalloc(size, GFP_KERNEL);

4763

memcg = kzalloc(size, GFP_KERNEL);

4758

else

4764

else

4759

memcg = vzalloc(size);

4765

memcg = vzalloc(size);

4760

4766

4761

if (!memcg)

4767

if (!memcg)

4762

return NULL;

4768

return NULL;

4763

4769

4764

memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);

4770

memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);

4765

if (!memcg->stat)

4771

if (!memcg->stat)

4766

goto out_free;

4772

goto out_free;

4767

spin_lock_init(&memcg->pcp_counter_lock);

4773

spin_lock_init(&memcg->pcp_counter_lock);

4768

return memcg;

4774

return memcg;

4769

4775

4770

out_free:

4776

out_free:

4771

if (size < PAGE_SIZE)

4777

if (size < PAGE_SIZE)

4772

kfree(memcg);

4778

kfree(memcg);

4773

else

4779

else

4774

vfree(memcg);

4780

vfree(memcg);

4775

return NULL;

4781

return NULL;

4776

}

4782

}

4777

4783

4778

/*

4784

/*

4779

* Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,

4785

* Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,

4780

* but in process context. The work_freeing structure is overlaid

4786

* but in process context. The work_freeing structure is overlaid

4781

* on the rcu_freeing structure, which itself is overlaid on memsw.

4787

* on the rcu_freeing structure, which itself is overlaid on memsw.

4782

*/

4788

*/

4783

static void free_work(struct work_struct *work)

4789

static void free_work(struct work_struct *work)

4784

{

4790

{

4785

struct mem_cgroup *memcg;

4791

struct mem_cgroup *memcg;

4786

int size = sizeof(struct mem_cgroup);

4792

int size = sizeof(struct mem_cgroup);

4787

4793

4788

memcg = container_of(work, struct mem_cgroup, work_freeing);

4794

memcg = container_of(work, struct mem_cgroup, work_freeing);

4789

/*

4795

/*

4790

* We need to make sure that (at least for now), the jump label

4796

* We need to make sure that (at least for now), the jump label

4791

* destruction code runs outside of the cgroup lock. This is because

4797

* destruction code runs outside of the cgroup lock. This is because

4792

* get_online_cpus(), which is called from the static_branch update,

4798

* get_online_cpus(), which is called from the static_branch update,

4793

* can't be called inside the cgroup_lock. cpusets are the ones

4799

* can't be called inside the cgroup_lock. cpusets are the ones

4794

* enforcing this dependency, so if they ever change, we might as well.

4800

* enforcing this dependency, so if they ever change, we might as well.

4795

*

4801

*

4796

* schedule_work() will guarantee this happens. Be careful if you need

4802

* schedule_work() will guarantee this happens. Be careful if you need

4797

* to move this code around, and make sure it is outside

4803

* to move this code around, and make sure it is outside

4798

* the cgroup_lock.

4804

* the cgroup_lock.

4799

*/

4805

*/

4800

disarm_sock_keys(memcg);

4806

disarm_sock_keys(memcg);

4801

if (size < PAGE_SIZE)

4807

if (size < PAGE_SIZE)

4802

kfree(memcg);

4808

kfree(memcg);

4803

else

4809

else

4804

vfree(memcg);

4810

vfree(memcg);

4805

}

4811

}

4806

4812

4807

static void free_rcu(struct rcu_head *rcu_head)

4813

static void free_rcu(struct rcu_head *rcu_head)

4808

{

4814

{

4809

struct mem_cgroup *memcg;

4815

struct mem_cgroup *memcg;

4810

4816

4811

memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);

4817

memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);

4812

INIT_WORK(&memcg->work_freeing, free_work);

4818

INIT_WORK(&memcg->work_freeing, free_work);

4813

schedule_work(&memcg->work_freeing);

4819

schedule_work(&memcg->work_freeing);

4814

}

4820

}

4815

4821

4816

/*

4822

/*

4817

* At destroying mem_cgroup, references from swap_cgroup can remain.

4823

* At destroying mem_cgroup, references from swap_cgroup can remain.

4818

* (scanning all at force_empty is too costly...)

4824

* (scanning all at force_empty is too costly...)

4819

*

4825

*

4820

* Instead of clearing all references at force_empty, we remember

4826

* Instead of clearing all references at force_empty, we remember

4821

* the number of reference from swap_cgroup and free mem_cgroup when

4827

* the number of reference from swap_cgroup and free mem_cgroup when

4822

* it goes down to 0.

4828

* it goes down to 0.

4823

*

4829

*

4824

* Removal of cgroup itself succeeds regardless of refs from swap.

4830

* Removal of cgroup itself succeeds regardless of refs from swap.

4825

*/

4831

*/

4826

4832

4827

static void __mem_cgroup_free(struct mem_cgroup *memcg)

4833

static void __mem_cgroup_free(struct mem_cgroup *memcg)

4828

{

4834

{

4829

int node;

4835

int node;

4830

4836

4831

mem_cgroup_remove_from_trees(memcg);

4837

mem_cgroup_remove_from_trees(memcg);

4832

free_css_id(&mem_cgroup_subsys, &memcg->css);

4838

free_css_id(&mem_cgroup_subsys, &memcg->css);

4833

4839

4834

for_each_node(node)

4840

for_each_node(node)

4835

free_mem_cgroup_per_zone_info(memcg, node);

4841

free_mem_cgroup_per_zone_info(memcg, node);

4836

4842

4837

free_percpu(memcg->stat);

4843

free_percpu(memcg->stat);

4838

call_rcu(&memcg->rcu_freeing, free_rcu);

4844

call_rcu(&memcg->rcu_freeing, free_rcu);

4839

}

4845

}

4840

4846

4841

static void mem_cgroup_get(struct mem_cgroup *memcg)

4847

static void mem_cgroup_get(struct mem_cgroup *memcg)

4842

{

4848

{

4843

atomic_inc(&memcg->refcnt);

4849

atomic_inc(&memcg->refcnt);

4844

}

4850

}

4845

4851

4846

static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)

4852

static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)

4847

{

4853

{

4848

if (atomic_sub_and_test(count, &memcg->refcnt)) {

4854

if (atomic_sub_and_test(count, &memcg->refcnt)) {

4849

struct mem_cgroup *parent = parent_mem_cgroup(memcg);

4855

struct mem_cgroup *parent = parent_mem_cgroup(memcg);

4850

__mem_cgroup_free(memcg);

4856

__mem_cgroup_free(memcg);

4851

if (parent)

4857

if (parent)

4852

mem_cgroup_put(parent);

4858

mem_cgroup_put(parent);

4853

}

4859

}

4854

}

4860

}

4855

4861

4856

static void mem_cgroup_put(struct mem_cgroup *memcg)

4862

static void mem_cgroup_put(struct mem_cgroup *memcg)

4857

{

4863

{

4858

__mem_cgroup_put(memcg, 1);

4864

__mem_cgroup_put(memcg, 1);

4859

}

4865

}

4860

4866

4861

/*

4867

/*

4862

* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.

4868

* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.

4863

*/

4869

*/

4864

struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)

4870

struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)

4865

{

4871

{

4866

if (!memcg->res.parent)

4872

if (!memcg->res.parent)

4867

return NULL;

4873

return NULL;

4868

return mem_cgroup_from_res_counter(memcg->res.parent, res);

4874

return mem_cgroup_from_res_counter(memcg->res.parent, res);

4869

}

4875

}

4870

EXPORT_SYMBOL(parent_mem_cgroup);

4876

EXPORT_SYMBOL(parent_mem_cgroup);

4871

4877

4872

#ifdef CONFIG_MEMCG_SWAP

4878

#ifdef CONFIG_MEMCG_SWAP

4873

static void __init enable_swap_cgroup(void)

4879

static void __init enable_swap_cgroup(void)

4874

{

4880

{

4875

if (!mem_cgroup_disabled() && really_do_swap_account)

4881

if (!mem_cgroup_disabled() && really_do_swap_account)

4876

do_swap_account = 1;

4882

do_swap_account = 1;

4877

}

4883

}

4878

#else

4884

#else

4879

static void __init enable_swap_cgroup(void)

4885

static void __init enable_swap_cgroup(void)

4880

{

4886

{

4881

}

4887

}

4882

#endif

4888

#endif

4883

4889

4884

static int mem_cgroup_soft_limit_tree_init(void)

4890

static int mem_cgroup_soft_limit_tree_init(void)

4885

{

4891

{

4886

struct mem_cgroup_tree_per_node *rtpn;

4892

struct mem_cgroup_tree_per_node *rtpn;

4887

struct mem_cgroup_tree_per_zone *rtpz;

4893

struct mem_cgroup_tree_per_zone *rtpz;

4888

int tmp, node, zone;

4894

int tmp, node, zone;

4889

4895

4890

for_each_node(node) {

4896

for_each_node(node) {

4891

tmp = node;

4897

tmp = node;

4892

if (!node_state(node, N_NORMAL_MEMORY))

4898

if (!node_state(node, N_NORMAL_MEMORY))

4893

tmp = -1;

4899

tmp = -1;

4894

rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);

4900

rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);

4895

if (!rtpn)

4901

if (!rtpn)

4896

goto err_cleanup;

4902

goto err_cleanup;

4897

4903

4898

soft_limit_tree.rb_tree_per_node[node] = rtpn;

4904

soft_limit_tree.rb_tree_per_node[node] = rtpn;

4899

4905

4900

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

4906

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

4901

rtpz = &rtpn->rb_tree_per_zone[zone];

4907

rtpz = &rtpn->rb_tree_per_zone[zone];

4902

rtpz->rb_root = RB_ROOT;

4908

rtpz->rb_root = RB_ROOT;

4903

spin_lock_init(&rtpz->lock);

4909

spin_lock_init(&rtpz->lock);

4904

}

4910

}

4905

}

4911

}

4906

return 0;

4912

return 0;

4907

4913

4908

err_cleanup:

4914

err_cleanup:

4909

for_each_node(node) {

4915

for_each_node(node) {

4910

if (!soft_limit_tree.rb_tree_per_node[node])

4916

if (!soft_limit_tree.rb_tree_per_node[node])

4911

break;

4917

break;

4912

kfree(soft_limit_tree.rb_tree_per_node[node]);

4918

kfree(soft_limit_tree.rb_tree_per_node[node]);

4913

soft_limit_tree.rb_tree_per_node[node] = NULL;

4919

soft_limit_tree.rb_tree_per_node[node] = NULL;

4914

}

4920

}

4915

return 1;

4921

return 1;

4916

4922

4917

}

4923

}

4918

4924

4919

static struct cgroup_subsys_state * __ref

4925

static struct cgroup_subsys_state * __ref

4920

mem_cgroup_create(struct cgroup *cont)

4926

mem_cgroup_create(struct cgroup *cont)

4921

{

4927

{

4922

struct mem_cgroup *memcg, *parent;

4928

struct mem_cgroup *memcg, *parent;

4923

long error = -ENOMEM;

4929

long error = -ENOMEM;

4924

int node;

4930

int node;

4925

4931

4926

memcg = mem_cgroup_alloc();

4932

memcg = mem_cgroup_alloc();

4927

if (!memcg)

4933

if (!memcg)

4928

return ERR_PTR(error);

4934

return ERR_PTR(error);

4929

4935

4930

for_each_node(node)

4936

for_each_node(node)

4931

if (alloc_mem_cgroup_per_zone_info(memcg, node))

4937

if (alloc_mem_cgroup_per_zone_info(memcg, node))

4932

goto free_out;

4938

goto free_out;

4933

4939

4934

/* root ? */

4940

/* root ? */

4935

if (cont->parent == NULL) {

4941

if (cont->parent == NULL) {

4936

int cpu;

4942

int cpu;

4937

enable_swap_cgroup();

4943

enable_swap_cgroup();

4938

parent = NULL;

4944

parent = NULL;

4939

if (mem_cgroup_soft_limit_tree_init())

4945

if (mem_cgroup_soft_limit_tree_init())

4940

goto free_out;

4946

goto free_out;

4941

root_mem_cgroup = memcg;

4947

root_mem_cgroup = memcg;

4942

for_each_possible_cpu(cpu) {

4948

for_each_possible_cpu(cpu) {

4943

struct memcg_stock_pcp *stock =

4949

struct memcg_stock_pcp *stock =

4944

&per_cpu(memcg_stock, cpu);

4950

&per_cpu(memcg_stock, cpu);

4945

INIT_WORK(&stock->work, drain_local_stock);

4951

INIT_WORK(&stock->work, drain_local_stock);

4946

}

4952

}

4947

hotcpu_notifier(memcg_cpu_hotplug_callback, 0);

4953

hotcpu_notifier(memcg_cpu_hotplug_callback, 0);

4948

} else {

4954

} else {

4949

parent = mem_cgroup_from_cont(cont->parent);

4955

parent = mem_cgroup_from_cont(cont->parent);

4950

memcg->use_hierarchy = parent->use_hierarchy;

4956

memcg->use_hierarchy = parent->use_hierarchy;

4951

memcg->oom_kill_disable = parent->oom_kill_disable;

4957

memcg->oom_kill_disable = parent->oom_kill_disable;

4952

}

4958

}

4953

4959

4954

if (parent && parent->use_hierarchy) {

4960

if (parent && parent->use_hierarchy) {

4955

res_counter_init(&memcg->res, &parent->res);

4961

res_counter_init(&memcg->res, &parent->res);

4956

res_counter_init(&memcg->memsw, &parent->memsw);

4962

res_counter_init(&memcg->memsw, &parent->memsw);

4957

/*

4963

/*

4958

* We increment refcnt of the parent to ensure that we can

4964

* We increment refcnt of the parent to ensure that we can

4959

* safely access it on res_counter_charge/uncharge.

4965

* safely access it on res_counter_charge/uncharge.

4960

* This refcnt will be decremented when freeing this

4966

* This refcnt will be decremented when freeing this

4961

* mem_cgroup(see mem_cgroup_put).

4967

* mem_cgroup(see mem_cgroup_put).

4962

*/

4968

*/

4963

mem_cgroup_get(parent);

4969

mem_cgroup_get(parent);

4964

} else {

4970

} else {

4965

res_counter_init(&memcg->res, NULL);

4971

res_counter_init(&memcg->res, NULL);

4966

res_counter_init(&memcg->memsw, NULL);

4972

res_counter_init(&memcg->memsw, NULL);

4967

}

4973

}

4968

memcg->last_scanned_node = MAX_NUMNODES;

4974

memcg->last_scanned_node = MAX_NUMNODES;

4969

INIT_LIST_HEAD(&memcg->oom_notify);

4975

INIT_LIST_HEAD(&memcg->oom_notify);

4970

4976

4971

if (parent)

4977

if (parent)

4972

memcg->swappiness = mem_cgroup_swappiness(parent);

4978

memcg->swappiness = mem_cgroup_swappiness(parent);

4973

atomic_set(&memcg->refcnt, 1);

4979

atomic_set(&memcg->refcnt, 1);

4974

memcg->move_charge_at_immigrate = 0;

4980

memcg->move_charge_at_immigrate = 0;

4975

mutex_init(&memcg->thresholds_lock);

4981

mutex_init(&memcg->thresholds_lock);

4976

spin_lock_init(&memcg->move_lock);

4982

spin_lock_init(&memcg->move_lock);

4977

4983

4978

error = memcg_init_kmem(memcg, &mem_cgroup_subsys);

4984

error = memcg_init_kmem(memcg, &mem_cgroup_subsys);

4979

if (error) {

4985

if (error) {

4980

/*

4986

/*

4981

* We call put now because our (and parent's) refcnts

4987

* We call put now because our (and parent's) refcnts

4982

* are already in place. mem_cgroup_put() will internally

4988

* are already in place. mem_cgroup_put() will internally

4983

* call __mem_cgroup_free, so return directly

4989

* call __mem_cgroup_free, so return directly

4984

*/

4990

*/

4985

mem_cgroup_put(memcg);

4991

mem_cgroup_put(memcg);

4986

return ERR_PTR(error);

4992

return ERR_PTR(error);

4987

}

4993

}

4988

return &memcg->css;

4994

return &memcg->css;

4989

free_out:

4995

free_out:

4990

__mem_cgroup_free(memcg);

4996

__mem_cgroup_free(memcg);

4991

return ERR_PTR(error);

4997

return ERR_PTR(error);

4992

}

4998

}

4993

4999

4994

static int mem_cgroup_pre_destroy(struct cgroup *cont)

5000

static int mem_cgroup_pre_destroy(struct cgroup *cont)

4995

{

5001

{

4996

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

5002

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

4997

5003

4998

return mem_cgroup_force_empty(memcg, false);

5004

return mem_cgroup_force_empty(memcg, false);

4999

}

5005

}

5000

5006

5001

static void mem_cgroup_destroy(struct cgroup *cont)

5007

static void mem_cgroup_destroy(struct cgroup *cont)

5002

{

5008

{

5003

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

5009

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

5004

5010

5005

kmem_cgroup_destroy(memcg);

5011

kmem_cgroup_destroy(memcg);

5006

5012

5007

mem_cgroup_put(memcg);

5013

mem_cgroup_put(memcg);

5008

}

5014

}

5009

5015

5010

#ifdef CONFIG_MMU

5016

#ifdef CONFIG_MMU

5011

/* Handlers for move charge at task migration. */

5017

/* Handlers for move charge at task migration. */

5012

#define PRECHARGE_COUNT_AT_ONCE 256

5018

#define PRECHARGE_COUNT_AT_ONCE 256

5013

static int mem_cgroup_do_precharge(unsigned long count)

5019

static int mem_cgroup_do_precharge(unsigned long count)

5014

{

5020

{

5015

int ret = 0;

5021

int ret = 0;

5016

int batch_count = PRECHARGE_COUNT_AT_ONCE;

5022

int batch_count = PRECHARGE_COUNT_AT_ONCE;

5017

struct mem_cgroup *memcg = mc.to;

5023

struct mem_cgroup *memcg = mc.to;

5018

5024

5019

if (mem_cgroup_is_root(memcg)) {

5025

if (mem_cgroup_is_root(memcg)) {

5020

mc.precharge += count;

5026

mc.precharge += count;

5021

/* we don't need css_get for root */

5027

/* we don't need css_get for root */

5022

return ret;

5028

return ret;

5023

}

5029

}

5024

/* try to charge at once */

5030

/* try to charge at once */

5025

if (count > 1) {

5031

if (count > 1) {

5026

struct res_counter *dummy;

5032

struct res_counter *dummy;

5027

/*

5033

/*

5028

* "memcg" cannot be under rmdir() because we've already checked

5034

* "memcg" cannot be under rmdir() because we've already checked

5029

* by cgroup_lock_live_cgroup() that it is not removed and we

5035

* by cgroup_lock_live_cgroup() that it is not removed and we

5030

* are still under the same cgroup_mutex. So we can postpone

5036

* are still under the same cgroup_mutex. So we can postpone

5031

* css_get().

5037

* css_get().

5032

*/

5038

*/

5033

if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))

5039

if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))

5034

goto one_by_one;

5040

goto one_by_one;

5035

if (do_swap_account && res_counter_charge(&memcg->memsw,

5041

if (do_swap_account && res_counter_charge(&memcg->memsw,

5036

PAGE_SIZE * count, &dummy)) {

5042

PAGE_SIZE * count, &dummy)) {

5037

res_counter_uncharge(&memcg->res, PAGE_SIZE * count);

5043

res_counter_uncharge(&memcg->res, PAGE_SIZE * count);

5038

goto one_by_one;

5044

goto one_by_one;

5039

}

5045

}

5040

mc.precharge += count;

5046

mc.precharge += count;

5041

return ret;

5047

return ret;

5042

}

5048

}

5043

one_by_one:

5049

one_by_one:

5044

/* fall back to one by one charge */

5050

/* fall back to one by one charge */

5045

while (count--) {

5051

while (count--) {

5046

if (signal_pending(current)) {

5052

if (signal_pending(current)) {

5047

ret = -EINTR;

5053

ret = -EINTR;

5048

break;

5054

break;

5049

}

5055

}

5050

if (!batch_count--) {

5056

if (!batch_count--) {

5051

batch_count = PRECHARGE_COUNT_AT_ONCE;

5057

batch_count = PRECHARGE_COUNT_AT_ONCE;

5052

cond_resched();

5058

cond_resched();

5053

}

5059

}

5054

ret = __mem_cgroup_try_charge(NULL,

5060

ret = __mem_cgroup_try_charge(NULL,

5055

GFP_KERNEL, 1, &memcg, false);

5061

GFP_KERNEL, 1, &memcg, false);

5056

if (ret)

5062

if (ret)

5057

/* mem_cgroup_clear_mc() will do uncharge later */

5063

/* mem_cgroup_clear_mc() will do uncharge later */

5058

return ret;

5064

return ret;

5059

mc.precharge++;

5065

mc.precharge++;

5060

}

5066

}

5061

return ret;

5067

return ret;

5062

}

5068

}

5063

5069

5064

/**

5070

/**

5065

* get_mctgt_type - get target type of moving charge

5071

* get_mctgt_type - get target type of moving charge

5066

* @vma: the vma the pte to be checked belongs

5072

* @vma: the vma the pte to be checked belongs

5067

* @addr: the address corresponding to the pte to be checked

5073

* @addr: the address corresponding to the pte to be checked

5068

* @ptent: the pte to be checked

5074

* @ptent: the pte to be checked

5069

* @target: the pointer the target page or swap ent will be stored(can be NULL)

5075

* @target: the pointer the target page or swap ent will be stored(can be NULL)

5070

*

5076

*

5071

* Returns

5077

* Returns

5072

* 0(MC_TARGET_NONE): if the pte is not a target for move charge.

5078

* 0(MC_TARGET_NONE): if the pte is not a target for move charge.

5073

* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for

5079

* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for

5074

* move charge. if @target is not NULL, the page is stored in target->page

5080

* move charge. if @target is not NULL, the page is stored in target->page

5075

* with extra refcnt got(Callers should handle it).

5081

* with extra refcnt got(Callers should handle it).

5076

* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a

5082

* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a

5077

* target for charge migration. if @target is not NULL, the entry is stored

5083

* target for charge migration. if @target is not NULL, the entry is stored

5078

* in target->ent.

5084

* in target->ent.

5079

*

5085

*

5080

* Called with pte lock held.

5086

* Called with pte lock held.

5081

*/

5087

*/

5082

union mc_target {

5088

union mc_target {

5083

struct page *page;

5089

struct page *page;

5084

swp_entry_t ent;

5090

swp_entry_t ent;

5085

};

5091

};

5086

5092

5087

enum mc_target_type {

5093

enum mc_target_type {

5088

MC_TARGET_NONE = 0,

5094

MC_TARGET_NONE = 0,

5089

MC_TARGET_PAGE,

5095

MC_TARGET_PAGE,

5090

MC_TARGET_SWAP,

5096

MC_TARGET_SWAP,

5091

};

5097

};

5092

5098

5093

static struct page *mc_handle_present_pte(struct vm_area_struct *vma,

5099

static struct page *mc_handle_present_pte(struct vm_area_struct *vma,

5094

unsigned long addr, pte_t ptent)

5100

unsigned long addr, pte_t ptent)

5095

{

5101

{

5096

struct page *page = vm_normal_page(vma, addr, ptent);

5102

struct page *page = vm_normal_page(vma, addr, ptent);

5097

5103

5098

if (!page || !page_mapped(page))

5104

if (!page || !page_mapped(page))

5099

return NULL;

5105

return NULL;

5100

if (PageAnon(page)) {

5106

if (PageAnon(page)) {

5101

/* we don't move shared anon */

5107

/* we don't move shared anon */

5102

if (!move_anon())

5108

if (!move_anon())

5103

return NULL;

5109

return NULL;

5104

} else if (!move_file())

5110

} else if (!move_file())

5105

/* we ignore mapcount for file pages */

5111

/* we ignore mapcount for file pages */

5106

return NULL;

5112

return NULL;

5107

if (!get_page_unless_zero(page))

5113

if (!get_page_unless_zero(page))

5108

return NULL;

5114

return NULL;

5109

5115

5110

return page;

5116

return page;

5111

}

5117

}

5112

5118

5113

#ifdef CONFIG_SWAP

5119

#ifdef CONFIG_SWAP

5114

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

5120

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

5115

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5121

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5116

{

5122

{

5117

struct page *page = NULL;

5123

struct page *page = NULL;

5118

swp_entry_t ent = pte_to_swp_entry(ptent);

5124

swp_entry_t ent = pte_to_swp_entry(ptent);

5119

5125

5120

if (!move_anon() || non_swap_entry(ent))

5126

if (!move_anon() || non_swap_entry(ent))

5121

return NULL;

5127

return NULL;

5122

/*

5128

/*

5123

* Because lookup_swap_cache() updates some statistics counter,

5129

* Because lookup_swap_cache() updates some statistics counter,

5124

* we call find_get_page() with swapper_space directly.

5130

* we call find_get_page() with swapper_space directly.

5125

*/

5131

*/

5126

page = find_get_page(&swapper_space, ent.val);

5132

page = find_get_page(&swapper_space, ent.val);

5127

if (do_swap_account)

5133

if (do_swap_account)

5128

entry->val = ent.val;

5134

entry->val = ent.val;

5129

5135

5130

return page;

5136

return page;

5131

}

5137

}

5132

#else

5138

#else

5133

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

5139

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

5134

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5140

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5135

{

5141

{

5136

return NULL;

5142

return NULL;

5137

}

5143

}

5138

#endif

5144

#endif

5139

5145

5140

static struct page *mc_handle_file_pte(struct vm_area_struct *vma,

5146

static struct page *mc_handle_file_pte(struct vm_area_struct *vma,

5141

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5147

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5142

{

5148

{

5143

struct page *page = NULL;

5149

struct page *page = NULL;

5144

struct address_space *mapping;

5150

struct address_space *mapping;

5145

pgoff_t pgoff;

5151

pgoff_t pgoff;

5146

5152

5147

if (!vma->vm_file) /* anonymous vma */

5153

if (!vma->vm_file) /* anonymous vma */

5148

return NULL;

5154

return NULL;

5149

if (!move_file())

5155

if (!move_file())

5150

return NULL;

5156

return NULL;

5151

5157

5152

mapping = vma->vm_file->f_mapping;

5158

mapping = vma->vm_file->f_mapping;

5153

if (pte_none(ptent))

5159

if (pte_none(ptent))

5154

pgoff = linear_page_index(vma, addr);

5160

pgoff = linear_page_index(vma, addr);

5155

else /* pte_file(ptent) is true */

5161

else /* pte_file(ptent) is true */

5156

pgoff = pte_to_pgoff(ptent);

5162

pgoff = pte_to_pgoff(ptent);

5157

5163

5158

/* page is moved even if it's not RSS of this task(page-faulted). */

5164

/* page is moved even if it's not RSS of this task(page-faulted). */

5159

page = find_get_page(mapping, pgoff);

5165

page = find_get_page(mapping, pgoff);

5160

5166

5161

#ifdef CONFIG_SWAP

5167

#ifdef CONFIG_SWAP

5162

/* shmem/tmpfs may report page out on swap: account for that too. */

5168

/* shmem/tmpfs may report page out on swap: account for that too. */

5163

if (radix_tree_exceptional_entry(page)) {

5169

if (radix_tree_exceptional_entry(page)) {

5164

swp_entry_t swap = radix_to_swp_entry(page);

5170

swp_entry_t swap = radix_to_swp_entry(page);

5165

if (do_swap_account)

5171

if (do_swap_account)

5166

*entry = swap;

5172

*entry = swap;

5167

page = find_get_page(&swapper_space, swap.val);

5173

page = find_get_page(&swapper_space, swap.val);

5168

}

5174

}

5169

#endif

5175

#endif

5170

return page;

5176

return page;

5171

}

5177

}

5172

5178

5173

static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,

5179

static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,

5174

unsigned long addr, pte_t ptent, union mc_target *target)

5180

unsigned long addr, pte_t ptent, union mc_target *target)

5175

{

5181

{

5176

struct page *page = NULL;

5182

struct page *page = NULL;

5177

struct page_cgroup *pc;

5183

struct page_cgroup *pc;

5178

enum mc_target_type ret = MC_TARGET_NONE;

5184

enum mc_target_type ret = MC_TARGET_NONE;

5179

swp_entry_t ent = { .val = 0 };

5185

swp_entry_t ent = { .val = 0 };

5180

5186

5181

if (pte_present(ptent))

5187

if (pte_present(ptent))

5182

page = mc_handle_present_pte(vma, addr, ptent);

5188

page = mc_handle_present_pte(vma, addr, ptent);

5183

else if (is_swap_pte(ptent))

5189

else if (is_swap_pte(ptent))

5184

page = mc_handle_swap_pte(vma, addr, ptent, &ent);

5190

page = mc_handle_swap_pte(vma, addr, ptent, &ent);

5185

else if (pte_none(ptent) || pte_file(ptent))

5191

else if (pte_none(ptent) || pte_file(ptent))

5186

page = mc_handle_file_pte(vma, addr, ptent, &ent);

5192

page = mc_handle_file_pte(vma, addr, ptent, &ent);

5187

5193

5188

if (!page && !ent.val)

5194

if (!page && !ent.val)

5189

return ret;

5195

return ret;

5190

if (page) {

5196

if (page) {

5191

pc = lookup_page_cgroup(page);

5197

pc = lookup_page_cgroup(page);

5192

/*

5198

/*

5193

* Do only loose check w/o page_cgroup lock.

5199

* Do only loose check w/o page_cgroup lock.

5194

* mem_cgroup_move_account() checks the pc is valid or not under

5200

* mem_cgroup_move_account() checks the pc is valid or not under

5195

* the lock.

5201

* the lock.

5196

*/

5202

*/

5197

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

5203

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

5198

ret = MC_TARGET_PAGE;

5204

ret = MC_TARGET_PAGE;

5199

if (target)

5205

if (target)

5200

target->page = page;

5206

target->page = page;

5201

}

5207

}

5202

if (!ret || !target)

5208

if (!ret || !target)

5203

put_page(page);

5209

put_page(page);

5204

}

5210

}

5205

/* There is a swap entry and a page doesn't exist or isn't charged */

5211

/* There is a swap entry and a page doesn't exist or isn't charged */

5206

if (ent.val && !ret &&

5212

if (ent.val && !ret &&

5207

css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {

5213

css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {

5208

ret = MC_TARGET_SWAP;

5214

ret = MC_TARGET_SWAP;

5209

if (target)

5215

if (target)

5210

target->ent = ent;

5216

target->ent = ent;

5211

}

5217

}

5212

return ret;

5218

return ret;

5213

}

5219

}

5214

5220

5215

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

5221

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

5216

/*

5222

/*

5217

* We don't consider swapping or file mapped pages because THP does not

5223

* We don't consider swapping or file mapped pages because THP does not

5218

* support them for now.

5224

* support them for now.

5219

* Caller should make sure that pmd_trans_huge(pmd) is true.

5225

* Caller should make sure that pmd_trans_huge(pmd) is true.

5220

*/

5226

*/

5221

static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,

5227

static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,

5222

unsigned long addr, pmd_t pmd, union mc_target *target)

5228

unsigned long addr, pmd_t pmd, union mc_target *target)

5223

{

5229

{

5224

struct page *page = NULL;

5230

struct page *page = NULL;

5225

struct page_cgroup *pc;

5231

struct page_cgroup *pc;

5226

enum mc_target_type ret = MC_TARGET_NONE;

5232

enum mc_target_type ret = MC_TARGET_NONE;

5227

5233

5228

page = pmd_page(pmd);

5234

page = pmd_page(pmd);

5229

VM_BUG_ON(!page || !PageHead(page));

5235

VM_BUG_ON(!page || !PageHead(page));

5230

if (!move_anon())

5236

if (!move_anon())

5231

return ret;

5237

return ret;

5232

pc = lookup_page_cgroup(page);

5238

pc = lookup_page_cgroup(page);

5233

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

5239

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

5234

ret = MC_TARGET_PAGE;

5240

ret = MC_TARGET_PAGE;

5235

if (target) {

5241

if (target) {

5236

get_page(page);

5242

get_page(page);

5237

target->page = page;

5243

target->page = page;

5238

}

5244

}

5239

}

5245

}

5240

return ret;

5246

return ret;

5241

}

5247

}

5242

#else

5248

#else

5243

static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,

5249

static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,

5244

unsigned long addr, pmd_t pmd, union mc_target *target)

5250

unsigned long addr, pmd_t pmd, union mc_target *target)

5245

{

5251

{

5246

return MC_TARGET_NONE;

5252

return MC_TARGET_NONE;

5247

}

5253

}

5248

#endif

5254

#endif

5249

5255

5250

static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,

5256

static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,

5251

unsigned long addr, unsigned long end,

5257

unsigned long addr, unsigned long end,

5252

struct mm_walk *walk)

5258

struct mm_walk *walk)

5253

{

5259

{

5254

struct vm_area_struct *vma = walk->private;

5260

struct vm_area_struct *vma = walk->private;

5255

pte_t *pte;

5261

pte_t *pte;

5256

spinlock_t *ptl;

5262

spinlock_t *ptl;

5257

5263

5258

if (pmd_trans_huge_lock(pmd, vma) == 1) {

5264

if (pmd_trans_huge_lock(pmd, vma) == 1) {

5259

if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)

5265

if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)

5260

mc.precharge += HPAGE_PMD_NR;

5266

mc.precharge += HPAGE_PMD_NR;

5261

spin_unlock(&vma->vm_mm->page_table_lock);

5267

spin_unlock(&vma->vm_mm->page_table_lock);

5262

return 0;

5268

return 0;

5263

}

5269

}

5264

5270

5265

if (pmd_trans_unstable(pmd))

5271

if (pmd_trans_unstable(pmd))

5266

return 0;

5272

return 0;

5267

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5273

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5268

for (; addr != end; pte++, addr += PAGE_SIZE)

5274

for (; addr != end; pte++, addr += PAGE_SIZE)

5269

if (get_mctgt_type(vma, addr, *pte, NULL))

5275

if (get_mctgt_type(vma, addr, *pte, NULL))

5270

mc.precharge++; /* increment precharge temporarily */

5276

mc.precharge++; /* increment precharge temporarily */

5271

pte_unmap_unlock(pte - 1, ptl);

5277

pte_unmap_unlock(pte - 1, ptl);

5272

cond_resched();

5278

cond_resched();

5273

5279

5274

return 0;

5280

return 0;

5275

}

5281

}

5276

5282

5277

static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)

5283

static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)

5278

{

5284

{

5279

unsigned long precharge;

5285

unsigned long precharge;

5280

struct vm_area_struct *vma;

5286

struct vm_area_struct *vma;

5281

5287

5282

down_read(&mm->mmap_sem);

5288

down_read(&mm->mmap_sem);

5283

for (vma = mm->mmap; vma; vma = vma->vm_next) {

5289

for (vma = mm->mmap; vma; vma = vma->vm_next) {

5284

struct mm_walk mem_cgroup_count_precharge_walk = {

5290

struct mm_walk mem_cgroup_count_precharge_walk = {

5285

.pmd_entry = mem_cgroup_count_precharge_pte_range,

5291

.pmd_entry = mem_cgroup_count_precharge_pte_range,

5286

.mm = mm,

5292

.mm = mm,

5287

.private = vma,

5293

.private = vma,

5288

};

5294

};

5289

if (is_vm_hugetlb_page(vma))

5295

if (is_vm_hugetlb_page(vma))

5290

continue;

5296

continue;

5291

walk_page_range(vma->vm_start, vma->vm_end,

5297

walk_page_range(vma->vm_start, vma->vm_end,

5292

&mem_cgroup_count_precharge_walk);

5298

&mem_cgroup_count_precharge_walk);

5293

}

5299

}

5294

up_read(&mm->mmap_sem);

5300

up_read(&mm->mmap_sem);

5295

5301

5296

precharge = mc.precharge;

5302

precharge = mc.precharge;

5297

mc.precharge = 0;

5303

mc.precharge = 0;

5298

5304

5299

return precharge;

5305

return precharge;

5300

}

5306

}

5301

5307

5302

static int mem_cgroup_precharge_mc(struct mm_struct *mm)

5308

static int mem_cgroup_precharge_mc(struct mm_struct *mm)

5303

{

5309

{

5304

unsigned long precharge = mem_cgroup_count_precharge(mm);

5310

unsigned long precharge = mem_cgroup_count_precharge(mm);

5305

5311

5306

VM_BUG_ON(mc.moving_task);

5312

VM_BUG_ON(mc.moving_task);

5307

mc.moving_task = current;

5313

mc.moving_task = current;

5308

return mem_cgroup_do_precharge(precharge);

5314

return mem_cgroup_do_precharge(precharge);

5309

}

5315

}

5310

5316

5311

/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */

5317

/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */

5312

static void __mem_cgroup_clear_mc(void)

5318

static void __mem_cgroup_clear_mc(void)

5313

{

5319

{

5314

struct mem_cgroup *from = mc.from;

5320

struct mem_cgroup *from = mc.from;

5315

struct mem_cgroup *to = mc.to;

5321

struct mem_cgroup *to = mc.to;

5316

5322

5317

/* we must uncharge all the leftover precharges from mc.to */

5323

/* we must uncharge all the leftover precharges from mc.to */

5318

if (mc.precharge) {

5324

if (mc.precharge) {

5319

__mem_cgroup_cancel_charge(mc.to, mc.precharge);

5325

__mem_cgroup_cancel_charge(mc.to, mc.precharge);

5320

mc.precharge = 0;

5326

mc.precharge = 0;

5321

}

5327

}

5322

/*

5328

/*

5323

* we didn't uncharge from mc.from at mem_cgroup_move_account(), so

5329

* we didn't uncharge from mc.from at mem_cgroup_move_account(), so

5324

* we must uncharge here.

5330

* we must uncharge here.

5325

*/

5331

*/

5326

if (mc.moved_charge) {

5332

if (mc.moved_charge) {

5327

__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);

5333

__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);

5328

mc.moved_charge = 0;

5334

mc.moved_charge = 0;

5329

}

5335

}

5330

/* we must fixup refcnts and charges */

5336

/* we must fixup refcnts and charges */

5331

if (mc.moved_swap) {

5337

if (mc.moved_swap) {

5332

/* uncharge swap account from the old cgroup */

5338

/* uncharge swap account from the old cgroup */

5333

if (!mem_cgroup_is_root(mc.from))

5339

if (!mem_cgroup_is_root(mc.from))

5334

res_counter_uncharge(&mc.from->memsw,

5340

res_counter_uncharge(&mc.from->memsw,

5335

PAGE_SIZE * mc.moved_swap);

5341

PAGE_SIZE * mc.moved_swap);

5336

__mem_cgroup_put(mc.from, mc.moved_swap);

5342

__mem_cgroup_put(mc.from, mc.moved_swap);

5337

5343

5338

if (!mem_cgroup_is_root(mc.to)) {

5344

if (!mem_cgroup_is_root(mc.to)) {

5339

/*

5345

/*

5340

* we charged both to->res and to->memsw, so we should

5346

* we charged both to->res and to->memsw, so we should

5341

* uncharge to->res.

5347

* uncharge to->res.

5342

*/

5348

*/

5343

res_counter_uncharge(&mc.to->res,

5349

res_counter_uncharge(&mc.to->res,

5344

PAGE_SIZE * mc.moved_swap);

5350

PAGE_SIZE * mc.moved_swap);

5345

}

5351

}

5346

/* we've already done mem_cgroup_get(mc.to) */

5352

/* we've already done mem_cgroup_get(mc.to) */

5347

mc.moved_swap = 0;

5353

mc.moved_swap = 0;

5348

}

5354

}

5349

memcg_oom_recover(from);

5355

memcg_oom_recover(from);

5350

memcg_oom_recover(to);

5356

memcg_oom_recover(to);

5351

wake_up_all(&mc.waitq);

5357

wake_up_all(&mc.waitq);

5352

}

5358

}

5353

5359

5354

static void mem_cgroup_clear_mc(void)

5360

static void mem_cgroup_clear_mc(void)

5355

{

5361

{

5356

struct mem_cgroup *from = mc.from;

5362

struct mem_cgroup *from = mc.from;

5357

5363

5358

/*

5364

/*

5359

* we must clear moving_task before waking up waiters at the end of

5365

* we must clear moving_task before waking up waiters at the end of

5360

* task migration.

5366

* task migration.

5361

*/

5367

*/

5362

mc.moving_task = NULL;

5368

mc.moving_task = NULL;

5363

__mem_cgroup_clear_mc();

5369

__mem_cgroup_clear_mc();

5364

spin_lock(&mc.lock);

5370

spin_lock(&mc.lock);

5365

mc.from = NULL;

5371

mc.from = NULL;

5366

mc.to = NULL;

5372

mc.to = NULL;

5367

spin_unlock(&mc.lock);

5373

spin_unlock(&mc.lock);

5368

mem_cgroup_end_move(from);

5374

mem_cgroup_end_move(from);

5369

}

5375

}

5370

5376

5371

static int mem_cgroup_can_attach(struct cgroup *cgroup,

5377

static int mem_cgroup_can_attach(struct cgroup *cgroup,

5372

struct cgroup_taskset *tset)

5378

struct cgroup_taskset *tset)

5373

{

5379

{

5374

struct task_struct *p = cgroup_taskset_first(tset);

5380

struct task_struct *p = cgroup_taskset_first(tset);

5375

int ret = 0;

5381

int ret = 0;

5376

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);

5382

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);

5377

5383

5378

if (memcg->move_charge_at_immigrate) {

5384

if (memcg->move_charge_at_immigrate) {

5379

struct mm_struct *mm;

5385

struct mm_struct *mm;

5380

struct mem_cgroup *from = mem_cgroup_from_task(p);

5386

struct mem_cgroup *from = mem_cgroup_from_task(p);

5381

5387

5382

VM_BUG_ON(from == memcg);

5388

VM_BUG_ON(from == memcg);

5383

5389

5384

mm = get_task_mm(p);

5390

mm = get_task_mm(p);

5385

if (!mm)

5391

if (!mm)

5386

return 0;

5392

return 0;

5387

/* We move charges only when we move a owner of the mm */

5393

/* We move charges only when we move a owner of the mm */

5388

if (mm->owner == p) {

5394

if (mm->owner == p) {

5389

VM_BUG_ON(mc.from);

5395

VM_BUG_ON(mc.from);

5390

VM_BUG_ON(mc.to);

5396

VM_BUG_ON(mc.to);

5391

VM_BUG_ON(mc.precharge);

5397

VM_BUG_ON(mc.precharge);

5392

VM_BUG_ON(mc.moved_charge);

5398

VM_BUG_ON(mc.moved_charge);

5393

VM_BUG_ON(mc.moved_swap);

5399

VM_BUG_ON(mc.moved_swap);

5394

mem_cgroup_start_move(from);

5400

mem_cgroup_start_move(from);

5395

spin_lock(&mc.lock);

5401

spin_lock(&mc.lock);

5396

mc.from = from;

5402

mc.from = from;

5397

mc.to = memcg;

5403

mc.to = memcg;

5398

spin_unlock(&mc.lock);

5404

spin_unlock(&mc.lock);

5399

/* We set mc.moving_task later */

5405

/* We set mc.moving_task later */

5400

5406

5401

ret = mem_cgroup_precharge_mc(mm);

5407

ret = mem_cgroup_precharge_mc(mm);

5402

if (ret)

5408

if (ret)

5403

mem_cgroup_clear_mc();

5409

mem_cgroup_clear_mc();

5404

}

5410

}

5405

mmput(mm);

5411

mmput(mm);

5406

}

5412

}

5407

return ret;

5413

return ret;

5408

}

5414

}

5409

5415

5410

static void mem_cgroup_cancel_attach(struct cgroup *cgroup,

5416

static void mem_cgroup_cancel_attach(struct cgroup *cgroup,

5411

struct cgroup_taskset *tset)

5417

struct cgroup_taskset *tset)

5412

{

5418

{

5413

mem_cgroup_clear_mc();

5419

mem_cgroup_clear_mc();

5414

}

5420

}

5415

5421

5416

static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,

5422

static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,

5417

unsigned long addr, unsigned long end,

5423

unsigned long addr, unsigned long end,

5418

struct mm_walk *walk)

5424

struct mm_walk *walk)

5419

{

5425

{

5420

int ret = 0;

5426

int ret = 0;

5421

struct vm_area_struct *vma = walk->private;

5427

struct vm_area_struct *vma = walk->private;

5422

pte_t *pte;

5428

pte_t *pte;

5423

spinlock_t *ptl;

5429

spinlock_t *ptl;

5424

enum mc_target_type target_type;

5430

enum mc_target_type target_type;

5425

union mc_target target;

5431

union mc_target target;

5426

struct page *page;

5432

struct page *page;

5427

struct page_cgroup *pc;

5433

struct page_cgroup *pc;

5428

5434

5429

/*

5435

/*

5430

* We don't take compound_lock() here but no race with splitting thp

5436

* We don't take compound_lock() here but no race with splitting thp

5431

* happens because:

5437

* happens because:

5432

* - if pmd_trans_huge_lock() returns 1, the relevant thp is not

5438

* - if pmd_trans_huge_lock() returns 1, the relevant thp is not

5433

* under splitting, which means there's no concurrent thp split,

5439

* under splitting, which means there's no concurrent thp split,

5434

* - if another thread runs into split_huge_page() just after we

5440

* - if another thread runs into split_huge_page() just after we

5435

* entered this if-block, the thread must wait for page table lock

5441

* entered this if-block, the thread must wait for page table lock

5436

* to be unlocked in __split_huge_page_splitting(), where the main

5442

* to be unlocked in __split_huge_page_splitting(), where the main

5437

* part of thp split is not executed yet.

5443

* part of thp split is not executed yet.

5438

*/

5444

*/

5439

if (pmd_trans_huge_lock(pmd, vma) == 1) {

5445

if (pmd_trans_huge_lock(pmd, vma) == 1) {

5440

if (mc.precharge < HPAGE_PMD_NR) {

5446

if (mc.precharge < HPAGE_PMD_NR) {

5441

spin_unlock(&vma->vm_mm->page_table_lock);

5447

spin_unlock(&vma->vm_mm->page_table_lock);

5442

return 0;

5448

return 0;

5443

}

5449

}

5444

target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);

5450

target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);

5445

if (target_type == MC_TARGET_PAGE) {

5451

if (target_type == MC_TARGET_PAGE) {

5446

page = target.page;

5452

page = target.page;

5447

if (!isolate_lru_page(page)) {

5453

if (!isolate_lru_page(page)) {

5448

pc = lookup_page_cgroup(page);

5454

pc = lookup_page_cgroup(page);

5449

if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,

5455

if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,

5450

pc, mc.from, mc.to)) {

5456

pc, mc.from, mc.to)) {

5451

mc.precharge -= HPAGE_PMD_NR;

5457

mc.precharge -= HPAGE_PMD_NR;

5452

mc.moved_charge += HPAGE_PMD_NR;

5458

mc.moved_charge += HPAGE_PMD_NR;

5453

}

5459

}

5454

putback_lru_page(page);

5460

putback_lru_page(page);

5455

}

5461

}

5456

put_page(page);

5462

put_page(page);

5457

}

5463

}

5458

spin_unlock(&vma->vm_mm->page_table_lock);

5464

spin_unlock(&vma->vm_mm->page_table_lock);

5459

return 0;

5465

return 0;

5460

}

5466

}

5461

5467

5462

if (pmd_trans_unstable(pmd))

5468

if (pmd_trans_unstable(pmd))

5463

return 0;

5469

return 0;

5464

retry:

5470

retry:

5465

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5471

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5466

for (; addr != end; addr += PAGE_SIZE) {

5472

for (; addr != end; addr += PAGE_SIZE) {

5467

pte_t ptent = *(pte++);

5473

pte_t ptent = *(pte++);

5468

swp_entry_t ent;

5474

swp_entry_t ent;

5469

5475

5470

if (!mc.precharge)

5476

if (!mc.precharge)

5471

break;

5477

break;

5472

5478

5473

switch (get_mctgt_type(vma, addr, ptent, &target)) {

5479

switch (get_mctgt_type(vma, addr, ptent, &target)) {

5474

case MC_TARGET_PAGE:

5480

case MC_TARGET_PAGE:

5475

page = target.page;

5481

page = target.page;

5476

if (isolate_lru_page(page))

5482

if (isolate_lru_page(page))

5477

goto put;

5483

goto put;

5478

pc = lookup_page_cgroup(page);

5484

pc = lookup_page_cgroup(page);

5479

if (!mem_cgroup_move_account(page, 1, pc,

5485

if (!mem_cgroup_move_account(page, 1, pc,

5480

mc.from, mc.to)) {

5486

mc.from, mc.to)) {

5481

mc.precharge--;

5487

mc.precharge--;

5482

/* we uncharge from mc.from later. */

5488

/* we uncharge from mc.from later. */

5483

mc.moved_charge++;

5489

mc.moved_charge++;

5484

}

5490

}

5485

putback_lru_page(page);

5491

putback_lru_page(page);

5486

put: /* get_mctgt_type() gets the page */

5492

put: /* get_mctgt_type() gets the page */

5487

put_page(page);

5493

put_page(page);

5488

break;

5494

break;

5489

case MC_TARGET_SWAP:

5495

case MC_TARGET_SWAP:

5490

ent = target.ent;

5496

ent = target.ent;

5491

if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {

5497

if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {

5492

mc.precharge--;

5498

mc.precharge--;

5493

/* we fixup refcnts and charges later. */

5499

/* we fixup refcnts and charges later. */

5494

mc.moved_swap++;

5500

mc.moved_swap++;

5495

}

5501

}

5496

break;

5502

break;

5497

default:

5503

default:

5498

break;

5504

break;

5499

}

5505

}

5500

}

5506

}

5501

pte_unmap_unlock(pte - 1, ptl);

5507

pte_unmap_unlock(pte - 1, ptl);

5502

cond_resched();

5508

cond_resched();

5503

5509

5504

if (addr != end) {

5510

if (addr != end) {

5505

/*

5511

/*

5506

* We have consumed all precharges we got in can_attach().

5512

* We have consumed all precharges we got in can_attach().

5507

* We try charge one by one, but don't do any additional

5513

* We try charge one by one, but don't do any additional

5508

* charges to mc.to if we have failed in charge once in attach()

5514

* charges to mc.to if we have failed in charge once in attach()

5509

* phase.

5515

* phase.

5510

*/

5516

*/

5511

ret = mem_cgroup_do_precharge(1);

5517

ret = mem_cgroup_do_precharge(1);

5512

if (!ret)

5518

if (!ret)

5513

goto retry;

5519

goto retry;

5514

}

5520

}

5515

5521

5516

return ret;

5522

return ret;

5517

}

5523

}

5518

5524

5519

static void mem_cgroup_move_charge(struct mm_struct *mm)

5525

static void mem_cgroup_move_charge(struct mm_struct *mm)

5520

{

5526

{

5521

struct vm_area_struct *vma;

5527

struct vm_area_struct *vma;

5522

5528

5523

lru_add_drain_all();

5529

lru_add_drain_all();

5524

retry:

5530

retry:

5525

if (unlikely(!down_read_trylock(&mm->mmap_sem))) {

5531

if (unlikely(!down_read_trylock(&mm->mmap_sem))) {

5526

/*

5532

/*

5527

* Someone who are holding the mmap_sem might be waiting in

5533

* Someone who are holding the mmap_sem might be waiting in

5528

* waitq. So we cancel all extra charges, wake up all waiters,

5534

* waitq. So we cancel all extra charges, wake up all waiters,

5529

* and retry. Because we cancel precharges, we might not be able

5535

* and retry. Because we cancel precharges, we might not be able

5530

* to move enough charges, but moving charge is a best-effort

5536

* to move enough charges, but moving charge is a best-effort

5531

* feature anyway, so it wouldn't be a big problem.

5537

* feature anyway, so it wouldn't be a big problem.

5532

*/

5538

*/

5533

__mem_cgroup_clear_mc();

5539

__mem_cgroup_clear_mc();

5534

cond_resched();

5540

cond_resched();

5535

goto retry;

5541

goto retry;

5536

}

5542

}

5537

for (vma = mm->mmap; vma; vma = vma->vm_next) {

5543

for (vma = mm->mmap; vma; vma = vma->vm_next) {

5538

int ret;

5544

int ret;

5539

struct mm_walk mem_cgroup_move_charge_walk = {

5545

struct mm_walk mem_cgroup_move_charge_walk = {

5540

.pmd_entry = mem_cgroup_move_charge_pte_range,

5546

.pmd_entry = mem_cgroup_move_charge_pte_range,

5541

.mm = mm,

5547

.mm = mm,

5542

.private = vma,

5548

.private = vma,

5543

};

5549

};

5544

if (is_vm_hugetlb_page(vma))

5550

if (is_vm_hugetlb_page(vma))

5545

continue;

5551

continue;

5546

ret = walk_page_range(vma->vm_start, vma->vm_end,

5552

ret = walk_page_range(vma->vm_start, vma->vm_end,

5547

&mem_cgroup_move_charge_walk);

5553

&mem_cgroup_move_charge_walk);

5548

if (ret)

5554

if (ret)

5549

/*

5555

/*

5550

* means we have consumed all precharges and failed in

5556

* means we have consumed all precharges and failed in

5551

* doing additional charge. Just abandon here.

5557

* doing additional charge. Just abandon here.

5552

*/

5558

*/

5553

break;

5559

break;

5554

}

5560

}

5555

up_read(&mm->mmap_sem);

5561

up_read(&mm->mmap_sem);

5556

}

5562

}

5557

5563

5558

static void mem_cgroup_move_task(struct cgroup *cont,

5564

static void mem_cgroup_move_task(struct cgroup *cont,

5559

struct cgroup_taskset *tset)

5565

struct cgroup_taskset *tset)

5560

{

5566

{

5561

struct task_struct *p = cgroup_taskset_first(tset);

5567

struct task_struct *p = cgroup_taskset_first(tset);

5562

struct mm_struct *mm = get_task_mm(p);

5568

struct mm_struct *mm = get_task_mm(p);

5563

5569

5564

if (mm) {

5570

if (mm) {

5565

if (mc.to)

5571

if (mc.to)

5566

mem_cgroup_move_charge(mm);

5572

mem_cgroup_move_charge(mm);

5567

mmput(mm);

5573

mmput(mm);

5568

}

5574

}

5569

if (mc.to)

5575

if (mc.to)

5570

mem_cgroup_clear_mc();

5576

mem_cgroup_clear_mc();

5571

}

5577

}

5572

#else /* !CONFIG_MMU */

5578

#else /* !CONFIG_MMU */

5573

static int mem_cgroup_can_attach(struct cgroup *cgroup,

5579

static int mem_cgroup_can_attach(struct cgroup *cgroup,

5574

struct cgroup_taskset *tset)

5580

struct cgroup_taskset *tset)

5575

{

5581

{

5576

return 0;

5582

return 0;

5577

}

5583

}

5578

static void mem_cgroup_cancel_attach(struct cgroup *cgroup,

5584

static void mem_cgroup_cancel_attach(struct cgroup *cgroup,

5579

struct cgroup_taskset *tset)

5585

struct cgroup_taskset *tset)

5580

{

5586

{

5581

}

5587

}

5582

static void mem_cgroup_move_task(struct cgroup *cont,

5588

static void mem_cgroup_move_task(struct cgroup *cont,

5583

struct cgroup_taskset *tset)

5589

struct cgroup_taskset *tset)

5584

{

5590

{

5585

}

5591

}

5586

#endif

5592

#endif

5587

5593

5588

struct cgroup_subsys mem_cgroup_subsys = {

5594

struct cgroup_subsys mem_cgroup_subsys = {

5589

.name = "memory",

5595

.name = "memory",

5590

.subsys_id = mem_cgroup_subsys_id,

5596

.subsys_id = mem_cgroup_subsys_id,

5591

.create = mem_cgroup_create,

5597

.create = mem_cgroup_create,

5592

.pre_destroy = mem_cgroup_pre_destroy,

5598

.pre_destroy = mem_cgroup_pre_destroy,

5593

.destroy = mem_cgroup_destroy,

5599

.destroy = mem_cgroup_destroy,

5594

.can_attach = mem_cgroup_can_attach,

5600

.can_attach = mem_cgroup_can_attach,

5595

.cancel_attach = mem_cgroup_cancel_attach,

5601

.cancel_attach = mem_cgroup_cancel_attach,

5596

.attach = mem_cgroup_move_task,

5602

.attach = mem_cgroup_move_task,

5597

.base_cftypes = mem_cgroup_files,

5603

.base_cftypes = mem_cgroup_files,

5598

.early_init = 0,

5604

.early_init = 0,

5599

.use_id = 1,

5605

.use_id = 1,

5600

.__DEPRECATED_clear_css_refs = true,

5606

.__DEPRECATED_clear_css_refs = true,

5601

};

5607

};

5602

5608

5603

#ifdef CONFIG_MEMCG_SWAP

5609

#ifdef CONFIG_MEMCG_SWAP

5604

static int __init enable_swap_account(char *s)

5610

static int __init enable_swap_account(char *s)

5605

{

5611

{

5606

/* consider enabled if no parameter or 1 is given */

5612

/* consider enabled if no parameter or 1 is given */

5607

if (!strcmp(s, "1"))

5613

if (!strcmp(s, "1"))

5608

really_do_swap_account = 1;

5614

really_do_swap_account = 1;

GITLAB

mm: memcg: only check anon swapin page charges for swap cache

 /* memcontrol.c - Memory Controller
  *
  * Copyright IBM Corporation, 2007
  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
  *
  * Copyright 2007 OpenVZ SWsoft Inc
  * Author: Pavel Emelianov <xemul@openvz.org>
  *
  * Memory thresholds
  * Copyright (C) 2009 Nokia Corporation
  * Author: Kirill A. Shutemov
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  */
 #include <linux/res_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/smp.h>
 #include <linux/page-flags.h>
 #include <linux/backing-dev.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/limits.h>
 #include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/spinlock.h>
 #include <linux/eventfd.h>
 #include <linux/sort.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/tcp_memcontrol.h>
 #include <asm/uaccess.h>
 #include <trace/events/vmscan.h>
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
 #ifdef CONFIG_MEMCG_SWAP
 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
 int do_swap_account __read_mostly;
 /* for remember boot option*/
 #ifdef CONFIG_MEMCG_SWAP_ENABLED
 static int really_do_swap_account __initdata = 1;
 #else
 static int really_do_swap_account __initdata = 0;
 #endif
 #else
 #define do_swap_account		0
 #endif
 /*
  * Statistics for memory cgroup.
  */
 enum mem_cgroup_stat_index {
 	/*
 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
 	 */
 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
 	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
 	MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
 	MEM_CGROUP_STAT_NSTATS,
 };
 static const char * const mem_cgroup_stat_names[] = {
 	"cache",
 	"rss",
 	"mapped_file",
 	"swap",
 };
 enum mem_cgroup_events_index {
 	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
 	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
 	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
 	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
 	MEM_CGROUP_EVENTS_NSTATS,
 };
 static const char * const mem_cgroup_events_names[] = {
 	"pgpgin",
 	"pgpgout",
 	"pgfault",
 	"pgmajfault",
 };
 /*
  * Per memcg event counter is incremented at every pagein/pageout. With THP,
  * it will be incremated by the number of pages. This counter is used for
  * for trigger some periodic events. This is straightforward and better
  * than using jiffies etc. to handle periodic memcg event.
  */
 enum mem_cgroup_events_target {
 	MEM_CGROUP_TARGET_THRESH,
 	MEM_CGROUP_TARGET_SOFTLIMIT,
 	MEM_CGROUP_TARGET_NUMAINFO,
 	MEM_CGROUP_NTARGETS,
 };
 #define THRESHOLDS_EVENTS_TARGET 128
 #define SOFTLIMIT_EVENTS_TARGET 1024
 #define NUMAINFO_EVENTS_TARGET	1024
 struct mem_cgroup_stat_cpu {
 	long count[MEM_CGROUP_STAT_NSTATS];
 	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
 	unsigned long nr_page_events;
 	unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 struct mem_cgroup_reclaim_iter {
 	/* css_id of the last scanned hierarchy member */
 	int position;
 	/* scan generation, increased every round-trip */
 	unsigned int generation;
 };
 /*
  * per-zone information in memory controller.
  */
 struct mem_cgroup_per_zone {
 	struct lruvec		lruvec;
 	unsigned long		lru_size[NR_LRU_LISTS];
 	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
 	struct rb_node		tree_node;	/* RB tree node */
 	unsigned long long	usage_in_excess;/* Set to the value by which */
 						/* the soft limit is exceeded*/
 	bool			on_tree;
 	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
 						/* use container_of	   */
 };
 struct mem_cgroup_per_node {
 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 };
 struct mem_cgroup_lru_info {
 	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 };
 /*
  * Cgroups above their limits are maintained in a RB-Tree, independent of
  * their hierarchy representation
  */
 struct mem_cgroup_tree_per_zone {
 	struct rb_root rb_root;
 	spinlock_t lock;
 };
 struct mem_cgroup_tree_per_node {
 	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
 };
 struct mem_cgroup_tree {
 	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 };
 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 struct mem_cgroup_threshold {
 	struct eventfd_ctx *eventfd;
 	u64 threshold;
 };
 /* For threshold */
 struct mem_cgroup_threshold_ary {
 	/* An array index points to threshold just below or equal to usage. */
 	int current_threshold;
 	/* Size of entries[] */
 	unsigned int size;
 	/* Array of thresholds */
 	struct mem_cgroup_threshold entries[0];
 };
 struct mem_cgroup_thresholds {
 	/* Primary thresholds array */
 	struct mem_cgroup_threshold_ary *primary;
 	/*
 	 * Spare threshold array.
 	 * This is needed to make mem_cgroup_unregister_event() "never fail".
 	 * It must be able to store at least primary->size - 1 entries.
 	 */
 	struct mem_cgroup_threshold_ary *spare;
 };
 /* for OOM */
 struct mem_cgroup_eventfd_list {
 	struct list_head list;
 	struct eventfd_ctx *eventfd;
 };
 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
  * to help the administrator determine what knobs to tune.
  *
  * TODO: Add a water mark for the memory controller. Reclaim will begin when
  * we hit the water mark. May be even add a low water mark, such that
  * no reclaim occurs from a cgroup at it's low water mark, this is
  * a feature that will be implemented much later in the future.
  */
 struct mem_cgroup {
 	struct cgroup_subsys_state css;
 	/*
 	 * the counter to account for memory usage
 	 */
 	struct res_counter res;
 	union {
 		/*
 		 * the counter to account for mem+swap usage.
 		 */
 		struct res_counter memsw;
 		/*
 		 * rcu_freeing is used only when freeing struct mem_cgroup,
 		 * so put it into a union to avoid wasting more memory.
 		 * It must be disjoint from the css field.  It could be
 		 * in a union with the res field, but res plays a much
 		 * larger part in mem_cgroup life than memsw, and might
 		 * be of interest, even at time of free, when debugging.
 		 * So share rcu_head with the less interesting memsw.
 		 */
 		struct rcu_head rcu_freeing;
 		/*
 		 * We also need some space for a worker in deferred freeing.
 		 * By the time we call it, rcu_freeing is no longer in use.
 		 */
 		struct work_struct work_freeing;
 	};
 	/*
 	 * Per cgroup active and inactive list, similar to the
 	 * per zone LRU lists.
 	 */
 	struct mem_cgroup_lru_info info;
 	int last_scanned_node;
 #if MAX_NUMNODES > 1
 	nodemask_t	scan_nodes;
 	atomic_t	numainfo_events;
 	atomic_t	numainfo_updating;
 #endif
 	/*
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
 	bool use_hierarchy;
 	bool		oom_lock;
 	atomic_t	under_oom;
 	atomic_t	refcnt;
 	int	swappiness;
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
 	/* set when res.limit == memsw.limit */
 	bool		memsw_is_minimum;
 	/* protect arrays of thresholds */
 	struct mutex thresholds_lock;
 	/* thresholds for memory usage. RCU-protected */
 	struct mem_cgroup_thresholds thresholds;
 	/* thresholds for mem+swap usage. RCU-protected */
 	struct mem_cgroup_thresholds memsw_thresholds;
 	/* For oom notifier event fd */
 	struct list_head oom_notify;
 	/*
 	 * Should we move charges of a task when a task is moved into this
 	 * mem_cgroup ? And what type of charges should we move ?
 	 */
 	unsigned long 	move_charge_at_immigrate;
 	/*
 	 * set > 0 if pages under this cgroup are moving to other cgroup.
 	 */
 	atomic_t	moving_account;
 	/* taken only while moving_account > 0 */
 	spinlock_t	move_lock;
 	/*
 	 * percpu counter.
 	 */
 	struct mem_cgroup_stat_cpu __percpu *stat;
 	/*
 	 * used when a cpu is offlined or other synchronizations
 	 * See mem_cgroup_read_stat().
 	 */
 	struct mem_cgroup_stat_cpu nocpu_base;
 	spinlock_t pcp_counter_lock;
 #ifdef CONFIG_INET
 	struct tcp_memcontrol tcp_mem;
 #endif
 };
 /* Stuffs for move charges at task migration. */
 /*
  * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
  * left-shifted bitmap of these types.
  */
 enum move_type {
 	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
 	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
 	NR_MOVE_TYPE,
 };
 /* "mc" and its members are protected by cgroup_mutex */
 static struct move_charge_struct {
 	spinlock_t	  lock; /* for from, to */
 	struct mem_cgroup *from;
 	struct mem_cgroup *to;
 	unsigned long precharge;
 	unsigned long moved_charge;
 	unsigned long moved_swap;
 	struct task_struct *moving_task;	/* a task moving charges */
 	wait_queue_head_t waitq;		/* a waitq for other context */
 } mc = {
 	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 };
 static bool move_anon(void)
 {
 	return test_bit(MOVE_CHARGE_TYPE_ANON,
 					&mc.to->move_charge_at_immigrate);
 }
 static bool move_file(void)
 {
 	return test_bit(MOVE_CHARGE_TYPE_FILE,
 					&mc.to->move_charge_at_immigrate);
 }
 /*
  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
  * limit reclaim to prevent infinite loops, if they ever occur.
  */
 #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
 #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
 enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_ANON,
 	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
 	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
 	NR_CHARGE_TYPE,
 };
 /* for encoding cft->private value on file */
 #define _MEM			(0)
 #define _MEMSWAP		(1)
 #define _OOM_TYPE		(2)
 #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
 #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)
 /* Used for OOM nofiier */
 #define OOM_CONTROL		(0)
 /*
  * Reclaim flags for mem_cgroup_hierarchical_reclaim
  */
 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
 #define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
 #define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
 #define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
 static void mem_cgroup_get(struct mem_cgroup *memcg);
 static void mem_cgroup_put(struct mem_cgroup *memcg);
 /* Writing them here to avoid exposing memcg's inner layout */
 #ifdef CONFIG_MEMCG_KMEM
 #include <net/sock.h>
 #include <net/ip.h>
 static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
 void sock_update_memcg(struct sock *sk)
 {
 	if (mem_cgroup_sockets_enabled) {
 		struct mem_cgroup *memcg;
 		struct cg_proto *cg_proto;
 		BUG_ON(!sk->sk_prot->proto_cgroup);
 		/* Socket cloning can throw us here with sk_cgrp already
 		 * filled. It won't however, necessarily happen from
 		 * process context. So the test for root memcg given
 		 * the current task's memcg won't help us in this case.
 		 *
 		 * Respecting the original socket's memcg is a better
 		 * decision in this case.
 		 */
 		if (sk->sk_cgrp) {
 			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
 			mem_cgroup_get(sk->sk_cgrp->memcg);
 			return;
 		}
 		rcu_read_lock();
 		memcg = mem_cgroup_from_task(current);
 		cg_proto = sk->sk_prot->proto_cgroup(memcg);
 		if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
 			mem_cgroup_get(memcg);
 			sk->sk_cgrp = cg_proto;
 		}
 		rcu_read_unlock();
 	}
 }
 EXPORT_SYMBOL(sock_update_memcg);
 void sock_release_memcg(struct sock *sk)
 {
 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
 		struct mem_cgroup *memcg;
 		WARN_ON(!sk->sk_cgrp->memcg);
 		memcg = sk->sk_cgrp->memcg;
 		mem_cgroup_put(memcg);
 	}
 }
 #ifdef CONFIG_INET
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 {
 	if (!memcg || mem_cgroup_is_root(memcg))
 		return NULL;
 	return &memcg->tcp_mem.cg_proto;
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
 #endif /* CONFIG_INET */
 #endif /* CONFIG_MEMCG_KMEM */
 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
 	if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
 		return;
 	static_key_slow_dec(&memcg_socket_limit_enabled);
 }
 #else
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
 }
 #endif
 static void drain_all_stock_async(struct mem_cgroup *memcg);
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
 {
 	return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
 }
 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
 {
 	return &memcg->css;
 }
 static struct mem_cgroup_per_zone *
 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	return mem_cgroup_zoneinfo(memcg, nid, zid);
 }
 static struct mem_cgroup_tree_per_zone *
 soft_limit_tree_node_zone(int nid, int zid)
 {
 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 }
 static struct mem_cgroup_tree_per_zone *
 soft_limit_tree_from_page(struct page *page)
 {
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 }
 static void
 __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz,
 				unsigned long long new_usage_in_excess)
 {
 	struct rb_node **p = &mctz->rb_root.rb_node;
 	struct rb_node *parent = NULL;
 	struct mem_cgroup_per_zone *mz_node;
 	if (mz->on_tree)
 		return;
 	mz->usage_in_excess = new_usage_in_excess;
 	if (!mz->usage_in_excess)
 		return;
 	while (*p) {
 		parent = *p;
 		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
 					tree_node);
 		if (mz->usage_in_excess < mz_node->usage_in_excess)
 			p = &(*p)->rb_left;
 		/*
 		 * We can't avoid mem cgroups that are over their soft
 		 * limit by the same amount
 		 */
 		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 			p = &(*p)->rb_right;
 	}
 	rb_link_node(&mz->tree_node, parent, p);
 	rb_insert_color(&mz->tree_node, &mctz->rb_root);
 	mz->on_tree = true;
 }
 static void
 __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz)
 {
 	if (!mz->on_tree)
 		return;
 	rb_erase(&mz->tree_node, &mctz->rb_root);
 	mz->on_tree = false;
 }
 static void
 mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz)
 {
 	spin_lock(&mctz->lock);
 	__mem_cgroup_remove_exceeded(memcg, mz, mctz);
 	spin_unlock(&mctz->lock);
 }
 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 {
 	unsigned long long excess;
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup_tree_per_zone *mctz;
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	mctz = soft_limit_tree_from_page(page);
 	/*
 	 * Necessary to update all ancestors when hierarchy is used.
 	 * because their event counter is not touched.
 	 */
 	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 		mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 		excess = res_counter_soft_limit_excess(&memcg->res);
 		/*
 		 * We have to update the tree if mz is on RB-tree or
 		 * mem is over its softlimit.
 		 */
 		if (excess || mz->on_tree) {
 			spin_lock(&mctz->lock);
 			/* if on-tree, remove it */
 			if (mz->on_tree)
 				__mem_cgroup_remove_exceeded(memcg, mz, mctz);
 			/*
 			 * Insert again. mz->usage_in_excess will be updated.
 			 * If excess is 0, no tree ops.
 			 */
 			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
 			spin_unlock(&mctz->lock);
 		}
 	}
 }
 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 {
 	int node, zone;
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup_tree_per_zone *mctz;
 	for_each_node(node) {
 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 			mz = mem_cgroup_zoneinfo(memcg, node, zone);
 			mctz = soft_limit_tree_node_zone(node, zone);
 			mem_cgroup_remove_exceeded(memcg, mz, mctz);
 		}
 	}
 }
 static struct mem_cgroup_per_zone *
 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 {
 	struct rb_node *rightmost = NULL;
 	struct mem_cgroup_per_zone *mz;
 retry:
 	mz = NULL;
 	rightmost = rb_last(&mctz->rb_root);
 	if (!rightmost)
 		goto done;		/* Nothing to reclaim from */
 	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
 	/*
 	 * Remove the node now but someone else can add it back,
 	 * we will to add it back at the end of reclaim to its correct
 	 * position in the tree.
 	 */
 	__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
 	if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
 		!css_tryget(&mz->memcg->css))
 		goto retry;
 done:
 	return mz;
 }
 static struct mem_cgroup_per_zone *
 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 {
 	struct mem_cgroup_per_zone *mz;
 	spin_lock(&mctz->lock);
 	mz = __mem_cgroup_largest_soft_limit_node(mctz);
 	spin_unlock(&mctz->lock);
 	return mz;
 }
 /*
  * Implementation Note: reading percpu statistics for memcg.
  *
  * Both of vmstat[] and percpu_counter has threshold and do periodic
  * synchronization to implement "quick" read. There are trade-off between
  * reading cost and precision of value. Then, we may have a chance to implement
  * a periodic synchronizion of counter in memcg's counter.
  *
  * But this _read() function is used for user interface now. The user accounts
  * memory usage by memory cgroup and he _always_ requires exact value because
  * he accounts memory. Even if we provide quick-and-fuzzy read, we always
  * have to visit all online cpus and make sum. So, for now, unnecessary
  * synchronization is not implemented. (just implemented for cpu hotplug)
  *
  * If there are kernel internal actions which can make use of some not-exact
  * value, and reading all cpu value can be performance bottleneck in some
  * common workload, threashold and synchonization as vmstat[] should be
  * implemented.
  */
 static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
 				 enum mem_cgroup_stat_index idx)
 {
 	long val = 0;
 	int cpu;
 	get_online_cpus();
 	for_each_online_cpu(cpu)
 		val += per_cpu(memcg->stat->count[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 	spin_lock(&memcg->pcp_counter_lock);
 	val += memcg->nocpu_base.count[idx];
 	spin_unlock(&memcg->pcp_counter_lock);
 #endif
 	put_online_cpus();
 	return val;
 }
 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
 					 bool charge)
 {
 	int val = (charge) ? 1 : -1;
 	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
 }
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 					    enum mem_cgroup_events_index idx)
 {
 	unsigned long val = 0;
 	int cpu;
 	for_each_online_cpu(cpu)
 		val += per_cpu(memcg->stat->events[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 	spin_lock(&memcg->pcp_counter_lock);
 	val += memcg->nocpu_base.events[idx];
 	spin_unlock(&memcg->pcp_counter_lock);
 #endif
 	return val;
 }
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 					 bool anon, int nr_pages)
 {
 	preempt_disable();
 	/*
 	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
 	 * counted as CACHE even if it's on ANON LRU.
 	 */
 	if (anon)
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
 				nr_pages);
 	else
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
 				nr_pages);
 	/* pagein of a big page is an event. So, ignore page size */
 	if (nr_pages > 0)
 		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 	else {
 		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 		nr_pages = -nr_pages; /* for event */
 	}
 	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 	preempt_enable();
 }
 unsigned long
 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
 	struct mem_cgroup_per_zone *mz;
 	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
 	return mz->lru_size[lru];
 }
 static unsigned long
 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
 			unsigned int lru_mask)
 {
 	struct mem_cgroup_per_zone *mz;
 	enum lru_list lru;
 	unsigned long ret = 0;
 	mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 	for_each_lru(lru) {
 		if (BIT(lru) & lru_mask)
 			ret += mz->lru_size[lru];
 	}
 	return ret;
 }
 static unsigned long
 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 			int nid, unsigned int lru_mask)
 {
 	u64 total = 0;
 	int zid;
 	for (zid = 0; zid < MAX_NR_ZONES; zid++)
 		total += mem_cgroup_zone_nr_lru_pages(memcg,
 						nid, zid, lru_mask);
 	return total;
 }
 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
 			unsigned int lru_mask)
 {
 	int nid;
 	u64 total = 0;
 	for_each_node_state(nid, N_HIGH_MEMORY)
 		total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
 	return total;
 }
 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 				       enum mem_cgroup_events_target target)
 {
 	unsigned long val, next;
 	val = __this_cpu_read(memcg->stat->nr_page_events);
 	next = __this_cpu_read(memcg->stat->targets[target]);
 	/* from time_after() in jiffies.h */
 	if ((long)next - (long)val < 0) {
 		switch (target) {
 		case MEM_CGROUP_TARGET_THRESH:
 			next = val + THRESHOLDS_EVENTS_TARGET;
 			break;
 		case MEM_CGROUP_TARGET_SOFTLIMIT:
 			next = val + SOFTLIMIT_EVENTS_TARGET;
 			break;
 		case MEM_CGROUP_TARGET_NUMAINFO:
 			next = val + NUMAINFO_EVENTS_TARGET;
 			break;
 		default:
 			break;
 		}
 		__this_cpu_write(memcg->stat->targets[target], next);
 		return true;
 	}
 	return false;
 }
 /*
  * Check events in order.
  *
  */
 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 {
 	preempt_disable();
 	/* threshold event is triggered in finer grain than soft limit */
 	if (unlikely(mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_THRESH))) {
 		bool do_softlimit;
 		bool do_numainfo __maybe_unused;
 		do_softlimit = mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_SOFTLIMIT);
 #if MAX_NUMNODES > 1
 		do_numainfo = mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_NUMAINFO);
 #endif
 		preempt_enable();
 		mem_cgroup_threshold(memcg);
 		if (unlikely(do_softlimit))
 			mem_cgroup_update_tree(memcg, page);
 #if MAX_NUMNODES > 1
 		if (unlikely(do_numainfo))
 			atomic_inc(&memcg->numainfo_events);
 #endif
 	} else
 		preempt_enable();
 }
 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 {
 	return container_of(cgroup_subsys_state(cont,
 				mem_cgroup_subsys_id), struct mem_cgroup,
 				css);
 }
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 {
 	/*
 	 * mm_update_next_owner() may clear mm->owner to NULL
 	 * if it races with swapoff, page migration, etc.
 	 * So this can be called with p == NULL.
 	 */
 	if (unlikely(!p))
 		return NULL;
 	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 				struct mem_cgroup, css);
 }
 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
 	struct mem_cgroup *memcg = NULL;
 	if (!mm)
 		return NULL;
 	/*
 	 * Because we have no locks, mm->owner's may be being moved to other
 	 * cgroup. We use css_tryget() here even if this looks
 	 * pessimistic (rather than adding locks here).
 	 */
 	rcu_read_lock();
 	do {
 		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 		if (unlikely(!memcg))
 			break;
 	} while (!css_tryget(&memcg->css));
 	rcu_read_unlock();
 	return memcg;
 }
 /**
  * mem_cgroup_iter - iterate over memory cgroup hierarchy
  * @root: hierarchy root
  * @prev: previously returned memcg, NULL on first invocation
  * @reclaim: cookie for shared reclaim walks, NULL for full walks
  *
  * Returns references to children of the hierarchy below @root, or
  * @root itself, or %NULL after a full round-trip.
  *
  * Caller must pass the return value in @prev on subsequent
  * invocations for reference counting, or use mem_cgroup_iter_break()
  * to cancel a hierarchy walk before the round-trip is complete.
  *
  * Reclaimers can specify a zone and a priority level in @reclaim to
  * divide up the memcgs in the hierarchy among all concurrent
  * reclaimers operating on the same zone and priority.
  */
 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 				   struct mem_cgroup *prev,
 				   struct mem_cgroup_reclaim_cookie *reclaim)
 {
 	struct mem_cgroup *memcg = NULL;
 	int id = 0;
 	if (mem_cgroup_disabled())
 		return NULL;
 	if (!root)
 		root = root_mem_cgroup;
 	if (prev && !reclaim)
 		id = css_id(&prev->css);
 	if (prev && prev != root)
 		css_put(&prev->css);
 	if (!root->use_hierarchy && root != root_mem_cgroup) {
 		if (prev)
 			return NULL;
 		return root;
 	}
 	while (!memcg) {
 		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
 		struct cgroup_subsys_state *css;
 		if (reclaim) {
 			int nid = zone_to_nid(reclaim->zone);
 			int zid = zone_idx(reclaim->zone);
 			struct mem_cgroup_per_zone *mz;
 			mz = mem_cgroup_zoneinfo(root, nid, zid);
 			iter = &mz->reclaim_iter[reclaim->priority];
 			if (prev && reclaim->generation != iter->generation)
 				return NULL;
 			id = iter->position;
 		}
 		rcu_read_lock();
 		css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
 		if (css) {
 			if (css == &root->css || css_tryget(css))
 				memcg = container_of(css,
 						     struct mem_cgroup, css);
 		} else
 			id = 0;
 		rcu_read_unlock();
 		if (reclaim) {
 			iter->position = id;
 			if (!css)
 				iter->generation++;
 			else if (!prev && memcg)
 				reclaim->generation = iter->generation;
 		}
 		if (prev && !css)
 			return NULL;
 	}
 	return memcg;
 }
 /**
  * mem_cgroup_iter_break - abort a hierarchy walk prematurely
  * @root: hierarchy root
  * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
  */
 void mem_cgroup_iter_break(struct mem_cgroup *root,
 			   struct mem_cgroup *prev)
 {
 	if (!root)
 		root = root_mem_cgroup;
 	if (prev && prev != root)
 		css_put(&prev->css);
 }
 /*
  * Iteration constructs for visiting all cgroups (under a tree).  If
  * loops are exited prematurely (break), mem_cgroup_iter_break() must
  * be used for reference counting.
  */
 #define for_each_mem_cgroup_tree(iter, root)		\
 	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
 	     iter != NULL;				\
 	     iter = mem_cgroup_iter(root, iter, NULL))
 #define for_each_mem_cgroup(iter)			\
 	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
 	     iter != NULL;				\
 	     iter = mem_cgroup_iter(NULL, iter, NULL))
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
 	return (memcg == root_mem_cgroup);
 }
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 	struct mem_cgroup *memcg;
 	if (!mm)
 		return;
 	rcu_read_lock();
 	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 	if (unlikely(!memcg))
 		goto out;
 	switch (idx) {
 	case PGFAULT:
 		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
 		break;
 	case PGMAJFAULT:
 		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
 		break;
 	default:
 		BUG();
 	}
 out:
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(mem_cgroup_count_vm_event);
 /**
  * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
  * @zone: zone of the wanted lruvec
  * @memcg: memcg of the wanted lruvec
  *
  * Returns the lru list vector holding pages for the given @zone and
  * @mem.  This can be the global zone lruvec, if the memory controller
  * is disabled.
  */
 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
 				      struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_per_zone *mz;
 	if (mem_cgroup_disabled())
 		return &zone->lruvec;
 	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
 	return &mz->lruvec;
 }
 /*
  * Following LRU functions are allowed to be used without PCG_LOCK.
  * Operations are called by routine of global LRU independently from memcg.
  * What we have to take care of here is validness of pc->mem_cgroup.
  *
  * Changes to pc->mem_cgroup happens when
  * 1. charge
  * 2. moving account
  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
  * It is added to LRU before charge.
  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
  * When moving account, the page is not on LRU. It's isolated.
  */
 /**
  * mem_cgroup_page_lruvec - return lruvec for adding an lru page
  * @page: the page
  * @zone: zone of the page
  */
 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
 {
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc;
 	if (mem_cgroup_disabled())
 		return &zone->lruvec;
 	pc = lookup_page_cgroup(page);
 	memcg = pc->mem_cgroup;
 	/*
 	 * Surreptitiously switch any uncharged offlist page to root:
 	 * an uncharged page off lru does nothing to secure
 	 * its former mem_cgroup from sudden removal.
 	 *
 	 * Our caller holds lru_lock, and PageCgroupUsed is updated
 	 * under page_cgroup lock: between them, they make all uses
 	 * of pc->mem_cgroup safe.
 	 */
 	if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
 		pc->mem_cgroup = memcg = root_mem_cgroup;
 	mz = page_cgroup_zoneinfo(memcg, page);
 	return &mz->lruvec;
 }
 /**
  * mem_cgroup_update_lru_size - account for adding or removing an lru page
  * @lruvec: mem_cgroup per zone lru vector
  * @lru: index of lru list the page is sitting on
  * @nr_pages: positive when adding or negative when removing
  *
  * This function must be called when a page is added to or removed from an
  * lru list.
  */
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 				int nr_pages)
 {
 	struct mem_cgroup_per_zone *mz;
 	unsigned long *lru_size;
 	if (mem_cgroup_disabled())
 		return;
 	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
 	lru_size = mz->lru_size + lru;
 	*lru_size += nr_pages;
 	VM_BUG_ON((long)(*lru_size) < 0);
 }
 /*
  * Checks whether given mem is same or in the root_mem_cgroup's
  * hierarchy subtree
  */
 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 				  struct mem_cgroup *memcg)
 {
 	if (root_memcg == memcg)
 		return true;
 	if (!root_memcg->use_hierarchy || !memcg)
 		return false;
 	return css_is_ancestor(&memcg->css, &root_memcg->css);
 }
 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 				       struct mem_cgroup *memcg)
 {
 	bool ret;
 	rcu_read_lock();
 	ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
 	rcu_read_unlock();
 	return ret;
 }
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
 {
 	int ret;
 	struct mem_cgroup *curr = NULL;
 	struct task_struct *p;
 	p = find_lock_task_mm(task);
 	if (p) {
 		curr = try_get_mem_cgroup_from_mm(p->mm);
 		task_unlock(p);
 	} else {
 		/*
 		 * All threads may have already detached their mm's, but the oom
 		 * killer still needs to detect if they have already been oom
 		 * killed to prevent needlessly killing additional tasks.
 		 */
 		task_lock(task);
 		curr = mem_cgroup_from_task(task);
 		if (curr)
 			css_get(&curr->css);
 		task_unlock(task);
 	}
 	if (!curr)
 		return 0;
 	/*
 	 * We should check use_hierarchy of "memcg" not "curr". Because checking
 	 * use_hierarchy of "curr" here make this function true if hierarchy is
 	 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
 	 * hierarchy(even if use_hierarchy is disabled in "memcg").
 	 */
 	ret = mem_cgroup_same_or_subtree(memcg, curr);
 	css_put(&curr->css);
 	return ret;
 }
 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 {
 	unsigned long inactive_ratio;
 	unsigned long inactive;
 	unsigned long active;
 	unsigned long gb;
 	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
 	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
 	if (gb)
 		inactive_ratio = int_sqrt(10 * gb);
 	else
 		inactive_ratio = 1;
 	return inactive * inactive_ratio < active;
 }
 int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
 {
 	unsigned long active;
 	unsigned long inactive;
 	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
 	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
 	return (active > inactive);
 }
 #define mem_cgroup_from_res_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 /**
  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
  * @memcg: the memory cgroup
  *
  * Returns the maximum amount of memory @mem can be charged with, in
  * pages.
  */
 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
 {
 	unsigned long long margin;
 	margin = res_counter_margin(&memcg->res);
 	if (do_swap_account)
 		margin = min(margin, res_counter_margin(&memcg->memsw));
 	return margin >> PAGE_SHIFT;
 }
 int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
 	struct cgroup *cgrp = memcg->css.cgroup;
 	/* root ? */
 	if (cgrp->parent == NULL)
 		return vm_swappiness;
 	return memcg->swappiness;
 }
 /*
  * memcg->moving_account is used for checking possibility that some thread is
  * calling move_account(). When a thread on CPU-A starts moving pages under
  * a memcg, other threads should check memcg->moving_account under
  * rcu_read_lock(), like this:
  *
  *         CPU-A                                    CPU-B
  *                                              rcu_read_lock()
  *         memcg->moving_account+1              if (memcg->mocing_account)
  *                                                   take heavy locks.
  *         synchronize_rcu()                    update something.
  *                                              rcu_read_unlock()
  *         start move here.
  */
 /* for quick checking without looking up memcg */
 atomic_t memcg_moving __read_mostly;
 static void mem_cgroup_start_move(struct mem_cgroup *memcg)
 {
 	atomic_inc(&memcg_moving);
 	atomic_inc(&memcg->moving_account);
 	synchronize_rcu();
 }
 static void mem_cgroup_end_move(struct mem_cgroup *memcg)
 {
 	/*
 	 * Now, mem_cgroup_clear_mc() may call this function with NULL.
 	 * We check NULL in callee rather than caller.
 	 */
 	if (memcg) {
 		atomic_dec(&memcg_moving);
 		atomic_dec(&memcg->moving_account);
 	}
 }
 /*
  * 2 routines for checking "mem" is under move_account() or not.
  *
  * mem_cgroup_stolen() -  checking whether a cgroup is mc.from or not. This
  *			  is used for avoiding races in accounting.  If true,
  *			  pc->mem_cgroup may be overwritten.
  *
  * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
  *			  under hierarchy of moving cgroups. This is for
  *			  waiting at hith-memory prressure caused by "move".
  */
 static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
 {
 	VM_BUG_ON(!rcu_read_lock_held());
 	return atomic_read(&memcg->moving_account) > 0;
 }
 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *from;
 	struct mem_cgroup *to;
 	bool ret = false;
 	/*
 	 * Unlike task_move routines, we access mc.to, mc.from not under
 	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
 	 */
 	spin_lock(&mc.lock);
 	from = mc.from;
 	to = mc.to;
 	if (!from)
 		goto unlock;
 	ret = mem_cgroup_same_or_subtree(memcg, from)
 		|| mem_cgroup_same_or_subtree(memcg, to);
 unlock:
 	spin_unlock(&mc.lock);
 	return ret;
 }
 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
 {
 	if (mc.moving_task && current != mc.moving_task) {
 		if (mem_cgroup_under_move(memcg)) {
 			DEFINE_WAIT(wait);
 			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
 			/* moving charge context might have finished. */
 			if (mc.moving_task)
 				schedule();
 			finish_wait(&mc.waitq, &wait);
 			return true;
 		}
 	}
 	return false;
 }
 /*
  * Take this lock when
  * - a code tries to modify page's memcg while it's USED.
  * - a code tries to modify page state accounting in a memcg.
  * see mem_cgroup_stolen(), too.
  */
 static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
 				  unsigned long *flags)
 {
 	spin_lock_irqsave(&memcg->move_lock, *flags);
 }
 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
 				unsigned long *flags)
 {
 	spin_unlock_irqrestore(&memcg->move_lock, *flags);
 }
 /**
  * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
  * @memcg: The memory cgroup that went over limit
  * @p: Task that is going to be killed
  *
  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
  * enabled
  */
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 	struct cgroup *task_cgrp;
 	struct cgroup *mem_cgrp;
 	/*
 	 * Need a buffer in BSS, can't rely on allocations. The code relies
 	 * on the assumption that OOM is serialized for memory controller.
 	 * If this assumption is broken, revisit this code.
 	 */
 	static char memcg_name[PATH_MAX];
 	int ret;
 	if (!memcg || !p)
 		return;
 	rcu_read_lock();
 	mem_cgrp = memcg->css.cgroup;
 	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
 	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
 	if (ret < 0) {
 		/*
 		 * Unfortunately, we are unable to convert to a useful name
 		 * But we'll still print out the usage information
 		 */
 		rcu_read_unlock();
 		goto done;
 	}
 	rcu_read_unlock();
 	printk(KERN_INFO "Task in %s killed", memcg_name);
 	rcu_read_lock();
 	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
 	if (ret < 0) {
 		rcu_read_unlock();
 		goto done;
 	}
 	rcu_read_unlock();
 	/*
 	 * Continues from above, so we don't need an KERN_ level
 	 */
 	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
 done:
 	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
 		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
 		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
 		res_counter_read_u64(&memcg->res, RES_FAILCNT));
 	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
 		"failcnt %llu\n",
 		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
 		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
 		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
 }
 /*
  * This function returns the number of memcg under hierarchy tree. Returns
  * 1(self count) if no children.
  */
 static int mem_cgroup_count_children(struct mem_cgroup *memcg)
 {
 	int num = 0;
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, memcg)
 		num++;
 	return num;
 }
 /*
  * Return the memory (and swap, if configured) limit for a memcg.
  */
 static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
 {
 	u64 limit;
 	u64 memsw;
 	limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 	limit += total_swap_pages << PAGE_SHIFT;
 	memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 	/*
 	 * If memsw is finite and limits the amount of swap space available
 	 * to this memcg, return that limit.
 	 */
 	return min(limit, memsw);
 }
 void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 			      int order)
 {
 	struct mem_cgroup *iter;
 	unsigned long chosen_points = 0;
 	unsigned long totalpages;
 	unsigned int points = 0;
 	struct task_struct *chosen = NULL;
 	/*
 	 * If current has a pending SIGKILL, then automatically select it.  The
 	 * goal is to allow it to allocate so that it may quickly exit and free
 	 * its memory.
 	 */
 	if (fatal_signal_pending(current)) {
 		set_thread_flag(TIF_MEMDIE);
 		return;
 	}
 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
 	totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
 	for_each_mem_cgroup_tree(iter, memcg) {
 		struct cgroup *cgroup = iter->css.cgroup;
 		struct cgroup_iter it;
 		struct task_struct *task;
 		cgroup_iter_start(cgroup, &it);
 		while ((task = cgroup_iter_next(cgroup, &it))) {
 			switch (oom_scan_process_thread(task, totalpages, NULL,
 							false)) {
 			case OOM_SCAN_SELECT:
 				if (chosen)
 					put_task_struct(chosen);
 				chosen = task;
 				chosen_points = ULONG_MAX;
 				get_task_struct(chosen);
 				/* fall through */
 			case OOM_SCAN_CONTINUE:
 				continue;
 			case OOM_SCAN_ABORT:
 				cgroup_iter_end(cgroup, &it);
 				mem_cgroup_iter_break(memcg, iter);
 				if (chosen)
 					put_task_struct(chosen);
 				return;
 			case OOM_SCAN_OK:
 				break;
 			};
 			points = oom_badness(task, memcg, NULL, totalpages);
 			if (points > chosen_points) {
 				if (chosen)
 					put_task_struct(chosen);
 				chosen = task;
 				chosen_points = points;
 				get_task_struct(chosen);
 			}
 		}
 		cgroup_iter_end(cgroup, &it);
 	}
 	if (!chosen)
 		return;
 	points = chosen_points * 1000 / totalpages;
 	oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
 			 NULL, "Memory cgroup out of memory");
 }
 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 					gfp_t gfp_mask,
 					unsigned long flags)
 {
 	unsigned long total = 0;
 	bool noswap = false;
 	int loop;
 	if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
 		noswap = true;
 	if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
 		noswap = true;
 	for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
 		if (loop)
 			drain_all_stock_async(memcg);
 		total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
 		/*
 		 * Allow limit shrinkers, which are triggered directly
 		 * by userspace, to catch signals and stop reclaim
 		 * after minimal progress, regardless of the margin.
 		 */
 		if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
 			break;
 		if (mem_cgroup_margin(memcg))
 			break;
 		/*
 		 * If nothing was reclaimed after two attempts, there
 		 * may be no reclaimable pages in this hierarchy.
 		 */
 		if (loop && !total)
 			break;
 	}
 	return total;
 }
 /**
  * test_mem_cgroup_node_reclaimable
  * @memcg: the target memcg
  * @nid: the node ID to be checked.
  * @noswap : specify true here if the user wants flle only information.
  *
  * This function returns whether the specified memcg contains any
  * reclaimable pages on a node. Returns true if there are any reclaimable
  * pages in the node.
  */
 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
 		int nid, bool noswap)
 {
 	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
 		return true;
 	if (noswap || !total_swap_pages)
 		return false;
 	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
 		return true;
 	return false;
 }
 #if MAX_NUMNODES > 1
 /*
  * Always updating the nodemask is not very good - even if we have an empty
  * list or the wrong list here, we can start from some node and traverse all
  * nodes based on the zonelist. So update the list loosely once per 10 secs.
  *
  */
 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
 {
 	int nid;
 	/*
 	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
 	 * pagein/pageout changes since the last update.
 	 */
 	if (!atomic_read(&memcg->numainfo_events))
 		return;
 	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
 		return;
 	/* make a nodemask where this memcg uses memory from */
 	memcg->scan_nodes = node_states[N_HIGH_MEMORY];
 	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
 		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
 			node_clear(nid, memcg->scan_nodes);
 	}
 	atomic_set(&memcg->numainfo_events, 0);
 	atomic_set(&memcg->numainfo_updating, 0);
 }
 /*
  * Selecting a node where we start reclaim from. Because what we need is just
  * reducing usage counter, start from anywhere is O,K. Considering
  * memory reclaim from current node, there are pros. and cons.
  *
  * Freeing memory from current node means freeing memory from a node which
  * we'll use or we've used. So, it may make LRU bad. And if several threads
  * hit limits, it will see a contention on a node. But freeing from remote
  * node means more costs for memory reclaim because of memory latency.
  *
  * Now, we use round-robin. Better algorithm is welcomed.
  */
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 {
 	int node;
 	mem_cgroup_may_update_nodemask(memcg);
 	node = memcg->last_scanned_node;
 	node = next_node(node, memcg->scan_nodes);
 	if (node == MAX_NUMNODES)
 		node = first_node(memcg->scan_nodes);
 	/*
 	 * We call this when we hit limit, not when pages are added to LRU.
 	 * No LRU may hold pages because all pages are UNEVICTABLE or
 	 * memcg is too small and all pages are not on LRU. In that case,
 	 * we use curret node.
 	 */
 	if (unlikely(node == MAX_NUMNODES))
 		node = numa_node_id();
 	memcg->last_scanned_node = node;
 	return node;
 }
 /*
  * Check all nodes whether it contains reclaimable pages or not.
  * For quick scan, we make use of scan_nodes. This will allow us to skip
  * unused nodes. But scan_nodes is lazily updated and may not cotain
  * enough new information. We need to do double check.
  */
 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 {
 	int nid;
 	/*
 	 * quick check...making use of scan_node.
 	 * We can skip unused nodes.
 	 */
 	if (!nodes_empty(memcg->scan_nodes)) {
 		for (nid = first_node(memcg->scan_nodes);
 		     nid < MAX_NUMNODES;
 		     nid = next_node(nid, memcg->scan_nodes)) {
 			if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
 				return true;
 		}
 	}
 	/*
 	 * Check rest of nodes.
 	 */
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		if (node_isset(nid, memcg->scan_nodes))
 			continue;
 		if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
 			return true;
 	}
 	return false;
 }
 #else
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 {
 	return 0;
 }
 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 {
 	return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
 }
 #endif
 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
 				   struct zone *zone,
 				   gfp_t gfp_mask,
 				   unsigned long *total_scanned)
 {
 	struct mem_cgroup *victim = NULL;
 	int total = 0;
 	int loop = 0;
 	unsigned long excess;
 	unsigned long nr_scanned;
 	struct mem_cgroup_reclaim_cookie reclaim = {
 		.zone = zone,
 		.priority = 0,
 	};
 	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
 	while (1) {
 		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
 		if (!victim) {
 			loop++;
 			if (loop >= 2) {
 				/*
 				 * If we have not been able to reclaim
 				 * anything, it might because there are
 				 * no reclaimable pages under this hierarchy
 				 */
 				if (!total)
 					break;
 				/*
 				 * We want to do more targeted reclaim.
 				 * excess >> 2 is not to excessive so as to
 				 * reclaim too much, nor too less that we keep
 				 * coming back to reclaim from this cgroup
 				 */
 				if (total >= (excess >> 2) ||
 					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
 					break;
 			}
 			continue;
 		}
 		if (!mem_cgroup_reclaimable(victim, false))
 			continue;
 		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
 						     zone, &nr_scanned);
 		*total_scanned += nr_scanned;
 		if (!res_counter_soft_limit_excess(&root_memcg->res))
 			break;
 	}
 	mem_cgroup_iter_break(root_memcg, victim);
 	return total;
 }
 /*
  * Check OOM-Killer is already running under our hierarchy.
  * If someone is running, return false.
  * Has to be called with memcg_oom_lock
  */
 static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter, *failed = NULL;
 	for_each_mem_cgroup_tree(iter, memcg) {
 		if (iter->oom_lock) {
 			/*
 			 * this subtree of our hierarchy is already locked
 			 * so we cannot give a lock.
 			 */
 			failed = iter;
 			mem_cgroup_iter_break(memcg, iter);
 			break;
 		} else
 			iter->oom_lock = true;
 	}
 	if (!failed)
 		return true;
 	/*
 	 * OK, we failed to lock the whole subtree so we have to clean up
 	 * what we set up to the failing subtree
 	 */
 	for_each_mem_cgroup_tree(iter, memcg) {
 		if (iter == failed) {
 			mem_cgroup_iter_break(memcg, iter);
 			break;
 		}
 		iter->oom_lock = false;
 	}
 	return false;
 }
 /*
  * Has to be called with memcg_oom_lock
  */
 static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, memcg)
 		iter->oom_lock = false;
 	return 0;
 }
 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, memcg)
 		atomic_inc(&iter->under_oom);
 }
 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 	/*
 	 * When a new child is created while the hierarchy is under oom,
 	 * mem_cgroup_oom_lock() may not be called. We have to use
 	 * atomic_add_unless() here.
 	 */
 	for_each_mem_cgroup_tree(iter, memcg)
 		atomic_add_unless(&iter->under_oom, -1, 0);
 }
 static DEFINE_SPINLOCK(memcg_oom_lock);
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 struct oom_wait_info {
 	struct mem_cgroup *memcg;
 	wait_queue_t	wait;
 };
 static int memcg_oom_wake_function(wait_queue_t *wait,
 	unsigned mode, int sync, void *arg)
 {
 	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
 	struct mem_cgroup *oom_wait_memcg;
 	struct oom_wait_info *oom_wait_info;
 	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
 	oom_wait_memcg = oom_wait_info->memcg;
 	/*
 	 * Both of oom_wait_info->memcg and wake_memcg are stable under us.
 	 * Then we can use css_is_ancestor without taking care of RCU.
 	 */
 	if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
 		&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
 		return 0;
 	return autoremove_wake_function(wait, mode, sync, arg);
 }
 static void memcg_wakeup_oom(struct mem_cgroup *memcg)
 {
 	/* for filtering, pass "memcg" as argument. */
 	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
 }
 static void memcg_oom_recover(struct mem_cgroup *memcg)
 {
 	if (memcg && atomic_read(&memcg->under_oom))
 		memcg_wakeup_oom(memcg);
 }
 /*
  * try to call OOM killer. returns false if we should exit memory-reclaim loop.
  */
 static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
 				  int order)
 {
 	struct oom_wait_info owait;
 	bool locked, need_to_kill;
 	owait.memcg = memcg;
 	owait.wait.flags = 0;
 	owait.wait.func = memcg_oom_wake_function;
 	owait.wait.private = current;
 	INIT_LIST_HEAD(&owait.wait.task_list);
 	need_to_kill = true;
 	mem_cgroup_mark_under_oom(memcg);
 	/* At first, try to OOM lock hierarchy under memcg.*/
 	spin_lock(&memcg_oom_lock);
 	locked = mem_cgroup_oom_lock(memcg);
 	/*
 	 * Even if signal_pending(), we can't quit charge() loop without
 	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
 	 * under OOM is always welcomed, use TASK_KILLABLE here.
 	 */
 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
 	if (!locked || memcg->oom_kill_disable)
 		need_to_kill = false;
 	if (locked)
 		mem_cgroup_oom_notify(memcg);
 	spin_unlock(&memcg_oom_lock);
 	if (need_to_kill) {
 		finish_wait(&memcg_oom_waitq, &owait.wait);
 		mem_cgroup_out_of_memory(memcg, mask, order);
 	} else {
 		schedule();
 		finish_wait(&memcg_oom_waitq, &owait.wait);
 	}
 	spin_lock(&memcg_oom_lock);
 	if (locked)
 		mem_cgroup_oom_unlock(memcg);
 	memcg_wakeup_oom(memcg);
 	spin_unlock(&memcg_oom_lock);
 	mem_cgroup_unmark_under_oom(memcg);
 	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
 		return false;
 	/* Give chance to dying process */
 	schedule_timeout_uninterruptible(1);
 	return true;
 }
 /*
  * Currently used to update mapped file statistics, but the routine can be
  * generalized to update other statistics as well.
  *
  * Notes: Race condition
  *
  * We usually use page_cgroup_lock() for accessing page_cgroup member but
  * it tends to be costly. But considering some conditions, we doesn't need
  * to do so _always_.
  *
  * Considering "charge", lock_page_cgroup() is not required because all
  * file-stat operations happen after a page is attached to radix-tree. There
  * are no race with "charge".
  *
  * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
  * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
  * if there are race with "uncharge". Statistics itself is properly handled
  * by flags.
  *
  * Considering "move", this is an only case we see a race. To make the race
  * small, we check mm->moving_account and detect there are possibility of race
  * If there is, we take a lock.
  */
 void __mem_cgroup_begin_update_page_stat(struct page *page,
 				bool *locked, unsigned long *flags)
 {
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup(page);
 again:
 	memcg = pc->mem_cgroup;
 	if (unlikely(!memcg || !PageCgroupUsed(pc)))
 		return;
 	/*
 	 * If this memory cgroup is not under account moving, we don't
 	 * need to take move_lock_mem_cgroup(). Because we already hold
 	 * rcu_read_lock(), any calls to move_account will be delayed until
 	 * rcu_read_unlock() if mem_cgroup_stolen() == true.
 	 */
 	if (!mem_cgroup_stolen(memcg))
 		return;
 	move_lock_mem_cgroup(memcg, flags);
 	if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
 		move_unlock_mem_cgroup(memcg, flags);
 		goto again;
 	}
 	*locked = true;
 }
 void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
 {
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	/*
 	 * It's guaranteed that pc->mem_cgroup never changes while
 	 * lock is held because a routine modifies pc->mem_cgroup
 	 * should take move_lock_mem_cgroup().
 	 */
 	move_unlock_mem_cgroup(pc->mem_cgroup, flags);
 }
 void mem_cgroup_update_page_stat(struct page *page,
 				 enum mem_cgroup_page_stat_item idx, int val)
 {
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	unsigned long uninitialized_var(flags);
 	if (mem_cgroup_disabled())
 		return;
 	memcg = pc->mem_cgroup;
 	if (unlikely(!memcg || !PageCgroupUsed(pc)))
 		return;
 	switch (idx) {
 	case MEMCG_NR_FILE_MAPPED:
 		idx = MEM_CGROUP_STAT_FILE_MAPPED;
 		break;
 	default:
 		BUG();
 	}
 	this_cpu_add(memcg->stat->count[idx], val);
 }
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
  * TODO: maybe necessary to use big numbers in big irons.
  */
 #define CHARGE_BATCH	32U
 struct memcg_stock_pcp {
 	struct mem_cgroup *cached; /* this never be root cgroup */
 	unsigned int nr_pages;
 	struct work_struct work;
 	unsigned long flags;
 #define FLUSHING_CACHED_CHARGE	0
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
 static DEFINE_MUTEX(percpu_charge_mutex);
 /*
  * Try to consume stocked charge on this cpu. If success, one page is consumed
  * from local stock and true is returned. If the stock is 0 or charges from a
  * cgroup which is not current target, returns false. This stock will be
  * refilled.
  */
 static bool consume_stock(struct mem_cgroup *memcg)
 {
 	struct memcg_stock_pcp *stock;
 	bool ret = true;
 	stock = &get_cpu_var(memcg_stock);
 	if (memcg == stock->cached && stock->nr_pages)
 		stock->nr_pages--;
 	else /* need to call res_counter_charge */
 		ret = false;
 	put_cpu_var(memcg_stock);
 	return ret;
 }
 /*
  * Returns stocks cached in percpu to res_counter and reset cached information.
  */
 static void drain_stock(struct memcg_stock_pcp *stock)
 {
 	struct mem_cgroup *old = stock->cached;
 	if (stock->nr_pages) {
 		unsigned long bytes = stock->nr_pages * PAGE_SIZE;
 		res_counter_uncharge(&old->res, bytes);
 		if (do_swap_account)
 			res_counter_uncharge(&old->memsw, bytes);
 		stock->nr_pages = 0;
 	}
 	stock->cached = NULL;
 }
 /*
  * This must be called under preempt disabled or must be called by
  * a thread which is pinned to local cpu.
  */
 static void drain_local_stock(struct work_struct *dummy)
 {
 	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
 	drain_stock(stock);
 	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
 /*
  * Cache charges(val) which is from res_counter, to local per_cpu area.
  * This will be consumed by consume_stock() function, later.
  */
 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
 	if (stock->cached != memcg) { /* reset if necessary */
 		drain_stock(stock);
 		stock->cached = memcg;
 	}
 	stock->nr_pages += nr_pages;
 	put_cpu_var(memcg_stock);
 }
 /*
  * Drains all per-CPU charge caches for given root_memcg resp. subtree
  * of the hierarchy under it. sync flag says whether we should block
  * until the work is done.
  */
 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
 {
 	int cpu, curcpu;
 	/* Notify other cpus that system-wide "drain" is running */
 	get_online_cpus();
 	curcpu = get_cpu();
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
 		struct mem_cgroup *memcg;
 		memcg = stock->cached;
 		if (!memcg || !stock->nr_pages)
 			continue;
 		if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
 			continue;
 		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
 			if (cpu == curcpu)
 				drain_local_stock(&stock->work);
 			else
 				schedule_work_on(cpu, &stock->work);
 		}
 	}
 	put_cpu();
 	if (!sync)
 		goto out;
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
 		if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
 			flush_work(&stock->work);
 	}
 out:
  	put_online_cpus();
 }
 /*
  * Tries to drain stocked charges in other cpus. This function is asynchronous
  * and just put a work per cpu for draining localy on each cpu. Caller can
  * expects some charges will be back to res_counter later but cannot wait for
  * it.
  */
 static void drain_all_stock_async(struct mem_cgroup *root_memcg)
 {
 	/*
 	 * If someone calls draining, avoid adding more kworker runs.
 	 */
 	if (!mutex_trylock(&percpu_charge_mutex))
 		return;
 	drain_all_stock(root_memcg, false);
 	mutex_unlock(&percpu_charge_mutex);
 }
 /* This is a synchronous drain interface. */
 static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
 {
 	/* called when force_empty is called */
 	mutex_lock(&percpu_charge_mutex);
 	drain_all_stock(root_memcg, true);
 	mutex_unlock(&percpu_charge_mutex);
 }
 /*
  * This function drains percpu counter value from DEAD cpu and
  * move it to local cpu. Note that this function can be preempted.
  */
 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
 {
 	int i;
 	spin_lock(&memcg->pcp_counter_lock);
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 		long x = per_cpu(memcg->stat->count[i], cpu);
 		per_cpu(memcg->stat->count[i], cpu) = 0;
 		memcg->nocpu_base.count[i] += x;
 	}
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
 		unsigned long x = per_cpu(memcg->stat->events[i], cpu);
 		per_cpu(memcg->stat->events[i], cpu) = 0;
 		memcg->nocpu_base.events[i] += x;
 	}
 	spin_unlock(&memcg->pcp_counter_lock);
 }
 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
 					unsigned long action,
 					void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	struct memcg_stock_pcp *stock;
 	struct mem_cgroup *iter;
 	if (action == CPU_ONLINE)
 		return NOTIFY_OK;
 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
 		return NOTIFY_OK;
 	for_each_mem_cgroup(iter)
 		mem_cgroup_drain_pcp_counter(iter, cpu);
 	stock = &per_cpu(memcg_stock, cpu);
 	drain_stock(stock);
 	return NOTIFY_OK;
 }
 /* See __mem_cgroup_try_charge() for details */
 enum {
 	CHARGE_OK,		/* success */
 	CHARGE_RETRY,		/* need to retry but retry is not bad */
 	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
 	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
 	CHARGE_OOM_DIE,		/* the current is killed because of OOM */
 };
 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				unsigned int nr_pages, bool oom_check)
 {
 	unsigned long csize = nr_pages * PAGE_SIZE;
 	struct mem_cgroup *mem_over_limit;
 	struct res_counter *fail_res;
 	unsigned long flags = 0;
 	int ret;
 	ret = res_counter_charge(&memcg->res, csize, &fail_res);
 	if (likely(!ret)) {
 		if (!do_swap_account)
 			return CHARGE_OK;
 		ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
 		if (likely(!ret))
 			return CHARGE_OK;
 		res_counter_uncharge(&memcg->res, csize);
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
 		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
 	} else
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
 	/*
 	 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
 	 * of regular pages (CHARGE_BATCH), or a single regular page (1).
 	 *
 	 * Never reclaim on behalf of optional batching, retry with a
 	 * single page instead.
 	 */
 	if (nr_pages == CHARGE_BATCH)
 		return CHARGE_RETRY;
 	if (!(gfp_mask & __GFP_WAIT))
 		return CHARGE_WOULDBLOCK;
 	ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
 		return CHARGE_RETRY;
 	/*
 	 * Even though the limit is exceeded at this point, reclaim
 	 * may have been able to free some pages.  Retry the charge
 	 * before killing the task.
 	 *
 	 * Only for regular pages, though: huge pages are rather
 	 * unlikely to succeed so close to the limit, and we fall back
 	 * to regular pages anyway in case of failure.
 	 */
 	if (nr_pages == 1 && ret)
 		return CHARGE_RETRY;
 	/*
 	 * At task move, charge accounts can be doubly counted. So, it's
 	 * better to wait until the end of task_move if something is going on.
 	 */
 	if (mem_cgroup_wait_acct_move(mem_over_limit))
 		return CHARGE_RETRY;
 	/* If we don't need to call oom-killer at el, return immediately */
 	if (!oom_check)
 		return CHARGE_NOMEM;
 	/* check OOM */
 	if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
 		return CHARGE_OOM_DIE;
 	return CHARGE_RETRY;
 }
 /*
  * __mem_cgroup_try_charge() does
  * 1. detect memcg to be charged against from passed *mm and *ptr,
  * 2. update res_counter
  * 3. call memory reclaim if necessary.
  *
  * In some special case, if the task is fatal, fatal_signal_pending() or
  * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
  * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
  * as possible without any hazards. 2: all pages should have a valid
  * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
  * pointer, that is treated as a charge to root_mem_cgroup.
  *
  * So __mem_cgroup_try_charge() will return
  *  0       ...  on success, filling *ptr with a valid memcg pointer.
  *  -ENOMEM ...  charge failure because of resource limits.
  *  -EINTR  ...  if thread is fatal. *ptr is filled with root_mem_cgroup.
  *
  * Unlike the exported interface, an "oom" parameter is added. if oom==true,
  * the oom-killer can be invoked.
  */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
 				   gfp_t gfp_mask,
 				   unsigned int nr_pages,
 				   struct mem_cgroup **ptr,
 				   bool oom)
 {
 	unsigned int batch = max(CHARGE_BATCH, nr_pages);
 	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct mem_cgroup *memcg = NULL;
 	int ret;
 	/*
 	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
 	 * in system level. So, allow to go ahead dying process in addition to
 	 * MEMDIE process.
 	 */
 	if (unlikely(test_thread_flag(TIF_MEMDIE)
 		     || fatal_signal_pending(current)))
 		goto bypass;
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
 	 * The mm_struct's mem_cgroup changes on task migration if the
 	 * thread group leader migrates. It's possible that mm is not
 	 * set, if so charge the root memcg (happens for pagecache usage).
 	 */
 	if (!*ptr && !mm)
 		*ptr = root_mem_cgroup;
 again:
 	if (*ptr) { /* css should be a valid one */
 		memcg = *ptr;
 		VM_BUG_ON(css_is_removed(&memcg->css));
 		if (mem_cgroup_is_root(memcg))
 			goto done;
 		if (nr_pages == 1 && consume_stock(memcg))
 			goto done;
 		css_get(&memcg->css);
 	} else {
 		struct task_struct *p;
 		rcu_read_lock();
 		p = rcu_dereference(mm->owner);
 		/*
 		 * Because we don't have task_lock(), "p" can exit.
 		 * In that case, "memcg" can point to root or p can be NULL with
 		 * race with swapoff. Then, we have small risk of mis-accouning.
 		 * But such kind of mis-account by race always happens because
 		 * we don't have cgroup_mutex(). It's overkill and we allo that
 		 * small race, here.
 		 * (*) swapoff at el will charge against mm-struct not against
 		 * task-struct. So, mm->owner can be NULL.
 		 */
 		memcg = mem_cgroup_from_task(p);
 		if (!memcg)
 			memcg = root_mem_cgroup;
 		if (mem_cgroup_is_root(memcg)) {
 			rcu_read_unlock();
 			goto done;
 		}
 		if (nr_pages == 1 && consume_stock(memcg)) {
 			/*
 			 * It seems dagerous to access memcg without css_get().
 			 * But considering how consume_stok works, it's not
 			 * necessary. If consume_stock success, some charges
 			 * from this memcg are cached on this cpu. So, we
 			 * don't need to call css_get()/css_tryget() before
 			 * calling consume_stock().
 			 */
 			rcu_read_unlock();
 			goto done;
 		}
 		/* after here, we may be blocked. we need to get refcnt */
 		if (!css_tryget(&memcg->css)) {
 			rcu_read_unlock();
 			goto again;
 		}
 		rcu_read_unlock();
 	}
 	do {
 		bool oom_check;
 		/* If killed, bypass charge */
 		if (fatal_signal_pending(current)) {
 			css_put(&memcg->css);
 			goto bypass;
 		}
 		oom_check = false;
 		if (oom && !nr_oom_retries) {
 			oom_check = true;
 			nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
 		}
 		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
 		switch (ret) {
 		case CHARGE_OK:
 			break;
 		case CHARGE_RETRY: /* not in OOM situation but retry */
 			batch = nr_pages;
 			css_put(&memcg->css);
 			memcg = NULL;
 			goto again;
 		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
 			css_put(&memcg->css);
 			goto nomem;
 		case CHARGE_NOMEM: /* OOM routine works */
 			if (!oom) {
 				css_put(&memcg->css);
 				goto nomem;
 			}
 			/* If oom, we never return -ENOMEM */
 			nr_oom_retries--;
 			break;
 		case CHARGE_OOM_DIE: /* Killed by OOM Killer */
 			css_put(&memcg->css);
 			goto bypass;
 		}
 	} while (ret != CHARGE_OK);
 	if (batch > nr_pages)
 		refill_stock(memcg, batch - nr_pages);
 	css_put(&memcg->css);
 done:
 	*ptr = memcg;
 	return 0;
 nomem:
 	*ptr = NULL;
 	return -ENOMEM;
 bypass:
 	*ptr = root_mem_cgroup;
 	return -EINTR;
 }
 /*
  * Somemtimes we have to undo a charge we got by try_charge().
  * This function is for that and do uncharge, put css's refcnt.
  * gotten by try_charge().
  */
 static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
 				       unsigned int nr_pages)
 {
 	if (!mem_cgroup_is_root(memcg)) {
 		unsigned long bytes = nr_pages * PAGE_SIZE;
 		res_counter_uncharge(&memcg->res, bytes);
 		if (do_swap_account)
 			res_counter_uncharge(&memcg->memsw, bytes);
 	}
 }
 /*
  * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
  * This is useful when moving usage to parent cgroup.
  */
 static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
 					unsigned int nr_pages)
 {
 	unsigned long bytes = nr_pages * PAGE_SIZE;
 	if (mem_cgroup_is_root(memcg))
 		return;
 	res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
 	if (do_swap_account)
 		res_counter_uncharge_until(&memcg->memsw,
 						memcg->memsw.parent, bytes);
 }
 /*
  * A helper function to get mem_cgroup from ID. must be called under
  * rcu_read_lock(). The caller must check css_is_removed() or some if
  * it's concern. (dropping refcnt from swap can be called against removed
  * memcg.)
  */
 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
 {
 	struct cgroup_subsys_state *css;
 	/* ID 0 is unused ID */
 	if (!id)
 		return NULL;
 	css = css_lookup(&mem_cgroup_subsys, id);
 	if (!css)
 		return NULL;
 	return container_of(css, struct mem_cgroup, css);
 }
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct page_cgroup *pc;
 	unsigned short id;
 	swp_entry_t ent;
 	VM_BUG_ON(!PageLocked(page));
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		memcg = pc->mem_cgroup;
 		if (memcg && !css_tryget(&memcg->css))
 			memcg = NULL;
 	} else if (PageSwapCache(page)) {
 		ent.val = page_private(page);
 		id = lookup_swap_cgroup_id(ent);
 		rcu_read_lock();
 		memcg = mem_cgroup_lookup(id);
 		if (memcg && !css_tryget(&memcg->css))
 			memcg = NULL;
 		rcu_read_unlock();
 	}
 	unlock_page_cgroup(pc);
 	return memcg;
 }
 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 				       struct page *page,
 				       unsigned int nr_pages,
 				       enum charge_type ctype,
 				       bool lrucare)
 {
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	struct zone *uninitialized_var(zone);
 	struct lruvec *lruvec;
 	bool was_on_lru = false;
 	bool anon;
 	lock_page_cgroup(pc);
 	VM_BUG_ON(PageCgroupUsed(pc));
 	/*
 	 * we don't need page_cgroup_lock about tail pages, becase they are not
 	 * accessed by any other context at this point.
 	 */
 	/*
 	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
 	 * may already be on some other mem_cgroup's LRU.  Take care of it.
 	 */
 	if (lrucare) {
 		zone = page_zone(page);
 		spin_lock_irq(&zone->lru_lock);
 		if (PageLRU(page)) {
 			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
 			ClearPageLRU(page);
 			del_page_from_lru_list(page, lruvec, page_lru(page));
 			was_on_lru = true;
 		}
 	}
 	pc->mem_cgroup = memcg;
 	/*
 	 * We access a page_cgroup asynchronously without lock_page_cgroup().
 	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
 	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
 	 * before USED bit, we need memory barrier here.
 	 * See mem_cgroup_add_lru_list(), etc.
  	 */
 	smp_wmb();
 	SetPageCgroupUsed(pc);
 	if (lrucare) {
 		if (was_on_lru) {
 			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
 			VM_BUG_ON(PageLRU(page));
 			SetPageLRU(page);
 			add_page_to_lru_list(page, lruvec, page_lru(page));
 		}
 		spin_unlock_irq(&zone->lru_lock);
 	}
 	if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
 		anon = true;
 	else
 		anon = false;
 	mem_cgroup_charge_statistics(memcg, anon, nr_pages);
 	unlock_page_cgroup(pc);
 	/*
 	 * "charge_statistics" updated event counter. Then, check it.
 	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
 	 * if they exceeds softlimit.
 	 */
 	memcg_check_events(memcg, page);
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
 /*
  * Because tail pages are not marked as "used", set it. We're under
  * zone->lru_lock, 'splitting on pmd' and compound_lock.
  * charge/uncharge will be never happen and move_account() is done under
  * compound_lock(), so we don't have to take care of races.
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
 	struct page_cgroup *head_pc = lookup_page_cgroup(head);
 	struct page_cgroup *pc;
 	int i;
 	if (mem_cgroup_disabled())
 		return;
 	for (i = 1; i < HPAGE_PMD_NR; i++) {
 		pc = head_pc + i;
 		pc->mem_cgroup = head_pc->mem_cgroup;
 		smp_wmb();/* see __commit_charge() */
 		pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
 	}
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 /**
  * mem_cgroup_move_account - move account of the page
  * @page: the page
  * @nr_pages: number of regular pages (>1 for huge pages)
  * @pc:	page_cgroup of the page.
  * @from: mem_cgroup which the page is moved from.
  * @to:	mem_cgroup which the page is moved to. @from != @to.
  *
  * The caller must confirm following.
  * - page is not on LRU (isolate_page() is useful.)
  * - compound_lock is held when nr_pages > 1
  *
  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
  * from old cgroup.
  */
 static int mem_cgroup_move_account(struct page *page,
 				   unsigned int nr_pages,
 				   struct page_cgroup *pc,
 				   struct mem_cgroup *from,
 				   struct mem_cgroup *to)
 {
 	unsigned long flags;
 	int ret;
 	bool anon = PageAnon(page);
 	VM_BUG_ON(from == to);
 	VM_BUG_ON(PageLRU(page));
 	/*
 	 * The page is isolated from LRU. So, collapse function
 	 * will not handle this page. But page splitting can happen.
 	 * Do this check under compound_page_lock(). The caller should
 	 * hold it.
 	 */
 	ret = -EBUSY;
 	if (nr_pages > 1 && !PageTransHuge(page))
 		goto out;
 	lock_page_cgroup(pc);
 	ret = -EINVAL;
 	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
 		goto unlock;
 	move_lock_mem_cgroup(from, &flags);
 	if (!anon && page_mapped(page)) {
 		/* Update mapped_file data for mem_cgroup */
 		preempt_disable();
 		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
 		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
 		preempt_enable();
 	}
 	mem_cgroup_charge_statistics(from, anon, -nr_pages);
 	/* caller should have done css_get */
 	pc->mem_cgroup = to;
 	mem_cgroup_charge_statistics(to, anon, nr_pages);
 	/*
 	 * We charges against "to" which may not have any tasks. Then, "to"
 	 * can be under rmdir(). But in current implementation, caller of
 	 * this function is just force_empty() and move charge, so it's
 	 * guaranteed that "to" is never removed. So, we don't check rmdir
 	 * status here.
 	 */
 	move_unlock_mem_cgroup(from, &flags);
 	ret = 0;
 unlock:
 	unlock_page_cgroup(pc);
 	/*
 	 * check events
 	 */
 	memcg_check_events(to, page);
 	memcg_check_events(from, page);
 out:
 	return ret;
 }
 /*
  * move charges to its parent.
  */
 static int mem_cgroup_move_parent(struct page *page,
 				  struct page_cgroup *pc,
 				  struct mem_cgroup *child)
 {
 	struct mem_cgroup *parent;
 	unsigned int nr_pages;
 	unsigned long uninitialized_var(flags);
 	int ret;
 	/* Is ROOT ? */
 	if (mem_cgroup_is_root(child))
 		return -EINVAL;
 	ret = -EBUSY;
 	if (!get_page_unless_zero(page))
 		goto out;
 	if (isolate_lru_page(page))
 		goto put;
 	nr_pages = hpage_nr_pages(page);
 	parent = parent_mem_cgroup(child);
 	/*
 	 * If no parent, move charges to root cgroup.
 	 */
 	if (!parent)
 		parent = root_mem_cgroup;
 	if (nr_pages > 1)
 		flags = compound_lock_irqsave(page);
 	ret = mem_cgroup_move_account(page, nr_pages,
 				pc, child, parent);
 	if (!ret)
 		__mem_cgroup_cancel_local_charge(child, nr_pages);
 	if (nr_pages > 1)
 		compound_unlock_irqrestore(page, flags);
 	putback_lru_page(page);
 put:
 	put_page(page);
 out:
 	return ret;
 }
 /*
  * Charge the memory controller for page usage.
  * Return
  * 0 if the charge was successful
  * < 0 if the cgroup is over its limit
  */
 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask, enum charge_type ctype)
 {
 	struct mem_cgroup *memcg = NULL;
 	unsigned int nr_pages = 1;
 	bool oom = true;
 	int ret;
 	if (PageTransHuge(page)) {
 		nr_pages <<= compound_order(page);
 		VM_BUG_ON(!PageTransHuge(page));
 		/*
 		 * Never OOM-kill a process for a huge page.  The
 		 * fault handler will fall back to regular pages.
 		 */
 		oom = false;
 	}
 	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
 	if (ret == -ENOMEM)
 		return ret;
 	__mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
 	return 0;
 }
 int mem_cgroup_newpage_charge(struct page *page,
 			      struct mm_struct *mm, gfp_t gfp_mask)
 {
 	if (mem_cgroup_disabled())
 		return 0;
 	VM_BUG_ON(page_mapped(page));
 	VM_BUG_ON(page->mapping && !PageAnon(page));
 	VM_BUG_ON(!mm);
 	return mem_cgroup_charge_common(page, mm, gfp_mask,
 					MEM_CGROUP_CHARGE_TYPE_ANON);
 }
 /*
  * While swap-in, try_charge -> commit or cancel, the page is locked.
  * And when try_charge() successfully returns, one refcnt to memcg without
  * struct page_cgroup is acquired. This refcnt will be consumed by
  * "commit()" or removed by "cancel()"
  */
 static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 					  struct page *page,
 					  gfp_t mask,
 					  struct mem_cgroup **memcgp)
 {
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc;
 	int ret;
 	pc = lookup_page_cgroup(page);
 	/*
 	 * Every swap fault against a single page tries to charge the
 	 * page, bail as early as possible.  shmem_unuse() encounters
 	 * already charged pages, too.  The USED bit is protected by
 	 * the page lock, which serializes swap cache removal, which
 	 * in turn serializes uncharging.
 	 */
 	if (PageCgroupUsed(pc))
 		return 0;
 	if (!do_swap_account)
 		goto charge_cur_mm;
-	/*
-	 * A racing thread's fault, or swapoff, may have already updated
-	 * the pte, and even removed page from swap cache: in those cases
-	 * do_swap_page()'s pte_same() test will fail; but there's also a
-	 * KSM case which does need to charge the page.
-	 */
-	if (!PageSwapCache(page))
-		goto charge_cur_mm;
 	memcg = try_get_mem_cgroup_from_page(page);
 	if (!memcg)
 		goto charge_cur_mm;
 	*memcgp = memcg;
 	ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
 	css_put(&memcg->css);
 	if (ret == -EINTR)
 		ret = 0;
 	return ret;
 charge_cur_mm:
 	ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
 	if (ret == -EINTR)
 		ret = 0;
 	return ret;
 }
 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
 				 gfp_t gfp_mask, struct mem_cgroup **memcgp)
 {
 	*memcgp = NULL;
 	if (mem_cgroup_disabled())
 		return 0;
+	/*
+	 * A racing thread's fault, or swapoff, may have already
+	 * updated the pte, and even removed page from swap cache: in
+	 * those cases unuse_pte()'s pte_same() test will fail; but
+	 * there's also a KSM case which does need to charge the page.
+	 */
+	if (!PageSwapCache(page)) {
+		int ret;
+		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
+		if (ret == -EINTR)
+			ret = 0;
+		return ret;
+	}
 	return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
 }
 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
 {
 	if (mem_cgroup_disabled())
 		return;
 	if (!memcg)
 		return;
 	__mem_cgroup_cancel_charge(memcg, 1);
 }
 static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
 					enum charge_type ctype)
 {
 	if (mem_cgroup_disabled())
 		return;
 	if (!memcg)
 		return;
 	cgroup_exclude_rmdir(&memcg->css);
 	__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
 	/*
 	 * Now swap is on-memory. This means this page may be
 	 * counted both as mem and swap....double count.
 	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
 	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
 	 * may call delete_from_swap_cache() before reach here.
 	 */
 	if (do_swap_account && PageSwapCache(page)) {
 		swp_entry_t ent = {.val = page_private(page)};
 		mem_cgroup_uncharge_swap(ent);
 	}
 	/*
 	 * At swapin, we may charge account against cgroup which has no tasks.
 	 * So, rmdir()->pre_destroy() can be called while we do this charge.
 	 * In that case, we need to call pre_destroy() again. check it here.
 	 */
 	cgroup_release_and_wakeup_rmdir(&memcg->css);
 }
 void mem_cgroup_commit_charge_swapin(struct page *page,
 				     struct mem_cgroup *memcg)
 {
 	__mem_cgroup_commit_charge_swapin(page, memcg,
 					  MEM_CGROUP_CHARGE_TYPE_ANON);
 }
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask)
 {
 	struct mem_cgroup *memcg = NULL;
 	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	int ret;
 	if (mem_cgroup_disabled())
 		return 0;
 	if (PageCompound(page))
 		return 0;
 	if (!PageSwapCache(page))
 		ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
 	else { /* page is swapcache/shmem */
 		ret = __mem_cgroup_try_charge_swapin(mm, page,
 						     gfp_mask, &memcg);
 		if (!ret)
 			__mem_cgroup_commit_charge_swapin(page, memcg, type);
 	}
 	return ret;
 }
 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
 				   unsigned int nr_pages,
 				   const enum charge_type ctype)
 {
 	struct memcg_batch_info *batch = NULL;
 	bool uncharge_memsw = true;
 	/* If swapout, usage of swap doesn't decrease */
 	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
 		uncharge_memsw = false;
 	batch = &current->memcg_batch;
 	/*
 	 * In usual, we do css_get() when we remember memcg pointer.
 	 * But in this case, we keep res->usage until end of a series of
 	 * uncharges. Then, it's ok to ignore memcg's refcnt.
 	 */
 	if (!batch->memcg)
 		batch->memcg = memcg;
 	/*
 	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
 	 * In those cases, all pages freed continuously can be expected to be in
 	 * the same cgroup and we have chance to coalesce uncharges.
 	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
 	 * because we want to do uncharge as soon as possible.
 	 */
 	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
 		goto direct_uncharge;
 	if (nr_pages > 1)
 		goto direct_uncharge;
 	/*
 	 * In typical case, batch->memcg == mem. This means we can
 	 * merge a series of uncharges to an uncharge of res_counter.
 	 * If not, we uncharge res_counter ony by one.
 	 */
 	if (batch->memcg != memcg)
 		goto direct_uncharge;
 	/* remember freed charge and uncharge it later */
 	batch->nr_pages++;
 	if (uncharge_memsw)
 		batch->memsw_nr_pages++;
 	return;
 direct_uncharge:
 	res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
 	if (uncharge_memsw)
 		res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
 	if (unlikely(batch->memcg != memcg))
 		memcg_oom_recover(memcg);
 }
 /*
  * uncharge if !page_mapped(page)
  */
 static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
 			     bool end_migration)
 {
 	struct mem_cgroup *memcg = NULL;
 	unsigned int nr_pages = 1;
 	struct page_cgroup *pc;
 	bool anon;
 	if (mem_cgroup_disabled())
 		return NULL;
 	VM_BUG_ON(PageSwapCache(page));
 	if (PageTransHuge(page)) {
 		nr_pages <<= compound_order(page);
 		VM_BUG_ON(!PageTransHuge(page));
 	}
 	/*
 	 * Check if our page_cgroup is valid
 	 */
 	pc = lookup_page_cgroup(page);
 	if (unlikely(!PageCgroupUsed(pc)))
 		return NULL;
 	lock_page_cgroup(pc);
 	memcg = pc->mem_cgroup;
 	if (!PageCgroupUsed(pc))
 		goto unlock_out;
 	anon = PageAnon(page);
 	switch (ctype) {
 	case MEM_CGROUP_CHARGE_TYPE_ANON:
 		/*
 		 * Generally PageAnon tells if it's the anon statistics to be
 		 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
 		 * used before page reached the stage of being marked PageAnon.
 		 */
 		anon = true;
 		/* fallthrough */
 	case MEM_CGROUP_CHARGE_TYPE_DROP:
 		/* See mem_cgroup_prepare_migration() */
 		if (page_mapped(page))
 			goto unlock_out;
 		/*
 		 * Pages under migration may not be uncharged.  But
 		 * end_migration() /must/ be the one uncharging the
 		 * unused post-migration page and so it has to call
 		 * here with the migration bit still set.  See the
 		 * res_counter handling below.
 		 */
 		if (!end_migration && PageCgroupMigration(pc))
 			goto unlock_out;
 		break;
 	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
 		if (!PageAnon(page)) {	/* Shared memory */
 			if (page->mapping && !page_is_file_cache(page))
 				goto unlock_out;
 		} else if (page_mapped(page)) /* Anon */
 				goto unlock_out;
 		break;
 	default:
 		break;
 	}
 	mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
 	ClearPageCgroupUsed(pc);
 	/*
 	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
 	 * freed from LRU. This is safe because uncharged page is expected not
 	 * to be reused (freed soon). Exception is SwapCache, it's handled by
 	 * special functions.
 	 */
 	unlock_page_cgroup(pc);
 	/*
 	 * even after unlock, we have memcg->res.usage here and this memcg
 	 * will never be freed.
 	 */
 	memcg_check_events(memcg, page);
 	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
 		mem_cgroup_swap_statistics(memcg, true);
 		mem_cgroup_get(memcg);
 	}
 	/*
 	 * Migration does not charge the res_counter for the
 	 * replacement page, so leave it alone when phasing out the
 	 * page that is unused after the migration.
 	 */
 	if (!end_migration && !mem_cgroup_is_root(memcg))
 		mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
 	return memcg;
 unlock_out:
 	unlock_page_cgroup(pc);
 	return NULL;
 }
 void mem_cgroup_uncharge_page(struct page *page)
 {
 	/* early check. */
 	if (page_mapped(page))
 		return;
 	VM_BUG_ON(page->mapping && !PageAnon(page));
 	if (PageSwapCache(page))
 		return;
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
 }
 void mem_cgroup_uncharge_cache_page(struct page *page)
 {
 	VM_BUG_ON(page_mapped(page));
 	VM_BUG_ON(page->mapping);
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
 }
 /*
  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
  * In that cases, pages are freed continuously and we can expect pages
  * are in the same memcg. All these calls itself limits the number of
  * pages freed at once, then uncharge_start/end() is called properly.
  * This may be called prural(2) times in a context,
  */
 void mem_cgroup_uncharge_start(void)
 {
 	current->memcg_batch.do_batch++;
 	/* We can do nest. */
 	if (current->memcg_batch.do_batch == 1) {
 		current->memcg_batch.memcg = NULL;
 		current->memcg_batch.nr_pages = 0;
 		current->memcg_batch.memsw_nr_pages = 0;
 	}
 }
 void mem_cgroup_uncharge_end(void)
 {
 	struct memcg_batch_info *batch = &current->memcg_batch;
 	if (!batch->do_batch)
 		return;
 	batch->do_batch--;
 	if (batch->do_batch) /* If stacked, do nothing. */
 		return;
 	if (!batch->memcg)
 		return;
 	/*
 	 * This "batch->memcg" is valid without any css_get/put etc...
 	 * bacause we hide charges behind us.
 	 */
 	if (batch->nr_pages)
 		res_counter_uncharge(&batch->memcg->res,
 				     batch->nr_pages * PAGE_SIZE);
 	if (batch->memsw_nr_pages)
 		res_counter_uncharge(&batch->memcg->memsw,
 				     batch->memsw_nr_pages * PAGE_SIZE);
 	memcg_oom_recover(batch->memcg);
 	/* forget this pointer (for sanity check) */
 	batch->memcg = NULL;
 }
 #ifdef CONFIG_SWAP
 /*
  * called after __delete_from_swap_cache() and drop "page" account.
  * memcg information is recorded to swap_cgroup of "ent"
  */
 void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 {
 	struct mem_cgroup *memcg;
 	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
 	if (!swapout) /* this was a swap cache but the swap is unused ! */
 		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
 	memcg = __mem_cgroup_uncharge_common(page, ctype, false);
 	/*
 	 * record memcg information,  if swapout && memcg != NULL,
 	 * mem_cgroup_get() was called in uncharge().
 	 */
 	if (do_swap_account && swapout && memcg)
 		swap_cgroup_record(ent, css_id(&memcg->css));
 }
 #endif
 #ifdef CONFIG_MEMCG_SWAP
 /*
  * called from swap_entry_free(). remove record in swap_cgroup and
  * uncharge "memsw" account.
  */
 void mem_cgroup_uncharge_swap(swp_entry_t ent)
 {
 	struct mem_cgroup *memcg;
 	unsigned short id;
 	if (!do_swap_account)
 		return;
 	id = swap_cgroup_record(ent, 0);
 	rcu_read_lock();
 	memcg = mem_cgroup_lookup(id);
 	if (memcg) {
 		/*
 		 * We uncharge this because swap is freed.
 		 * This memcg can be obsolete one. We avoid calling css_tryget
 		 */
 		if (!mem_cgroup_is_root(memcg))
 			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 		mem_cgroup_swap_statistics(memcg, false);
 		mem_cgroup_put(memcg);
 	}
 	rcu_read_unlock();
 }
 /**
  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
  * @entry: swap entry to be moved
  * @from:  mem_cgroup which the entry is moved from
  * @to:  mem_cgroup which the entry is moved to
  *
  * It succeeds only when the swap_cgroup's record for this entry is the same
  * as the mem_cgroup's id of @from.
  *
  * Returns 0 on success, -EINVAL on failure.
  *
  * The caller must have charged to @to, IOW, called res_counter_charge() about
  * both res and memsw, and called css_get().
  */
 static int mem_cgroup_move_swap_account(swp_entry_t entry,
 				struct mem_cgroup *from, struct mem_cgroup *to)
 {
 	unsigned short old_id, new_id;
 	old_id = css_id(&from->css);
 	new_id = css_id(&to->css);
 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
 		mem_cgroup_swap_statistics(from, false);
 		mem_cgroup_swap_statistics(to, true);
 		/*
 		 * This function is only called from task migration context now.
 		 * It postpones res_counter and refcount handling till the end
 		 * of task migration(mem_cgroup_clear_mc()) for performance
 		 * improvement. But we cannot postpone mem_cgroup_get(to)
 		 * because if the process that has been moved to @to does
 		 * swap-in, the refcount of @to might be decreased to 0.
 		 */
 		mem_cgroup_get(to);
 		return 0;
 	}
 	return -EINVAL;
 }
 #else
 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 				struct mem_cgroup *from, struct mem_cgroup *to)
 {
 	return -EINVAL;
 }
 #endif
 /*
  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
  * page belongs to.
  */
 void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 				  struct mem_cgroup **memcgp)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct page_cgroup *pc;
 	enum charge_type ctype;
 	*memcgp = NULL;
 	VM_BUG_ON(PageTransHuge(page));
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		memcg = pc->mem_cgroup;
 		css_get(&memcg->css);
 		/*
 		 * At migrating an anonymous page, its mapcount goes down
 		 * to 0 and uncharge() will be called. But, even if it's fully
 		 * unmapped, migration may fail and this page has to be
 		 * charged again. We set MIGRATION flag here and delay uncharge
 		 * until end_migration() is called
 		 *
 		 * Corner Case Thinking
 		 * A)
 		 * When the old page was mapped as Anon and it's unmap-and-freed
 		 * while migration was ongoing.
 		 * If unmap finds the old page, uncharge() of it will be delayed
 		 * until end_migration(). If unmap finds a new page, it's
 		 * uncharged when it make mapcount to be 1->0. If unmap code
 		 * finds swap_migration_entry, the new page will not be mapped
 		 * and end_migration() will find it(mapcount==0).
 		 *
 		 * B)
 		 * When the old page was mapped but migraion fails, the kernel
 		 * remaps it. A charge for it is kept by MIGRATION flag even
 		 * if mapcount goes down to 0. We can do remap successfully
 		 * without charging it again.
 		 *
 		 * C)
 		 * The "old" page is under lock_page() until the end of
 		 * migration, so, the old page itself will not be swapped-out.
 		 * If the new page is swapped out before end_migraton, our
 		 * hook to usual swap-out path will catch the event.
 		 */
 		if (PageAnon(page))
 			SetPageCgroupMigration(pc);
 	}
 	unlock_page_cgroup(pc);
 	/*
 	 * If the page is not charged at this point,
 	 * we return here.
 	 */
 	if (!memcg)
 		return;
 	*memcgp = memcg;
 	/*
 	 * We charge new page before it's used/mapped. So, even if unlock_page()
 	 * is called before end_migration, we can catch all events on this new
 	 * page. In the case new page is migrated but not remapped, new page's
 	 * mapcount will be finally 0 and we call uncharge in end_migration().
 	 */
 	if (PageAnon(page))
 		ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
 	else
 		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	/*
 	 * The page is committed to the memcg, but it's not actually
 	 * charged to the res_counter since we plan on replacing the
 	 * old one and only one page is going to be left afterwards.
 	 */
 	__mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
 }
 /* remove redundant charge if migration failed*/
 void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 	struct page *oldpage, struct page *newpage, bool migration_ok)
 {
 	struct page *used, *unused;
 	struct page_cgroup *pc;
 	bool anon;
 	if (!memcg)
 		return;
 	/* blocks rmdir() */
 	cgroup_exclude_rmdir(&memcg->css);
 	if (!migration_ok) {
 		used = oldpage;
 		unused = newpage;
 	} else {
 		used = newpage;
 		unused = oldpage;
 	}
 	anon = PageAnon(used);
 	__mem_cgroup_uncharge_common(unused,
 				     anon ? MEM_CGROUP_CHARGE_TYPE_ANON
 				     : MEM_CGROUP_CHARGE_TYPE_CACHE,
 				     true);
 	css_put(&memcg->css);
 	/*
 	 * We disallowed uncharge of pages under migration because mapcount
 	 * of the page goes down to zero, temporarly.
 	 * Clear the flag and check the page should be charged.
 	 */
 	pc = lookup_page_cgroup(oldpage);
 	lock_page_cgroup(pc);
 	ClearPageCgroupMigration(pc);
 	unlock_page_cgroup(pc);
 	/*
 	 * If a page is a file cache, radix-tree replacement is very atomic
 	 * and we can skip this check. When it was an Anon page, its mapcount
 	 * goes down to 0. But because we added MIGRATION flage, it's not
 	 * uncharged yet. There are several case but page->mapcount check
 	 * and USED bit check in mem_cgroup_uncharge_page() will do enough
 	 * check. (see prepare_charge() also)
 	 */
 	if (anon)
 		mem_cgroup_uncharge_page(used);
 	/*
 	 * At migration, we may charge account against cgroup which has no
 	 * tasks.
 	 * So, rmdir()->pre_destroy() can be called while we do this charge.
 	 * In that case, we need to call pre_destroy() again. check it here.
 	 */
 	cgroup_release_and_wakeup_rmdir(&memcg->css);
 }
 /*
  * At replace page cache, newpage is not under any memcg but it's on
  * LRU. So, this function doesn't touch res_counter but handles LRU
  * in correct way. Both pages are locked so we cannot race with uncharge.
  */
 void mem_cgroup_replace_page_cache(struct page *oldpage,
 				  struct page *newpage)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct page_cgroup *pc;
 	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(oldpage);
 	/* fix accounting on old pages */
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		memcg = pc->mem_cgroup;
 		mem_cgroup_charge_statistics(memcg, false, -1);
 		ClearPageCgroupUsed(pc);
 	}
 	unlock_page_cgroup(pc);
 	/*
 	 * When called from shmem_replace_page(), in some cases the
 	 * oldpage has already been charged, and in some cases not.
 	 */
 	if (!memcg)
 		return;
 	/*
 	 * Even if newpage->mapping was NULL before starting replacement,
 	 * the newpage may be on LRU(or pagevec for LRU) already. We lock
 	 * LRU while we overwrite pc->mem_cgroup.
 	 */
 	__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
 }
 #ifdef CONFIG_DEBUG_VM
 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
 {
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup(page);
 	/*
 	 * Can be NULL while feeding pages into the page allocator for
 	 * the first time, i.e. during boot or memory hotplug;
 	 * or when mem_cgroup_disabled().
 	 */
 	if (likely(pc) && PageCgroupUsed(pc))
 		return pc;
 	return NULL;
 }
 bool mem_cgroup_bad_page_check(struct page *page)
 {
 	if (mem_cgroup_disabled())
 		return false;
 	return lookup_page_cgroup_used(page) != NULL;
 }
 void mem_cgroup_print_bad_page(struct page *page)
 {
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup_used(page);
 	if (pc) {
 		printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
 		       pc, pc->flags, pc->mem_cgroup);
 	}
 }
 #endif
 static DEFINE_MUTEX(set_limit_mutex);
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 				unsigned long long val)
 {
 	int retry_count;
 	u64 memswlimit, memlimit;
 	int ret = 0;
 	int children = mem_cgroup_count_children(memcg);
 	u64 curusage, oldusage;
 	int enlarge;
 	/*
 	 * For keeping hierarchical_reclaim simple, how long we should retry
 	 * is depends on callers. We set our retry-count to be function
 	 * of # of children which we should visit in this loop.
 	 */
 	retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
 	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 	enlarge = 0;
 	while (retry_count) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		/*
 		 * Rather than hide all in some function, I do this in
 		 * open coded manner. You see what this really does.
 		 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
 		 */
 		mutex_lock(&set_limit_mutex);
 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 		if (memswlimit < val) {
 			ret = -EINVAL;
 			mutex_unlock(&set_limit_mutex);
 			break;
 		}
 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 		if (memlimit < val)
 			enlarge = 1;
 		ret = res_counter_set_limit(&memcg->res, val);
 		if (!ret) {
 			if (memswlimit == val)
 				memcg->memsw_is_minimum = true;
 			else
 				memcg->memsw_is_minimum = false;
 		}
 		mutex_unlock(&set_limit_mutex);
 		if (!ret)
 			break;
 		mem_cgroup_reclaim(memcg, GFP_KERNEL,
 				   MEM_CGROUP_RECLAIM_SHRINK);
 		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 		/* Usage is reduced ? */
   		if (curusage >= oldusage)
 			retry_count--;
 		else
 			oldusage = curusage;
 	}
 	if (!ret && enlarge)
 		memcg_oom_recover(memcg);
 	return ret;
 }
 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 					unsigned long long val)
 {
 	int retry_count;
 	u64 memlimit, memswlimit, oldusage, curusage;
 	int children = mem_cgroup_count_children(memcg);
 	int ret = -EBUSY;
 	int enlarge = 0;
 	/* see mem_cgroup_resize_res_limit */
  	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
 	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
 	while (retry_count) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		/*
 		 * Rather than hide all in some function, I do this in
 		 * open coded manner. You see what this really does.
 		 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
 		 */
 		mutex_lock(&set_limit_mutex);
 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 		if (memlimit > val) {
 			ret = -EINVAL;
 			mutex_unlock(&set_limit_mutex);
 			break;
 		}
 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 		if (memswlimit < val)
 			enlarge = 1;
 		ret = res_counter_set_limit(&memcg->memsw, val);
 		if (!ret) {
 			if (memlimit == val)
 				memcg->memsw_is_minimum = true;
 			else
 				memcg->memsw_is_minimum = false;
 		}
 		mutex_unlock(&set_limit_mutex);
 		if (!ret)
 			break;
 		mem_cgroup_reclaim(memcg, GFP_KERNEL,
 				   MEM_CGROUP_RECLAIM_NOSWAP |
 				   MEM_CGROUP_RECLAIM_SHRINK);
 		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
 		/* Usage is reduced ? */
 		if (curusage >= oldusage)
 			retry_count--;
 		else
 			oldusage = curusage;
 	}
 	if (!ret && enlarge)
 		memcg_oom_recover(memcg);
 	return ret;
 }
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 					    gfp_t gfp_mask,
 					    unsigned long *total_scanned)
 {
 	unsigned long nr_reclaimed = 0;
 	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
 	unsigned long reclaimed;
 	int loop = 0;
 	struct mem_cgroup_tree_per_zone *mctz;
 	unsigned long long excess;
 	unsigned long nr_scanned;
 	if (order > 0)
 		return 0;
 	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
 	/*
 	 * This loop can run a while, specially if mem_cgroup's continuously
 	 * keep exceeding their soft limit and putting the system under
 	 * pressure
 	 */
 	do {
 		if (next_mz)
 			mz = next_mz;
 		else
 			mz = mem_cgroup_largest_soft_limit_node(mctz);
 		if (!mz)
 			break;
 		nr_scanned = 0;
 		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
 						    gfp_mask, &nr_scanned);
 		nr_reclaimed += reclaimed;
 		*total_scanned += nr_scanned;
 		spin_lock(&mctz->lock);
 		/*
 		 * If we failed to reclaim anything from this memory cgroup
 		 * it is time to move on to the next cgroup
 		 */
 		next_mz = NULL;
 		if (!reclaimed) {
 			do {
 				/*
 				 * Loop until we find yet another one.
 				 *
 				 * By the time we get the soft_limit lock
 				 * again, someone might have aded the
 				 * group back on the RB tree. Iterate to
 				 * make sure we get a different mem.
 				 * mem_cgroup_largest_soft_limit_node returns
 				 * NULL if no other cgroup is present on
 				 * the tree
 				 */
 				next_mz =
 				__mem_cgroup_largest_soft_limit_node(mctz);
 				if (next_mz == mz)
 					css_put(&next_mz->memcg->css);
 				else /* next_mz == NULL or other memcg */
 					break;
 			} while (1);
 		}
 		__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
 		excess = res_counter_soft_limit_excess(&mz->memcg->res);
 		/*
 		 * One school of thought says that we should not add
 		 * back the node to the tree if reclaim returns 0.
 		 * But our reclaim could return 0, simply because due
 		 * to priority we are exposing a smaller subset of
 		 * memory to reclaim from. Consider this as a longer
 		 * term TODO.
 		 */
 		/* If excess == 0, no tree ops */
 		__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
 		spin_unlock(&mctz->lock);
 		css_put(&mz->memcg->css);
 		loop++;
 		/*
 		 * Could not reclaim anything and there are no more
 		 * mem cgroups to try or we seem to be looping without
 		 * reclaiming anything.
 		 */
 		if (!nr_reclaimed &&
 			(next_mz == NULL ||
 			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
 			break;
 	} while (!nr_reclaimed);
 	if (next_mz)
 		css_put(&next_mz->memcg->css);
 	return nr_reclaimed;
 }
 /*
  * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
  * reclaim the pages page themselves - it just removes the page_cgroups.
  * Returns true if some page_cgroups were not freed, indicating that the caller
  * must retry this operation.
  */
 static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 				int node, int zid, enum lru_list lru)
 {
 	struct mem_cgroup_per_zone *mz;
 	unsigned long flags, loop;
 	struct list_head *list;
 	struct page *busy;
 	struct zone *zone;
 	zone = &NODE_DATA(node)->node_zones[zid];
 	mz = mem_cgroup_zoneinfo(memcg, node, zid);
 	list = &mz->lruvec.lists[lru];
 	loop = mz->lru_size[lru];
 	/* give some margin against EBUSY etc...*/
 	loop += 256;
 	busy = NULL;
 	while (loop--) {
 		struct page_cgroup *pc;
 		struct page *page;
 		spin_lock_irqsave(&zone->lru_lock, flags);
 		if (list_empty(list)) {
 			spin_unlock_irqrestore(&zone->lru_lock, flags);
 			break;
 		}
 		page = list_entry(list->prev, struct page, lru);
 		if (busy == page) {
 			list_move(&page->lru, list);
 			busy = NULL;
 			spin_unlock_irqrestore(&zone->lru_lock, flags);
 			continue;
 		}
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 		pc = lookup_page_cgroup(page);
 		if (mem_cgroup_move_parent(page, pc, memcg)) {
 			/* found lock contention or "pc" is obsolete. */
 			busy = page;
 			cond_resched();
 		} else
 			busy = NULL;
 	}
 	return !list_empty(list);
 }
 /*
  * make mem_cgroup's charge to be 0 if there is no task.
  * This enables deleting this mem_cgroup.
  */
 static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
 {
 	int ret;
 	int node, zid, shrink;
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct cgroup *cgrp = memcg->css.cgroup;
 	css_get(&memcg->css);
 	shrink = 0;
 	/* should free all ? */
 	if (free_all)
 		goto try_to_free;
 move_account:
 	do {
 		ret = -EBUSY;
 		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
 			goto out;
 		/* This is for making all *used* pages to be on LRU. */
 		lru_add_drain_all();
 		drain_all_stock_sync(memcg);
 		ret = 0;
 		mem_cgroup_start_move(memcg);
 		for_each_node_state(node, N_HIGH_MEMORY) {
 			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
 				enum lru_list lru;
 				for_each_lru(lru) {
 					ret = mem_cgroup_force_empty_list(memcg,
 							node, zid, lru);
 					if (ret)
 						break;
 				}
 			}
 			if (ret)
 				break;
 		}
 		mem_cgroup_end_move(memcg);
 		memcg_oom_recover(memcg);
 		cond_resched();
 	/* "ret" should also be checked to ensure all lists are empty. */
 	} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
 out:
 	css_put(&memcg->css);
 	return ret;
 try_to_free:
 	/* returns EBUSY if there is a task or if we come here twice. */
 	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
 		ret = -EBUSY;
 		goto out;
 	}
 	/* we call try-to-free pages for make this cgroup empty */
 	lru_add_drain_all();
 	/* try to free all pages in this cgroup */
 	shrink = 1;
 	while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
 		int progress;
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			goto out;
 		}
 		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
 						false);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
 			congestion_wait(BLK_RW_ASYNC, HZ/10);
 		}
 	}
 	lru_add_drain();
 	/* try move_account...there may be some *locked* pages. */
 	goto move_account;
 }
 static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
 {
 	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
 }
 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
 {
 	return mem_cgroup_from_cont(cont)->use_hierarchy;
 }
 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
 					u64 val)
 {
 	int retval = 0;
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	struct cgroup *parent = cont->parent;
 	struct mem_cgroup *parent_memcg = NULL;
 	if (parent)
 		parent_memcg = mem_cgroup_from_cont(parent);
 	cgroup_lock();
 	if (memcg->use_hierarchy == val)
 		goto out;
 	/*
 	 * If parent's use_hierarchy is set, we can't make any modifications
 	 * in the child subtrees. If it is unset, then the change can
 	 * occur, provided the current cgroup has no children.
 	 *
 	 * For the root cgroup, parent_mem is NULL, we allow value to be
 	 * set if there are no children.
 	 */
 	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
 				(val == 1 || val == 0)) {
 		if (list_empty(&cont->children))
 			memcg->use_hierarchy = val;
 		else
 			retval = -EBUSY;
 	} else
 		retval = -EINVAL;
 out:
 	cgroup_unlock();
 	return retval;
 }
 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
 					       enum mem_cgroup_stat_index idx)
 {
 	struct mem_cgroup *iter;
 	long val = 0;
 	/* Per-cpu values can be negative, use a signed accumulator */
 	for_each_mem_cgroup_tree(iter, memcg)
 		val += mem_cgroup_read_stat(iter, idx);
 	if (val < 0) /* race ? */
 		val = 0;
 	return val;
 }
 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
 	u64 val;
 	if (!mem_cgroup_is_root(memcg)) {
 		if (!swap)
 			return res_counter_read_u64(&memcg->res, RES_USAGE);
 		else
 			return res_counter_read_u64(&memcg->memsw, RES_USAGE);
 	}
 	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
 	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
 	if (swap)
 		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
 	return val << PAGE_SHIFT;
 }
 static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
 			       struct file *file, char __user *buf,
 			       size_t nbytes, loff_t *ppos)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	char str[64];
 	u64 val;
 	int type, name, len;
 	type = MEMFILE_TYPE(cft->private);
 	name = MEMFILE_ATTR(cft->private);
 	if (!do_swap_account && type == _MEMSWAP)
 		return -EOPNOTSUPP;
 	switch (type) {
 	case _MEM:
 		if (name == RES_USAGE)
 			val = mem_cgroup_usage(memcg, false);
 		else
 			val = res_counter_read_u64(&memcg->res, name);
 		break;
 	case _MEMSWAP:
 		if (name == RES_USAGE)
 			val = mem_cgroup_usage(memcg, true);
 		else
 			val = res_counter_read_u64(&memcg->memsw, name);
 		break;
 	default:
 		BUG();
 	}
 	len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
 	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
 }
 /*
  * The user of this function is...
  * RES_LIMIT.
  */
 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 			    const char *buffer)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	int type, name;
 	unsigned long long val;
 	int ret;
 	type = MEMFILE_TYPE(cft->private);
 	name = MEMFILE_ATTR(cft->private);
 	if (!do_swap_account && type == _MEMSWAP)
 		return -EOPNOTSUPP;
 	switch (name) {
 	case RES_LIMIT:
 		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
 			ret = -EINVAL;
 			break;
 		}
 		/* This function does all necessary parse...reuse it */
 		ret = res_counter_memparse_write_strategy(buffer, &val);
 		if (ret)
 			break;
 		if (type == _MEM)
 			ret = mem_cgroup_resize_limit(memcg, val);
 		else
 			ret = mem_cgroup_resize_memsw_limit(memcg, val);
 		break;
 	case RES_SOFT_LIMIT:
 		ret = res_counter_memparse_write_strategy(buffer, &val);
 		if (ret)
 			break;
 		/*
 		 * For memsw, soft limits are hard to implement in terms
 		 * of semantics, for now, we support soft limits for
 		 * control without swap
 		 */
 		if (type == _MEM)
 			ret = res_counter_set_soft_limit(&memcg->res, val);
 		else
 			ret = -EINVAL;
 		break;
 	default:
 		ret = -EINVAL; /* should be BUG() ? */
 		break;
 	}
 	return ret;
 }
 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
 		unsigned long long *mem_limit, unsigned long long *memsw_limit)
 {
 	struct cgroup *cgroup;
 	unsigned long long min_limit, min_memsw_limit, tmp;
 	min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 	min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 	cgroup = memcg->css.cgroup;
 	if (!memcg->use_hierarchy)
 		goto out;
 	while (cgroup->parent) {
 		cgroup = cgroup->parent;
 		memcg = mem_cgroup_from_cont(cgroup);
 		if (!memcg->use_hierarchy)
 			break;
 		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
 		min_limit = min(min_limit, tmp);
 		tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 		min_memsw_limit = min(min_memsw_limit, tmp);
 	}
 out:
 	*mem_limit = min_limit;
 	*memsw_limit = min_memsw_limit;
 }
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	int type, name;
 	type = MEMFILE_TYPE(event);
 	name = MEMFILE_ATTR(event);
 	if (!do_swap_account && type == _MEMSWAP)
 		return -EOPNOTSUPP;
 	switch (name) {
 	case RES_MAX_USAGE:
 		if (type == _MEM)
 			res_counter_reset_max(&memcg->res);
 		else
 			res_counter_reset_max(&memcg->memsw);
 		break;
 	case RES_FAILCNT:
 		if (type == _MEM)
 			res_counter_reset_failcnt(&memcg->res);
 		else
 			res_counter_reset_failcnt(&memcg->memsw);
 		break;
 	}
 	return 0;
 }
 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
 					struct cftype *cft)
 {
 	return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
 }
 #ifdef CONFIG_MMU
 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 					struct cftype *cft, u64 val)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	if (val >= (1 << NR_MOVE_TYPE))
 		return -EINVAL;
 	/*
 	 * We check this value several times in both in can_attach() and
 	 * attach(), so we need cgroup lock to prevent this value from being
 	 * inconsistent.
 	 */
 	cgroup_lock();
 	memcg->move_charge_at_immigrate = val;
 	cgroup_unlock();
 	return 0;
 }
 #else
 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 					struct cftype *cft, u64 val)
 {
 	return -ENOSYS;
 }
 #endif
 #ifdef CONFIG_NUMA
 static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
 				      struct seq_file *m)
 {
 	int nid;
 	unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
 	unsigned long node_nr;
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
 	seq_printf(m, "total=%lu", total_nr);
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
 		seq_printf(m, " N%d=%lu", nid, node_nr);
 	}
 	seq_putc(m, '\n');
 	file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
 	seq_printf(m, "file=%lu", file_nr);
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
 				LRU_ALL_FILE);
 		seq_printf(m, " N%d=%lu", nid, node_nr);
 	}
 	seq_putc(m, '\n');
 	anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
 	seq_printf(m, "anon=%lu", anon_nr);
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
 				LRU_ALL_ANON);
 		seq_printf(m, " N%d=%lu", nid, node_nr);
 	}
 	seq_putc(m, '\n');
 	unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
 	seq_printf(m, "unevictable=%lu", unevictable_nr);
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
 				BIT(LRU_UNEVICTABLE));
 		seq_printf(m, " N%d=%lu", nid, node_nr);
 	}
 	seq_putc(m, '\n');
 	return 0;
 }
 #endif /* CONFIG_NUMA */
 static const char * const mem_cgroup_lru_names[] = {
 	"inactive_anon",
 	"active_anon",
 	"inactive_file",
 	"active_file",
 	"unevictable",
 };
 static inline void mem_cgroup_lru_names_not_uptodate(void)
 {
 	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
 }
 static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
 				 struct seq_file *m)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	struct mem_cgroup *mi;
 	unsigned int i;
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 			continue;
 		seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
 			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
 	}
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
 		seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
 			   mem_cgroup_read_events(memcg, i));
 	for (i = 0; i < NR_LRU_LISTS; i++)
 		seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
 			   mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
 	/* Hierarchical information */
 	{
 		unsigned long long limit, memsw_limit;
 		memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
 		seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
 		if (do_swap_account)
 			seq_printf(m, "hierarchical_memsw_limit %llu\n",
 				   memsw_limit);
 	}
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 		long long val = 0;
 		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 			continue;
 		for_each_mem_cgroup_tree(mi, memcg)
 			val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
 		seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
 	}
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
 		unsigned long long val = 0;
 		for_each_mem_cgroup_tree(mi, memcg)
 			val += mem_cgroup_read_events(mi, i);
 		seq_printf(m, "total_%s %llu\n",
 			   mem_cgroup_events_names[i], val);
 	}
 	for (i = 0; i < NR_LRU_LISTS; i++) {
 		unsigned long long val = 0;
 		for_each_mem_cgroup_tree(mi, memcg)
 			val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
 		seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
 	}
 #ifdef CONFIG_DEBUG_VM
 	{
 		int nid, zid;
 		struct mem_cgroup_per_zone *mz;
 		struct zone_reclaim_stat *rstat;
 		unsigned long recent_rotated[2] = {0, 0};
 		unsigned long recent_scanned[2] = {0, 0};
 		for_each_online_node(nid)
 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 				mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 				rstat = &mz->lruvec.reclaim_stat;
 				recent_rotated[0] += rstat->recent_rotated[0];
 				recent_rotated[1] += rstat->recent_rotated[1];
 				recent_scanned[0] += rstat->recent_scanned[0];
 				recent_scanned[1] += rstat->recent_scanned[1];
 			}
 		seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
 		seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
 		seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
 		seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
 	}
 #endif
 	return 0;
 }
 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	return mem_cgroup_swappiness(memcg);
 }
 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
 				       u64 val)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup *parent;
 	if (val > 100)
 		return -EINVAL;
 	if (cgrp->parent == NULL)
 		return -EINVAL;
 	parent = mem_cgroup_from_cont(cgrp->parent);
 	cgroup_lock();
 	/* If under hierarchy, only empty-root can set this value */
 	if ((parent->use_hierarchy) ||
 	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
 		cgroup_unlock();
 		return -EINVAL;
 	}
 	memcg->swappiness = val;
 	cgroup_unlock();
 	return 0;
 }
 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 {
 	struct mem_cgroup_threshold_ary *t;
 	u64 usage;
 	int i;
 	rcu_read_lock();
 	if (!swap)
 		t = rcu_dereference(memcg->thresholds.primary);
 	else
 		t = rcu_dereference(memcg->memsw_thresholds.primary);
 	if (!t)
 		goto unlock;
 	usage = mem_cgroup_usage(memcg, swap);
 	/*
 	 * current_threshold points to threshold just below or equal to usage.
 	 * If it's not true, a threshold was crossed after last
 	 * call of __mem_cgroup_threshold().
 	 */
 	i = t->current_threshold;
 	/*
 	 * Iterate backward over array of thresholds starting from
 	 * current_threshold and check if a threshold is crossed.
 	 * If none of thresholds below usage is crossed, we read
 	 * only one element of the array here.
 	 */
 	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
 		eventfd_signal(t->entries[i].eventfd, 1);
 	/* i = current_threshold + 1 */
 	i++;
 	/*
 	 * Iterate forward over array of thresholds starting from
 	 * current_threshold+1 and check if a threshold is crossed.
 	 * If none of thresholds above usage is crossed, we read
 	 * only one element of the array here.
 	 */
 	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
 		eventfd_signal(t->entries[i].eventfd, 1);
 	/* Update current_threshold */
 	t->current_threshold = i - 1;
 unlock:
 	rcu_read_unlock();
 }
 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
 {
 	while (memcg) {
 		__mem_cgroup_threshold(memcg, false);
 		if (do_swap_account)
 			__mem_cgroup_threshold(memcg, true);
 		memcg = parent_mem_cgroup(memcg);
 	}
 }
 static int compare_thresholds(const void *a, const void *b)
 {
 	const struct mem_cgroup_threshold *_a = a;
 	const struct mem_cgroup_threshold *_b = b;
 	return _a->threshold - _b->threshold;
 }
 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_eventfd_list *ev;
 	list_for_each_entry(ev, &memcg->oom_notify, list)
 		eventfd_signal(ev->eventfd, 1);
 	return 0;
 }
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, memcg)
 		mem_cgroup_oom_notify_cb(iter);
 }
 static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
 	int type = MEMFILE_TYPE(cft->private);
 	u64 threshold, usage;
 	int i, size, ret;
 	ret = res_counter_memparse_write_strategy(args, &threshold);
 	if (ret)
 		return ret;
 	mutex_lock(&memcg->thresholds_lock);
 	if (type == _MEM)
 		thresholds = &memcg->thresholds;
 	else if (type == _MEMSWAP)
 		thresholds = &memcg->memsw_thresholds;
 	else
 		BUG();
 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
 	/* Check if a threshold crossed before adding a new one */
 	if (thresholds->primary)
 		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
 	/* Allocate memory for new array of thresholds */
 	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
 			GFP_KERNEL);
 	if (!new) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
 	new->size = size;
 	/* Copy thresholds (if any) to new array */
 	if (thresholds->primary) {
 		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
 				sizeof(struct mem_cgroup_threshold));
 	}
 	/* Add new threshold */
 	new->entries[size - 1].eventfd = eventfd;
 	new->entries[size - 1].threshold = threshold;
 	/* Sort thresholds. Registering of new threshold isn't time-critical */
 	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
 			compare_thresholds, NULL);
 	/* Find current threshold */
 	new->current_threshold = -1;
 	for (i = 0; i < size; i++) {
 		if (new->entries[i].threshold <= usage) {
 			/*
 			 * new->current_threshold will not be used until
 			 * rcu_assign_pointer(), so it's safe to increment
 			 * it here.
 			 */
 			++new->current_threshold;
 		} else
 			break;
 	}
 	/* Free old spare buffer and save old primary buffer as spare */
 	kfree(thresholds->spare);
 	thresholds->spare = thresholds->primary;
 	rcu_assign_pointer(thresholds->primary, new);
 	/* To be sure that nobody uses thresholds */
 	synchronize_rcu();
 unlock:
 	mutex_unlock(&memcg->thresholds_lock);
 	return ret;
 }
 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
 	int type = MEMFILE_TYPE(cft->private);
 	u64 usage;
 	int i, j, size;
 	mutex_lock(&memcg->thresholds_lock);
 	if (type == _MEM)
 		thresholds = &memcg->thresholds;
 	else if (type == _MEMSWAP)
 		thresholds = &memcg->memsw_thresholds;
 	else
 		BUG();
 	if (!thresholds->primary)
 		goto unlock;
 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
 	/* Check if a threshold crossed before removing */
 	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 	/* Calculate new number of threshold */
 	size = 0;
 	for (i = 0; i < thresholds->primary->size; i++) {
 		if (thresholds->primary->entries[i].eventfd != eventfd)
 			size++;
 	}
 	new = thresholds->spare;
 	/* Set thresholds array to NULL if we don't have thresholds */
 	if (!size) {
 		kfree(new);
 		new = NULL;
 		goto swap_buffers;
 	}
 	new->size = size;
 	/* Copy thresholds and find current threshold */
 	new->current_threshold = -1;
 	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
 		if (thresholds->primary->entries[i].eventfd == eventfd)
 			continue;
 		new->entries[j] = thresholds->primary->entries[i];
 		if (new->entries[j].threshold <= usage) {
 			/*
 			 * new->current_threshold will not be used
 			 * until rcu_assign_pointer(), so it's safe to increment
 			 * it here.
 			 */
 			++new->current_threshold;
 		}
 		j++;
 	}
 swap_buffers:
 	/* Swap primary and spare array */
 	thresholds->spare = thresholds->primary;
 	/* If all events are unregistered, free the spare array */
 	if (!new) {
 		kfree(thresholds->spare);
 		thresholds->spare = NULL;
 	}
 	rcu_assign_pointer(thresholds->primary, new);
 	/* To be sure that nobody uses thresholds */
 	synchronize_rcu();
 unlock:
 	mutex_unlock(&memcg->thresholds_lock);
 }
 static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_eventfd_list *event;
 	int type = MEMFILE_TYPE(cft->private);
 	BUG_ON(type != _OOM_TYPE);
 	event = kmalloc(sizeof(*event),	GFP_KERNEL);
 	if (!event)
 		return -ENOMEM;
 	spin_lock(&memcg_oom_lock);
 	event->eventfd = eventfd;
 	list_add(&event->list, &memcg->oom_notify);
 	/* already in OOM ? */
 	if (atomic_read(&memcg->under_oom))
 		eventfd_signal(eventfd, 1);
 	spin_unlock(&memcg_oom_lock);
 	return 0;
 }
 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_eventfd_list *ev, *tmp;
 	int type = MEMFILE_TYPE(cft->private);
 	BUG_ON(type != _OOM_TYPE);
 	spin_lock(&memcg_oom_lock);
 	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
 		if (ev->eventfd == eventfd) {
 			list_del(&ev->list);
 			kfree(ev);
 		}
 	}
 	spin_unlock(&memcg_oom_lock);
 }
 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
 	struct cftype *cft,  struct cgroup_map_cb *cb)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
 	if (atomic_read(&memcg->under_oom))
 		cb->fill(cb, "under_oom", 1);
 	else
 		cb->fill(cb, "under_oom", 0);
 	return 0;
 }
 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 	struct cftype *cft, u64 val)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup *parent;
 	/* cannot set to root cgroup and only 0 and 1 are allowed */
 	if (!cgrp->parent || !((val == 0) || (val == 1)))
 		return -EINVAL;
 	parent = mem_cgroup_from_cont(cgrp->parent);
 	cgroup_lock();
 	/* oom-kill-disable is a flag for subhierarchy. */
 	if ((parent->use_hierarchy) ||
 	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
 		cgroup_unlock();
 		return -EINVAL;
 	}
 	memcg->oom_kill_disable = val;
 	if (!val)
 		memcg_oom_recover(memcg);
 	cgroup_unlock();
 	return 0;
 }
 #ifdef CONFIG_MEMCG_KMEM
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	return mem_cgroup_sockets_init(memcg, ss);
 };
 static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
 {
 	mem_cgroup_sockets_destroy(memcg);
 }
 #else
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	return 0;
 }
 static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
 {
 }
 #endif
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
 		.read = mem_cgroup_read,
 		.register_event = mem_cgroup_usage_register_event,
 		.unregister_event = mem_cgroup_usage_unregister_event,
 	},
 	{
 		.name = "max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
 		.read = mem_cgroup_read,
 	},
 	{
 		.name = "limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read = mem_cgroup_read,
 	},
 	{
 		.name = "soft_limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read = mem_cgroup_read,
 	},
 	{
 		.name = "failcnt",
 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
 		.read = mem_cgroup_read,
 	},
 	{
 		.name = "stat",
 		.read_seq_string = memcg_stat_show,
 	},
 	{
 		.name = "force_empty",
 		.trigger = mem_cgroup_force_empty_write,
 	},
 	{
 		.name = "use_hierarchy",
 		.write_u64 = mem_cgroup_hierarchy_write,
 		.read_u64 = mem_cgroup_hierarchy_read,
 	},
 	{
 		.name = "swappiness",
 		.read_u64 = mem_cgroup_swappiness_read,
 		.write_u64 = mem_cgroup_swappiness_write,
 	},
 	{
 		.name = "move_charge_at_immigrate",
 		.read_u64 = mem_cgroup_move_charge_read,
 		.write_u64 = mem_cgroup_move_charge_write,
 	},
 	{
 		.name = "oom_control",
 		.read_map = mem_cgroup_oom_control_read,
 		.write_u64 = mem_cgroup_oom_control_write,
 		.register_event = mem_cgroup_oom_register_event,
 		.unregister_event = mem_cgroup_oom_unregister_event,
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 	},
 #ifdef CONFIG_NUMA
 	{
 		.name = "numa_stat",
 		.read_seq_string = memcg_numa_stat_show,
 	},
 #endif
 #ifdef CONFIG_MEMCG_SWAP
 	{
 		.name = "memsw.usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
 		.read = mem_cgroup_read,
 		.register_event = mem_cgroup_usage_register_event,
 		.unregister_event = mem_cgroup_usage_unregister_event,
 	},
 	{
 		.name = "memsw.max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
 		.read = mem_cgroup_read,
 	},
 	{
 		.name = "memsw.limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read = mem_cgroup_read,
 	},
 	{
 		.name = "memsw.failcnt",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
 		.read = mem_cgroup_read,
 	},
 #endif
 	{ },	/* terminate */
 };
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
 	struct mem_cgroup_per_node *pn;
 	struct mem_cgroup_per_zone *mz;
 	int zone, tmp = node;
 	/*
 	 * This routine is called against possible nodes.
 	 * But it's BUG to call kmalloc() against offline node.
 	 *
 	 * TODO: this routine can waste much memory for nodes which will
 	 *       never be onlined. It's better to use memory hotplug callback
 	 *       function.
 	 */
 	if (!node_state(node, N_NORMAL_MEMORY))
 		tmp = -1;
 	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
 	if (!pn)
 		return 1;
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
 		lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
 		mz->usage_in_excess = 0;
 		mz->on_tree = false;
 		mz->memcg = memcg;
 	}
 	memcg->info.nodeinfo[node] = pn;
 	return 0;
 }
 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
 	kfree(memcg->info.nodeinfo[node]);
 }
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
 	struct mem_cgroup *memcg;
 	int size = sizeof(struct mem_cgroup);
 	/* Can be very big if MAX_NUMNODES is very big */
 	if (size < PAGE_SIZE)
 		memcg = kzalloc(size, GFP_KERNEL);
 	else
 		memcg = vzalloc(size);
 	if (!memcg)
 		return NULL;
 	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
 	if (!memcg->stat)
 		goto out_free;
 	spin_lock_init(&memcg->pcp_counter_lock);
 	return memcg;
 out_free:
 	if (size < PAGE_SIZE)
 		kfree(memcg);
 	else
 		vfree(memcg);
 	return NULL;
 }
 /*
  * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
  * but in process context.  The work_freeing structure is overlaid
  * on the rcu_freeing structure, which itself is overlaid on memsw.
  */
 static void free_work(struct work_struct *work)
 {
 	struct mem_cgroup *memcg;
 	int size = sizeof(struct mem_cgroup);
 	memcg = container_of(work, struct mem_cgroup, work_freeing);
 	/*
 	 * We need to make sure that (at least for now), the jump label
 	 * destruction code runs outside of the cgroup lock. This is because
 	 * get_online_cpus(), which is called from the static_branch update,
 	 * can't be called inside the cgroup_lock. cpusets are the ones
 	 * enforcing this dependency, so if they ever change, we might as well.
 	 *
 	 * schedule_work() will guarantee this happens. Be careful if you need
 	 * to move this code around, and make sure it is outside
 	 * the cgroup_lock.
 	 */
 	disarm_sock_keys(memcg);
 	if (size < PAGE_SIZE)
 		kfree(memcg);
 	else
 		vfree(memcg);
 }
 static void free_rcu(struct rcu_head *rcu_head)
 {
 	struct mem_cgroup *memcg;
 	memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
 	INIT_WORK(&memcg->work_freeing, free_work);
 	schedule_work(&memcg->work_freeing);
 }
 /*
  * At destroying mem_cgroup, references from swap_cgroup can remain.
  * (scanning all at force_empty is too costly...)
  *
  * Instead of clearing all references at force_empty, we remember
  * the number of reference from swap_cgroup and free mem_cgroup when
  * it goes down to 0.
  *
  * Removal of cgroup itself succeeds regardless of refs from swap.
  */
 static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
 	int node;
 	mem_cgroup_remove_from_trees(memcg);
 	free_css_id(&mem_cgroup_subsys, &memcg->css);
 	for_each_node(node)
 		free_mem_cgroup_per_zone_info(memcg, node);
 	free_percpu(memcg->stat);
 	call_rcu(&memcg->rcu_freeing, free_rcu);
 }
 static void mem_cgroup_get(struct mem_cgroup *memcg)
 {
 	atomic_inc(&memcg->refcnt);
 }
 static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
 {
 	if (atomic_sub_and_test(count, &memcg->refcnt)) {
 		struct mem_cgroup *parent = parent_mem_cgroup(memcg);
 		__mem_cgroup_free(memcg);
 		if (parent)
 			mem_cgroup_put(parent);
 	}
 }
 static void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 	__mem_cgroup_put(memcg, 1);
 }
 /*
  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
  */
 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 {
 	if (!memcg->res.parent)
 		return NULL;
 	return mem_cgroup_from_res_counter(memcg->res.parent, res);
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
 #ifdef CONFIG_MEMCG_SWAP
 static void __init enable_swap_cgroup(void)
 {
 	if (!mem_cgroup_disabled() && really_do_swap_account)
 		do_swap_account = 1;
 }
 #else
 static void __init enable_swap_cgroup(void)
 {
 }
 #endif
 static int mem_cgroup_soft_limit_tree_init(void)
 {
 	struct mem_cgroup_tree_per_node *rtpn;
 	struct mem_cgroup_tree_per_zone *rtpz;
 	int tmp, node, zone;
 	for_each_node(node) {
 		tmp = node;
 		if (!node_state(node, N_NORMAL_MEMORY))
 			tmp = -1;
 		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
 		if (!rtpn)
 			goto err_cleanup;
 		soft_limit_tree.rb_tree_per_node[node] = rtpn;
 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 			rtpz = &rtpn->rb_tree_per_zone[zone];
 			rtpz->rb_root = RB_ROOT;
 			spin_lock_init(&rtpz->lock);
 		}
 	}
 	return 0;
 err_cleanup:
 	for_each_node(node) {
 		if (!soft_limit_tree.rb_tree_per_node[node])
 			break;
 		kfree(soft_limit_tree.rb_tree_per_node[node]);
 		soft_limit_tree.rb_tree_per_node[node] = NULL;
 	}
 	return 1;
 }
 static struct cgroup_subsys_state * __ref
 mem_cgroup_create(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg, *parent;
 	long error = -ENOMEM;
 	int node;
 	memcg = mem_cgroup_alloc();
 	if (!memcg)
 		return ERR_PTR(error);
 	for_each_node(node)
 		if (alloc_mem_cgroup_per_zone_info(memcg, node))
 			goto free_out;
 	/* root ? */
 	if (cont->parent == NULL) {
 		int cpu;
 		enable_swap_cgroup();
 		parent = NULL;
 		if (mem_cgroup_soft_limit_tree_init())
 			goto free_out;
 		root_mem_cgroup = memcg;
 		for_each_possible_cpu(cpu) {
 			struct memcg_stock_pcp *stock =
 						&per_cpu(memcg_stock, cpu);
 			INIT_WORK(&stock->work, drain_local_stock);
 		}
 		hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
 	} else {
 		parent = mem_cgroup_from_cont(cont->parent);
 		memcg->use_hierarchy = parent->use_hierarchy;
 		memcg->oom_kill_disable = parent->oom_kill_disable;
 	}
 	if (parent && parent->use_hierarchy) {
 		res_counter_init(&memcg->res, &parent->res);
 		res_counter_init(&memcg->memsw, &parent->memsw);
 		/*
 		 * We increment refcnt of the parent to ensure that we can
 		 * safely access it on res_counter_charge/uncharge.
 		 * This refcnt will be decremented when freeing this
 		 * mem_cgroup(see mem_cgroup_put).
 		 */
 		mem_cgroup_get(parent);
 	} else {
 		res_counter_init(&memcg->res, NULL);
 		res_counter_init(&memcg->memsw, NULL);
 	}
 	memcg->last_scanned_node = MAX_NUMNODES;
 	INIT_LIST_HEAD(&memcg->oom_notify);
 	if (parent)
 		memcg->swappiness = mem_cgroup_swappiness(parent);
 	atomic_set(&memcg->refcnt, 1);
 	memcg->move_charge_at_immigrate = 0;
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
 	error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
 	if (error) {
 		/*
 		 * We call put now because our (and parent's) refcnts
 		 * are already in place. mem_cgroup_put() will internally
 		 * call __mem_cgroup_free, so return directly
 		 */
 		mem_cgroup_put(memcg);
 		return ERR_PTR(error);
 	}
 	return &memcg->css;
 free_out:
 	__mem_cgroup_free(memcg);
 	return ERR_PTR(error);
 }
 static int mem_cgroup_pre_destroy(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	return mem_cgroup_force_empty(memcg, false);
 }
 static void mem_cgroup_destroy(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	kmem_cgroup_destroy(memcg);
 	mem_cgroup_put(memcg);
 }
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
 #define PRECHARGE_COUNT_AT_ONCE	256
 static int mem_cgroup_do_precharge(unsigned long count)
 {
 	int ret = 0;
 	int batch_count = PRECHARGE_COUNT_AT_ONCE;
 	struct mem_cgroup *memcg = mc.to;
 	if (mem_cgroup_is_root(memcg)) {
 		mc.precharge += count;
 		/* we don't need css_get for root */
 		return ret;
 	}
 	/* try to charge at once */
 	if (count > 1) {
 		struct res_counter *dummy;
 		/*
 		 * "memcg" cannot be under rmdir() because we've already checked
 		 * by cgroup_lock_live_cgroup() that it is not removed and we
 		 * are still under the same cgroup_mutex. So we can postpone
 		 * css_get().
 		 */
 		if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
 			goto one_by_one;
 		if (do_swap_account && res_counter_charge(&memcg->memsw,
 						PAGE_SIZE * count, &dummy)) {
 			res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
 			goto one_by_one;
 		}
 		mc.precharge += count;
 		return ret;
 	}
 one_by_one:
 	/* fall back to one by one charge */
 	while (count--) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		if (!batch_count--) {
 			batch_count = PRECHARGE_COUNT_AT_ONCE;
 			cond_resched();
 		}
 		ret = __mem_cgroup_try_charge(NULL,
 					GFP_KERNEL, 1, &memcg, false);
 		if (ret)
 			/* mem_cgroup_clear_mc() will do uncharge later */
 			return ret;
 		mc.precharge++;
 	}
 	return ret;
 }
 /**
  * get_mctgt_type - get target type of moving charge
  * @vma: the vma the pte to be checked belongs
  * @addr: the address corresponding to the pte to be checked
  * @ptent: the pte to be checked
  * @target: the pointer the target page or swap ent will be stored(can be NULL)
  *
  * Returns
  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
  *     move charge. if @target is not NULL, the page is stored in target->page
  *     with extra refcnt got(Callers should handle it).
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  *     target for charge migration. if @target is not NULL, the entry is stored
  *     in target->ent.
  *
  * Called with pte lock held.
  */
 union mc_target {
 	struct page	*page;
 	swp_entry_t	ent;
 };
 enum mc_target_type {
 	MC_TARGET_NONE = 0,
 	MC_TARGET_PAGE,
 	MC_TARGET_SWAP,
 };
 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
 						unsigned long addr, pte_t ptent)
 {
 	struct page *page = vm_normal_page(vma, addr, ptent);
 	if (!page || !page_mapped(page))
 		return NULL;
 	if (PageAnon(page)) {
 		/* we don't move shared anon */
 		if (!move_anon())
 			return NULL;
 	} else if (!move_file())
 		/* we ignore mapcount for file pages */
 		return NULL;
 	if (!get_page_unless_zero(page))
 		return NULL;
 	return page;
 }
 #ifdef CONFIG_SWAP
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	struct page *page = NULL;
 	swp_entry_t ent = pte_to_swp_entry(ptent);
 	if (!move_anon() || non_swap_entry(ent))
 		return NULL;
 	/*
 	 * Because lookup_swap_cache() updates some statistics counter,
 	 * we call find_get_page() with swapper_space directly.
 	 */
 	page = find_get_page(&swapper_space, ent.val);
 	if (do_swap_account)
 		entry->val = ent.val;
 	return page;
 }
 #else
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	return NULL;
 }
 #endif
 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	struct page *page = NULL;
 	struct address_space *mapping;
 	pgoff_t pgoff;
 	if (!vma->vm_file) /* anonymous vma */
 		return NULL;
 	if (!move_file())
 		return NULL;
 	mapping = vma->vm_file->f_mapping;
 	if (pte_none(ptent))
 		pgoff = linear_page_index(vma, addr);
 	else /* pte_file(ptent) is true */
 		pgoff = pte_to_pgoff(ptent);
 	/* page is moved even if it's not RSS of this task(page-faulted). */
 	page = find_get_page(mapping, pgoff);
 #ifdef CONFIG_SWAP
 	/* shmem/tmpfs may report page out on swap: account for that too. */
 	if (radix_tree_exceptional_entry(page)) {
 		swp_entry_t swap = radix_to_swp_entry(page);
 		if (do_swap_account)
 			*entry = swap;
 		page = find_get_page(&swapper_space, swap.val);
 	}
 #endif
 	return page;
 }
 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		unsigned long addr, pte_t ptent, union mc_target *target)
 {
 	struct page *page = NULL;
 	struct page_cgroup *pc;
 	enum mc_target_type ret = MC_TARGET_NONE;
 	swp_entry_t ent = { .val = 0 };
 	if (pte_present(ptent))
 		page = mc_handle_present_pte(vma, addr, ptent);
 	else if (is_swap_pte(ptent))
 		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
 	else if (pte_none(ptent) || pte_file(ptent))
 		page = mc_handle_file_pte(vma, addr, ptent, &ent);
 	if (!page && !ent.val)
 		return ret;
 	if (page) {
 		pc = lookup_page_cgroup(page);
 		/*
 		 * Do only loose check w/o page_cgroup lock.
 		 * mem_cgroup_move_account() checks the pc is valid or not under
 		 * the lock.
 		 */
 		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
 			ret = MC_TARGET_PAGE;
 			if (target)
 				target->page = page;
 		}
 		if (!ret || !target)
 			put_page(page);
 	}
 	/* There is a swap entry and a page doesn't exist or isn't charged */
 	if (ent.val && !ret &&
 			css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
 		ret = MC_TARGET_SWAP;
 		if (target)
 			target->ent = ent;
 	}
 	return ret;
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
  * We don't consider swapping or file mapped pages because THP does not
  * support them for now.
  * Caller should make sure that pmd_trans_huge(pmd) is true.
  */
 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t pmd, union mc_target *target)
 {
 	struct page *page = NULL;
 	struct page_cgroup *pc;
 	enum mc_target_type ret = MC_TARGET_NONE;
 	page = pmd_page(pmd);
 	VM_BUG_ON(!page || !PageHead(page));
 	if (!move_anon())
 		return ret;
 	pc = lookup_page_cgroup(page);
 	if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
 		ret = MC_TARGET_PAGE;
 		if (target) {
 			get_page(page);
 			target->page = page;
 		}
 	}
 	return ret;
 }
 #else
 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t pmd, union mc_target *target)
 {
 	return MC_TARGET_NONE;
 }
 #endif
 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 					unsigned long addr, unsigned long end,
 					struct mm_walk *walk)
 {
 	struct vm_area_struct *vma = walk->private;
 	pte_t *pte;
 	spinlock_t *ptl;
 	if (pmd_trans_huge_lock(pmd, vma) == 1) {
 		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
 			mc.precharge += HPAGE_PMD_NR;
 		spin_unlock(&vma->vm_mm->page_table_lock);
 		return 0;
 	}
 	if (pmd_trans_unstable(pmd))
 		return 0;
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE)
 		if (get_mctgt_type(vma, addr, *pte, NULL))
 			mc.precharge++;	/* increment precharge temporarily */
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	return 0;
 }
 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
 {
 	unsigned long precharge;
 	struct vm_area_struct *vma;
 	down_read(&mm->mmap_sem);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		struct mm_walk mem_cgroup_count_precharge_walk = {
 			.pmd_entry = mem_cgroup_count_precharge_pte_range,
 			.mm = mm,
 			.private = vma,
 		};
 		if (is_vm_hugetlb_page(vma))
 			continue;
 		walk_page_range(vma->vm_start, vma->vm_end,
 					&mem_cgroup_count_precharge_walk);
 	}
 	up_read(&mm->mmap_sem);
 	precharge = mc.precharge;
 	mc.precharge = 0;
 	return precharge;
 }
 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 {
 	unsigned long precharge = mem_cgroup_count_precharge(mm);
 	VM_BUG_ON(mc.moving_task);
 	mc.moving_task = current;
 	return mem_cgroup_do_precharge(precharge);
 }
 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
 static void __mem_cgroup_clear_mc(void)
 {
 	struct mem_cgroup *from = mc.from;
 	struct mem_cgroup *to = mc.to;
 	/* we must uncharge all the leftover precharges from mc.to */
 	if (mc.precharge) {
 		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
 		mc.precharge = 0;
 	}
 	/*
 	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
 	 * we must uncharge here.
 	 */
 	if (mc.moved_charge) {
 		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
 		mc.moved_charge = 0;
 	}
 	/* we must fixup refcnts and charges */
 	if (mc.moved_swap) {
 		/* uncharge swap account from the old cgroup */
 		if (!mem_cgroup_is_root(mc.from))
 			res_counter_uncharge(&mc.from->memsw,
 						PAGE_SIZE * mc.moved_swap);
 		__mem_cgroup_put(mc.from, mc.moved_swap);
 		if (!mem_cgroup_is_root(mc.to)) {
 			/*
 			 * we charged both to->res and to->memsw, so we should
 			 * uncharge to->res.
 			 */
 			res_counter_uncharge(&mc.to->res,
 						PAGE_SIZE * mc.moved_swap);
 		}
 		/* we've already done mem_cgroup_get(mc.to) */
 		mc.moved_swap = 0;
 	}
 	memcg_oom_recover(from);
 	memcg_oom_recover(to);
 	wake_up_all(&mc.waitq);
 }
 static void mem_cgroup_clear_mc(void)
 {
 	struct mem_cgroup *from = mc.from;
 	/*
 	 * we must clear moving_task before waking up waiters at the end of
 	 * task migration.
 	 */
 	mc.moving_task = NULL;
 	__mem_cgroup_clear_mc();
 	spin_lock(&mc.lock);
 	mc.from = NULL;
 	mc.to = NULL;
 	spin_unlock(&mc.lock);
 	mem_cgroup_end_move(from);
 }
 static int mem_cgroup_can_attach(struct cgroup *cgroup,
 				 struct cgroup_taskset *tset)
 {
 	struct task_struct *p = cgroup_taskset_first(tset);
 	int ret = 0;
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
 	if (memcg->move_charge_at_immigrate) {
 		struct mm_struct *mm;
 		struct mem_cgroup *from = mem_cgroup_from_task(p);
 		VM_BUG_ON(from == memcg);
 		mm = get_task_mm(p);
 		if (!mm)
 			return 0;
 		/* We move charges only when we move a owner of the mm */
 		if (mm->owner == p) {
 			VM_BUG_ON(mc.from);
 			VM_BUG_ON(mc.to);
 			VM_BUG_ON(mc.precharge);
 			VM_BUG_ON(mc.moved_charge);
 			VM_BUG_ON(mc.moved_swap);
 			mem_cgroup_start_move(from);
 			spin_lock(&mc.lock);
 			mc.from = from;
 			mc.to = memcg;
 			spin_unlock(&mc.lock);
 			/* We set mc.moving_task later */
 			ret = mem_cgroup_precharge_mc(mm);
 			if (ret)
 				mem_cgroup_clear_mc();
 		}
 		mmput(mm);
 	}
 	return ret;
 }
 static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
 				     struct cgroup_taskset *tset)
 {
 	mem_cgroup_clear_mc();
 }
 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct mm_walk *walk)
 {
 	int ret = 0;
 	struct vm_area_struct *vma = walk->private;
 	pte_t *pte;
 	spinlock_t *ptl;
 	enum mc_target_type target_type;
 	union mc_target target;
 	struct page *page;
 	struct page_cgroup *pc;
 	/*
 	 * We don't take compound_lock() here but no race with splitting thp
 	 * happens because:
 	 *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
 	 *    under splitting, which means there's no concurrent thp split,
 	 *  - if another thread runs into split_huge_page() just after we
 	 *    entered this if-block, the thread must wait for page table lock
 	 *    to be unlocked in __split_huge_page_splitting(), where the main
 	 *    part of thp split is not executed yet.
 	 */
 	if (pmd_trans_huge_lock(pmd, vma) == 1) {
 		if (mc.precharge < HPAGE_PMD_NR) {
 			spin_unlock(&vma->vm_mm->page_table_lock);
 			return 0;
 		}
 		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
 		if (target_type == MC_TARGET_PAGE) {
 			page = target.page;
 			if (!isolate_lru_page(page)) {
 				pc = lookup_page_cgroup(page);
 				if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
 							pc, mc.from, mc.to)) {
 					mc.precharge -= HPAGE_PMD_NR;
 					mc.moved_charge += HPAGE_PMD_NR;
 				}
 				putback_lru_page(page);
 			}
 			put_page(page);
 		}
 		spin_unlock(&vma->vm_mm->page_table_lock);
 		return 0;
 	}
 	if (pmd_trans_unstable(pmd))
 		return 0;
 retry:
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; addr += PAGE_SIZE) {
 		pte_t ptent = *(pte++);
 		swp_entry_t ent;
 		if (!mc.precharge)
 			break;
 		switch (get_mctgt_type(vma, addr, ptent, &target)) {
 		case MC_TARGET_PAGE:
 			page = target.page;
 			if (isolate_lru_page(page))
 				goto put;
 			pc = lookup_page_cgroup(page);
 			if (!mem_cgroup_move_account(page, 1, pc,
 						     mc.from, mc.to)) {
 				mc.precharge--;
 				/* we uncharge from mc.from later. */
 				mc.moved_charge++;
 			}
 			putback_lru_page(page);
 put:			/* get_mctgt_type() gets the page */
 			put_page(page);
 			break;
 		case MC_TARGET_SWAP:
 			ent = target.ent;
 			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
 				mc.precharge--;
 				/* we fixup refcnts and charges later. */
 				mc.moved_swap++;
 			}
 			break;
 		default:
 			break;
 		}
 	}
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	if (addr != end) {
 		/*
 		 * We have consumed all precharges we got in can_attach().
 		 * We try charge one by one, but don't do any additional
 		 * charges to mc.to if we have failed in charge once in attach()
 		 * phase.
 		 */
 		ret = mem_cgroup_do_precharge(1);
 		if (!ret)
 			goto retry;
 	}
 	return ret;
 }
 static void mem_cgroup_move_charge(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
 	lru_add_drain_all();
 retry:
 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
 		/*
 		 * Someone who are holding the mmap_sem might be waiting in
 		 * waitq. So we cancel all extra charges, wake up all waiters,
 		 * and retry. Because we cancel precharges, we might not be able
 		 * to move enough charges, but moving charge is a best-effort
 		 * feature anyway, so it wouldn't be a big problem.
 		 */
 		__mem_cgroup_clear_mc();
 		cond_resched();
 		goto retry;
 	}
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		int ret;
 		struct mm_walk mem_cgroup_move_charge_walk = {
 			.pmd_entry = mem_cgroup_move_charge_pte_range,
 			.mm = mm,
 			.private = vma,
 		};
 		if (is_vm_hugetlb_page(vma))
 			continue;
 		ret = walk_page_range(vma->vm_start, vma->vm_end,
 						&mem_cgroup_move_charge_walk);
 		if (ret)
 			/*
 			 * means we have consumed all precharges and failed in
 			 * doing additional charge. Just abandon here.
 			 */
 			break;
 	}
 	up_read(&mm->mmap_sem);
 }
 static void mem_cgroup_move_task(struct cgroup *cont,
 				 struct cgroup_taskset *tset)
 {
 	struct task_struct *p = cgroup_taskset_first(tset);
 	struct mm_struct *mm = get_task_mm(p);
 	if (mm) {
 		if (mc.to)
 			mem_cgroup_move_charge(mm);
 		mmput(mm);
 	}
 	if (mc.to)
 		mem_cgroup_clear_mc();
 }
 #else	/* !CONFIG_MMU */
 static int mem_cgroup_can_attach(struct cgroup *cgroup,
 				 struct cgroup_taskset *tset)
 {
 	return 0;
 }
 static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
 				     struct cgroup_taskset *tset)
 {
 }
 static void mem_cgroup_move_task(struct cgroup *cont,
 				 struct cgroup_taskset *tset)
 {
 }
 #endif
 struct cgroup_subsys mem_cgroup_subsys = {
 	.name = "memory",
 	.subsys_id = mem_cgroup_subsys_id,
 	.create = mem_cgroup_create,
 	.pre_destroy = mem_cgroup_pre_destroy,
 	.destroy = mem_cgroup_destroy,
 	.can_attach = mem_cgroup_can_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.attach = mem_cgroup_move_task,
 	.base_cftypes = mem_cgroup_files,
 	.early_init = 0,
 	.use_id = 1,
 	.__DEPRECATED_clear_css_refs = true,
 };
 #ifdef CONFIG_MEMCG_SWAP
 static int __init enable_swap_account(char *s)
 {
 	/* consider enabled if no parameter or 1 is given */
 	if (!strcmp(s, "1"))
 		really_do_swap_account = 1;