Doug / smarc-fsl-linux-kernel

1

/* memcontrol.c - Memory Controller

1

/* memcontrol.c - Memory Controller

2

*

2

*

3

* Copyright IBM Corporation, 2007

3

* Copyright IBM Corporation, 2007

4

* Author Balbir Singh <balbir@linux.vnet.ibm.com>

4

* Author Balbir Singh <balbir@linux.vnet.ibm.com>

5

*

5

*

6

7

* Author: Pavel Emelianov <xemul@openvz.org>

7

* Author: Pavel Emelianov <xemul@openvz.org>

8

*

8

*

9

* Memory thresholds

9

* Memory thresholds

10

11

* Author: Kirill A. Shutemov

11

* Author: Kirill A. Shutemov

12

*

12

*

13

* This program is free software; you can redistribute it and/or modify

13

* This program is free software; you can redistribute it and/or modify

14

* it under the terms of the GNU General Public License as published by

14

* it under the terms of the GNU General Public License as published by

15

* the Free Software Foundation; either version 2 of the License, or

15

* the Free Software Foundation; either version 2 of the License, or

16

* (at your option) any later version.

16

* (at your option) any later version.

17

*

17

*

18

* This program is distributed in the hope that it will be useful,

18

* This program is distributed in the hope that it will be useful,

19

* but WITHOUT ANY WARRANTY; without even the implied warranty of

19

* but WITHOUT ANY WARRANTY; without even the implied warranty of

20

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

20

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

21

* GNU General Public License for more details.

21

* GNU General Public License for more details.

22

*/

22

*/

23

24

#include <linux/res_counter.h>

24

#include <linux/res_counter.h>

25

#include <linux/memcontrol.h>

25

#include <linux/memcontrol.h>

26

#include <linux/cgroup.h>

26

#include <linux/cgroup.h>

27

#include <linux/mm.h>

27

#include <linux/mm.h>

28

#include <linux/hugetlb.h>

28

#include <linux/hugetlb.h>

29

#include <linux/pagemap.h>

29

#include <linux/pagemap.h>

30

#include <linux/smp.h>

30

#include <linux/smp.h>

31

#include <linux/page-flags.h>

31

#include <linux/page-flags.h>

32

#include <linux/backing-dev.h>

32

#include <linux/backing-dev.h>

33

#include <linux/bit_spinlock.h>

33

#include <linux/bit_spinlock.h>

34

#include <linux/rcupdate.h>

34

#include <linux/rcupdate.h>

35

#include <linux/limits.h>

35

#include <linux/limits.h>

36

#include <linux/export.h>

36

#include <linux/export.h>

37

#include <linux/mutex.h>

37

#include <linux/mutex.h>

38

#include <linux/rbtree.h>

38

#include <linux/rbtree.h>

39

#include <linux/slab.h>

39

#include <linux/slab.h>

40

#include <linux/swap.h>

40

#include <linux/swap.h>

41

#include <linux/swapops.h>

41

#include <linux/swapops.h>

42

#include <linux/spinlock.h>

42

#include <linux/spinlock.h>

43

#include <linux/eventfd.h>

43

#include <linux/eventfd.h>

44

#include <linux/sort.h>

44

#include <linux/sort.h>

45

#include <linux/fs.h>

45

#include <linux/fs.h>

46

#include <linux/seq_file.h>

46

#include <linux/seq_file.h>

47

#include <linux/vmalloc.h>

47

#include <linux/vmalloc.h>

48

#include <linux/mm_inline.h>

48

#include <linux/mm_inline.h>

49

#include <linux/page_cgroup.h>

49

#include <linux/page_cgroup.h>

50

#include <linux/cpu.h>

50

#include <linux/cpu.h>

51

#include <linux/oom.h>

51

#include <linux/oom.h>

52

#include "internal.h"

52

#include "internal.h"

53

#include <net/sock.h>

53

#include <net/sock.h>

54

#include <net/tcp_memcontrol.h>

54

#include <net/tcp_memcontrol.h>

55

56

#include <asm/uaccess.h>

56

#include <asm/uaccess.h>

57

58

#include <trace/events/vmscan.h>

58

#include <trace/events/vmscan.h>

59

60

struct cgroup_subsys mem_cgroup_subsys __read_mostly;

60

struct cgroup_subsys mem_cgroup_subsys __read_mostly;

61

#define MEM_CGROUP_RECLAIM_RETRIES 5

61

#define MEM_CGROUP_RECLAIM_RETRIES 5

62

static struct mem_cgroup *root_mem_cgroup __read_mostly;

62

static struct mem_cgroup *root_mem_cgroup __read_mostly;

63

64

#ifdef CONFIG_MEMCG_SWAP

64

#ifdef CONFIG_MEMCG_SWAP

65

/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */

65

/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */

66

int do_swap_account __read_mostly;

66

int do_swap_account __read_mostly;

67

68

/* for remember boot option*/

68

/* for remember boot option*/

69

#ifdef CONFIG_MEMCG_SWAP_ENABLED

69

#ifdef CONFIG_MEMCG_SWAP_ENABLED

70

static int really_do_swap_account __initdata = 1;

70

static int really_do_swap_account __initdata = 1;

71

#else

71

#else

72

static int really_do_swap_account __initdata = 0;

72

static int really_do_swap_account __initdata = 0;

73

#endif

73

#endif

74

75

#else

75

#else

76

#define do_swap_account 0

76

#define do_swap_account 0

77

#endif

77

#endif

78

79

80

/*

80

/*

81

* Statistics for memory cgroup.

81

* Statistics for memory cgroup.

82

*/

82

*/

83

enum mem_cgroup_stat_index {

83

enum mem_cgroup_stat_index {

84

/*

84

/*

85

* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.

85

* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.

86

*/

86

*/

87

MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */

87

MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */

88

MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */

88

MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */

89

MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */

89

MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */

90

MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */

90

MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */

91

MEM_CGROUP_STAT_NSTATS,

91

MEM_CGROUP_STAT_NSTATS,

92

};

92

};

93

94

static const char * const mem_cgroup_stat_names[] = {

94

static const char * const mem_cgroup_stat_names[] = {

95

"cache",

95

"cache",

96

"rss",

96

"rss",

97

"mapped_file",

97

"mapped_file",

98

"swap",

98

"swap",

99

};

99

};

100

101

enum mem_cgroup_events_index {

101

enum mem_cgroup_events_index {

102

MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */

102

MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */

103

MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */

103

MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */

104

MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */

104

MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */

105

MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */

105

MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */

106

MEM_CGROUP_EVENTS_NSTATS,

106

MEM_CGROUP_EVENTS_NSTATS,

107

};

107

};

108

109

static const char * const mem_cgroup_events_names[] = {

109

static const char * const mem_cgroup_events_names[] = {

110

"pgpgin",

110

"pgpgin",

111

"pgpgout",

111

"pgpgout",

112

"pgfault",

112

"pgfault",

113

"pgmajfault",

113

"pgmajfault",

114

};

114

};

115

116

/*

116

/*

117

* Per memcg event counter is incremented at every pagein/pageout. With THP,

117

* Per memcg event counter is incremented at every pagein/pageout. With THP,

118

* it will be incremated by the number of pages. This counter is used for

118

* it will be incremated by the number of pages. This counter is used for

119

* for trigger some periodic events. This is straightforward and better

119

* for trigger some periodic events. This is straightforward and better

120

* than using jiffies etc. to handle periodic memcg event.

120

* than using jiffies etc. to handle periodic memcg event.

121

*/

121

*/

122

enum mem_cgroup_events_target {

122

enum mem_cgroup_events_target {

123

MEM_CGROUP_TARGET_THRESH,

123

MEM_CGROUP_TARGET_THRESH,

124

MEM_CGROUP_TARGET_SOFTLIMIT,

124

MEM_CGROUP_TARGET_SOFTLIMIT,

125

MEM_CGROUP_TARGET_NUMAINFO,

125

MEM_CGROUP_TARGET_NUMAINFO,

126

MEM_CGROUP_NTARGETS,

126

MEM_CGROUP_NTARGETS,

127

};

127

};

128

#define THRESHOLDS_EVENTS_TARGET 128

128

#define THRESHOLDS_EVENTS_TARGET 128

129

#define SOFTLIMIT_EVENTS_TARGET 1024

129

#define SOFTLIMIT_EVENTS_TARGET 1024

130

#define NUMAINFO_EVENTS_TARGET 1024

130

#define NUMAINFO_EVENTS_TARGET 1024

131

132

struct mem_cgroup_stat_cpu {

132

struct mem_cgroup_stat_cpu {

133

long count[MEM_CGROUP_STAT_NSTATS];

133

long count[MEM_CGROUP_STAT_NSTATS];

134

unsigned long events[MEM_CGROUP_EVENTS_NSTATS];

134

unsigned long events[MEM_CGROUP_EVENTS_NSTATS];

135

unsigned long nr_page_events;

135

unsigned long nr_page_events;

136

unsigned long targets[MEM_CGROUP_NTARGETS];

136

unsigned long targets[MEM_CGROUP_NTARGETS];

137

};

137

};

138

139

struct mem_cgroup_reclaim_iter {

139

struct mem_cgroup_reclaim_iter {

140

/* css_id of the last scanned hierarchy member */

140

/* css_id of the last scanned hierarchy member */

141

int position;

141

int position;

142

/* scan generation, increased every round-trip */

142

/* scan generation, increased every round-trip */

143

unsigned int generation;

143

unsigned int generation;

144

};

144

};

145

146

/*

146

/*

147

* per-zone information in memory controller.

147

* per-zone information in memory controller.

148

*/

148

*/

149

struct mem_cgroup_per_zone {

149

struct mem_cgroup_per_zone {

150

struct lruvec lruvec;

150

struct lruvec lruvec;

151

unsigned long lru_size[NR_LRU_LISTS];

151

unsigned long lru_size[NR_LRU_LISTS];

152

153

struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];

153

struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];

154

155

struct rb_node tree_node; /* RB tree node */

155

struct rb_node tree_node; /* RB tree node */

156

unsigned long long usage_in_excess;/* Set to the value by which */

156

unsigned long long usage_in_excess;/* Set to the value by which */

157

/* the soft limit is exceeded*/

157

/* the soft limit is exceeded*/

158

bool on_tree;

158

bool on_tree;

159

struct mem_cgroup *memcg; /* Back pointer, we cannot */

159

struct mem_cgroup *memcg; /* Back pointer, we cannot */

160

/* use container_of */

160

/* use container_of */

161

};

161

};

162

163

struct mem_cgroup_per_node {

163

struct mem_cgroup_per_node {

164

struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];

164

struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];

165

};

165

};

166

167

struct mem_cgroup_lru_info {

167

struct mem_cgroup_lru_info {

168

struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];

168

struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];

169

};

169

};

170

171

/*

171

/*

172

* Cgroups above their limits are maintained in a RB-Tree, independent of

172

* Cgroups above their limits are maintained in a RB-Tree, independent of

173

* their hierarchy representation

173

* their hierarchy representation

174

*/

174

*/

175

176

struct mem_cgroup_tree_per_zone {

176

struct mem_cgroup_tree_per_zone {

177

struct rb_root rb_root;

177

struct rb_root rb_root;

178

spinlock_t lock;

178

spinlock_t lock;

179

};

179

};

180

181

struct mem_cgroup_tree_per_node {

181

struct mem_cgroup_tree_per_node {

182

struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];

182

struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];

183

};

183

};

184

185

struct mem_cgroup_tree {

185

struct mem_cgroup_tree {

186

struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];

186

struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];

187

};

187

};

188

189

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

189

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

190

191

struct mem_cgroup_threshold {

191

struct mem_cgroup_threshold {

192

struct eventfd_ctx *eventfd;

192

struct eventfd_ctx *eventfd;

193

u64 threshold;

193

u64 threshold;

194

};

194

};

195

196

/* For threshold */

196

/* For threshold */

197

struct mem_cgroup_threshold_ary {

197

struct mem_cgroup_threshold_ary {

198

/* An array index points to threshold just below or equal to usage. */

198

/* An array index points to threshold just below or equal to usage. */

199

int current_threshold;

199

int current_threshold;

200

/* Size of entries[] */

200

/* Size of entries[] */

201

unsigned int size;

201

unsigned int size;

202

/* Array of thresholds */

202

/* Array of thresholds */

203

struct mem_cgroup_threshold entries[0];

203

struct mem_cgroup_threshold entries[0];

204

};

204

};

205

206

struct mem_cgroup_thresholds {

206

struct mem_cgroup_thresholds {

207

/* Primary thresholds array */

207

/* Primary thresholds array */

208

struct mem_cgroup_threshold_ary *primary;

208

struct mem_cgroup_threshold_ary *primary;

209

/*

209

/*

210

* Spare threshold array.

210

* Spare threshold array.

211

* This is needed to make mem_cgroup_unregister_event() "never fail".

211

* This is needed to make mem_cgroup_unregister_event() "never fail".

212

* It must be able to store at least primary->size - 1 entries.

212

* It must be able to store at least primary->size - 1 entries.

213

*/

213

*/

214

struct mem_cgroup_threshold_ary *spare;

214

struct mem_cgroup_threshold_ary *spare;

215

};

215

};

216

217

/* for OOM */

217

/* for OOM */

218

struct mem_cgroup_eventfd_list {

218

struct mem_cgroup_eventfd_list {

219

struct list_head list;

219

struct list_head list;

220

struct eventfd_ctx *eventfd;

220

struct eventfd_ctx *eventfd;

221

};

221

};

222

223

static void mem_cgroup_threshold(struct mem_cgroup *memcg);

223

static void mem_cgroup_threshold(struct mem_cgroup *memcg);

224

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);

224

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);

225

226

/*

226

/*

227

* The memory controller data structure. The memory controller controls both

227

* The memory controller data structure. The memory controller controls both

228

* page cache and RSS per cgroup. We would eventually like to provide

228

* page cache and RSS per cgroup. We would eventually like to provide

229

* statistics based on the statistics developed by Rik Van Riel for clock-pro,

229

* statistics based on the statistics developed by Rik Van Riel for clock-pro,

230

* to help the administrator determine what knobs to tune.

230

* to help the administrator determine what knobs to tune.

231

*

231

*

232

* TODO: Add a water mark for the memory controller. Reclaim will begin when

232

* TODO: Add a water mark for the memory controller. Reclaim will begin when

233

* we hit the water mark. May be even add a low water mark, such that

233

* we hit the water mark. May be even add a low water mark, such that

234

* no reclaim occurs from a cgroup at it's low water mark, this is

234

* no reclaim occurs from a cgroup at it's low water mark, this is

235

* a feature that will be implemented much later in the future.

235

* a feature that will be implemented much later in the future.

236

*/

236

*/

237

struct mem_cgroup {

237

struct mem_cgroup {

238

struct cgroup_subsys_state css;

238

struct cgroup_subsys_state css;

239

/*

239

/*

240

* the counter to account for memory usage

240

* the counter to account for memory usage

241

*/

241

*/

242

struct res_counter res;

242

struct res_counter res;

243

244

union {

244

union {

245

/*

245

/*

246

* the counter to account for mem+swap usage.

246

* the counter to account for mem+swap usage.

247

*/

247

*/

248

struct res_counter memsw;

248

struct res_counter memsw;

249

250

/*

250

/*

251

* rcu_freeing is used only when freeing struct mem_cgroup,

251

* rcu_freeing is used only when freeing struct mem_cgroup,

252

* so put it into a union to avoid wasting more memory.

252

* so put it into a union to avoid wasting more memory.

253

* It must be disjoint from the css field. It could be

253

* It must be disjoint from the css field. It could be

254

* in a union with the res field, but res plays a much

254

* in a union with the res field, but res plays a much

255

* larger part in mem_cgroup life than memsw, and might

255

* larger part in mem_cgroup life than memsw, and might

256

* be of interest, even at time of free, when debugging.

256

* be of interest, even at time of free, when debugging.

257

* So share rcu_head with the less interesting memsw.

257

* So share rcu_head with the less interesting memsw.

258

*/

258

*/

259

struct rcu_head rcu_freeing;

259

struct rcu_head rcu_freeing;

260

/*

260

/*

261

* We also need some space for a worker in deferred freeing.

261

* We also need some space for a worker in deferred freeing.

262

* By the time we call it, rcu_freeing is no longer in use.

262

* By the time we call it, rcu_freeing is no longer in use.

263

*/

263

*/

264

struct work_struct work_freeing;

264

struct work_struct work_freeing;

265

};

265

};

266

267

/*

267

/*

268

* Per cgroup active and inactive list, similar to the

268

* Per cgroup active and inactive list, similar to the

269

* per zone LRU lists.

269

* per zone LRU lists.

270

*/

270

*/

271

struct mem_cgroup_lru_info info;

271

struct mem_cgroup_lru_info info;

272

int last_scanned_node;

272

int last_scanned_node;

273

#if MAX_NUMNODES > 1

273

#if MAX_NUMNODES > 1

274

nodemask_t scan_nodes;

274

nodemask_t scan_nodes;

275

atomic_t numainfo_events;

275

atomic_t numainfo_events;

276

atomic_t numainfo_updating;

276

atomic_t numainfo_updating;

277

#endif

277

#endif

278

/*

278

/*

279

* Should the accounting and control be hierarchical, per subtree?

279

* Should the accounting and control be hierarchical, per subtree?

280

*/

280

*/

281

bool use_hierarchy;

281

bool use_hierarchy;

282

283

bool oom_lock;

283

bool oom_lock;

284

atomic_t under_oom;

284

atomic_t under_oom;

285

286

atomic_t refcnt;

286

atomic_t refcnt;

287

288

int swappiness;

288

int swappiness;

289

/* OOM-Killer disable */

289

/* OOM-Killer disable */

290

int oom_kill_disable;

290

int oom_kill_disable;

291

292

/* set when res.limit == memsw.limit */

292

/* set when res.limit == memsw.limit */

293

bool memsw_is_minimum;

293

bool memsw_is_minimum;

294

295

/* protect arrays of thresholds */

295

/* protect arrays of thresholds */

296

struct mutex thresholds_lock;

296

struct mutex thresholds_lock;

297

298

/* thresholds for memory usage. RCU-protected */

298

/* thresholds for memory usage. RCU-protected */

299

struct mem_cgroup_thresholds thresholds;

299

struct mem_cgroup_thresholds thresholds;

300

301

/* thresholds for mem+swap usage. RCU-protected */

301

/* thresholds for mem+swap usage. RCU-protected */

302

struct mem_cgroup_thresholds memsw_thresholds;

302

struct mem_cgroup_thresholds memsw_thresholds;

303

304

/* For oom notifier event fd */

304

/* For oom notifier event fd */

305

struct list_head oom_notify;

305

struct list_head oom_notify;

306

307

/*

307

/*

308

* Should we move charges of a task when a task is moved into this

308

* Should we move charges of a task when a task is moved into this

309

* mem_cgroup ? And what type of charges should we move ?

309

* mem_cgroup ? And what type of charges should we move ?

310

*/

310

*/

311

unsigned long move_charge_at_immigrate;

311

unsigned long move_charge_at_immigrate;

312

/*

312

/*

313

* set > 0 if pages under this cgroup are moving to other cgroup.

313

* set > 0 if pages under this cgroup are moving to other cgroup.

314

*/

314

*/

315

atomic_t moving_account;

315

atomic_t moving_account;

316

/* taken only while moving_account > 0 */

316

/* taken only while moving_account > 0 */

317

spinlock_t move_lock;

317

spinlock_t move_lock;

318

/*

318

/*

319

* percpu counter.

319

* percpu counter.

320

*/

320

*/

321

struct mem_cgroup_stat_cpu __percpu *stat;

321

struct mem_cgroup_stat_cpu __percpu *stat;

322

/*

322

/*

323

* used when a cpu is offlined or other synchronizations

323

* used when a cpu is offlined or other synchronizations

324

* See mem_cgroup_read_stat().

324

* See mem_cgroup_read_stat().

325

*/

325

*/

326

struct mem_cgroup_stat_cpu nocpu_base;

326

struct mem_cgroup_stat_cpu nocpu_base;

327

spinlock_t pcp_counter_lock;

327

spinlock_t pcp_counter_lock;

328

329

#ifdef CONFIG_INET

329

#ifdef CONFIG_INET

330

struct tcp_memcontrol tcp_mem;

330

struct tcp_memcontrol tcp_mem;

331

#endif

331

#endif

332

};

332

};

333

334

/* Stuffs for move charges at task migration. */

334

/* Stuffs for move charges at task migration. */

335

/*

335

/*

336

* Types of charges to be moved. "move_charge_at_immitgrate" is treated as a

336

* Types of charges to be moved. "move_charge_at_immitgrate" is treated as a

337

* left-shifted bitmap of these types.

337

* left-shifted bitmap of these types.

338

*/

338

*/

339

enum move_type {

339

enum move_type {

340

MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */

340

MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */

341

MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */

341

MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */

342

NR_MOVE_TYPE,

342

NR_MOVE_TYPE,

343

};

343

};

344

345

/* "mc" and its members are protected by cgroup_mutex */

345

/* "mc" and its members are protected by cgroup_mutex */

346

static struct move_charge_struct {

346

static struct move_charge_struct {

347

spinlock_t lock; /* for from, to */

347

spinlock_t lock; /* for from, to */

348

struct mem_cgroup *from;

348

struct mem_cgroup *from;

349

struct mem_cgroup *to;

349

struct mem_cgroup *to;

350

unsigned long precharge;

350

unsigned long precharge;

351

unsigned long moved_charge;

351

unsigned long moved_charge;

352

unsigned long moved_swap;

352

unsigned long moved_swap;

353

struct task_struct *moving_task; /* a task moving charges */

353

struct task_struct *moving_task; /* a task moving charges */

354

wait_queue_head_t waitq; /* a waitq for other context */

354

wait_queue_head_t waitq; /* a waitq for other context */

355

} mc = {

355

} mc = {

356

.lock = __SPIN_LOCK_UNLOCKED(mc.lock),

356

.lock = __SPIN_LOCK_UNLOCKED(mc.lock),

357

.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),

357

.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),

358

};

358

};

359

360

static bool move_anon(void)

360

static bool move_anon(void)

361

{

361

{

362

return test_bit(MOVE_CHARGE_TYPE_ANON,

362

return test_bit(MOVE_CHARGE_TYPE_ANON,

363

&mc.to->move_charge_at_immigrate);

363

&mc.to->move_charge_at_immigrate);

364

}

364

}

365

366

static bool move_file(void)

366

static bool move_file(void)

367

{

367

{

368

return test_bit(MOVE_CHARGE_TYPE_FILE,

368

return test_bit(MOVE_CHARGE_TYPE_FILE,

369

&mc.to->move_charge_at_immigrate);

369

&mc.to->move_charge_at_immigrate);

370

}

370

}

371

372

/*

372

/*

373

* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft

373

* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft

374

* limit reclaim to prevent infinite loops, if they ever occur.

374

* limit reclaim to prevent infinite loops, if they ever occur.

375

*/

375

*/

376

#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100

376

#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100

377

#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2

377

#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2

378

379

enum charge_type {

379

enum charge_type {

380

MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

380

MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

381

MEM_CGROUP_CHARGE_TYPE_ANON,

381

MEM_CGROUP_CHARGE_TYPE_ANON,

382

MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */

382

MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */

383

MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */

383

MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */

384

MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */

384

MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */

385

NR_CHARGE_TYPE,

385

NR_CHARGE_TYPE,

386

};

386

};

387

388

/* for encoding cft->private value on file */

388

/* for encoding cft->private value on file */

389

#define _MEM (0)

389

#define _MEM (0)

390

#define _MEMSWAP (1)

390

#define _MEMSWAP (1)

391

#define _OOM_TYPE (2)

391

#define _OOM_TYPE (2)

392

#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))

392

#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))

393

#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)

393

#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)

394

#define MEMFILE_ATTR(val) ((val) & 0xffff)

394

#define MEMFILE_ATTR(val) ((val) & 0xffff)

395

/* Used for OOM nofiier */

395

/* Used for OOM nofiier */

396

#define OOM_CONTROL (0)

396

#define OOM_CONTROL (0)

397

398

/*

398

/*

399

* Reclaim flags for mem_cgroup_hierarchical_reclaim

399

* Reclaim flags for mem_cgroup_hierarchical_reclaim

400

*/

400

*/

401

#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0

401

#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0

402

#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)

402

#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)

403

#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1

403

#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1

404

#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)

404

#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)

405

406

static void mem_cgroup_get(struct mem_cgroup *memcg);

406

static void mem_cgroup_get(struct mem_cgroup *memcg);

407

static void mem_cgroup_put(struct mem_cgroup *memcg);

407

static void mem_cgroup_put(struct mem_cgroup *memcg);

408

409

/* Writing them here to avoid exposing memcg's inner layout */

409

/* Writing them here to avoid exposing memcg's inner layout */

410

#ifdef CONFIG_MEMCG_KMEM

410

#ifdef CONFIG_MEMCG_KMEM

411

#include <net/sock.h>

411

#include <net/sock.h>

412

#include <net/ip.h>

412

#include <net/ip.h>

413

414

static bool mem_cgroup_is_root(struct mem_cgroup *memcg);

414

static bool mem_cgroup_is_root(struct mem_cgroup *memcg);

415

void sock_update_memcg(struct sock *sk)

415

void sock_update_memcg(struct sock *sk)

416

{

416

{

417

if (mem_cgroup_sockets_enabled) {

417

if (mem_cgroup_sockets_enabled) {

418

struct mem_cgroup *memcg;

418

struct mem_cgroup *memcg;

419

struct cg_proto *cg_proto;

419

struct cg_proto *cg_proto;

420

421

BUG_ON(!sk->sk_prot->proto_cgroup);

421

BUG_ON(!sk->sk_prot->proto_cgroup);

422

423

/* Socket cloning can throw us here with sk_cgrp already

423

/* Socket cloning can throw us here with sk_cgrp already

424

* filled. It won't however, necessarily happen from

424

* filled. It won't however, necessarily happen from

425

* process context. So the test for root memcg given

425

* process context. So the test for root memcg given

426

* the current task's memcg won't help us in this case.

426

* the current task's memcg won't help us in this case.

427

*

427

*

428

* Respecting the original socket's memcg is a better

428

* Respecting the original socket's memcg is a better

429

* decision in this case.

429

* decision in this case.

430

*/

430

*/

431

if (sk->sk_cgrp) {

431

if (sk->sk_cgrp) {

432

BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));

432

BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));

433

mem_cgroup_get(sk->sk_cgrp->memcg);

433

mem_cgroup_get(sk->sk_cgrp->memcg);

434

return;

434

return;

435

}

435

}

436

437

rcu_read_lock();

437

rcu_read_lock();

438

memcg = mem_cgroup_from_task(current);

438

memcg = mem_cgroup_from_task(current);

439

cg_proto = sk->sk_prot->proto_cgroup(memcg);

439

cg_proto = sk->sk_prot->proto_cgroup(memcg);

440

if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {

440

if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {

441

mem_cgroup_get(memcg);

441

mem_cgroup_get(memcg);

442

sk->sk_cgrp = cg_proto;

442

sk->sk_cgrp = cg_proto;

443

}

443

}

444

rcu_read_unlock();

444

rcu_read_unlock();

445

}

445

}

446

}

446

}

447

EXPORT_SYMBOL(sock_update_memcg);

447

EXPORT_SYMBOL(sock_update_memcg);

448

449

void sock_release_memcg(struct sock *sk)

449

void sock_release_memcg(struct sock *sk)

450

{

450

{

451

if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {

451

if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {

452

struct mem_cgroup *memcg;

452

struct mem_cgroup *memcg;

453

WARN_ON(!sk->sk_cgrp->memcg);

453

WARN_ON(!sk->sk_cgrp->memcg);

454

memcg = sk->sk_cgrp->memcg;

454

memcg = sk->sk_cgrp->memcg;

455

mem_cgroup_put(memcg);

455

mem_cgroup_put(memcg);

456

}

456

}

457

}

457

}

458

459

#ifdef CONFIG_INET

459

#ifdef CONFIG_INET

460

struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)

460

struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)

461

{

461

{

462

if (!memcg || mem_cgroup_is_root(memcg))

462

if (!memcg || mem_cgroup_is_root(memcg))

463

return NULL;

463

return NULL;

464

465

return &memcg->tcp_mem.cg_proto;

465

return &memcg->tcp_mem.cg_proto;

466

}

466

}

467

EXPORT_SYMBOL(tcp_proto_cgroup);

467

EXPORT_SYMBOL(tcp_proto_cgroup);

468

#endif /* CONFIG_INET */

468

#endif /* CONFIG_INET */

469

#endif /* CONFIG_MEMCG_KMEM */

469

#endif /* CONFIG_MEMCG_KMEM */

470

471

#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)

471

#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)

472

static void disarm_sock_keys(struct mem_cgroup *memcg)

472

static void disarm_sock_keys(struct mem_cgroup *memcg)

473

{

473

{

474

if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))

474

if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))

475

return;

475

return;

476

static_key_slow_dec(&memcg_socket_limit_enabled);

476

static_key_slow_dec(&memcg_socket_limit_enabled);

477

}

477

}

478

#else

478

#else

479

static void disarm_sock_keys(struct mem_cgroup *memcg)

479

static void disarm_sock_keys(struct mem_cgroup *memcg)

480

{

480

{

481

}

481

}

482

#endif

482

#endif

483

484

static void drain_all_stock_async(struct mem_cgroup *memcg);

484

static void drain_all_stock_async(struct mem_cgroup *memcg);

485

486

static struct mem_cgroup_per_zone *

486

static struct mem_cgroup_per_zone *

487

mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)

487

mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)

488

{

488

{

489

return &memcg->info.nodeinfo[nid]->zoneinfo[zid];

489

return &memcg->info.nodeinfo[nid]->zoneinfo[zid];

490

}

490

}

491

492

struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)

492

struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)

493

{

493

{

494

return &memcg->css;

494

return &memcg->css;

495

}

495

}

496

497

static struct mem_cgroup_per_zone *

497

static struct mem_cgroup_per_zone *

498

page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)

498

page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)

499

{

499

{

500

int nid = page_to_nid(page);

500

int nid = page_to_nid(page);

501

int zid = page_zonenum(page);

501

int zid = page_zonenum(page);

502

503

return mem_cgroup_zoneinfo(memcg, nid, zid);

503

return mem_cgroup_zoneinfo(memcg, nid, zid);

504

}

504

}

505

506

static struct mem_cgroup_tree_per_zone *

506

static struct mem_cgroup_tree_per_zone *

507

soft_limit_tree_node_zone(int nid, int zid)

507

soft_limit_tree_node_zone(int nid, int zid)

508

{

508

{

509

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

509

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

510

}

510

}

511

512

static struct mem_cgroup_tree_per_zone *

512

static struct mem_cgroup_tree_per_zone *

513

soft_limit_tree_from_page(struct page *page)

513

soft_limit_tree_from_page(struct page *page)

514

{

514

{

515

int nid = page_to_nid(page);

515

int nid = page_to_nid(page);

516

int zid = page_zonenum(page);

516

int zid = page_zonenum(page);

517

518

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

518

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

519

}

519

}

520

521

static void

521

static void

522

__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,

522

__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,

523

struct mem_cgroup_per_zone *mz,

523

struct mem_cgroup_per_zone *mz,

524

struct mem_cgroup_tree_per_zone *mctz,

524

struct mem_cgroup_tree_per_zone *mctz,

525

unsigned long long new_usage_in_excess)

525

unsigned long long new_usage_in_excess)

526

{

526

{

527

struct rb_node **p = &mctz->rb_root.rb_node;

527

struct rb_node **p = &mctz->rb_root.rb_node;

528

struct rb_node *parent = NULL;

528

struct rb_node *parent = NULL;

529

struct mem_cgroup_per_zone *mz_node;

529

struct mem_cgroup_per_zone *mz_node;

530

531

if (mz->on_tree)

531

if (mz->on_tree)

532

return;

532

return;

533

534

mz->usage_in_excess = new_usage_in_excess;

534

mz->usage_in_excess = new_usage_in_excess;

535

if (!mz->usage_in_excess)

535

if (!mz->usage_in_excess)

536

return;

536

return;

537

while (*p) {

537

while (*p) {

538

parent = *p;

538

parent = *p;

539

mz_node = rb_entry(parent, struct mem_cgroup_per_zone,

539

mz_node = rb_entry(parent, struct mem_cgroup_per_zone,

540

tree_node);

540

tree_node);

541

if (mz->usage_in_excess < mz_node->usage_in_excess)

541

if (mz->usage_in_excess < mz_node->usage_in_excess)

542

p = &(*p)->rb_left;

542

p = &(*p)->rb_left;

543

/*

543

/*

544

* We can't avoid mem cgroups that are over their soft

544

* We can't avoid mem cgroups that are over their soft

545

* limit by the same amount

545

* limit by the same amount

546

*/

546

*/

547

else if (mz->usage_in_excess >= mz_node->usage_in_excess)

547

else if (mz->usage_in_excess >= mz_node->usage_in_excess)

548

p = &(*p)->rb_right;

548

p = &(*p)->rb_right;

549

}

549

}

550

rb_link_node(&mz->tree_node, parent, p);

550

rb_link_node(&mz->tree_node, parent, p);

551

rb_insert_color(&mz->tree_node, &mctz->rb_root);

551

rb_insert_color(&mz->tree_node, &mctz->rb_root);

552

mz->on_tree = true;

552

mz->on_tree = true;

553

}

553

}

554

555

static void

555

static void

556

__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,

556

__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,

557

struct mem_cgroup_per_zone *mz,

557

struct mem_cgroup_per_zone *mz,

558

struct mem_cgroup_tree_per_zone *mctz)

558

struct mem_cgroup_tree_per_zone *mctz)

559

{

559

{

560

if (!mz->on_tree)

560

if (!mz->on_tree)

561

return;

561

return;

562

rb_erase(&mz->tree_node, &mctz->rb_root);

562

rb_erase(&mz->tree_node, &mctz->rb_root);

563

mz->on_tree = false;

563

mz->on_tree = false;

564

}

564

}

565

566

static void

566

static void

567

mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,

567

mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,

568

struct mem_cgroup_per_zone *mz,

568

struct mem_cgroup_per_zone *mz,

569

struct mem_cgroup_tree_per_zone *mctz)

569

struct mem_cgroup_tree_per_zone *mctz)

570

{

570

{

571

spin_lock(&mctz->lock);

571

spin_lock(&mctz->lock);

572

__mem_cgroup_remove_exceeded(memcg, mz, mctz);

572

__mem_cgroup_remove_exceeded(memcg, mz, mctz);

573

spin_unlock(&mctz->lock);

573

spin_unlock(&mctz->lock);

574

}

574

}

575

576

577

static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)

577

static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)

578

{

578

{

579

unsigned long long excess;

579

unsigned long long excess;

580

struct mem_cgroup_per_zone *mz;

580

struct mem_cgroup_per_zone *mz;

581

struct mem_cgroup_tree_per_zone *mctz;

581

struct mem_cgroup_tree_per_zone *mctz;

582

int nid = page_to_nid(page);

582

int nid = page_to_nid(page);

583

int zid = page_zonenum(page);

583

int zid = page_zonenum(page);

584

mctz = soft_limit_tree_from_page(page);

584

mctz = soft_limit_tree_from_page(page);

585

586

/*

586

/*

587

* Necessary to update all ancestors when hierarchy is used.

587

* Necessary to update all ancestors when hierarchy is used.

588

* because their event counter is not touched.

588

* because their event counter is not touched.

589

*/

589

*/

590

for (; memcg; memcg = parent_mem_cgroup(memcg)) {

590

for (; memcg; memcg = parent_mem_cgroup(memcg)) {

591

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

591

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

592

excess = res_counter_soft_limit_excess(&memcg->res);

592

excess = res_counter_soft_limit_excess(&memcg->res);

593

/*

593

/*

594

* We have to update the tree if mz is on RB-tree or

594

* We have to update the tree if mz is on RB-tree or

595

* mem is over its softlimit.

595

* mem is over its softlimit.

596

*/

596

*/

597

if (excess || mz->on_tree) {

597

if (excess || mz->on_tree) {

598

spin_lock(&mctz->lock);

598

spin_lock(&mctz->lock);

599

/* if on-tree, remove it */

599

/* if on-tree, remove it */

600

if (mz->on_tree)

600

if (mz->on_tree)

601

__mem_cgroup_remove_exceeded(memcg, mz, mctz);

601

__mem_cgroup_remove_exceeded(memcg, mz, mctz);

602

/*

602

/*

603

* Insert again. mz->usage_in_excess will be updated.

603

* Insert again. mz->usage_in_excess will be updated.

604

* If excess is 0, no tree ops.

604

* If excess is 0, no tree ops.

605

*/

605

*/

606

__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);

606

__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);

607

spin_unlock(&mctz->lock);

607

spin_unlock(&mctz->lock);

608

}

608

}

609

}

609

}

610

}

610

}

611

612

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)

612

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)

613

{

613

{

614

int node, zone;

614

int node, zone;

615

struct mem_cgroup_per_zone *mz;

615

struct mem_cgroup_per_zone *mz;

616

struct mem_cgroup_tree_per_zone *mctz;

616

struct mem_cgroup_tree_per_zone *mctz;

617

618

for_each_node(node) {

618

for_each_node(node) {

619

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

619

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

620

mz = mem_cgroup_zoneinfo(memcg, node, zone);

620

mz = mem_cgroup_zoneinfo(memcg, node, zone);

621

mctz = soft_limit_tree_node_zone(node, zone);

621

mctz = soft_limit_tree_node_zone(node, zone);

622

mem_cgroup_remove_exceeded(memcg, mz, mctz);

622

mem_cgroup_remove_exceeded(memcg, mz, mctz);

623

}

623

}

624

}

624

}

625

}

625

}

626

627

static struct mem_cgroup_per_zone *

627

static struct mem_cgroup_per_zone *

628

__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

628

__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

629

{

629

{

630

struct rb_node *rightmost = NULL;

630

struct rb_node *rightmost = NULL;

631

struct mem_cgroup_per_zone *mz;

631

struct mem_cgroup_per_zone *mz;

632

633

retry:

633

retry:

634

mz = NULL;

634

mz = NULL;

635

rightmost = rb_last(&mctz->rb_root);

635

rightmost = rb_last(&mctz->rb_root);

636

if (!rightmost)

636

if (!rightmost)

637

goto done; /* Nothing to reclaim from */

637

goto done; /* Nothing to reclaim from */

638

639

mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);

639

mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);

640

/*

640

/*

641

* Remove the node now but someone else can add it back,

641

* Remove the node now but someone else can add it back,

642

* we will to add it back at the end of reclaim to its correct

642

* we will to add it back at the end of reclaim to its correct

643

* position in the tree.

643

* position in the tree.

644

*/

644

*/

645

__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);

645

__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);

646

if (!res_counter_soft_limit_excess(&mz->memcg->res) ||

646

if (!res_counter_soft_limit_excess(&mz->memcg->res) ||

647

!css_tryget(&mz->memcg->css))

647

!css_tryget(&mz->memcg->css))

648

goto retry;

648

goto retry;

649

done:

649

done:

650

return mz;

650

return mz;

651

}

651

}

652

653

static struct mem_cgroup_per_zone *

653

static struct mem_cgroup_per_zone *

654

mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

654

mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

655

{

655

{

656

struct mem_cgroup_per_zone *mz;

656

struct mem_cgroup_per_zone *mz;

657

658

spin_lock(&mctz->lock);

658

spin_lock(&mctz->lock);

659

mz = __mem_cgroup_largest_soft_limit_node(mctz);

659

mz = __mem_cgroup_largest_soft_limit_node(mctz);

660

spin_unlock(&mctz->lock);

660

spin_unlock(&mctz->lock);

661

return mz;

661

return mz;

662

}

662

}

663

664

/*

664

/*

665

* Implementation Note: reading percpu statistics for memcg.

665

* Implementation Note: reading percpu statistics for memcg.

666

*

666

*

667

* Both of vmstat[] and percpu_counter has threshold and do periodic

667

* Both of vmstat[] and percpu_counter has threshold and do periodic

668

* synchronization to implement "quick" read. There are trade-off between

668

* synchronization to implement "quick" read. There are trade-off between

669

* reading cost and precision of value. Then, we may have a chance to implement

669

* reading cost and precision of value. Then, we may have a chance to implement

670

* a periodic synchronizion of counter in memcg's counter.

670

* a periodic synchronizion of counter in memcg's counter.

671

*

671

*

672

* But this _read() function is used for user interface now. The user accounts

672

* But this _read() function is used for user interface now. The user accounts

673

* memory usage by memory cgroup and he _always_ requires exact value because

673

* memory usage by memory cgroup and he _always_ requires exact value because

674

* he accounts memory. Even if we provide quick-and-fuzzy read, we always

674

* he accounts memory. Even if we provide quick-and-fuzzy read, we always

675

* have to visit all online cpus and make sum. So, for now, unnecessary

675

* have to visit all online cpus and make sum. So, for now, unnecessary

676

* synchronization is not implemented. (just implemented for cpu hotplug)

676

* synchronization is not implemented. (just implemented for cpu hotplug)

677

*

677

*

678

* If there are kernel internal actions which can make use of some not-exact

678

* If there are kernel internal actions which can make use of some not-exact

679

* value, and reading all cpu value can be performance bottleneck in some

679

* value, and reading all cpu value can be performance bottleneck in some

680

* common workload, threashold and synchonization as vmstat[] should be

680

* common workload, threashold and synchonization as vmstat[] should be

681

* implemented.

681

* implemented.

682

*/

682

*/

683

static long mem_cgroup_read_stat(struct mem_cgroup *memcg,

683

static long mem_cgroup_read_stat(struct mem_cgroup *memcg,

684

enum mem_cgroup_stat_index idx)

684

enum mem_cgroup_stat_index idx)

685

{

685

{

686

long val = 0;

686

long val = 0;

687

int cpu;

687

int cpu;

688

689

get_online_cpus();

689

get_online_cpus();

690

for_each_online_cpu(cpu)

690

for_each_online_cpu(cpu)

691

val += per_cpu(memcg->stat->count[idx], cpu);

691

val += per_cpu(memcg->stat->count[idx], cpu);

692

#ifdef CONFIG_HOTPLUG_CPU

692

#ifdef CONFIG_HOTPLUG_CPU

693

spin_lock(&memcg->pcp_counter_lock);

693

spin_lock(&memcg->pcp_counter_lock);

694

val += memcg->nocpu_base.count[idx];

694

val += memcg->nocpu_base.count[idx];

695

spin_unlock(&memcg->pcp_counter_lock);

695

spin_unlock(&memcg->pcp_counter_lock);

696

#endif

696

#endif

697

put_online_cpus();

697

put_online_cpus();

698

return val;

698

return val;

699

}

699

}

700

701

static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,

701

static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,

702

bool charge)

702

bool charge)

703

{

703

{

704

int val = (charge) ? 1 : -1;

704

int val = (charge) ? 1 : -1;

705

this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);

705

this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);

706

}

706

}

707

708

static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,

708

static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,

709

enum mem_cgroup_events_index idx)

709

enum mem_cgroup_events_index idx)

710

{

710

{

711

unsigned long val = 0;

711

unsigned long val = 0;

712

int cpu;

712

int cpu;

713

714

for_each_online_cpu(cpu)

714

for_each_online_cpu(cpu)

715

val += per_cpu(memcg->stat->events[idx], cpu);

715

val += per_cpu(memcg->stat->events[idx], cpu);

716

#ifdef CONFIG_HOTPLUG_CPU

716

#ifdef CONFIG_HOTPLUG_CPU

717

spin_lock(&memcg->pcp_counter_lock);

717

spin_lock(&memcg->pcp_counter_lock);

718

val += memcg->nocpu_base.events[idx];

718

val += memcg->nocpu_base.events[idx];

719

spin_unlock(&memcg->pcp_counter_lock);

719

spin_unlock(&memcg->pcp_counter_lock);

720

#endif

720

#endif

721

return val;

721

return val;

722

}

722

}

723

724

static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,

724

static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,

725

bool anon, int nr_pages)

725

bool anon, int nr_pages)

726

{

726

{

727

preempt_disable();

727

preempt_disable();

728

729

/*

729

/*

730

* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is

730

* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is

731

* counted as CACHE even if it's on ANON LRU.

731

* counted as CACHE even if it's on ANON LRU.

732

*/

732

*/

733

if (anon)

733

if (anon)

734

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],

734

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],

735

nr_pages);

735

nr_pages);

736

else

736

else

737

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],

737

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],

738

nr_pages);

738

nr_pages);

739

740

/* pagein of a big page is an event. So, ignore page size */

740

/* pagein of a big page is an event. So, ignore page size */

741

if (nr_pages > 0)

741

if (nr_pages > 0)

742

__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);

742

__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);

743

else {

743

else {

744

__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);

744

__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);

745

nr_pages = -nr_pages; /* for event */

745

nr_pages = -nr_pages; /* for event */

746

}

746

}

747

748

__this_cpu_add(memcg->stat->nr_page_events, nr_pages);

748

__this_cpu_add(memcg->stat->nr_page_events, nr_pages);

749

750

preempt_enable();

750

preempt_enable();

751

}

751

}

752

753

unsigned long

753

unsigned long

754

mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)

754

mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)

755

{

755

{

756

struct mem_cgroup_per_zone *mz;

756

struct mem_cgroup_per_zone *mz;

757

758

mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);

758

mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);

759

return mz->lru_size[lru];

759

return mz->lru_size[lru];

760

}

760

}

761

762

static unsigned long

762

static unsigned long

763

mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,

763

mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,

764

unsigned int lru_mask)

764

unsigned int lru_mask)

765

{

765

{

766

struct mem_cgroup_per_zone *mz;

766

struct mem_cgroup_per_zone *mz;

767

enum lru_list lru;

767

enum lru_list lru;

768

unsigned long ret = 0;

768

unsigned long ret = 0;

769

770

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

770

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

771

772

for_each_lru(lru) {

772

for_each_lru(lru) {

773

if (BIT(lru) & lru_mask)

773

if (BIT(lru) & lru_mask)

774

ret += mz->lru_size[lru];

774

ret += mz->lru_size[lru];

775

}

775

}

776

return ret;

776

return ret;

777

}

777

}

778

779

static unsigned long

779

static unsigned long

780

mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,

780

mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,

781

int nid, unsigned int lru_mask)

781

int nid, unsigned int lru_mask)

782

{

782

{

783

u64 total = 0;

783

u64 total = 0;

784

int zid;

784

int zid;

785

786

for (zid = 0; zid < MAX_NR_ZONES; zid++)

786

for (zid = 0; zid < MAX_NR_ZONES; zid++)

787

total += mem_cgroup_zone_nr_lru_pages(memcg,

787

total += mem_cgroup_zone_nr_lru_pages(memcg,

788

nid, zid, lru_mask);

788

nid, zid, lru_mask);

789

790

return total;

790

return total;

791

}

791

}

792

793

static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,

793

static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,

794

unsigned int lru_mask)

794

unsigned int lru_mask)

795

{

795

{

796

int nid;

796

int nid;

797

u64 total = 0;

797

u64 total = 0;

798

799

for_each_node_state(nid, N_HIGH_MEMORY)

799

for_each_node_state(nid, N_HIGH_MEMORY)

800

total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);

800

total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);

801

return total;

801

return total;

802

}

802

}

803

804

static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,

804

static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,

805

enum mem_cgroup_events_target target)

805

enum mem_cgroup_events_target target)

806

{

806

{

807

unsigned long val, next;

807

unsigned long val, next;

808

809

val = __this_cpu_read(memcg->stat->nr_page_events);

809

val = __this_cpu_read(memcg->stat->nr_page_events);

810

next = __this_cpu_read(memcg->stat->targets[target]);

810

next = __this_cpu_read(memcg->stat->targets[target]);

811

/* from time_after() in jiffies.h */

811

/* from time_after() in jiffies.h */

812

if ((long)next - (long)val < 0) {

812

if ((long)next - (long)val < 0) {

813

switch (target) {

813

switch (target) {

814

case MEM_CGROUP_TARGET_THRESH:

814

case MEM_CGROUP_TARGET_THRESH:

815

next = val + THRESHOLDS_EVENTS_TARGET;

815

next = val + THRESHOLDS_EVENTS_TARGET;

816

break;

816

break;

817

case MEM_CGROUP_TARGET_SOFTLIMIT:

817

case MEM_CGROUP_TARGET_SOFTLIMIT:

818

next = val + SOFTLIMIT_EVENTS_TARGET;

818

next = val + SOFTLIMIT_EVENTS_TARGET;

819

break;

819

break;

820

case MEM_CGROUP_TARGET_NUMAINFO:

820

case MEM_CGROUP_TARGET_NUMAINFO:

821

next = val + NUMAINFO_EVENTS_TARGET;

821

next = val + NUMAINFO_EVENTS_TARGET;

822

break;

822

break;

823

default:

823

default:

824

break;

824

break;

825

}

825

}

826

__this_cpu_write(memcg->stat->targets[target], next);

826

__this_cpu_write(memcg->stat->targets[target], next);

827

return true;

827

return true;

828

}

828

}

829

return false;

829

return false;

830

}

830

}

831

832

/*

832

/*

833

* Check events in order.

833

* Check events in order.

834

*

834

*

835

*/

835

*/

836

static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)

836

static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)

837

{

837

{

838

preempt_disable();

838

preempt_disable();

839

/* threshold event is triggered in finer grain than soft limit */

839

/* threshold event is triggered in finer grain than soft limit */

840

if (unlikely(mem_cgroup_event_ratelimit(memcg,

840

if (unlikely(mem_cgroup_event_ratelimit(memcg,

841

MEM_CGROUP_TARGET_THRESH))) {

841

MEM_CGROUP_TARGET_THRESH))) {

842

bool do_softlimit;

842

bool do_softlimit;

843

bool do_numainfo __maybe_unused;

843

bool do_numainfo __maybe_unused;

844

845

do_softlimit = mem_cgroup_event_ratelimit(memcg,

845

do_softlimit = mem_cgroup_event_ratelimit(memcg,

846

MEM_CGROUP_TARGET_SOFTLIMIT);

846

MEM_CGROUP_TARGET_SOFTLIMIT);

847

#if MAX_NUMNODES > 1

847

#if MAX_NUMNODES > 1

848

do_numainfo = mem_cgroup_event_ratelimit(memcg,

848

do_numainfo = mem_cgroup_event_ratelimit(memcg,

849

MEM_CGROUP_TARGET_NUMAINFO);

849

MEM_CGROUP_TARGET_NUMAINFO);

850

#endif

850

#endif

851

preempt_enable();

851

preempt_enable();

852

853

mem_cgroup_threshold(memcg);

853

mem_cgroup_threshold(memcg);

854

if (unlikely(do_softlimit))

854

if (unlikely(do_softlimit))

855

mem_cgroup_update_tree(memcg, page);

855

mem_cgroup_update_tree(memcg, page);

856

#if MAX_NUMNODES > 1

856

#if MAX_NUMNODES > 1

857

if (unlikely(do_numainfo))

857

if (unlikely(do_numainfo))

858

atomic_inc(&memcg->numainfo_events);

858

atomic_inc(&memcg->numainfo_events);

859

#endif

859

#endif

860

} else

860

} else

861

preempt_enable();

861

preempt_enable();

862

}

862

}

863

864

struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)

864

struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)

865

{

865

{

866

return container_of(cgroup_subsys_state(cont,

866

return container_of(cgroup_subsys_state(cont,

867

mem_cgroup_subsys_id), struct mem_cgroup,

867

mem_cgroup_subsys_id), struct mem_cgroup,

868

css);

868

css);

869

}

869

}

870

871

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)

871

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)

872

{

872

{

873

/*

873

/*

874

* mm_update_next_owner() may clear mm->owner to NULL

874

* mm_update_next_owner() may clear mm->owner to NULL

875

* if it races with swapoff, page migration, etc.

875

* if it races with swapoff, page migration, etc.

876

* So this can be called with p == NULL.

876

* So this can be called with p == NULL.

877

*/

877

*/

878

if (unlikely(!p))

878

if (unlikely(!p))

879

return NULL;

879

return NULL;

880

881

return container_of(task_subsys_state(p, mem_cgroup_subsys_id),

881

return container_of(task_subsys_state(p, mem_cgroup_subsys_id),

882

struct mem_cgroup, css);

882

struct mem_cgroup, css);

883

}

883

}

884

885

struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)

885

struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)

886

{

886

{

887

struct mem_cgroup *memcg = NULL;

887

struct mem_cgroup *memcg = NULL;

888

889

if (!mm)

889

if (!mm)

890

return NULL;

890

return NULL;

891

/*

891

/*

892

* Because we have no locks, mm->owner's may be being moved to other

892

* Because we have no locks, mm->owner's may be being moved to other

893

* cgroup. We use css_tryget() here even if this looks

893

* cgroup. We use css_tryget() here even if this looks

894

* pessimistic (rather than adding locks here).

894

* pessimistic (rather than adding locks here).

895

*/

895

*/

896

rcu_read_lock();

896

rcu_read_lock();

897

do {

897

do {

898

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));

898

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));

899

if (unlikely(!memcg))

899

if (unlikely(!memcg))

900

break;

900

break;

901

} while (!css_tryget(&memcg->css));

901

} while (!css_tryget(&memcg->css));

902

rcu_read_unlock();

902

rcu_read_unlock();

903

return memcg;

903

return memcg;

904

}

904

}

905

906

/**

906

/**

907

* mem_cgroup_iter - iterate over memory cgroup hierarchy

907

* mem_cgroup_iter - iterate over memory cgroup hierarchy

908

* @root: hierarchy root

908

* @root: hierarchy root

909

* @prev: previously returned memcg, NULL on first invocation

909

* @prev: previously returned memcg, NULL on first invocation

910

* @reclaim: cookie for shared reclaim walks, NULL for full walks

910

* @reclaim: cookie for shared reclaim walks, NULL for full walks

911

*

911

*

912

* Returns references to children of the hierarchy below @root, or

912

* Returns references to children of the hierarchy below @root, or

913

* @root itself, or %NULL after a full round-trip.

913

* @root itself, or %NULL after a full round-trip.

914

*

914

*

915

* Caller must pass the return value in @prev on subsequent

915

* Caller must pass the return value in @prev on subsequent

916

* invocations for reference counting, or use mem_cgroup_iter_break()

916

* invocations for reference counting, or use mem_cgroup_iter_break()

917

* to cancel a hierarchy walk before the round-trip is complete.

917

* to cancel a hierarchy walk before the round-trip is complete.

918

*

918

*

919

* Reclaimers can specify a zone and a priority level in @reclaim to

919

* Reclaimers can specify a zone and a priority level in @reclaim to

920

* divide up the memcgs in the hierarchy among all concurrent

920

* divide up the memcgs in the hierarchy among all concurrent

921

* reclaimers operating on the same zone and priority.

921

* reclaimers operating on the same zone and priority.

922

*/

922

*/

923

struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,

923

struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,

924

struct mem_cgroup *prev,

924

struct mem_cgroup *prev,

925

struct mem_cgroup_reclaim_cookie *reclaim)

925

struct mem_cgroup_reclaim_cookie *reclaim)

926

{

926

{

927

struct mem_cgroup *memcg = NULL;

927

struct mem_cgroup *memcg = NULL;

928

int id = 0;

928

int id = 0;

929

930

if (mem_cgroup_disabled())

930

if (mem_cgroup_disabled())

931

return NULL;

931

return NULL;

932

933

if (!root)

933

if (!root)

934

root = root_mem_cgroup;

934

root = root_mem_cgroup;

935

936

if (prev && !reclaim)

936

if (prev && !reclaim)

937

id = css_id(&prev->css);

937

id = css_id(&prev->css);

938

939

if (prev && prev != root)

939

if (prev && prev != root)

940

css_put(&prev->css);

940

css_put(&prev->css);

941

942

if (!root->use_hierarchy && root != root_mem_cgroup) {

942

if (!root->use_hierarchy && root != root_mem_cgroup) {

943

if (prev)

943

if (prev)

944

return NULL;

944

return NULL;

945

return root;

945

return root;

946

}

946

}

947

948

while (!memcg) {

948

while (!memcg) {

949

struct mem_cgroup_reclaim_iter *uninitialized_var(iter);

949

struct mem_cgroup_reclaim_iter *uninitialized_var(iter);

950

struct cgroup_subsys_state *css;

950

struct cgroup_subsys_state *css;

951

952

if (reclaim) {

952

if (reclaim) {

953

int nid = zone_to_nid(reclaim->zone);

953

int nid = zone_to_nid(reclaim->zone);

954

int zid = zone_idx(reclaim->zone);

954

int zid = zone_idx(reclaim->zone);

955

struct mem_cgroup_per_zone *mz;

955

struct mem_cgroup_per_zone *mz;

956

957

mz = mem_cgroup_zoneinfo(root, nid, zid);

957

mz = mem_cgroup_zoneinfo(root, nid, zid);

958

iter = &mz->reclaim_iter[reclaim->priority];

958

iter = &mz->reclaim_iter[reclaim->priority];

959

if (prev && reclaim->generation != iter->generation)

959

if (prev && reclaim->generation != iter->generation)

960

return NULL;

960

return NULL;

961

id = iter->position;

961

id = iter->position;

962

}

962

}

963

964

rcu_read_lock();

964

rcu_read_lock();

965

css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);

965

css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);

966

if (css) {

966

if (css) {

967

if (css == &root->css || css_tryget(css))

967

if (css == &root->css || css_tryget(css))

968

memcg = container_of(css,

968

memcg = container_of(css,

969

struct mem_cgroup, css);

969

struct mem_cgroup, css);

970

} else

970

} else

971

id = 0;

971

id = 0;

972

rcu_read_unlock();

972

rcu_read_unlock();

973

974

if (reclaim) {

974

if (reclaim) {

975

iter->position = id;

975

iter->position = id;

976

if (!css)

976

if (!css)

977

iter->generation++;

977

iter->generation++;

978

else if (!prev && memcg)

978

else if (!prev && memcg)

979

reclaim->generation = iter->generation;

979

reclaim->generation = iter->generation;

980

}

980

}

981

982

if (prev && !css)

982

if (prev && !css)

983

return NULL;

983

return NULL;

984

}

984

}

985

return memcg;

985

return memcg;

986

}

986

}

987

988

/**

988

/**

989

* mem_cgroup_iter_break - abort a hierarchy walk prematurely

989

* mem_cgroup_iter_break - abort a hierarchy walk prematurely

990

* @root: hierarchy root

990

* @root: hierarchy root

991

* @prev: last visited hierarchy member as returned by mem_cgroup_iter()

991

* @prev: last visited hierarchy member as returned by mem_cgroup_iter()

992

*/

992

*/

993

void mem_cgroup_iter_break(struct mem_cgroup *root,

993

void mem_cgroup_iter_break(struct mem_cgroup *root,

994

struct mem_cgroup *prev)

994

struct mem_cgroup *prev)

995

{

995

{

996

if (!root)

996

if (!root)

997

root = root_mem_cgroup;

997

root = root_mem_cgroup;

998

if (prev && prev != root)

998

if (prev && prev != root)

999

css_put(&prev->css);

999

css_put(&prev->css);

1000

}

1000

}

1001

1002

/*

1002

/*

1003

* Iteration constructs for visiting all cgroups (under a tree). If

1003

* Iteration constructs for visiting all cgroups (under a tree). If

1004

* loops are exited prematurely (break), mem_cgroup_iter_break() must

1004

* loops are exited prematurely (break), mem_cgroup_iter_break() must

1005

* be used for reference counting.

1005

* be used for reference counting.

1006

*/

1006

*/

1007

#define for_each_mem_cgroup_tree(iter, root) \

1007

#define for_each_mem_cgroup_tree(iter, root) \

1008

for (iter = mem_cgroup_iter(root, NULL, NULL); \

1008

for (iter = mem_cgroup_iter(root, NULL, NULL); \

1009

iter != NULL; \

1009

iter != NULL; \

1010

iter = mem_cgroup_iter(root, iter, NULL))

1010

iter = mem_cgroup_iter(root, iter, NULL))

1011

1012

#define for_each_mem_cgroup(iter) \

1012

#define for_each_mem_cgroup(iter) \

1013

for (iter = mem_cgroup_iter(NULL, NULL, NULL); \

1013

for (iter = mem_cgroup_iter(NULL, NULL, NULL); \

1014

iter != NULL; \

1014

iter != NULL; \

1015

iter = mem_cgroup_iter(NULL, iter, NULL))

1015

iter = mem_cgroup_iter(NULL, iter, NULL))

1016

1017

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)

1017

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)

1018

{

1018

{

1019

return (memcg == root_mem_cgroup);

1019

return (memcg == root_mem_cgroup);

1020

}

1020

}

1021

1022

void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)

1022

void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)

1023

{

1023

{

1024

struct mem_cgroup *memcg;

1024

struct mem_cgroup *memcg;

1025

1026

if (!mm)

1026

if (!mm)

1027

return;

1027

return;

1028

1029

rcu_read_lock();

1029

rcu_read_lock();

1030

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));

1030

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));

1031

if (unlikely(!memcg))

1031

if (unlikely(!memcg))

1032

goto out;

1032

goto out;

1033

1034

switch (idx) {

1034

switch (idx) {

1035

case PGFAULT:

1035

case PGFAULT:

1036

this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);

1036

this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);

1037

break;

1037

break;

1038

case PGMAJFAULT:

1038

case PGMAJFAULT:

1039

this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);

1039

this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);

1040

break;

1040

break;

1041

default:

1041

default:

1042

BUG();

1042

BUG();

1043

}

1043

}

1044

out:

1044

out:

1045

rcu_read_unlock();

1045

rcu_read_unlock();

1046

}

1046

}

1047

EXPORT_SYMBOL(mem_cgroup_count_vm_event);

1047

EXPORT_SYMBOL(mem_cgroup_count_vm_event);

1048

1049

/**

1049

/**

1050

* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg

1050

* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg

1051

* @zone: zone of the wanted lruvec

1051

* @zone: zone of the wanted lruvec

1052

* @memcg: memcg of the wanted lruvec

1052

* @memcg: memcg of the wanted lruvec

1053

*

1053

*

1054

* Returns the lru list vector holding pages for the given @zone and

1054

* Returns the lru list vector holding pages for the given @zone and

1055

* @mem. This can be the global zone lruvec, if the memory controller

1055

* @mem. This can be the global zone lruvec, if the memory controller

1056

* is disabled.

1056

* is disabled.

1057

*/

1057

*/

1058

struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,

1058

struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,

1059

struct mem_cgroup *memcg)

1059

struct mem_cgroup *memcg)

1060

{

1060

{

1061

struct mem_cgroup_per_zone *mz;

1061

struct mem_cgroup_per_zone *mz;

1062

1063

if (mem_cgroup_disabled())

1063

if (mem_cgroup_disabled())

1064

return &zone->lruvec;

1064

return &zone->lruvec;

1065

1066

mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));

1066

mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));

1067

return &mz->lruvec;

1067

return &mz->lruvec;

1068

}

1068

}

1069

1070

/*

1070

/*

1071

* Following LRU functions are allowed to be used without PCG_LOCK.

1071

* Following LRU functions are allowed to be used without PCG_LOCK.

1072

* Operations are called by routine of global LRU independently from memcg.

1072

* Operations are called by routine of global LRU independently from memcg.

1073

* What we have to take care of here is validness of pc->mem_cgroup.

1073

* What we have to take care of here is validness of pc->mem_cgroup.

1074

*

1074

*

1075

* Changes to pc->mem_cgroup happens when

1075

* Changes to pc->mem_cgroup happens when

1076

* 1. charge

1076

* 1. charge

1077

* 2. moving account

1077

* 2. moving account

1078

* In typical case, "charge" is done before add-to-lru. Exception is SwapCache.

1078

* In typical case, "charge" is done before add-to-lru. Exception is SwapCache.

1079

* It is added to LRU before charge.

1079

* It is added to LRU before charge.

1080

* If PCG_USED bit is not set, page_cgroup is not added to this private LRU.

1080

* If PCG_USED bit is not set, page_cgroup is not added to this private LRU.

1081

* When moving account, the page is not on LRU. It's isolated.

1081

* When moving account, the page is not on LRU. It's isolated.

1082

*/

1082

*/

1083

1084

/**

1084

/**

1085

* mem_cgroup_page_lruvec - return lruvec for adding an lru page

1085

* mem_cgroup_page_lruvec - return lruvec for adding an lru page

1086

* @page: the page

1086

* @page: the page

1087

* @zone: zone of the page

1087

* @zone: zone of the page

1088

*/

1088

*/

1089

struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)

1089

struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)

1090

{

1090

{

1091

struct mem_cgroup_per_zone *mz;

1091

struct mem_cgroup_per_zone *mz;

1092

struct mem_cgroup *memcg;

1092

struct mem_cgroup *memcg;

1093

struct page_cgroup *pc;

1093

struct page_cgroup *pc;

1094

1095

if (mem_cgroup_disabled())

1095

if (mem_cgroup_disabled())

1096

return &zone->lruvec;

1096

return &zone->lruvec;

1097

1098

pc = lookup_page_cgroup(page);

1098

pc = lookup_page_cgroup(page);

1099

memcg = pc->mem_cgroup;

1099

memcg = pc->mem_cgroup;

1100

1101

/*

1101

/*

1102

* Surreptitiously switch any uncharged offlist page to root:

1102

* Surreptitiously switch any uncharged offlist page to root:

1103

* an uncharged page off lru does nothing to secure

1103

* an uncharged page off lru does nothing to secure

1104

* its former mem_cgroup from sudden removal.

1104

* its former mem_cgroup from sudden removal.

1105

*

1105

*

1106

* Our caller holds lru_lock, and PageCgroupUsed is updated

1106

* Our caller holds lru_lock, and PageCgroupUsed is updated

1107

* under page_cgroup lock: between them, they make all uses

1107

* under page_cgroup lock: between them, they make all uses

1108

* of pc->mem_cgroup safe.

1108

* of pc->mem_cgroup safe.

1109

*/

1109

*/

1110

if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)

1110

if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)

1111

pc->mem_cgroup = memcg = root_mem_cgroup;

1111

pc->mem_cgroup = memcg = root_mem_cgroup;

1112

1113

mz = page_cgroup_zoneinfo(memcg, page);

1113

mz = page_cgroup_zoneinfo(memcg, page);

1114

return &mz->lruvec;

1114

return &mz->lruvec;

1115

}

1115

}

1116

1117

/**

1117

/**

1118

* mem_cgroup_update_lru_size - account for adding or removing an lru page

1118

* mem_cgroup_update_lru_size - account for adding or removing an lru page

1119

* @lruvec: mem_cgroup per zone lru vector

1119

* @lruvec: mem_cgroup per zone lru vector

1120

* @lru: index of lru list the page is sitting on

1120

* @lru: index of lru list the page is sitting on

1121

* @nr_pages: positive when adding or negative when removing

1121

* @nr_pages: positive when adding or negative when removing

1122

*

1122

*

1123

* This function must be called when a page is added to or removed from an

1123

* This function must be called when a page is added to or removed from an

1124

* lru list.

1124

* lru list.

1125

*/

1125

*/

1126

void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,

1126

void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,

1127

int nr_pages)

1127

int nr_pages)

1128

{

1128

{

1129

struct mem_cgroup_per_zone *mz;

1129

struct mem_cgroup_per_zone *mz;

1130

unsigned long *lru_size;

1130

unsigned long *lru_size;

1131

1132

if (mem_cgroup_disabled())

1132

if (mem_cgroup_disabled())

1133

return;

1133

return;

1134

1135

mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);

1135

mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);

1136

lru_size = mz->lru_size + lru;

1136

lru_size = mz->lru_size + lru;

1137

*lru_size += nr_pages;

1137

*lru_size += nr_pages;

1138

VM_BUG_ON((long)(*lru_size) < 0);

1138

VM_BUG_ON((long)(*lru_size) < 0);

1139

}

1139

}

1140

1141

/*

1141

/*

1142

* Checks whether given mem is same or in the root_mem_cgroup's

1142

* Checks whether given mem is same or in the root_mem_cgroup's

1143

* hierarchy subtree

1143

* hierarchy subtree

1144

*/

1144

*/

1145

bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,

1145

bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,

1146

struct mem_cgroup *memcg)

1146

struct mem_cgroup *memcg)

1147

{

1147

{

1148

if (root_memcg == memcg)

1148

if (root_memcg == memcg)

1149

return true;

1149

return true;

1150

if (!root_memcg->use_hierarchy || !memcg)

1150

if (!root_memcg->use_hierarchy || !memcg)

1151

return false;

1151

return false;

1152

return css_is_ancestor(&memcg->css, &root_memcg->css);

1152

return css_is_ancestor(&memcg->css, &root_memcg->css);

1153

}

1153

}

1154

1155

static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,

1155

static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,

1156

struct mem_cgroup *memcg)

1156

struct mem_cgroup *memcg)

1157

{

1157

{

1158

bool ret;

1158

bool ret;

1159

1160

rcu_read_lock();

1160

rcu_read_lock();

1161

ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);

1161

ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);

1162

rcu_read_unlock();

1162

rcu_read_unlock();

1163

return ret;

1163

return ret;

1164

}

1164

}

1165

1166

int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)

1166

int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)

1167

{

1167

{

1168

int ret;

1168

int ret;

1169

struct mem_cgroup *curr = NULL;

1169

struct mem_cgroup *curr = NULL;

1170

struct task_struct *p;

1170

struct task_struct *p;

1171

1172

p = find_lock_task_mm(task);

1172

p = find_lock_task_mm(task);

1173

if (p) {

1173

if (p) {

1174

curr = try_get_mem_cgroup_from_mm(p->mm);

1174

curr = try_get_mem_cgroup_from_mm(p->mm);

1175

task_unlock(p);

1175

task_unlock(p);

1176

} else {

1176

} else {

1177

/*

1177

/*

1178

* All threads may have already detached their mm's, but the oom

1178

* All threads may have already detached their mm's, but the oom

1179

* killer still needs to detect if they have already been oom

1179

* killer still needs to detect if they have already been oom

1180

* killed to prevent needlessly killing additional tasks.

1180

* killed to prevent needlessly killing additional tasks.

1181

*/

1181

*/

1182

task_lock(task);

1182

task_lock(task);

1183

curr = mem_cgroup_from_task(task);

1183

curr = mem_cgroup_from_task(task);

1184

if (curr)

1184

if (curr)

1185

css_get(&curr->css);

1185

css_get(&curr->css);

1186

task_unlock(task);

1186

task_unlock(task);

1187

}

1187

}

1188

if (!curr)

1188

if (!curr)

1189

return 0;

1189

return 0;

1190

/*

1190

/*

1191

* We should check use_hierarchy of "memcg" not "curr". Because checking

1191

* We should check use_hierarchy of "memcg" not "curr". Because checking

1192

* use_hierarchy of "curr" here make this function true if hierarchy is

1192

* use_hierarchy of "curr" here make this function true if hierarchy is

1193

* enabled in "curr" and "curr" is a child of "memcg" in *cgroup*

1193

* enabled in "curr" and "curr" is a child of "memcg" in *cgroup*

1194

* hierarchy(even if use_hierarchy is disabled in "memcg").

1194

* hierarchy(even if use_hierarchy is disabled in "memcg").

1195

*/

1195

*/

1196

ret = mem_cgroup_same_or_subtree(memcg, curr);

1196

ret = mem_cgroup_same_or_subtree(memcg, curr);

1197

css_put(&curr->css);

1197

css_put(&curr->css);

1198

return ret;

1198

return ret;

1199

}

1199

}

1200

1201

int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)

1201

int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)

1202

{

1202

{

1203

unsigned long inactive_ratio;

1203

unsigned long inactive_ratio;

1204

unsigned long inactive;

1204

unsigned long inactive;

1205

unsigned long active;

1205

unsigned long active;

1206

unsigned long gb;

1206

unsigned long gb;

1207

1208

inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);

1208

inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);

1209

active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);

1209

active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);

1210

1211

gb = (inactive + active) >> (30 - PAGE_SHIFT);

1211

gb = (inactive + active) >> (30 - PAGE_SHIFT);

1212

if (gb)

1212

if (gb)

1213

inactive_ratio = int_sqrt(10 * gb);

1213

inactive_ratio = int_sqrt(10 * gb);

1214

else

1214

else

1215

inactive_ratio = 1;

1215

inactive_ratio = 1;

1216

1217

return inactive * inactive_ratio < active;

1217

return inactive * inactive_ratio < active;

1218

}

1218

}

1219

1220

int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)

1220

int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)

1221

{

1221

{

1222

unsigned long active;

1222

unsigned long active;

1223

unsigned long inactive;

1223

unsigned long inactive;

1224

1225

inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);

1225

inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);

1226

active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);

1226

active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);

1227

1228

return (active > inactive);

1228

return (active > inactive);

1229

}

1229

}

1230

1231

#define mem_cgroup_from_res_counter(counter, member) \

1231

#define mem_cgroup_from_res_counter(counter, member) \

1232

container_of(counter, struct mem_cgroup, member)

1232

container_of(counter, struct mem_cgroup, member)

1233

1234

/**

1234

/**

1235

* mem_cgroup_margin - calculate chargeable space of a memory cgroup

1235

* mem_cgroup_margin - calculate chargeable space of a memory cgroup

1236

* @memcg: the memory cgroup

1236

* @memcg: the memory cgroup

1237

*

1237

*

1238

* Returns the maximum amount of memory @mem can be charged with, in

1238

* Returns the maximum amount of memory @mem can be charged with, in

1239

* pages.

1239

* pages.

1240

*/

1240

*/

1241

static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)

1241

static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)

1242

{

1242

{

1243

unsigned long long margin;

1243

unsigned long long margin;

1244

1245

margin = res_counter_margin(&memcg->res);

1245

margin = res_counter_margin(&memcg->res);

1246

if (do_swap_account)

1246

if (do_swap_account)

1247

margin = min(margin, res_counter_margin(&memcg->memsw));

1247

margin = min(margin, res_counter_margin(&memcg->memsw));

1248

return margin >> PAGE_SHIFT;

1248

return margin >> PAGE_SHIFT;

1249

}

1249

}

1250

1251

int mem_cgroup_swappiness(struct mem_cgroup *memcg)

1251

int mem_cgroup_swappiness(struct mem_cgroup *memcg)

1252

{

1252

{

1253

struct cgroup *cgrp = memcg->css.cgroup;

1253

struct cgroup *cgrp = memcg->css.cgroup;

1254

1255

/* root ? */

1255

/* root ? */

1256

if (cgrp->parent == NULL)

1256

if (cgrp->parent == NULL)

1257

return vm_swappiness;

1257

return vm_swappiness;

1258

1259

return memcg->swappiness;

1259

return memcg->swappiness;

1260

}

1260

}

1261

1262

/*

1262

/*

1263

* memcg->moving_account is used for checking possibility that some thread is

1263

* memcg->moving_account is used for checking possibility that some thread is

1264

* calling move_account(). When a thread on CPU-A starts moving pages under

1264

* calling move_account(). When a thread on CPU-A starts moving pages under

1265

* a memcg, other threads should check memcg->moving_account under

1265

* a memcg, other threads should check memcg->moving_account under

1266

* rcu_read_lock(), like this:

1266

* rcu_read_lock(), like this:

1267

*

1267

*

1268

* CPU-A CPU-B

1268

* CPU-A CPU-B

1269

* rcu_read_lock()

1269

* rcu_read_lock()

1270

* memcg->moving_account+1 if (memcg->mocing_account)

1270

* memcg->moving_account+1 if (memcg->mocing_account)

1271

* take heavy locks.

1271

* take heavy locks.

1272

* synchronize_rcu() update something.

1272

* synchronize_rcu() update something.

1273

* rcu_read_unlock()

1273

* rcu_read_unlock()

1274

* start move here.

1274

* start move here.

1275

*/

1275

*/

1276

1277

/* for quick checking without looking up memcg */

1277

/* for quick checking without looking up memcg */

1278

atomic_t memcg_moving __read_mostly;

1278

atomic_t memcg_moving __read_mostly;

1279

1280

static void mem_cgroup_start_move(struct mem_cgroup *memcg)

1280

static void mem_cgroup_start_move(struct mem_cgroup *memcg)

1281

{

1281

{

1282

atomic_inc(&memcg_moving);

1282

atomic_inc(&memcg_moving);

1283

atomic_inc(&memcg->moving_account);

1283

atomic_inc(&memcg->moving_account);

1284

synchronize_rcu();

1284

synchronize_rcu();

1285

}

1285

}

1286

1287

static void mem_cgroup_end_move(struct mem_cgroup *memcg)

1287

static void mem_cgroup_end_move(struct mem_cgroup *memcg)

1288

{

1288

{

1289

/*

1289

/*

1290

* Now, mem_cgroup_clear_mc() may call this function with NULL.

1290

* Now, mem_cgroup_clear_mc() may call this function with NULL.

1291

* We check NULL in callee rather than caller.

1291

* We check NULL in callee rather than caller.

1292

*/

1292

*/

1293

if (memcg) {

1293

if (memcg) {

1294

atomic_dec(&memcg_moving);

1294

atomic_dec(&memcg_moving);

1295

atomic_dec(&memcg->moving_account);

1295

atomic_dec(&memcg->moving_account);

1296

}

1296

}

1297

}

1297

}

1298

1299

/*

1299

/*

1300

* 2 routines for checking "mem" is under move_account() or not.

1300

* 2 routines for checking "mem" is under move_account() or not.

1301

*

1301

*

1302

* mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This

1302

* mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This

1303

* is used for avoiding races in accounting. If true,

1303

* is used for avoiding races in accounting. If true,

1304

* pc->mem_cgroup may be overwritten.

1304

* pc->mem_cgroup may be overwritten.

1305

*

1305

*

1306

* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or

1306

* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or

1307

* under hierarchy of moving cgroups. This is for

1307

* under hierarchy of moving cgroups. This is for

1308

* waiting at hith-memory prressure caused by "move".

1308

* waiting at hith-memory prressure caused by "move".

1309

*/

1309

*/

1310

1311

static bool mem_cgroup_stolen(struct mem_cgroup *memcg)

1311

static bool mem_cgroup_stolen(struct mem_cgroup *memcg)

1312

{

1312

{

1313

VM_BUG_ON(!rcu_read_lock_held());

1313

VM_BUG_ON(!rcu_read_lock_held());

1314

return atomic_read(&memcg->moving_account) > 0;

1314

return atomic_read(&memcg->moving_account) > 0;

1315

}

1315

}

1316

1317

static bool mem_cgroup_under_move(struct mem_cgroup *memcg)

1317

static bool mem_cgroup_under_move(struct mem_cgroup *memcg)

1318

{

1318

{

1319

struct mem_cgroup *from;

1319

struct mem_cgroup *from;

1320

struct mem_cgroup *to;

1320

struct mem_cgroup *to;

1321

bool ret = false;

1321

bool ret = false;

1322

/*

1322

/*

1323

* Unlike task_move routines, we access mc.to, mc.from not under

1323

* Unlike task_move routines, we access mc.to, mc.from not under

1324

* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.

1324

* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.

1325

*/

1325

*/

1326

spin_lock(&mc.lock);

1326

spin_lock(&mc.lock);

1327

from = mc.from;

1327

from = mc.from;

1328

to = mc.to;

1328

to = mc.to;

1329

if (!from)

1329

if (!from)

1330

goto unlock;

1330

goto unlock;

1331

1332

ret = mem_cgroup_same_or_subtree(memcg, from)

1332

ret = mem_cgroup_same_or_subtree(memcg, from)

1333

|| mem_cgroup_same_or_subtree(memcg, to);

1333

|| mem_cgroup_same_or_subtree(memcg, to);

1334

unlock:

1334

unlock:

1335

spin_unlock(&mc.lock);

1335

spin_unlock(&mc.lock);

1336

return ret;

1336

return ret;

1337

}

1337

}

1338

1339

static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)

1339

static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)

1340

{

1340

{

1341

if (mc.moving_task && current != mc.moving_task) {

1341

if (mc.moving_task && current != mc.moving_task) {

1342

if (mem_cgroup_under_move(memcg)) {

1342

if (mem_cgroup_under_move(memcg)) {

1343

DEFINE_WAIT(wait);

1343

DEFINE_WAIT(wait);

1344

prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);

1344

prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);

1345

/* moving charge context might have finished. */

1345

/* moving charge context might have finished. */

1346

if (mc.moving_task)

1346

if (mc.moving_task)

1347

schedule();

1347

schedule();

1348

finish_wait(&mc.waitq, &wait);

1348

finish_wait(&mc.waitq, &wait);

1349

return true;

1349

return true;

1350

}

1350

}

1351

}

1351

}

1352

return false;

1352

return false;

1353

}

1353

}

1354

1355

/*

1355

/*

1356

* Take this lock when

1356

* Take this lock when

1357

* - a code tries to modify page's memcg while it's USED.

1357

* - a code tries to modify page's memcg while it's USED.

1358

* - a code tries to modify page state accounting in a memcg.

1358

* - a code tries to modify page state accounting in a memcg.

1359

* see mem_cgroup_stolen(), too.

1359

* see mem_cgroup_stolen(), too.

1360

*/

1360

*/

1361

static void move_lock_mem_cgroup(struct mem_cgroup *memcg,

1361

static void move_lock_mem_cgroup(struct mem_cgroup *memcg,

1362

unsigned long *flags)

1362

unsigned long *flags)

1363

{

1363

{

1364

spin_lock_irqsave(&memcg->move_lock, *flags);

1364

spin_lock_irqsave(&memcg->move_lock, *flags);

1365

}

1365

}

1366

1367

static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,

1367

static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,

1368

unsigned long *flags)

1368

unsigned long *flags)

1369

{

1369

{

1370

spin_unlock_irqrestore(&memcg->move_lock, *flags);

1370

spin_unlock_irqrestore(&memcg->move_lock, *flags);

1371

}

1371

}

1372

1373

/**

1373

/**

1374

* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.

1374

* mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.

1375

* @memcg: The memory cgroup that went over limit

1375

* @memcg: The memory cgroup that went over limit

1376

* @p: Task that is going to be killed

1376

* @p: Task that is going to be killed

1377

*

1377

*

1378

* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is

1378

* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is

1379

* enabled

1379

* enabled

1380

*/

1380

*/

1381

void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)

1381

void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)

1382

{

1382

{

1383

struct cgroup *task_cgrp;

1383

struct cgroup *task_cgrp;

1384

struct cgroup *mem_cgrp;

1384

struct cgroup *mem_cgrp;

1385

/*

1385

/*

1386

* Need a buffer in BSS, can't rely on allocations. The code relies

1386

* Need a buffer in BSS, can't rely on allocations. The code relies

1387

* on the assumption that OOM is serialized for memory controller.

1387

* on the assumption that OOM is serialized for memory controller.

1388

* If this assumption is broken, revisit this code.

1388

* If this assumption is broken, revisit this code.

1389

*/

1389

*/

1390

static char memcg_name[PATH_MAX];

1390

static char memcg_name[PATH_MAX];

1391

int ret;

1391

int ret;

1392

1393

if (!memcg || !p)

1393

if (!memcg || !p)

1394

return;

1394

return;

1395

1396

rcu_read_lock();

1396

rcu_read_lock();

1397

1398

mem_cgrp = memcg->css.cgroup;

1398

mem_cgrp = memcg->css.cgroup;

1399

task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);

1399

task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);

1400

1401

ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);

1401

ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);

1402

if (ret < 0) {

1402

if (ret < 0) {

1403

/*

1403

/*

1404

* Unfortunately, we are unable to convert to a useful name

1404

* Unfortunately, we are unable to convert to a useful name

1405

* But we'll still print out the usage information

1405

* But we'll still print out the usage information

1406

*/

1406

*/

1407

rcu_read_unlock();

1407

rcu_read_unlock();

1408

goto done;

1408

goto done;

1409

}

1409

}

1410

rcu_read_unlock();

1410

rcu_read_unlock();

1411

1412

printk(KERN_INFO "Task in %s killed", memcg_name);

1412

printk(KERN_INFO "Task in %s killed", memcg_name);

1413

1414

rcu_read_lock();

1414

rcu_read_lock();

1415

ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);

1415

ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);

1416

if (ret < 0) {

1416

if (ret < 0) {

1417

rcu_read_unlock();

1417

rcu_read_unlock();

1418

goto done;

1418

goto done;

1419

}

1419

}

1420

rcu_read_unlock();

1420

rcu_read_unlock();

1421

1422

/*

1422

/*

1423

* Continues from above, so we don't need an KERN_ level

1423

* Continues from above, so we don't need an KERN_ level

1424

*/

1424

*/

1425

printk(KERN_CONT " as a result of limit of %s\n", memcg_name);

1425

printk(KERN_CONT " as a result of limit of %s\n", memcg_name);

1426

done:

1426

done:

1427

1428

printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",

1428

printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",

1429

res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,

1429

res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,

1430

res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,

1430

res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,

1431

res_counter_read_u64(&memcg->res, RES_FAILCNT));

1431

res_counter_read_u64(&memcg->res, RES_FAILCNT));

1432

printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "

1432

printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "

1433

"failcnt %llu\n",

1433

"failcnt %llu\n",

1434

res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,

1434

res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,

1435

res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,

1435

res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,

1436

res_counter_read_u64(&memcg->memsw, RES_FAILCNT));

1436

res_counter_read_u64(&memcg->memsw, RES_FAILCNT));

1437

}

1437

}

1438

1439

/*

1439

/*

1440

* This function returns the number of memcg under hierarchy tree. Returns

1440

* This function returns the number of memcg under hierarchy tree. Returns

1441

* 1(self count) if no children.

1441

* 1(self count) if no children.

1442

*/

1442

*/

1443

static int mem_cgroup_count_children(struct mem_cgroup *memcg)

1443

static int mem_cgroup_count_children(struct mem_cgroup *memcg)

1444

{

1444

{

1445

int num = 0;

1445

int num = 0;

1446

struct mem_cgroup *iter;

1446

struct mem_cgroup *iter;

1447

1448

for_each_mem_cgroup_tree(iter, memcg)

1448

for_each_mem_cgroup_tree(iter, memcg)

1449

num++;

1449

num++;

1450

return num;

1450

return num;

1451

}

1451

}

1452

1453

/*

1453

/*

1454

* Return the memory (and swap, if configured) limit for a memcg.

1454

* Return the memory (and swap, if configured) limit for a memcg.

1455

*/

1455

*/

1456

static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)

1456

static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)

1457

{

1457

{

1458

u64 limit;

1458

u64 limit;

1459

u64 memsw;

1459

u64 memsw;

1460

1461

limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

1461

limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

1462

limit += total_swap_pages << PAGE_SHIFT;

1462

limit += total_swap_pages << PAGE_SHIFT;

1463

1464

memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

1464

memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

1465

/*

1465

/*

1466

* If memsw is finite and limits the amount of swap space available

1466

* If memsw is finite and limits the amount of swap space available

1467

* to this memcg, return that limit.

1467

* to this memcg, return that limit.

1468

*/

1468

*/

1469

return min(limit, memsw);

1469

return min(limit, memsw);

1470

}

1470

}

1471

1472

void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,

1472

void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,

1473

int order)

1473

int order)

1474

{

1474

{

1475

struct mem_cgroup *iter;

1475

struct mem_cgroup *iter;

1476

unsigned long chosen_points = 0;

1476

unsigned long chosen_points = 0;

1477

unsigned long totalpages;

1477

unsigned long totalpages;

1478

unsigned int points = 0;

1478

unsigned int points = 0;

1479

struct task_struct *chosen = NULL;

1479

struct task_struct *chosen = NULL;

1480

1481

/*

1481

/*

1482

* If current has a pending SIGKILL, then automatically select it. The

1482

* If current has a pending SIGKILL, then automatically select it. The

1483

* goal is to allow it to allocate so that it may quickly exit and free

1483

* goal is to allow it to allocate so that it may quickly exit and free

1484

* its memory.

1484

* its memory.

1485

*/

1485

*/

1486

if (fatal_signal_pending(current)) {

1486

if (fatal_signal_pending(current)) {

1487

set_thread_flag(TIF_MEMDIE);

1487

set_thread_flag(TIF_MEMDIE);

1488

return;

1488

return;

1489

}

1489

}

1490

1491

check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);

1491

check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);

1492

totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;

1492

totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;

1493

for_each_mem_cgroup_tree(iter, memcg) {

1493

for_each_mem_cgroup_tree(iter, memcg) {

1494

struct cgroup *cgroup = iter->css.cgroup;

1494

struct cgroup *cgroup = iter->css.cgroup;

1495

struct cgroup_iter it;

1495

struct cgroup_iter it;

1496

struct task_struct *task;

1496

struct task_struct *task;

1497

1498

cgroup_iter_start(cgroup, &it);

1498

cgroup_iter_start(cgroup, &it);

1499

while ((task = cgroup_iter_next(cgroup, &it))) {

1499

while ((task = cgroup_iter_next(cgroup, &it))) {

1500

switch (oom_scan_process_thread(task, totalpages, NULL,

1500

switch (oom_scan_process_thread(task, totalpages, NULL,

1501

false)) {

1501

false)) {

1502

case OOM_SCAN_SELECT:

1502

case OOM_SCAN_SELECT:

1503

if (chosen)

1503

if (chosen)

1504

put_task_struct(chosen);

1504

put_task_struct(chosen);

1505

chosen = task;

1505

chosen = task;

1506

chosen_points = ULONG_MAX;

1506

chosen_points = ULONG_MAX;

1507

get_task_struct(chosen);

1507

get_task_struct(chosen);

1508

/* fall through */

1508

/* fall through */

1509

case OOM_SCAN_CONTINUE:

1509

case OOM_SCAN_CONTINUE:

1510

continue;

1510

continue;

1511

case OOM_SCAN_ABORT:

1511

case OOM_SCAN_ABORT:

1512

cgroup_iter_end(cgroup, &it);

1512

cgroup_iter_end(cgroup, &it);

1513

mem_cgroup_iter_break(memcg, iter);

1513

mem_cgroup_iter_break(memcg, iter);

1514

if (chosen)

1514

if (chosen)

1515

put_task_struct(chosen);

1515

put_task_struct(chosen);

1516

return;

1516

return;

1517

case OOM_SCAN_OK:

1517

case OOM_SCAN_OK:

1518

break;

1518

break;

1519

};

1519

};

1520

points = oom_badness(task, memcg, NULL, totalpages);

1520

points = oom_badness(task, memcg, NULL, totalpages);

1521

if (points > chosen_points) {

1521

if (points > chosen_points) {

1522

if (chosen)

1522

if (chosen)

1523

put_task_struct(chosen);

1523

put_task_struct(chosen);

1524

chosen = task;

1524

chosen = task;

1525

chosen_points = points;

1525

chosen_points = points;

1526

get_task_struct(chosen);

1526

get_task_struct(chosen);

1527

}

1527

}

1528

}

1528

}

1529

cgroup_iter_end(cgroup, &it);

1529

cgroup_iter_end(cgroup, &it);

1530

}

1530

}

1531

1532

if (!chosen)

1532

if (!chosen)

1533

return;

1533

return;

1534

points = chosen_points * 1000 / totalpages;

1534

points = chosen_points * 1000 / totalpages;

1535

oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,

1535

oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,

1536

NULL, "Memory cgroup out of memory");

1536

NULL, "Memory cgroup out of memory");

1537

}

1537

}

1538

1539

static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,

1539

static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,

1540

gfp_t gfp_mask,

1540

gfp_t gfp_mask,

1541

unsigned long flags)

1541

unsigned long flags)

1542

{

1542

{

1543

unsigned long total = 0;

1543

unsigned long total = 0;

1544

bool noswap = false;

1544

bool noswap = false;

1545

int loop;

1545

int loop;

1546

1547

if (flags & MEM_CGROUP_RECLAIM_NOSWAP)

1547

if (flags & MEM_CGROUP_RECLAIM_NOSWAP)

1548

noswap = true;

1548

noswap = true;

1549

if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)

1549

if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)

1550

noswap = true;

1550

noswap = true;

1551

1552

for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {

1552

for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {

1553

if (loop)

1553

if (loop)

1554

drain_all_stock_async(memcg);

1554

drain_all_stock_async(memcg);

1555

total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);

1555

total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);

1556

/*

1556

/*

1557

* Allow limit shrinkers, which are triggered directly

1557

* Allow limit shrinkers, which are triggered directly

1558

* by userspace, to catch signals and stop reclaim

1558

* by userspace, to catch signals and stop reclaim

1559

* after minimal progress, regardless of the margin.

1559

* after minimal progress, regardless of the margin.

1560

*/

1560

*/

1561

if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))

1561

if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))

1562

break;

1562

break;

1563

if (mem_cgroup_margin(memcg))

1563

if (mem_cgroup_margin(memcg))

1564

break;

1564

break;

1565

/*

1565

/*

1566

* If nothing was reclaimed after two attempts, there

1566

* If nothing was reclaimed after two attempts, there

1567

* may be no reclaimable pages in this hierarchy.

1567

* may be no reclaimable pages in this hierarchy.

1568

*/

1568

*/

1569

if (loop && !total)

1569

if (loop && !total)

1570

break;

1570

break;

1571

}

1571

}

1572

return total;

1572

return total;

1573

}

1573

}

1574

1575

/**

1575

/**

1576

* test_mem_cgroup_node_reclaimable

1576

* test_mem_cgroup_node_reclaimable

1577

* @memcg: the target memcg

1577

* @memcg: the target memcg

1578

* @nid: the node ID to be checked.

1578

* @nid: the node ID to be checked.

1579

* @noswap : specify true here if the user wants flle only information.

1579

* @noswap : specify true here if the user wants flle only information.

1580

*

1580

*

1581

* This function returns whether the specified memcg contains any

1581

* This function returns whether the specified memcg contains any

1582

* reclaimable pages on a node. Returns true if there are any reclaimable

1582

* reclaimable pages on a node. Returns true if there are any reclaimable

1583

* pages in the node.

1583

* pages in the node.

1584

*/

1584

*/

1585

static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,

1585

static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,

1586

int nid, bool noswap)

1586

int nid, bool noswap)

1587

{

1587

{

1588

if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))

1588

if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))

1589

return true;

1589

return true;

1590

if (noswap || !total_swap_pages)

1590

if (noswap || !total_swap_pages)

1591

return false;

1591

return false;

1592

if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))

1592

if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))

1593

return true;

1593

return true;

1594

return false;

1594

return false;

1595

1596

}

1596

}

1597

#if MAX_NUMNODES > 1

1597

#if MAX_NUMNODES > 1

1598

1599

/*

1599

/*

1600

* Always updating the nodemask is not very good - even if we have an empty

1600

* Always updating the nodemask is not very good - even if we have an empty

1601

* list or the wrong list here, we can start from some node and traverse all

1601

* list or the wrong list here, we can start from some node and traverse all

1602

* nodes based on the zonelist. So update the list loosely once per 10 secs.

1602

* nodes based on the zonelist. So update the list loosely once per 10 secs.

1603

*

1603

*

1604

*/

1604

*/

1605

static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)

1605

static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)

1606

{

1606

{

1607

int nid;

1607

int nid;

1608

/*

1608

/*

1609

* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET

1609

* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET

1610

* pagein/pageout changes since the last update.

1610

* pagein/pageout changes since the last update.

1611

*/

1611

*/

1612

if (!atomic_read(&memcg->numainfo_events))

1612

if (!atomic_read(&memcg->numainfo_events))

1613

return;

1613

return;

1614

if (atomic_inc_return(&memcg->numainfo_updating) > 1)

1614

if (atomic_inc_return(&memcg->numainfo_updating) > 1)

1615

return;

1615

return;

1616

1617

/* make a nodemask where this memcg uses memory from */

1617

/* make a nodemask where this memcg uses memory from */

1618

memcg->scan_nodes = node_states[N_HIGH_MEMORY];

1618

memcg->scan_nodes = node_states[N_HIGH_MEMORY];

1619

1620

for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {

1620

for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {

1621

1622

if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))

1622

if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))

1623

node_clear(nid, memcg->scan_nodes);

1623

node_clear(nid, memcg->scan_nodes);

1624

}

1624

}

1625

1626

atomic_set(&memcg->numainfo_events, 0);

1626

atomic_set(&memcg->numainfo_events, 0);

1627

atomic_set(&memcg->numainfo_updating, 0);

1627

atomic_set(&memcg->numainfo_updating, 0);

1628

}

1628

}

1629

1630

/*

1630

/*

1631

* Selecting a node where we start reclaim from. Because what we need is just

1631

* Selecting a node where we start reclaim from. Because what we need is just

1632

* reducing usage counter, start from anywhere is O,K. Considering

1632

* reducing usage counter, start from anywhere is O,K. Considering

1633

* memory reclaim from current node, there are pros. and cons.

1633

* memory reclaim from current node, there are pros. and cons.

1634

*

1634

*

1635

* Freeing memory from current node means freeing memory from a node which

1635

* Freeing memory from current node means freeing memory from a node which

1636

* we'll use or we've used. So, it may make LRU bad. And if several threads

1636

* we'll use or we've used. So, it may make LRU bad. And if several threads

1637

* hit limits, it will see a contention on a node. But freeing from remote

1637

* hit limits, it will see a contention on a node. But freeing from remote

1638

* node means more costs for memory reclaim because of memory latency.

1638

* node means more costs for memory reclaim because of memory latency.

1639

*

1639

*

1640

* Now, we use round-robin. Better algorithm is welcomed.

1640

* Now, we use round-robin. Better algorithm is welcomed.

1641

*/

1641

*/

1642

int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)

1642

int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)

1643

{

1643

{

1644

int node;

1644

int node;

1645

1646

mem_cgroup_may_update_nodemask(memcg);

1646

mem_cgroup_may_update_nodemask(memcg);

1647

node = memcg->last_scanned_node;

1647

node = memcg->last_scanned_node;

1648

1649

node = next_node(node, memcg->scan_nodes);

1649

node = next_node(node, memcg->scan_nodes);

1650

if (node == MAX_NUMNODES)

1650

if (node == MAX_NUMNODES)

1651

node = first_node(memcg->scan_nodes);

1651

node = first_node(memcg->scan_nodes);

1652

/*

1652

/*

1653

* We call this when we hit limit, not when pages are added to LRU.

1653

* We call this when we hit limit, not when pages are added to LRU.

1654

* No LRU may hold pages because all pages are UNEVICTABLE or

1654

* No LRU may hold pages because all pages are UNEVICTABLE or

1655

* memcg is too small and all pages are not on LRU. In that case,

1655

* memcg is too small and all pages are not on LRU. In that case,

1656

* we use curret node.

1656

* we use curret node.

1657

*/

1657

*/

1658

if (unlikely(node == MAX_NUMNODES))

1658

if (unlikely(node == MAX_NUMNODES))

1659

node = numa_node_id();

1659

node = numa_node_id();

1660

1661

memcg->last_scanned_node = node;

1661

memcg->last_scanned_node = node;

1662

return node;

1662

return node;

1663

}

1663

}

1664

1665

/*

1665

/*

1666

* Check all nodes whether it contains reclaimable pages or not.

1666

* Check all nodes whether it contains reclaimable pages or not.

1667

* For quick scan, we make use of scan_nodes. This will allow us to skip

1667

* For quick scan, we make use of scan_nodes. This will allow us to skip

1668

* unused nodes. But scan_nodes is lazily updated and may not cotain

1668

* unused nodes. But scan_nodes is lazily updated and may not cotain

1669

* enough new information. We need to do double check.

1669

* enough new information. We need to do double check.

1670

*/

1670

*/

1671

static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)

1671

static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)

1672

{

1672

{

1673

int nid;

1673

int nid;

1674

1675

/*

1675

/*

1676

* quick check...making use of scan_node.

1676

* quick check...making use of scan_node.

1677

* We can skip unused nodes.

1677

* We can skip unused nodes.

1678

*/

1678

*/

1679

if (!nodes_empty(memcg->scan_nodes)) {

1679

if (!nodes_empty(memcg->scan_nodes)) {

1680

for (nid = first_node(memcg->scan_nodes);

1680

for (nid = first_node(memcg->scan_nodes);

1681

nid < MAX_NUMNODES;

1681

nid < MAX_NUMNODES;

1682

nid = next_node(nid, memcg->scan_nodes)) {

1682

nid = next_node(nid, memcg->scan_nodes)) {

1683

1684

if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))

1684

if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))

1685

return true;

1685

return true;

1686

}

1686

}

1687

}

1687

}

1688

/*

1688

/*

1689

* Check rest of nodes.

1689

* Check rest of nodes.

1690

*/

1690

*/

1691

for_each_node_state(nid, N_HIGH_MEMORY) {

1691

for_each_node_state(nid, N_HIGH_MEMORY) {

1692

if (node_isset(nid, memcg->scan_nodes))

1692

if (node_isset(nid, memcg->scan_nodes))

1693

continue;

1693

continue;

1694

if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))

1694

if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))

1695

return true;

1695

return true;

1696

}

1696

}

1697

return false;

1697

return false;

1698

}

1698

}

1699

1700

#else

1700

#else

1701

int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)

1701

int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)

1702

{

1702

{

1703

return 0;

1703

return 0;

1704

}

1704

}

1705

1706

static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)

1706

static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)

1707

{

1707

{

1708

return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);

1708

return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);

1709

}

1709

}

1710

#endif

1710

#endif

1711

1712

static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,

1712

static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,

1713

struct zone *zone,

1713

struct zone *zone,

1714

gfp_t gfp_mask,

1714

gfp_t gfp_mask,

1715

unsigned long *total_scanned)

1715

unsigned long *total_scanned)

1716

{

1716

{

1717

struct mem_cgroup *victim = NULL;

1717

struct mem_cgroup *victim = NULL;

1718

int total = 0;

1718

int total = 0;

1719

int loop = 0;

1719

int loop = 0;

1720

unsigned long excess;

1720

unsigned long excess;

1721

unsigned long nr_scanned;

1721

unsigned long nr_scanned;

1722

struct mem_cgroup_reclaim_cookie reclaim = {

1722

struct mem_cgroup_reclaim_cookie reclaim = {

1723

.zone = zone,

1723

.zone = zone,

1724

.priority = 0,

1724

.priority = 0,

1725

};

1725

};

1726

1727

excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;

1727

excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;

1728

1729

while (1) {

1729

while (1) {

1730

victim = mem_cgroup_iter(root_memcg, victim, &reclaim);

1730

victim = mem_cgroup_iter(root_memcg, victim, &reclaim);

1731

if (!victim) {

1731

if (!victim) {

1732

loop++;

1732

loop++;

1733

if (loop >= 2) {

1733

if (loop >= 2) {

1734

/*

1734

/*

1735

* If we have not been able to reclaim

1735

* If we have not been able to reclaim

1736

* anything, it might because there are

1736

* anything, it might because there are

1737

* no reclaimable pages under this hierarchy

1737

* no reclaimable pages under this hierarchy

1738

*/

1738

*/

1739

if (!total)

1739

if (!total)

1740

break;

1740

break;

1741

/*

1741

/*

1742

* We want to do more targeted reclaim.

1742

* We want to do more targeted reclaim.

1743

* excess >> 2 is not to excessive so as to

1743

* excess >> 2 is not to excessive so as to

1744

* reclaim too much, nor too less that we keep

1744

* reclaim too much, nor too less that we keep

1745

* coming back to reclaim from this cgroup

1745

* coming back to reclaim from this cgroup

1746

*/

1746

*/

1747

if (total >= (excess >> 2) ||

1747

if (total >= (excess >> 2) ||

1748

(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))

1748

(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))

1749

break;

1749

break;

1750

}

1750

}

1751

continue;

1751

continue;

1752

}

1752

}

1753

if (!mem_cgroup_reclaimable(victim, false))

1753

if (!mem_cgroup_reclaimable(victim, false))

1754

continue;

1754

continue;

1755

total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,

1755

total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,

1756

zone, &nr_scanned);

1756

zone, &nr_scanned);

1757

*total_scanned += nr_scanned;

1757

*total_scanned += nr_scanned;

1758

if (!res_counter_soft_limit_excess(&root_memcg->res))

1758

if (!res_counter_soft_limit_excess(&root_memcg->res))

1759

break;

1759

break;

1760

}

1760

}

1761

mem_cgroup_iter_break(root_memcg, victim);

1761

mem_cgroup_iter_break(root_memcg, victim);

1762

return total;

1762

return total;

1763

}

1763

}

1764

1765

/*

1765

/*

1766

* Check OOM-Killer is already running under our hierarchy.

1766

* Check OOM-Killer is already running under our hierarchy.

1767

* If someone is running, return false.

1767

* If someone is running, return false.

1768

* Has to be called with memcg_oom_lock

1768

* Has to be called with memcg_oom_lock

1769

*/

1769

*/

1770

static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)

1770

static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)

1771

{

1771

{

1772

struct mem_cgroup *iter, *failed = NULL;

1772

struct mem_cgroup *iter, *failed = NULL;

1773

1774

for_each_mem_cgroup_tree(iter, memcg) {

1774

for_each_mem_cgroup_tree(iter, memcg) {

1775

if (iter->oom_lock) {

1775

if (iter->oom_lock) {

1776

/*

1776

/*

1777

* this subtree of our hierarchy is already locked

1777

* this subtree of our hierarchy is already locked

1778

* so we cannot give a lock.

1778

* so we cannot give a lock.

1779

*/

1779

*/

1780

failed = iter;

1780

failed = iter;

1781

mem_cgroup_iter_break(memcg, iter);

1781

mem_cgroup_iter_break(memcg, iter);

1782

break;

1782

break;

1783

} else

1783

} else

1784

iter->oom_lock = true;

1784

iter->oom_lock = true;

1785

}

1785

}

1786

1787

if (!failed)

1787

if (!failed)

1788

return true;

1788

return true;

1789

1790

/*

1790

/*

1791

* OK, we failed to lock the whole subtree so we have to clean up

1791

* OK, we failed to lock the whole subtree so we have to clean up

1792

* what we set up to the failing subtree

1792

* what we set up to the failing subtree

1793

*/

1793

*/

1794

for_each_mem_cgroup_tree(iter, memcg) {

1794

for_each_mem_cgroup_tree(iter, memcg) {

1795

if (iter == failed) {

1795

if (iter == failed) {

1796

mem_cgroup_iter_break(memcg, iter);

1796

mem_cgroup_iter_break(memcg, iter);

1797

break;

1797

break;

1798

}

1798

}

1799

iter->oom_lock = false;

1799

iter->oom_lock = false;

1800

}

1800

}

1801

return false;

1801

return false;

1802

}

1802

}

1803

1804

/*

1804

/*

1805

* Has to be called with memcg_oom_lock

1805

* Has to be called with memcg_oom_lock

1806

*/

1806

*/

1807

static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)

1807

static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)

1808

{

1808

{

1809

struct mem_cgroup *iter;

1809

struct mem_cgroup *iter;

1810

1811

for_each_mem_cgroup_tree(iter, memcg)

1811

for_each_mem_cgroup_tree(iter, memcg)

1812

iter->oom_lock = false;

1812

iter->oom_lock = false;

1813

return 0;

1813

return 0;

1814

}

1814

}

1815

1816

static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)

1816

static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)

1817

{

1817

{

1818

struct mem_cgroup *iter;

1818

struct mem_cgroup *iter;

1819

1820

for_each_mem_cgroup_tree(iter, memcg)

1820

for_each_mem_cgroup_tree(iter, memcg)

1821

atomic_inc(&iter->under_oom);

1821

atomic_inc(&iter->under_oom);

1822

}

1822

}

1823

1824

static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)

1824

static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)

1825

{

1825

{

1826

struct mem_cgroup *iter;

1826

struct mem_cgroup *iter;

1827

1828

/*

1828

/*

1829

* When a new child is created while the hierarchy is under oom,

1829

* When a new child is created while the hierarchy is under oom,

1830

* mem_cgroup_oom_lock() may not be called. We have to use

1830

* mem_cgroup_oom_lock() may not be called. We have to use

1831

* atomic_add_unless() here.

1831

* atomic_add_unless() here.

1832

*/

1832

*/

1833

for_each_mem_cgroup_tree(iter, memcg)

1833

for_each_mem_cgroup_tree(iter, memcg)

1834

atomic_add_unless(&iter->under_oom, -1, 0);

1834

atomic_add_unless(&iter->under_oom, -1, 0);

1835

}

1835

}

1836

1837

static DEFINE_SPINLOCK(memcg_oom_lock);

1837

static DEFINE_SPINLOCK(memcg_oom_lock);

1838

static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);

1838

static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);

1839

1840

struct oom_wait_info {

1840

struct oom_wait_info {

1841

struct mem_cgroup *memcg;

1841

struct mem_cgroup *memcg;

1842

wait_queue_t wait;

1842

wait_queue_t wait;

1843

};

1843

};

1844

1845

static int memcg_oom_wake_function(wait_queue_t *wait,

1845

static int memcg_oom_wake_function(wait_queue_t *wait,

1846

unsigned mode, int sync, void *arg)

1846

unsigned mode, int sync, void *arg)

1847

{

1847

{

1848

struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;

1848

struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;

1849

struct mem_cgroup *oom_wait_memcg;

1849

struct mem_cgroup *oom_wait_memcg;

1850

struct oom_wait_info *oom_wait_info;

1850

struct oom_wait_info *oom_wait_info;

1851

1852

oom_wait_info = container_of(wait, struct oom_wait_info, wait);

1852

oom_wait_info = container_of(wait, struct oom_wait_info, wait);

1853

oom_wait_memcg = oom_wait_info->memcg;

1853

oom_wait_memcg = oom_wait_info->memcg;

1854

1855

/*

1855

/*

1856

* Both of oom_wait_info->memcg and wake_memcg are stable under us.

1856

* Both of oom_wait_info->memcg and wake_memcg are stable under us.

1857

* Then we can use css_is_ancestor without taking care of RCU.

1857

* Then we can use css_is_ancestor without taking care of RCU.

1858

*/

1858

*/

1859

if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)

1859

if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)

1860

&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))

1860

&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))

1861

return 0;

1861

return 0;

1862

return autoremove_wake_function(wait, mode, sync, arg);

1862

return autoremove_wake_function(wait, mode, sync, arg);

1863

}

1863

}

1864

1865

static void memcg_wakeup_oom(struct mem_cgroup *memcg)

1865

static void memcg_wakeup_oom(struct mem_cgroup *memcg)

1866

{

1866

{

1867

/* for filtering, pass "memcg" as argument. */

1867

/* for filtering, pass "memcg" as argument. */

1868

__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);

1868

__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);

1869

}

1869

}

1870

1871

static void memcg_oom_recover(struct mem_cgroup *memcg)

1871

static void memcg_oom_recover(struct mem_cgroup *memcg)

1872

{

1872

{

1873

if (memcg && atomic_read(&memcg->under_oom))

1873

if (memcg && atomic_read(&memcg->under_oom))

1874

memcg_wakeup_oom(memcg);

1874

memcg_wakeup_oom(memcg);

1875

}

1875

}

1876

1877

/*

1877

/*

1878

* try to call OOM killer. returns false if we should exit memory-reclaim loop.

1878

* try to call OOM killer. returns false if we should exit memory-reclaim loop.

1879

*/

1879

*/

1880

static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,

1880

static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,

1881

int order)

1881

int order)

1882

{

1882

{

1883

struct oom_wait_info owait;

1883

struct oom_wait_info owait;

1884

bool locked, need_to_kill;

1884

bool locked, need_to_kill;

1885

1886

owait.memcg = memcg;

1886

owait.memcg = memcg;

1887

owait.wait.flags = 0;

1887

owait.wait.flags = 0;

1888

owait.wait.func = memcg_oom_wake_function;

1888

owait.wait.func = memcg_oom_wake_function;

1889

owait.wait.private = current;

1889

owait.wait.private = current;

1890

INIT_LIST_HEAD(&owait.wait.task_list);

1890

INIT_LIST_HEAD(&owait.wait.task_list);

1891

need_to_kill = true;

1891

need_to_kill = true;

1892

mem_cgroup_mark_under_oom(memcg);

1892

mem_cgroup_mark_under_oom(memcg);

1893

1894

/* At first, try to OOM lock hierarchy under memcg.*/

1894

/* At first, try to OOM lock hierarchy under memcg.*/

1895

spin_lock(&memcg_oom_lock);

1895

spin_lock(&memcg_oom_lock);

1896

locked = mem_cgroup_oom_lock(memcg);

1896

locked = mem_cgroup_oom_lock(memcg);

1897

/*

1897

/*

1898

* Even if signal_pending(), we can't quit charge() loop without

1898

* Even if signal_pending(), we can't quit charge() loop without

1899

* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL

1899

* accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL

1900

* under OOM is always welcomed, use TASK_KILLABLE here.

1900

* under OOM is always welcomed, use TASK_KILLABLE here.

1901

*/

1901

*/

1902

prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);

1902

prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);

1903

if (!locked || memcg->oom_kill_disable)

1903

if (!locked || memcg->oom_kill_disable)

1904

need_to_kill = false;

1904

need_to_kill = false;

1905

if (locked)

1905

if (locked)

1906

mem_cgroup_oom_notify(memcg);

1906

mem_cgroup_oom_notify(memcg);

1907

spin_unlock(&memcg_oom_lock);

1907

spin_unlock(&memcg_oom_lock);

1908

1909

if (need_to_kill) {

1909

if (need_to_kill) {

1910

finish_wait(&memcg_oom_waitq, &owait.wait);

1910

finish_wait(&memcg_oom_waitq, &owait.wait);

1911

mem_cgroup_out_of_memory(memcg, mask, order);

1911

mem_cgroup_out_of_memory(memcg, mask, order);

1912

} else {

1912

} else {

1913

schedule();

1913

schedule();

1914

finish_wait(&memcg_oom_waitq, &owait.wait);

1914

finish_wait(&memcg_oom_waitq, &owait.wait);

1915

}

1915

}

1916

spin_lock(&memcg_oom_lock);

1916

spin_lock(&memcg_oom_lock);

1917

if (locked)

1917

if (locked)

1918

mem_cgroup_oom_unlock(memcg);

1918

mem_cgroup_oom_unlock(memcg);

1919

memcg_wakeup_oom(memcg);

1919

memcg_wakeup_oom(memcg);

1920

spin_unlock(&memcg_oom_lock);

1920

spin_unlock(&memcg_oom_lock);

1921

1922

mem_cgroup_unmark_under_oom(memcg);

1922

mem_cgroup_unmark_under_oom(memcg);

1923

1924

if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))

1924

if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))

1925

return false;

1925

return false;

1926

/* Give chance to dying process */

1926

/* Give chance to dying process */

1927

schedule_timeout_uninterruptible(1);

1927

schedule_timeout_uninterruptible(1);

1928

return true;

1928

return true;

1929

}

1929

}

1930

1931

/*

1931

/*

1932

* Currently used to update mapped file statistics, but the routine can be

1932

* Currently used to update mapped file statistics, but the routine can be

1933

* generalized to update other statistics as well.

1933

* generalized to update other statistics as well.

1934

*

1934

*

1935

* Notes: Race condition

1935

* Notes: Race condition

1936

*

1936

*

1937

* We usually use page_cgroup_lock() for accessing page_cgroup member but

1937

* We usually use page_cgroup_lock() for accessing page_cgroup member but

1938

* it tends to be costly. But considering some conditions, we doesn't need

1938

* it tends to be costly. But considering some conditions, we doesn't need

1939

* to do so _always_.

1939

* to do so _always_.

1940

*

1940

*

1941

* Considering "charge", lock_page_cgroup() is not required because all

1941

* Considering "charge", lock_page_cgroup() is not required because all

1942

* file-stat operations happen after a page is attached to radix-tree. There

1942

* file-stat operations happen after a page is attached to radix-tree. There

1943

* are no race with "charge".

1943

* are no race with "charge".

1944

*

1944

*

1945

* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup

1945

* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup

1946

* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even

1946

* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even

1947

* if there are race with "uncharge". Statistics itself is properly handled

1947

* if there are race with "uncharge". Statistics itself is properly handled

1948

* by flags.

1948

* by flags.

1949

*

1949

*

1950

* Considering "move", this is an only case we see a race. To make the race

1950

* Considering "move", this is an only case we see a race. To make the race

1951

* small, we check mm->moving_account and detect there are possibility of race

1951

* small, we check mm->moving_account and detect there are possibility of race

1952

* If there is, we take a lock.

1952

* If there is, we take a lock.

1953

*/

1953

*/

1954

1955

void __mem_cgroup_begin_update_page_stat(struct page *page,

1955

void __mem_cgroup_begin_update_page_stat(struct page *page,

1956

bool *locked, unsigned long *flags)

1956

bool *locked, unsigned long *flags)

1957

{

1957

{

1958

struct mem_cgroup *memcg;

1958

struct mem_cgroup *memcg;

1959

struct page_cgroup *pc;

1959

struct page_cgroup *pc;

1960

1961

pc = lookup_page_cgroup(page);

1961

pc = lookup_page_cgroup(page);

1962

again:

1962

again:

1963

memcg = pc->mem_cgroup;

1963

memcg = pc->mem_cgroup;

1964

if (unlikely(!memcg || !PageCgroupUsed(pc)))

1964

if (unlikely(!memcg || !PageCgroupUsed(pc)))

1965

return;

1965

return;

1966

/*

1966

/*

1967

* If this memory cgroup is not under account moving, we don't

1967

* If this memory cgroup is not under account moving, we don't

1968

* need to take move_lock_mem_cgroup(). Because we already hold

1968

* need to take move_lock_mem_cgroup(). Because we already hold

1969

* rcu_read_lock(), any calls to move_account will be delayed until

1969

* rcu_read_lock(), any calls to move_account will be delayed until

1970

* rcu_read_unlock() if mem_cgroup_stolen() == true.

1970

* rcu_read_unlock() if mem_cgroup_stolen() == true.

1971

*/

1971

*/

1972

if (!mem_cgroup_stolen(memcg))

1972

if (!mem_cgroup_stolen(memcg))

1973

return;

1973

return;

1974

1975

move_lock_mem_cgroup(memcg, flags);

1975

move_lock_mem_cgroup(memcg, flags);

1976

if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {

1976

if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {

1977

move_unlock_mem_cgroup(memcg, flags);

1977

move_unlock_mem_cgroup(memcg, flags);

1978

goto again;

1978

goto again;

1979

}

1979

}

1980

*locked = true;

1980

*locked = true;

1981

}

1981

}

1982

1983

void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)

1983

void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)

1984

{

1984

{

1985

struct page_cgroup *pc = lookup_page_cgroup(page);

1985

struct page_cgroup *pc = lookup_page_cgroup(page);

1986

1987

/*

1987

/*

1988

* It's guaranteed that pc->mem_cgroup never changes while

1988

* It's guaranteed that pc->mem_cgroup never changes while

1989

* lock is held because a routine modifies pc->mem_cgroup

1989

* lock is held because a routine modifies pc->mem_cgroup

1990

* should take move_lock_mem_cgroup().

1990

* should take move_lock_mem_cgroup().

1991

*/

1991

*/

1992

move_unlock_mem_cgroup(pc->mem_cgroup, flags);

1992

move_unlock_mem_cgroup(pc->mem_cgroup, flags);

1993

}

1993

}

1994

1995

void mem_cgroup_update_page_stat(struct page *page,

1995

void mem_cgroup_update_page_stat(struct page *page,

1996

enum mem_cgroup_page_stat_item idx, int val)

1996

enum mem_cgroup_page_stat_item idx, int val)

1997

{

1997

{

1998

struct mem_cgroup *memcg;

1998

struct mem_cgroup *memcg;

1999

struct page_cgroup *pc = lookup_page_cgroup(page);

1999

struct page_cgroup *pc = lookup_page_cgroup(page);

2000

unsigned long uninitialized_var(flags);

2000

unsigned long uninitialized_var(flags);

2001

2002

if (mem_cgroup_disabled())

2002

if (mem_cgroup_disabled())

2003

return;

2003

return;

2004

2005

memcg = pc->mem_cgroup;

2005

memcg = pc->mem_cgroup;

2006

if (unlikely(!memcg || !PageCgroupUsed(pc)))

2006

if (unlikely(!memcg || !PageCgroupUsed(pc)))

2007

return;

2007

return;

2008

2009

switch (idx) {

2009

switch (idx) {

2010

case MEMCG_NR_FILE_MAPPED:

2010

case MEMCG_NR_FILE_MAPPED:

2011

idx = MEM_CGROUP_STAT_FILE_MAPPED;

2011

idx = MEM_CGROUP_STAT_FILE_MAPPED;

2012

break;

2012

break;

2013

default:

2013

default:

2014

BUG();

2014

BUG();

2015

}

2015

}

2016

2017

this_cpu_add(memcg->stat->count[idx], val);

2017

this_cpu_add(memcg->stat->count[idx], val);

2018

}

2018

}

2019

2020

/*

2020

/*

2021

* size of first charge trial. "32" comes from vmscan.c's magic value.

2021

* size of first charge trial. "32" comes from vmscan.c's magic value.

2022

* TODO: maybe necessary to use big numbers in big irons.

2022

* TODO: maybe necessary to use big numbers in big irons.

2023

*/

2023

*/

2024

#define CHARGE_BATCH 32U

2024

#define CHARGE_BATCH 32U

2025

struct memcg_stock_pcp {

2025

struct memcg_stock_pcp {

2026

struct mem_cgroup *cached; /* this never be root cgroup */

2026

struct mem_cgroup *cached; /* this never be root cgroup */

2027

unsigned int nr_pages;

2027

unsigned int nr_pages;

2028

struct work_struct work;

2028

struct work_struct work;

2029

unsigned long flags;

2029

unsigned long flags;

2030

#define FLUSHING_CACHED_CHARGE 0

2030

#define FLUSHING_CACHED_CHARGE 0

2031

};

2031

};

2032

static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);

2032

static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);

2033

static DEFINE_MUTEX(percpu_charge_mutex);

2033

static DEFINE_MUTEX(percpu_charge_mutex);

2034

2035

/*

2035

/*

2036

* Try to consume stocked charge on this cpu. If success, one page is consumed

2036

* Try to consume stocked charge on this cpu. If success, one page is consumed

2037

* from local stock and true is returned. If the stock is 0 or charges from a

2037

* from local stock and true is returned. If the stock is 0 or charges from a

2038

* cgroup which is not current target, returns false. This stock will be

2038

* cgroup which is not current target, returns false. This stock will be

2039

* refilled.

2039

* refilled.

2040

*/

2040

*/

2041

static bool consume_stock(struct mem_cgroup *memcg)

2041

static bool consume_stock(struct mem_cgroup *memcg)

2042

{

2042

{

2043

struct memcg_stock_pcp *stock;

2043

struct memcg_stock_pcp *stock;

2044

bool ret = true;

2044

bool ret = true;

2045

2046

stock = &get_cpu_var(memcg_stock);

2046

stock = &get_cpu_var(memcg_stock);

2047

if (memcg == stock->cached && stock->nr_pages)

2047

if (memcg == stock->cached && stock->nr_pages)

2048

stock->nr_pages--;

2048

stock->nr_pages--;

2049

else /* need to call res_counter_charge */

2049

else /* need to call res_counter_charge */

2050

ret = false;

2050

ret = false;

2051

put_cpu_var(memcg_stock);

2051

put_cpu_var(memcg_stock);

2052

return ret;

2052

return ret;

2053

}

2053

}

2054

2055

/*

2055

/*

2056

* Returns stocks cached in percpu to res_counter and reset cached information.

2056

* Returns stocks cached in percpu to res_counter and reset cached information.

2057

*/

2057

*/

2058

static void drain_stock(struct memcg_stock_pcp *stock)

2058

static void drain_stock(struct memcg_stock_pcp *stock)

2059

{

2059

{

2060

struct mem_cgroup *old = stock->cached;

2060

struct mem_cgroup *old = stock->cached;

2061

2062

if (stock->nr_pages) {

2062

if (stock->nr_pages) {

2063

unsigned long bytes = stock->nr_pages * PAGE_SIZE;

2063

unsigned long bytes = stock->nr_pages * PAGE_SIZE;

2064

2065

res_counter_uncharge(&old->res, bytes);

2065

res_counter_uncharge(&old->res, bytes);

2066

if (do_swap_account)

2066

if (do_swap_account)

2067

res_counter_uncharge(&old->memsw, bytes);

2067

res_counter_uncharge(&old->memsw, bytes);

2068

stock->nr_pages = 0;

2068

stock->nr_pages = 0;

2069

}

2069

}

2070

stock->cached = NULL;

2070

stock->cached = NULL;

2071

}

2071

}

2072

2073

/*

2073

/*

2074

* This must be called under preempt disabled or must be called by

2074

* This must be called under preempt disabled or must be called by

2075

* a thread which is pinned to local cpu.

2075

* a thread which is pinned to local cpu.

2076

*/

2076

*/

2077

static void drain_local_stock(struct work_struct *dummy)

2077

static void drain_local_stock(struct work_struct *dummy)

2078

{

2078

{

2079

struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);

2079

struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);

2080

drain_stock(stock);

2080

drain_stock(stock);

2081

clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);

2081

clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);

2082

}

2082

}

2083

2084

/*

2084

/*

2085

* Cache charges(val) which is from res_counter, to local per_cpu area.

2085

* Cache charges(val) which is from res_counter, to local per_cpu area.

2086

* This will be consumed by consume_stock() function, later.

2086

* This will be consumed by consume_stock() function, later.

2087

*/

2087

*/

2088

static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)

2088

static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)

2089

{

2089

{

2090

struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);

2090

struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);

2091

2092

if (stock->cached != memcg) { /* reset if necessary */

2092

if (stock->cached != memcg) { /* reset if necessary */

2093

drain_stock(stock);

2093

drain_stock(stock);

2094

stock->cached = memcg;

2094

stock->cached = memcg;

2095

}

2095

}

2096

stock->nr_pages += nr_pages;

2096

stock->nr_pages += nr_pages;

2097

put_cpu_var(memcg_stock);

2097

put_cpu_var(memcg_stock);

2098

}

2098

}

2099

2100

/*

2100

/*

2101

* Drains all per-CPU charge caches for given root_memcg resp. subtree

2101

* Drains all per-CPU charge caches for given root_memcg resp. subtree

2102

* of the hierarchy under it. sync flag says whether we should block

2102

* of the hierarchy under it. sync flag says whether we should block

2103

* until the work is done.

2103

* until the work is done.

2104

*/

2104

*/

2105

static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)

2105

static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)

2106

{

2106

{

2107

int cpu, curcpu;

2107

int cpu, curcpu;

2108

2109

/* Notify other cpus that system-wide "drain" is running */

2109

/* Notify other cpus that system-wide "drain" is running */

2110

get_online_cpus();

2110

get_online_cpus();

2111

curcpu = get_cpu();

2111

curcpu = get_cpu();

2112

for_each_online_cpu(cpu) {

2112

for_each_online_cpu(cpu) {

2113

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2113

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2114

struct mem_cgroup *memcg;

2114

struct mem_cgroup *memcg;

2115

2116

memcg = stock->cached;

2116

memcg = stock->cached;

2117

if (!memcg || !stock->nr_pages)

2117

if (!memcg || !stock->nr_pages)

2118

continue;

2118

continue;

2119

if (!mem_cgroup_same_or_subtree(root_memcg, memcg))

2119

if (!mem_cgroup_same_or_subtree(root_memcg, memcg))

2120

continue;

2120

continue;

2121

if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {

2121

if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {

2122

if (cpu == curcpu)

2122

if (cpu == curcpu)

2123

drain_local_stock(&stock->work);

2123

drain_local_stock(&stock->work);

2124

else

2124

else

2125

schedule_work_on(cpu, &stock->work);

2125

schedule_work_on(cpu, &stock->work);

2126

}

2126

}

2127

}

2127

}

2128

put_cpu();

2128

put_cpu();

2129

2130

if (!sync)

2130

if (!sync)

2131

goto out;

2131

goto out;

2132

2133

for_each_online_cpu(cpu) {

2133

for_each_online_cpu(cpu) {

2134

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2134

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2135

if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))

2135

if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))

2136

flush_work(&stock->work);

2136

flush_work(&stock->work);

2137

}

2137

}

2138

out:

2138

out:

2139

put_online_cpus();

2139

put_online_cpus();

2140

}

2140

}

2141

2142

/*

2142

/*

2143

* Tries to drain stocked charges in other cpus. This function is asynchronous

2143

* Tries to drain stocked charges in other cpus. This function is asynchronous

2144

* and just put a work per cpu for draining localy on each cpu. Caller can

2144

* and just put a work per cpu for draining localy on each cpu. Caller can

2145

* expects some charges will be back to res_counter later but cannot wait for

2145

* expects some charges will be back to res_counter later but cannot wait for

2146

* it.

2146

* it.

2147

*/

2147

*/

2148

static void drain_all_stock_async(struct mem_cgroup *root_memcg)

2148

static void drain_all_stock_async(struct mem_cgroup *root_memcg)

2149

{

2149

{

2150

/*

2150

/*

2151

* If someone calls draining, avoid adding more kworker runs.

2151

* If someone calls draining, avoid adding more kworker runs.

2152

*/

2152

*/

2153

if (!mutex_trylock(&percpu_charge_mutex))

2153

if (!mutex_trylock(&percpu_charge_mutex))

2154

return;

2154

return;

2155

drain_all_stock(root_memcg, false);

2155

drain_all_stock(root_memcg, false);

2156

mutex_unlock(&percpu_charge_mutex);

2156

mutex_unlock(&percpu_charge_mutex);

2157

}

2157

}

2158

2159

/* This is a synchronous drain interface. */

2159

/* This is a synchronous drain interface. */

2160

static void drain_all_stock_sync(struct mem_cgroup *root_memcg)

2160

static void drain_all_stock_sync(struct mem_cgroup *root_memcg)

2161

{

2161

{

2162

/* called when force_empty is called */

2162

/* called when force_empty is called */

2163

mutex_lock(&percpu_charge_mutex);

2163

mutex_lock(&percpu_charge_mutex);

2164

drain_all_stock(root_memcg, true);

2164

drain_all_stock(root_memcg, true);

2165

mutex_unlock(&percpu_charge_mutex);

2165

mutex_unlock(&percpu_charge_mutex);

2166

}

2166

}

2167

2168

/*

2168

/*

2169

* This function drains percpu counter value from DEAD cpu and

2169

* This function drains percpu counter value from DEAD cpu and

2170

* move it to local cpu. Note that this function can be preempted.

2170

* move it to local cpu. Note that this function can be preempted.

2171

*/

2171

*/

2172

static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)

2172

static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)

2173

{

2173

{

2174

int i;

2174

int i;

2175

2176

spin_lock(&memcg->pcp_counter_lock);

2176

spin_lock(&memcg->pcp_counter_lock);

2177

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

2177

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

2178

long x = per_cpu(memcg->stat->count[i], cpu);

2178

long x = per_cpu(memcg->stat->count[i], cpu);

2179

2180

per_cpu(memcg->stat->count[i], cpu) = 0;

2180

per_cpu(memcg->stat->count[i], cpu) = 0;

2181

memcg->nocpu_base.count[i] += x;

2181

memcg->nocpu_base.count[i] += x;

2182

}

2182

}

2183

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

2183

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

2184

unsigned long x = per_cpu(memcg->stat->events[i], cpu);

2184

unsigned long x = per_cpu(memcg->stat->events[i], cpu);

2185

2186

per_cpu(memcg->stat->events[i], cpu) = 0;

2186

per_cpu(memcg->stat->events[i], cpu) = 0;

2187

memcg->nocpu_base.events[i] += x;

2187

memcg->nocpu_base.events[i] += x;

2188

}

2188

}

2189

spin_unlock(&memcg->pcp_counter_lock);

2189

spin_unlock(&memcg->pcp_counter_lock);

2190

}

2190

}

2191

2192

static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,

2192

static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,

2193

unsigned long action,

2193

unsigned long action,

2194

void *hcpu)

2194

void *hcpu)

2195

{

2195

{

2196

int cpu = (unsigned long)hcpu;

2196

int cpu = (unsigned long)hcpu;

2197

struct memcg_stock_pcp *stock;

2197

struct memcg_stock_pcp *stock;

2198

struct mem_cgroup *iter;

2198

struct mem_cgroup *iter;

2199

2200

if (action == CPU_ONLINE)

2200

if (action == CPU_ONLINE)

2201

return NOTIFY_OK;

2201

return NOTIFY_OK;

2202

2203

if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)

2203

if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)

2204

return NOTIFY_OK;

2204

return NOTIFY_OK;

2205

2206

for_each_mem_cgroup(iter)

2206

for_each_mem_cgroup(iter)

2207

mem_cgroup_drain_pcp_counter(iter, cpu);

2207

mem_cgroup_drain_pcp_counter(iter, cpu);

2208

2209

stock = &per_cpu(memcg_stock, cpu);

2209

stock = &per_cpu(memcg_stock, cpu);

2210

drain_stock(stock);

2210

drain_stock(stock);

2211

return NOTIFY_OK;

2211

return NOTIFY_OK;

2212

}

2212

}

2213

2214

2215

/* See __mem_cgroup_try_charge() for details */

2215

/* See __mem_cgroup_try_charge() for details */

2216

enum {

2216

enum {

2217

CHARGE_OK, /* success */

2217

CHARGE_OK, /* success */

2218

CHARGE_RETRY, /* need to retry but retry is not bad */

2218

CHARGE_RETRY, /* need to retry but retry is not bad */

2219

CHARGE_NOMEM, /* we can't do more. return -ENOMEM */

2219

CHARGE_NOMEM, /* we can't do more. return -ENOMEM */

2220

CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */

2220

CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */

2221

CHARGE_OOM_DIE, /* the current is killed because of OOM */

2221

CHARGE_OOM_DIE, /* the current is killed because of OOM */

2222

};

2222

};

2223

2224

static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,

2224

static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,

2225

unsigned int nr_pages, bool oom_check)

2225

unsigned int nr_pages, bool oom_check)

2226

{

2226

{

2227

unsigned long csize = nr_pages * PAGE_SIZE;

2227

unsigned long csize = nr_pages * PAGE_SIZE;

2228

struct mem_cgroup *mem_over_limit;

2228

struct mem_cgroup *mem_over_limit;

2229

struct res_counter *fail_res;

2229

struct res_counter *fail_res;

2230

unsigned long flags = 0;

2230

unsigned long flags = 0;

2231

int ret;

2231

int ret;

2232

2233

ret = res_counter_charge(&memcg->res, csize, &fail_res);

2233

ret = res_counter_charge(&memcg->res, csize, &fail_res);

2234

2235

if (likely(!ret)) {

2235

if (likely(!ret)) {

2236

if (!do_swap_account)

2236

if (!do_swap_account)

2237

return CHARGE_OK;

2237

return CHARGE_OK;

2238

ret = res_counter_charge(&memcg->memsw, csize, &fail_res);

2238

ret = res_counter_charge(&memcg->memsw, csize, &fail_res);

2239

if (likely(!ret))

2239

if (likely(!ret))

2240

return CHARGE_OK;

2240

return CHARGE_OK;

2241

2242

res_counter_uncharge(&memcg->res, csize);

2242

res_counter_uncharge(&memcg->res, csize);

2243

mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);

2243

mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);

2244

flags |= MEM_CGROUP_RECLAIM_NOSWAP;

2244

flags |= MEM_CGROUP_RECLAIM_NOSWAP;

2245

} else

2245

} else

2246

mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);

2246

mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);

2247

/*

2247

/*

2248

* nr_pages can be either a huge page (HPAGE_PMD_NR), a batch

2248

* nr_pages can be either a huge page (HPAGE_PMD_NR), a batch

2249

* of regular pages (CHARGE_BATCH), or a single regular page (1).

2249

* of regular pages (CHARGE_BATCH), or a single regular page (1).

2250

*

2250

*

2251

* Never reclaim on behalf of optional batching, retry with a

2251

* Never reclaim on behalf of optional batching, retry with a

2252

* single page instead.

2252

* single page instead.

2253

*/

2253

*/

2254

if (nr_pages == CHARGE_BATCH)

2254

if (nr_pages == CHARGE_BATCH)

2255

return CHARGE_RETRY;

2255

return CHARGE_RETRY;

2256

2257

if (!(gfp_mask & __GFP_WAIT))

2257

if (!(gfp_mask & __GFP_WAIT))

2258

return CHARGE_WOULDBLOCK;

2258

return CHARGE_WOULDBLOCK;

2259

2260

ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);

2260

ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);

2261

if (mem_cgroup_margin(mem_over_limit) >= nr_pages)

2261

if (mem_cgroup_margin(mem_over_limit) >= nr_pages)

2262

return CHARGE_RETRY;

2262

return CHARGE_RETRY;

2263

/*

2263

/*

2264

* Even though the limit is exceeded at this point, reclaim

2264

* Even though the limit is exceeded at this point, reclaim

2265

* may have been able to free some pages. Retry the charge

2265

* may have been able to free some pages. Retry the charge

2266

* before killing the task.

2266

* before killing the task.

2267

*

2267

*

2268

* Only for regular pages, though: huge pages are rather

2268

* Only for regular pages, though: huge pages are rather

2269

* unlikely to succeed so close to the limit, and we fall back

2269

* unlikely to succeed so close to the limit, and we fall back

2270

* to regular pages anyway in case of failure.

2270

* to regular pages anyway in case of failure.

2271

*/

2271

*/

2272

if (nr_pages == 1 && ret)

2272

if (nr_pages == 1 && ret)

2273

return CHARGE_RETRY;

2273

return CHARGE_RETRY;

2274

2275

/*

2275

/*

2276

* At task move, charge accounts can be doubly counted. So, it's

2276

* At task move, charge accounts can be doubly counted. So, it's

2277

* better to wait until the end of task_move if something is going on.

2277

* better to wait until the end of task_move if something is going on.

2278

*/

2278

*/

2279

if (mem_cgroup_wait_acct_move(mem_over_limit))

2279

if (mem_cgroup_wait_acct_move(mem_over_limit))

2280

return CHARGE_RETRY;

2280

return CHARGE_RETRY;

2281

2282

/* If we don't need to call oom-killer at el, return immediately */

2282

/* If we don't need to call oom-killer at el, return immediately */

2283

if (!oom_check)

2283

if (!oom_check)

2284

return CHARGE_NOMEM;

2284

return CHARGE_NOMEM;

2285

/* check OOM */

2285

/* check OOM */

2286

if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))

2286

if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))

2287

return CHARGE_OOM_DIE;

2287

return CHARGE_OOM_DIE;

2288

2289

return CHARGE_RETRY;

2289

return CHARGE_RETRY;

2290

}

2290

}

2291

2292

/*

2292

/*

2293

* __mem_cgroup_try_charge() does

2293

* __mem_cgroup_try_charge() does

2294

* 1. detect memcg to be charged against from passed *mm and *ptr,

2294

* 1. detect memcg to be charged against from passed *mm and *ptr,

2295

* 2. update res_counter

2295

* 2. update res_counter

2296

* 3. call memory reclaim if necessary.

2296

* 3. call memory reclaim if necessary.

2297

*

2297

*

2298

* In some special case, if the task is fatal, fatal_signal_pending() or

2298

* In some special case, if the task is fatal, fatal_signal_pending() or

2299

* has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup

2299

* has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup

2300

* to *ptr. There are two reasons for this. 1: fatal threads should quit as soon

2300

* to *ptr. There are two reasons for this. 1: fatal threads should quit as soon

2301

* as possible without any hazards. 2: all pages should have a valid

2301

* as possible without any hazards. 2: all pages should have a valid

2302

* pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg

2302

* pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg

2303

* pointer, that is treated as a charge to root_mem_cgroup.

2303

* pointer, that is treated as a charge to root_mem_cgroup.

2304

*

2304

*

2305

* So __mem_cgroup_try_charge() will return

2305

* So __mem_cgroup_try_charge() will return

2306

* 0 ... on success, filling *ptr with a valid memcg pointer.

2306

* 0 ... on success, filling *ptr with a valid memcg pointer.

2307

* -ENOMEM ... charge failure because of resource limits.

2307

* -ENOMEM ... charge failure because of resource limits.

2308

* -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.

2308

* -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.

2309

*

2309

*

2310

* Unlike the exported interface, an "oom" parameter is added. if oom==true,

2310

* Unlike the exported interface, an "oom" parameter is added. if oom==true,

2311

* the oom-killer can be invoked.

2311

* the oom-killer can be invoked.

2312

*/

2312

*/

2313

static int __mem_cgroup_try_charge(struct mm_struct *mm,

2313

static int __mem_cgroup_try_charge(struct mm_struct *mm,

2314

gfp_t gfp_mask,

2314

gfp_t gfp_mask,

2315

unsigned int nr_pages,

2315

unsigned int nr_pages,

2316

struct mem_cgroup **ptr,

2316

struct mem_cgroup **ptr,

2317

bool oom)

2317

bool oom)

2318

{

2318

{

2319

unsigned int batch = max(CHARGE_BATCH, nr_pages);

2319

unsigned int batch = max(CHARGE_BATCH, nr_pages);

2320

int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

2320

int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

2321

struct mem_cgroup *memcg = NULL;

2321

struct mem_cgroup *memcg = NULL;

2322

int ret;

2322

int ret;

2323

2324

/*

2324

/*

2325

* Unlike gloval-vm's OOM-kill, we're not in memory shortage

2325

* Unlike gloval-vm's OOM-kill, we're not in memory shortage

2326

* in system level. So, allow to go ahead dying process in addition to

2326

* in system level. So, allow to go ahead dying process in addition to

2327

* MEMDIE process.

2327

* MEMDIE process.

2328

*/

2328

*/

2329

if (unlikely(test_thread_flag(TIF_MEMDIE)

2329

if (unlikely(test_thread_flag(TIF_MEMDIE)

2330

|| fatal_signal_pending(current)))

2330

|| fatal_signal_pending(current)))

2331

goto bypass;

2331

goto bypass;

2332

2333

/*

2333

/*

2334

* We always charge the cgroup the mm_struct belongs to.

2334

* We always charge the cgroup the mm_struct belongs to.

2335

* The mm_struct's mem_cgroup changes on task migration if the

2335

* The mm_struct's mem_cgroup changes on task migration if the

2336

* thread group leader migrates. It's possible that mm is not

2336

* thread group leader migrates. It's possible that mm is not

2337

* set, if so charge the init_mm (happens for pagecache usage).

2337

* set, if so charge the init_mm (happens for pagecache usage).

2338

*/

2338

*/

2339

if (!*ptr && !mm)

2339

if (!*ptr && !mm)

2340

*ptr = root_mem_cgroup;

2340

*ptr = root_mem_cgroup;

2341

again:

2341

again:

2342

if (*ptr) { /* css should be a valid one */

2342

if (*ptr) { /* css should be a valid one */

2343

memcg = *ptr;

2343

memcg = *ptr;

2344

VM_BUG_ON(css_is_removed(&memcg->css));

2344

VM_BUG_ON(css_is_removed(&memcg->css));

2345

if (mem_cgroup_is_root(memcg))

2345

if (mem_cgroup_is_root(memcg))

2346

goto done;

2346

goto done;

2347

if (nr_pages == 1 && consume_stock(memcg))

2347

if (nr_pages == 1 && consume_stock(memcg))

2348

goto done;

2348

goto done;

2349

css_get(&memcg->css);

2349

css_get(&memcg->css);

2350

} else {

2350

} else {

2351

struct task_struct *p;

2351

struct task_struct *p;

2352

2353

rcu_read_lock();

2353

rcu_read_lock();

2354

p = rcu_dereference(mm->owner);

2354

p = rcu_dereference(mm->owner);

2355

/*

2355

/*

2356

* Because we don't have task_lock(), "p" can exit.

2356

* Because we don't have task_lock(), "p" can exit.

2357

* In that case, "memcg" can point to root or p can be NULL with

2357

* In that case, "memcg" can point to root or p can be NULL with

2358

* race with swapoff. Then, we have small risk of mis-accouning.

2358

* race with swapoff. Then, we have small risk of mis-accouning.

2359

* But such kind of mis-account by race always happens because

2359

* But such kind of mis-account by race always happens because

2360

* we don't have cgroup_mutex(). It's overkill and we allo that

2360

* we don't have cgroup_mutex(). It's overkill and we allo that

2361

* small race, here.

2361

* small race, here.

2362

* (*) swapoff at el will charge against mm-struct not against

2362

* (*) swapoff at el will charge against mm-struct not against

2363

* task-struct. So, mm->owner can be NULL.

2363

* task-struct. So, mm->owner can be NULL.

2364

*/

2364

*/

2365

memcg = mem_cgroup_from_task(p);

2365

memcg = mem_cgroup_from_task(p);

2366

if (!memcg)

2366

if (!memcg)

2367

memcg = root_mem_cgroup;

2367

memcg = root_mem_cgroup;

2368

if (mem_cgroup_is_root(memcg)) {

2368

if (mem_cgroup_is_root(memcg)) {

2369

rcu_read_unlock();

2369

rcu_read_unlock();

2370

goto done;

2370

goto done;

2371

}

2371

}

2372

if (nr_pages == 1 && consume_stock(memcg)) {

2372

if (nr_pages == 1 && consume_stock(memcg)) {

2373

/*

2373

/*

2374

* It seems dagerous to access memcg without css_get().

2374

* It seems dagerous to access memcg without css_get().

2375

* But considering how consume_stok works, it's not

2375

* But considering how consume_stok works, it's not

2376

* necessary. If consume_stock success, some charges

2376

* necessary. If consume_stock success, some charges

2377

* from this memcg are cached on this cpu. So, we

2377

* from this memcg are cached on this cpu. So, we

2378

* don't need to call css_get()/css_tryget() before

2378

* don't need to call css_get()/css_tryget() before

2379

* calling consume_stock().

2379

* calling consume_stock().

2380

*/

2380

*/

2381

rcu_read_unlock();

2381

rcu_read_unlock();

2382

goto done;

2382

goto done;

2383

}

2383

}

2384

/* after here, we may be blocked. we need to get refcnt */

2384

/* after here, we may be blocked. we need to get refcnt */

2385

if (!css_tryget(&memcg->css)) {

2385

if (!css_tryget(&memcg->css)) {

2386

rcu_read_unlock();

2386

rcu_read_unlock();

2387

goto again;

2387

goto again;

2388

}

2388

}

2389

rcu_read_unlock();

2389

rcu_read_unlock();

2390

}

2390

}

2391

2392

do {

2392

do {

2393

bool oom_check;

2393

bool oom_check;

2394

2395

/* If killed, bypass charge */

2395

/* If killed, bypass charge */

2396

if (fatal_signal_pending(current)) {

2396

if (fatal_signal_pending(current)) {

2397

css_put(&memcg->css);

2397

css_put(&memcg->css);

2398

goto bypass;

2398

goto bypass;

2399

}

2399

}

2400

2401

oom_check = false;

2401

oom_check = false;

2402

if (oom && !nr_oom_retries) {

2402

if (oom && !nr_oom_retries) {

2403

oom_check = true;

2403

oom_check = true;

2404

nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

2404

nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

2405

}

2405

}

2406

2407

ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);

2407

ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);

2408

switch (ret) {

2408

switch (ret) {

2409

case CHARGE_OK:

2409

case CHARGE_OK:

2410

break;

2410

break;

2411

case CHARGE_RETRY: /* not in OOM situation but retry */

2411

case CHARGE_RETRY: /* not in OOM situation but retry */

2412

batch = nr_pages;

2412

batch = nr_pages;

2413

css_put(&memcg->css);

2413

css_put(&memcg->css);

2414

memcg = NULL;

2414

memcg = NULL;

2415

goto again;

2415

goto again;

2416

case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */

2416

case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */

2417

css_put(&memcg->css);

2417

css_put(&memcg->css);

2418

goto nomem;

2418

goto nomem;

2419

case CHARGE_NOMEM: /* OOM routine works */

2419

case CHARGE_NOMEM: /* OOM routine works */

2420

if (!oom) {

2420

if (!oom) {

2421

css_put(&memcg->css);

2421

css_put(&memcg->css);

2422

goto nomem;

2422

goto nomem;

2423

}

2423

}

2424

/* If oom, we never return -ENOMEM */

2424

/* If oom, we never return -ENOMEM */

2425

nr_oom_retries--;

2425

nr_oom_retries--;

2426

break;

2426

break;

2427

case CHARGE_OOM_DIE: /* Killed by OOM Killer */

2427

case CHARGE_OOM_DIE: /* Killed by OOM Killer */

2428

css_put(&memcg->css);

2428

css_put(&memcg->css);

2429

goto bypass;

2429

goto bypass;

2430

}

2430

}

2431

} while (ret != CHARGE_OK);

2431

} while (ret != CHARGE_OK);

2432

2433

if (batch > nr_pages)

2433

if (batch > nr_pages)

2434

refill_stock(memcg, batch - nr_pages);

2434

refill_stock(memcg, batch - nr_pages);

2435

css_put(&memcg->css);

2435

css_put(&memcg->css);

2436

done:

2436

done:

2437

*ptr = memcg;

2437

*ptr = memcg;

2438

return 0;

2438

return 0;

2439

nomem:

2439

nomem:

2440

*ptr = NULL;

2440

*ptr = NULL;

2441

return -ENOMEM;

2441

return -ENOMEM;

2442

bypass:

2442

bypass:

2443

*ptr = root_mem_cgroup;

2443

*ptr = root_mem_cgroup;

2444

return -EINTR;

2444

return -EINTR;

2445

}

2445

}

2446

2447

/*

2447

/*

2448

* Somemtimes we have to undo a charge we got by try_charge().

2448

* Somemtimes we have to undo a charge we got by try_charge().

2449

* This function is for that and do uncharge, put css's refcnt.

2449

* This function is for that and do uncharge, put css's refcnt.

2450

* gotten by try_charge().

2450

* gotten by try_charge().

2451

*/

2451

*/

2452

static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,

2452

static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,

2453

unsigned int nr_pages)

2453

unsigned int nr_pages)

2454

{

2454

{

2455

if (!mem_cgroup_is_root(memcg)) {

2455

if (!mem_cgroup_is_root(memcg)) {

2456

unsigned long bytes = nr_pages * PAGE_SIZE;

2456

unsigned long bytes = nr_pages * PAGE_SIZE;

2457

2458

res_counter_uncharge(&memcg->res, bytes);

2458

res_counter_uncharge(&memcg->res, bytes);

2459

if (do_swap_account)

2459

if (do_swap_account)

2460

res_counter_uncharge(&memcg->memsw, bytes);

2460

res_counter_uncharge(&memcg->memsw, bytes);

2461

}

2461

}

2462

}

2462

}

2463

2464

/*

2464

/*

2465

* Cancel chrages in this cgroup....doesn't propagate to parent cgroup.

2465

* Cancel chrages in this cgroup....doesn't propagate to parent cgroup.

2466

* This is useful when moving usage to parent cgroup.

2466

* This is useful when moving usage to parent cgroup.

2467

*/

2467

*/

2468

static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,

2468

static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,

2469

unsigned int nr_pages)

2469

unsigned int nr_pages)

2470

{

2470

{

2471

unsigned long bytes = nr_pages * PAGE_SIZE;

2471

unsigned long bytes = nr_pages * PAGE_SIZE;

2472

2473

if (mem_cgroup_is_root(memcg))

2473

if (mem_cgroup_is_root(memcg))

2474

return;

2474

return;

2475

2476

res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);

2476

res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);

2477

if (do_swap_account)

2477

if (do_swap_account)

2478

res_counter_uncharge_until(&memcg->memsw,

2478

res_counter_uncharge_until(&memcg->memsw,

2479

memcg->memsw.parent, bytes);

2479

memcg->memsw.parent, bytes);

2480

}

2480

}

2481

2482

/*

2482

/*

2483

* A helper function to get mem_cgroup from ID. must be called under

2483

* A helper function to get mem_cgroup from ID. must be called under

2484

* rcu_read_lock(). The caller must check css_is_removed() or some if

2484

* rcu_read_lock(). The caller must check css_is_removed() or some if

2485

* it's concern. (dropping refcnt from swap can be called against removed

2485

* it's concern. (dropping refcnt from swap can be called against removed

2486

* memcg.)

2486

* memcg.)

2487

*/

2487

*/

2488

static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)

2488

static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)

2489

{

2489

{

2490

struct cgroup_subsys_state *css;

2490

struct cgroup_subsys_state *css;

2491

2492

/* ID 0 is unused ID */

2492

/* ID 0 is unused ID */

2493

if (!id)

2493

if (!id)

2494

return NULL;

2494

return NULL;

2495

css = css_lookup(&mem_cgroup_subsys, id);

2495

css = css_lookup(&mem_cgroup_subsys, id);

2496

if (!css)

2496

if (!css)

2497

return NULL;

2497

return NULL;

2498

return container_of(css, struct mem_cgroup, css);

2498

return container_of(css, struct mem_cgroup, css);

2499

}

2499

}

2500

2501

struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)

2501

struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)

2502

{

2502

{

2503

struct mem_cgroup *memcg = NULL;

2503

struct mem_cgroup *memcg = NULL;

2504

struct page_cgroup *pc;

2504

struct page_cgroup *pc;

2505

unsigned short id;

2505

unsigned short id;

2506

swp_entry_t ent;

2506

swp_entry_t ent;

2507

2508

VM_BUG_ON(!PageLocked(page));

2508

VM_BUG_ON(!PageLocked(page));

2509

2510

pc = lookup_page_cgroup(page);

2510

pc = lookup_page_cgroup(page);

2511

lock_page_cgroup(pc);

2511

lock_page_cgroup(pc);

2512

if (PageCgroupUsed(pc)) {

2512

if (PageCgroupUsed(pc)) {

2513

memcg = pc->mem_cgroup;

2513

memcg = pc->mem_cgroup;

2514

if (memcg && !css_tryget(&memcg->css))

2514

if (memcg && !css_tryget(&memcg->css))

2515

memcg = NULL;

2515

memcg = NULL;

2516

} else if (PageSwapCache(page)) {

2516

} else if (PageSwapCache(page)) {

2517

ent.val = page_private(page);

2517

ent.val = page_private(page);

2518

id = lookup_swap_cgroup_id(ent);

2518

id = lookup_swap_cgroup_id(ent);

2519

rcu_read_lock();

2519

rcu_read_lock();

2520

memcg = mem_cgroup_lookup(id);

2520

memcg = mem_cgroup_lookup(id);

2521

if (memcg && !css_tryget(&memcg->css))

2521

if (memcg && !css_tryget(&memcg->css))

2522

memcg = NULL;

2522

memcg = NULL;

2523

rcu_read_unlock();

2523

rcu_read_unlock();

2524

}

2524

}

2525

unlock_page_cgroup(pc);

2525

unlock_page_cgroup(pc);

2526

return memcg;

2526

return memcg;

2527

}

2527

}

2528

2529

static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,

2529

static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,

2530

struct page *page,

2530

struct page *page,

2531

unsigned int nr_pages,

2531

unsigned int nr_pages,

2532

enum charge_type ctype,

2532

enum charge_type ctype,

2533

bool lrucare)

2533

bool lrucare)

2534

{

2534

{

2535

struct page_cgroup *pc = lookup_page_cgroup(page);

2535

struct page_cgroup *pc = lookup_page_cgroup(page);

2536

struct zone *uninitialized_var(zone);

2536

struct zone *uninitialized_var(zone);

2537

struct lruvec *lruvec;

2537

struct lruvec *lruvec;

2538

bool was_on_lru = false;

2538

bool was_on_lru = false;

2539

bool anon;

2539

bool anon;

2540

2541

lock_page_cgroup(pc);

2541

lock_page_cgroup(pc);

2542

if (unlikely(PageCgroupUsed(pc))) {

2542

if (unlikely(PageCgroupUsed(pc))) {

2543

unlock_page_cgroup(pc);

2543

unlock_page_cgroup(pc);

2544

__mem_cgroup_cancel_charge(memcg, nr_pages);

2544

__mem_cgroup_cancel_charge(memcg, nr_pages);

2545

return;

2545

return;

2546

}

2546

}

2547

/*

2547

/*

2548

* we don't need page_cgroup_lock about tail pages, becase they are not

2548

* we don't need page_cgroup_lock about tail pages, becase they are not

2549

* accessed by any other context at this point.

2549

* accessed by any other context at this point.

2550

*/

2550

*/

2551

2552

/*

2552

/*

2553

* In some cases, SwapCache and FUSE(splice_buf->radixtree), the page

2553

* In some cases, SwapCache and FUSE(splice_buf->radixtree), the page

2554

* may already be on some other mem_cgroup's LRU. Take care of it.

2554

* may already be on some other mem_cgroup's LRU. Take care of it.

2555

*/

2555

*/

2556

if (lrucare) {

2556

if (lrucare) {

2557

zone = page_zone(page);

2557

zone = page_zone(page);

2558

spin_lock_irq(&zone->lru_lock);

2558

spin_lock_irq(&zone->lru_lock);

2559

if (PageLRU(page)) {

2559

if (PageLRU(page)) {

2560

lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);

2560

lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);

2561

ClearPageLRU(page);

2561

ClearPageLRU(page);

2562

del_page_from_lru_list(page, lruvec, page_lru(page));

2562

del_page_from_lru_list(page, lruvec, page_lru(page));

2563

was_on_lru = true;

2563

was_on_lru = true;

2564

}

2564

}

2565

}

2565

}

2566

2567

pc->mem_cgroup = memcg;

2567

pc->mem_cgroup = memcg;

2568

/*

2568

/*

2569

* We access a page_cgroup asynchronously without lock_page_cgroup().

2569

* We access a page_cgroup asynchronously without lock_page_cgroup().

2570

* Especially when a page_cgroup is taken from a page, pc->mem_cgroup

2570

* Especially when a page_cgroup is taken from a page, pc->mem_cgroup

2571

* is accessed after testing USED bit. To make pc->mem_cgroup visible

2571

* is accessed after testing USED bit. To make pc->mem_cgroup visible

2572

* before USED bit, we need memory barrier here.

2572

* before USED bit, we need memory barrier here.

2573

* See mem_cgroup_add_lru_list(), etc.

2573

* See mem_cgroup_add_lru_list(), etc.

2574

*/

2574

*/

2575

smp_wmb();

2575

smp_wmb();

2576

SetPageCgroupUsed(pc);

2576

SetPageCgroupUsed(pc);

2577

2578

if (lrucare) {

2578

if (lrucare) {

2579

if (was_on_lru) {

2579

if (was_on_lru) {

2580

lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);

2580

lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);

2581

VM_BUG_ON(PageLRU(page));

2581

VM_BUG_ON(PageLRU(page));

2582

SetPageLRU(page);

2582

SetPageLRU(page);

2583

add_page_to_lru_list(page, lruvec, page_lru(page));

2583

add_page_to_lru_list(page, lruvec, page_lru(page));

2584

}

2584

}

2585

spin_unlock_irq(&zone->lru_lock);

2585

spin_unlock_irq(&zone->lru_lock);

2586

}

2586

}

2587

2588

if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)

2588

if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)

2589

anon = true;

2589

anon = true;

2590

else

2590

else

2591

anon = false;

2591

anon = false;

2592

2593

mem_cgroup_charge_statistics(memcg, anon, nr_pages);

2593

mem_cgroup_charge_statistics(memcg, anon, nr_pages);

2594

unlock_page_cgroup(pc);

2594

unlock_page_cgroup(pc);

2595

2596

/*

2596

/*

2597

* "charge_statistics" updated event counter. Then, check it.

2597

* "charge_statistics" updated event counter. Then, check it.

2598

* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.

2598

* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.

2599

* if they exceeds softlimit.

2599

* if they exceeds softlimit.

2600

*/

2600

*/

2601

memcg_check_events(memcg, page);

2601

memcg_check_events(memcg, page);

2602

}

2602

}

2603

2604

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

2604

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

2605

2606

#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)

2606

#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)

2607

/*

2607

/*

2608

* Because tail pages are not marked as "used", set it. We're under

2608

* Because tail pages are not marked as "used", set it. We're under

2609

* zone->lru_lock, 'splitting on pmd' and compound_lock.

2609

* zone->lru_lock, 'splitting on pmd' and compound_lock.

2610

* charge/uncharge will be never happen and move_account() is done under

2610

* charge/uncharge will be never happen and move_account() is done under

2611

* compound_lock(), so we don't have to take care of races.

2611

* compound_lock(), so we don't have to take care of races.

2612

*/

2612

*/

2613

void mem_cgroup_split_huge_fixup(struct page *head)

2613

void mem_cgroup_split_huge_fixup(struct page *head)

2614

{

2614

{

2615

struct page_cgroup *head_pc = lookup_page_cgroup(head);

2615

struct page_cgroup *head_pc = lookup_page_cgroup(head);

2616

struct page_cgroup *pc;

2616

struct page_cgroup *pc;

2617

int i;

2617

int i;

2618

2619

if (mem_cgroup_disabled())

2619

if (mem_cgroup_disabled())

2620

return;

2620

return;

2621

for (i = 1; i < HPAGE_PMD_NR; i++) {

2621

for (i = 1; i < HPAGE_PMD_NR; i++) {

2622

pc = head_pc + i;

2622

pc = head_pc + i;

2623

pc->mem_cgroup = head_pc->mem_cgroup;

2623

pc->mem_cgroup = head_pc->mem_cgroup;

2624

smp_wmb();/* see __commit_charge() */

2624

smp_wmb();/* see __commit_charge() */

2625

pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;

2625

pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;

2626

}

2626

}

2627

}

2627

}

2628

#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

2628

#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

2629

2630

/**

2630

/**

2631

* mem_cgroup_move_account - move account of the page

2631

* mem_cgroup_move_account - move account of the page

2632

* @page: the page

2632

* @page: the page

2633

* @nr_pages: number of regular pages (>1 for huge pages)

2633

* @nr_pages: number of regular pages (>1 for huge pages)

2634

* @pc: page_cgroup of the page.

2634

* @pc: page_cgroup of the page.

2635

* @from: mem_cgroup which the page is moved from.

2635

* @from: mem_cgroup which the page is moved from.

2636

* @to: mem_cgroup which the page is moved to. @from != @to.

2636

* @to: mem_cgroup which the page is moved to. @from != @to.

2637

*

2637

*

2638

* The caller must confirm following.

2638

* The caller must confirm following.

2639

* - page is not on LRU (isolate_page() is useful.)

2639

* - page is not on LRU (isolate_page() is useful.)

2640

* - compound_lock is held when nr_pages > 1

2640

* - compound_lock is held when nr_pages > 1

2641

*

2641

*

2642

* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"

2642

* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"

2643

* from old cgroup.

2643

* from old cgroup.

2644

*/

2644

*/

2645

static int mem_cgroup_move_account(struct page *page,

2645

static int mem_cgroup_move_account(struct page *page,

2646

unsigned int nr_pages,

2646

unsigned int nr_pages,

2647

struct page_cgroup *pc,

2647

struct page_cgroup *pc,

2648

struct mem_cgroup *from,

2648

struct mem_cgroup *from,

2649

struct mem_cgroup *to)

2649

struct mem_cgroup *to)

2650

{

2650

{

2651

unsigned long flags;

2651

unsigned long flags;

2652

int ret;

2652

int ret;

2653

bool anon = PageAnon(page);

2653

bool anon = PageAnon(page);

2654

2655

VM_BUG_ON(from == to);

2655

VM_BUG_ON(from == to);

2656

VM_BUG_ON(PageLRU(page));

2656

VM_BUG_ON(PageLRU(page));

2657

/*

2657

/*

2658

* The page is isolated from LRU. So, collapse function

2658

* The page is isolated from LRU. So, collapse function

2659

* will not handle this page. But page splitting can happen.

2659

* will not handle this page. But page splitting can happen.

2660

* Do this check under compound_page_lock(). The caller should

2660

* Do this check under compound_page_lock(). The caller should

2661

* hold it.

2661

* hold it.

2662

*/

2662

*/

2663

ret = -EBUSY;

2663

ret = -EBUSY;

2664

if (nr_pages > 1 && !PageTransHuge(page))

2664

if (nr_pages > 1 && !PageTransHuge(page))

2665

goto out;

2665

goto out;

2666

2667

lock_page_cgroup(pc);

2667

lock_page_cgroup(pc);

2668

2669

ret = -EINVAL;

2669

ret = -EINVAL;

2670

if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)

2670

if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)

2671

goto unlock;

2671

goto unlock;

2672

2673

move_lock_mem_cgroup(from, &flags);

2673

move_lock_mem_cgroup(from, &flags);

2674

2675

if (!anon && page_mapped(page)) {

2675

if (!anon && page_mapped(page)) {

2676

/* Update mapped_file data for mem_cgroup */

2676

/* Update mapped_file data for mem_cgroup */

2677

preempt_disable();

2677

preempt_disable();

2678

__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2678

__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2679

__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2679

__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2680

preempt_enable();

2680

preempt_enable();

2681

}

2681

}

2682

mem_cgroup_charge_statistics(from, anon, -nr_pages);

2682

mem_cgroup_charge_statistics(from, anon, -nr_pages);

2683

2684

/* caller should have done css_get */

2684

/* caller should have done css_get */

2685

pc->mem_cgroup = to;

2685

pc->mem_cgroup = to;

2686

mem_cgroup_charge_statistics(to, anon, nr_pages);

2686

mem_cgroup_charge_statistics(to, anon, nr_pages);

2687

/*

2687

/*

2688

* We charges against "to" which may not have any tasks. Then, "to"

2688

* We charges against "to" which may not have any tasks. Then, "to"

2689

* can be under rmdir(). But in current implementation, caller of

2689

* can be under rmdir(). But in current implementation, caller of

2690

* this function is just force_empty() and move charge, so it's

2690

* this function is just force_empty() and move charge, so it's

2691

* guaranteed that "to" is never removed. So, we don't check rmdir

2691

* guaranteed that "to" is never removed. So, we don't check rmdir

2692

* status here.

2692

* status here.

2693

*/

2693

*/

2694

move_unlock_mem_cgroup(from, &flags);

2694

move_unlock_mem_cgroup(from, &flags);

2695

ret = 0;

2695

ret = 0;

2696

unlock:

2696

unlock:

2697

unlock_page_cgroup(pc);

2697

unlock_page_cgroup(pc);

2698

/*

2698

/*

2699

* check events

2699

* check events

2700

*/

2700

*/

2701

memcg_check_events(to, page);

2701

memcg_check_events(to, page);

2702

memcg_check_events(from, page);

2702

memcg_check_events(from, page);

2703

out:

2703

out:

2704

return ret;

2704

return ret;

2705

}

2705

}

2706

2707

/*

2707

/*

2708

* move charges to its parent.

2708

* move charges to its parent.

2709

*/

2709

*/

2710

2711

static int mem_cgroup_move_parent(struct page *page,

2711

static int mem_cgroup_move_parent(struct page *page,

2712

struct page_cgroup *pc,

2712

struct page_cgroup *pc,

2713

struct mem_cgroup *child)

2713

struct mem_cgroup *child)

2714

{

2714

{

2715

struct mem_cgroup *parent;

2715

struct mem_cgroup *parent;

2716

unsigned int nr_pages;

2716

unsigned int nr_pages;

2717

unsigned long uninitialized_var(flags);

2717

unsigned long uninitialized_var(flags);

2718

int ret;

2718

int ret;

2719

2720

/* Is ROOT ? */

2720

/* Is ROOT ? */

2721

if (mem_cgroup_is_root(child))

2721

if (mem_cgroup_is_root(child))

2722

return -EINVAL;

2722

return -EINVAL;

2723

2724

ret = -EBUSY;

2724

ret = -EBUSY;

2725

if (!get_page_unless_zero(page))

2725

if (!get_page_unless_zero(page))

2726

goto out;

2726

goto out;

2727

if (isolate_lru_page(page))

2727

if (isolate_lru_page(page))

2728

goto put;

2728

goto put;

2729

2730

nr_pages = hpage_nr_pages(page);

2730

nr_pages = hpage_nr_pages(page);

2731

2732

parent = parent_mem_cgroup(child);

2732

parent = parent_mem_cgroup(child);

2733

/*

2733

/*

2734

* If no parent, move charges to root cgroup.

2734

* If no parent, move charges to root cgroup.

2735

*/

2735

*/

2736

if (!parent)

2736

if (!parent)

2737

parent = root_mem_cgroup;

2737

parent = root_mem_cgroup;

2738

2739

if (nr_pages > 1)

2739

if (nr_pages > 1)

2740

flags = compound_lock_irqsave(page);

2740

flags = compound_lock_irqsave(page);

2741

2742

ret = mem_cgroup_move_account(page, nr_pages,

2742

ret = mem_cgroup_move_account(page, nr_pages,

2743

pc, child, parent);

2743

pc, child, parent);

2744

if (!ret)

2744

if (!ret)

2745

__mem_cgroup_cancel_local_charge(child, nr_pages);

2745

__mem_cgroup_cancel_local_charge(child, nr_pages);

2746

2747

if (nr_pages > 1)

2747

if (nr_pages > 1)

2748

compound_unlock_irqrestore(page, flags);

2748

compound_unlock_irqrestore(page, flags);

2749

putback_lru_page(page);

2749

putback_lru_page(page);

2750

put:

2750

put:

2751

put_page(page);

2751

put_page(page);

2752

out:

2752

out:

2753

return ret;

2753

return ret;

2754

}

2754

}

2755

2756

/*

2756

/*

2757

* Charge the memory controller for page usage.

2757

* Charge the memory controller for page usage.

2758

* Return

2758

* Return

2759

* 0 if the charge was successful

2759

* 0 if the charge was successful

2760

* < 0 if the cgroup is over its limit

2760

* < 0 if the cgroup is over its limit

2761

*/

2761

*/

2762

static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,

2762

static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,

2763

gfp_t gfp_mask, enum charge_type ctype)

2763

gfp_t gfp_mask, enum charge_type ctype)

2764

{

2764

{

2765

struct mem_cgroup *memcg = NULL;

2765

struct mem_cgroup *memcg = NULL;

2766

unsigned int nr_pages = 1;

2766

unsigned int nr_pages = 1;

2767

bool oom = true;

2767

bool oom = true;

2768

int ret;

2768

int ret;

2769

2770

if (PageTransHuge(page)) {

2770

if (PageTransHuge(page)) {

2771

nr_pages <<= compound_order(page);

2771

nr_pages <<= compound_order(page);

2772

VM_BUG_ON(!PageTransHuge(page));

2772

VM_BUG_ON(!PageTransHuge(page));

2773

/*

2773

/*

2774

* Never OOM-kill a process for a huge page. The

2774

* Never OOM-kill a process for a huge page. The

2775

* fault handler will fall back to regular pages.

2775

* fault handler will fall back to regular pages.

2776

*/

2776

*/

2777

oom = false;

2777

oom = false;

2778

}

2778

}

2779

2780

ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);

2780

ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);

2781

if (ret == -ENOMEM)

2781

if (ret == -ENOMEM)

2782

return ret;

2782

return ret;

2783

__mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);

2783

__mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);

2784

return 0;

2784

return 0;

2785

}

2785

}

2786

2787

int mem_cgroup_newpage_charge(struct page *page,

2787

int mem_cgroup_newpage_charge(struct page *page,

2788

struct mm_struct *mm, gfp_t gfp_mask)

2788

struct mm_struct *mm, gfp_t gfp_mask)

2789

{

2789

{

2790

if (mem_cgroup_disabled())

2790

if (mem_cgroup_disabled())

2791

return 0;

2791

return 0;

2792

VM_BUG_ON(page_mapped(page));

2792

VM_BUG_ON(page_mapped(page));

2793

VM_BUG_ON(page->mapping && !PageAnon(page));

2793

VM_BUG_ON(page->mapping && !PageAnon(page));

2794

VM_BUG_ON(!mm);

2794

VM_BUG_ON(!mm);

2795

return mem_cgroup_charge_common(page, mm, gfp_mask,

2795

return mem_cgroup_charge_common(page, mm, gfp_mask,

2796

MEM_CGROUP_CHARGE_TYPE_ANON);

2796

MEM_CGROUP_CHARGE_TYPE_ANON);

2797

}

2797

}

2798

2799

static void

2799

static void

2800

__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,

2800

__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,

2801

enum charge_type ctype);

2801

enum charge_type ctype);

2802

2803

int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,

2803

int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,

2804

gfp_t gfp_mask)

2804

gfp_t gfp_mask)

2805

{

2805

{

2806

struct mem_cgroup *memcg = NULL;

2806

struct mem_cgroup *memcg = NULL;

2807

enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;

2807

enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;

2808

int ret;

2808

int ret;

2809

2810

if (mem_cgroup_disabled())

2810

if (mem_cgroup_disabled())

2811

return 0;

2811

return 0;

2812

if (PageCompound(page))

2812

if (PageCompound(page))

2813

return 0;

2813

return 0;

2814

2815

if (unlikely(!mm))

2815

if (unlikely(!mm))

2816

mm = &init_mm;

2816

mm = &init_mm;

2817

if (!page_is_file_cache(page))

2817

if (!page_is_file_cache(page))

2818

type = MEM_CGROUP_CHARGE_TYPE_SHMEM;

2818

type = MEM_CGROUP_CHARGE_TYPE_SHMEM;

2819

2820

if (!PageSwapCache(page))

2820

if (!PageSwapCache(page))

2821

ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);

2821

ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);

2822

else { /* page is swapcache/shmem */

2822

else { /* page is swapcache/shmem */

2823

ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);

2823

ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);

2824

if (!ret)

2824

if (!ret)

2825

__mem_cgroup_commit_charge_swapin(page, memcg, type);

2825

__mem_cgroup_commit_charge_swapin(page, memcg, type);

2826

}

2826

}

2827

return ret;

2827

return ret;

2828

}

2828

}

2829

2830

/*

2830

/*

2831

* While swap-in, try_charge -> commit or cancel, the page is locked.

2831

* While swap-in, try_charge -> commit or cancel, the page is locked.

2832

* And when try_charge() successfully returns, one refcnt to memcg without

2832

* And when try_charge() successfully returns, one refcnt to memcg without

2833

* struct page_cgroup is acquired. This refcnt will be consumed by

2833

* struct page_cgroup is acquired. This refcnt will be consumed by

2834

* "commit()" or removed by "cancel()"

2834

* "commit()" or removed by "cancel()"

2835

*/

2835

*/

2836

int mem_cgroup_try_charge_swapin(struct mm_struct *mm,

2836

int mem_cgroup_try_charge_swapin(struct mm_struct *mm,

2837

struct page *page,

2837

struct page *page,

2838

gfp_t mask, struct mem_cgroup **memcgp)

2838

gfp_t mask, struct mem_cgroup **memcgp)

2839

{

2839

{

2840

struct mem_cgroup *memcg;

2840

struct mem_cgroup *memcg;

2841

int ret;

2841

int ret;

2842

2843

*memcgp = NULL;

2843

*memcgp = NULL;

2844

2845

if (mem_cgroup_disabled())

2845

if (mem_cgroup_disabled())

2846

return 0;

2846

return 0;

2847

2848

if (!do_swap_account)

2848

if (!do_swap_account)

2849

goto charge_cur_mm;

2849

goto charge_cur_mm;

2850

/*

2850

/*

2851

* A racing thread's fault, or swapoff, may have already updated

2851

* A racing thread's fault, or swapoff, may have already updated

2852

* the pte, and even removed page from swap cache: in those cases

2852

* the pte, and even removed page from swap cache: in those cases

2853

* do_swap_page()'s pte_same() test will fail; but there's also a

2853

* do_swap_page()'s pte_same() test will fail; but there's also a

2854

* KSM case which does need to charge the page.

2854

* KSM case which does need to charge the page.

2855

*/

2855

*/

2856

if (!PageSwapCache(page))

2856

if (!PageSwapCache(page))

2857

goto charge_cur_mm;

2857

goto charge_cur_mm;

2858

memcg = try_get_mem_cgroup_from_page(page);

2858

memcg = try_get_mem_cgroup_from_page(page);

2859

if (!memcg)

2859

if (!memcg)

2860

goto charge_cur_mm;

2860

goto charge_cur_mm;

2861

*memcgp = memcg;

2861

*memcgp = memcg;

2862

ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);

2862

ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);

2863

css_put(&memcg->css);

2863

css_put(&memcg->css);

2864

if (ret == -EINTR)

2864

if (ret == -EINTR)

2865

ret = 0;

2865

ret = 0;

2866

return ret;

2866

return ret;

2867

charge_cur_mm:

2867

charge_cur_mm:

2868

if (unlikely(!mm))

2868

if (unlikely(!mm))

2869

mm = &init_mm;

2869

mm = &init_mm;

2870

ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);

2870

ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);

2871

if (ret == -EINTR)

2871

if (ret == -EINTR)

2872

ret = 0;

2872

ret = 0;

2873

return ret;

2873

return ret;

2874

}

2874

}

2875

2876

static void

2876

static void

2877

__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,

2877

__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,

2878

enum charge_type ctype)

2878

enum charge_type ctype)

2879

{

2879

{

2880

if (mem_cgroup_disabled())

2880

if (mem_cgroup_disabled())

2881

return;

2881

return;

2882

if (!memcg)

2882

if (!memcg)

2883

return;

2883

return;

2884

cgroup_exclude_rmdir(&memcg->css);

2884

cgroup_exclude_rmdir(&memcg->css);

2885

2886

__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);

2886

__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);

2887

/*

2887

/*

2888

* Now swap is on-memory. This means this page may be

2888

* Now swap is on-memory. This means this page may be

2889

* counted both as mem and swap....double count.

2889

* counted both as mem and swap....double count.

2890

* Fix it by uncharging from memsw. Basically, this SwapCache is stable

2890

* Fix it by uncharging from memsw. Basically, this SwapCache is stable

2891

* under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()

2891

* under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()

2892

* may call delete_from_swap_cache() before reach here.

2892

* may call delete_from_swap_cache() before reach here.

2893

*/

2893

*/

2894

if (do_swap_account && PageSwapCache(page)) {

2894

if (do_swap_account && PageSwapCache(page)) {

2895

swp_entry_t ent = {.val = page_private(page)};

2895

swp_entry_t ent = {.val = page_private(page)};

2896

mem_cgroup_uncharge_swap(ent);

2896

mem_cgroup_uncharge_swap(ent);

2897

}

2897

}

2898

/*

2898

/*

2899

* At swapin, we may charge account against cgroup which has no tasks.

2899

* At swapin, we may charge account against cgroup which has no tasks.

2900

* So, rmdir()->pre_destroy() can be called while we do this charge.

2900

* So, rmdir()->pre_destroy() can be called while we do this charge.

2901

* In that case, we need to call pre_destroy() again. check it here.

2901

* In that case, we need to call pre_destroy() again. check it here.

2902

*/

2902

*/

2903

cgroup_release_and_wakeup_rmdir(&memcg->css);

2903

cgroup_release_and_wakeup_rmdir(&memcg->css);

2904

}

2904

}

2905

2906

void mem_cgroup_commit_charge_swapin(struct page *page,

2906

void mem_cgroup_commit_charge_swapin(struct page *page,

2907

struct mem_cgroup *memcg)

2907

struct mem_cgroup *memcg)

2908

{

2908

{

2909

__mem_cgroup_commit_charge_swapin(page, memcg,

2909

__mem_cgroup_commit_charge_swapin(page, memcg,

2910

MEM_CGROUP_CHARGE_TYPE_ANON);

2910

MEM_CGROUP_CHARGE_TYPE_ANON);

2911

}

2911

}

2912

2913

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)

2913

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)

2914

{

2914

{

2915

if (mem_cgroup_disabled())

2915

if (mem_cgroup_disabled())

2916

return;

2916

return;

2917

if (!memcg)

2917

if (!memcg)

2918

return;

2918

return;

2919

__mem_cgroup_cancel_charge(memcg, 1);

2919

__mem_cgroup_cancel_charge(memcg, 1);

2920

}

2920

}

2921

2922

static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,

2922

static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,

2923

unsigned int nr_pages,

2923

unsigned int nr_pages,

2924

const enum charge_type ctype)

2924

const enum charge_type ctype)

2925

{

2925

{

2926

struct memcg_batch_info *batch = NULL;

2926

struct memcg_batch_info *batch = NULL;

2927

bool uncharge_memsw = true;

2927

bool uncharge_memsw = true;

2928

2929

/* If swapout, usage of swap doesn't decrease */

2929

/* If swapout, usage of swap doesn't decrease */

2930

if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)

2930

if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)

2931

uncharge_memsw = false;

2931

uncharge_memsw = false;

2932

2933

batch = &current->memcg_batch;

2933

batch = &current->memcg_batch;

2934

/*

2934

/*

2935

* In usual, we do css_get() when we remember memcg pointer.

2935

* In usual, we do css_get() when we remember memcg pointer.

2936

* But in this case, we keep res->usage until end of a series of

2936

* But in this case, we keep res->usage until end of a series of

2937

* uncharges. Then, it's ok to ignore memcg's refcnt.

2937

* uncharges. Then, it's ok to ignore memcg's refcnt.

2938

*/

2938

*/

2939

if (!batch->memcg)

2939

if (!batch->memcg)

2940

batch->memcg = memcg;

2940

batch->memcg = memcg;

2941

/*

2941

/*

2942

* do_batch > 0 when unmapping pages or inode invalidate/truncate.

2942

* do_batch > 0 when unmapping pages or inode invalidate/truncate.

2943

* In those cases, all pages freed continuously can be expected to be in

2943

* In those cases, all pages freed continuously can be expected to be in

2944

* the same cgroup and we have chance to coalesce uncharges.

2944

* the same cgroup and we have chance to coalesce uncharges.

2945

* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)

2945

* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)

2946

* because we want to do uncharge as soon as possible.

2946

* because we want to do uncharge as soon as possible.

2947

*/

2947

*/

2948

2949

if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))

2949

if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))

2950

goto direct_uncharge;

2950

goto direct_uncharge;

2951

2952

if (nr_pages > 1)

2952

if (nr_pages > 1)

2953

goto direct_uncharge;

2953

goto direct_uncharge;

2954

2955

/*

2955

/*

2956

* In typical case, batch->memcg == mem. This means we can

2956

* In typical case, batch->memcg == mem. This means we can

2957

* merge a series of uncharges to an uncharge of res_counter.

2957

* merge a series of uncharges to an uncharge of res_counter.

2958

* If not, we uncharge res_counter ony by one.

2958

* If not, we uncharge res_counter ony by one.

2959

*/

2959

*/

2960

if (batch->memcg != memcg)

2960

if (batch->memcg != memcg)

2961

goto direct_uncharge;

2961

goto direct_uncharge;

2962

/* remember freed charge and uncharge it later */

2962

/* remember freed charge and uncharge it later */

2963

batch->nr_pages++;

2963

batch->nr_pages++;

2964

if (uncharge_memsw)

2964

if (uncharge_memsw)

2965

batch->memsw_nr_pages++;

2965

batch->memsw_nr_pages++;

2966

return;

2966

return;

2967

direct_uncharge:

2967

direct_uncharge:

2968

res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);

2968

res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);

2969

if (uncharge_memsw)

2969

if (uncharge_memsw)

2970

res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);

2970

res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);

2971

if (unlikely(batch->memcg != memcg))

2971

if (unlikely(batch->memcg != memcg))

2972

memcg_oom_recover(memcg);

2972

memcg_oom_recover(memcg);

2973

}

2973

}

2974

2975

/*

2975

/*

2976

* uncharge if !page_mapped(page)

2976

* uncharge if !page_mapped(page)

2977

*/

2977

*/

2978

static struct mem_cgroup *

2978

static struct mem_cgroup *

2979

__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,

2979

__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,

2980

bool end_migration)

2980

bool end_migration)

2981

{

2981

{

2982

struct mem_cgroup *memcg = NULL;

2982

struct mem_cgroup *memcg = NULL;

2983

unsigned int nr_pages = 1;

2983

unsigned int nr_pages = 1;

2984

struct page_cgroup *pc;

2984

struct page_cgroup *pc;

2985

bool anon;

2985

bool anon;

2986

2987

if (mem_cgroup_disabled())

2987

if (mem_cgroup_disabled())

2988

return NULL;

2988

return NULL;

2989

2990

VM_BUG_ON(PageSwapCache(page));

2990

VM_BUG_ON(PageSwapCache(page));

2991

2992

if (PageTransHuge(page)) {

2992

if (PageTransHuge(page)) {

2993

nr_pages <<= compound_order(page);

2993

nr_pages <<= compound_order(page);

2994

VM_BUG_ON(!PageTransHuge(page));

2994

VM_BUG_ON(!PageTransHuge(page));

2995

}

2995

}

2996

/*

2996

/*

2997

* Check if our page_cgroup is valid

2997

* Check if our page_cgroup is valid

2998

*/

2998

*/

2999

pc = lookup_page_cgroup(page);

2999

pc = lookup_page_cgroup(page);

3000

if (unlikely(!PageCgroupUsed(pc)))

3000

if (unlikely(!PageCgroupUsed(pc)))

3001

return NULL;

3001

return NULL;

3002

3003

lock_page_cgroup(pc);

3003

lock_page_cgroup(pc);

3004

3005

memcg = pc->mem_cgroup;

3005

memcg = pc->mem_cgroup;

3006

3007

if (!PageCgroupUsed(pc))

3007

if (!PageCgroupUsed(pc))

3008

goto unlock_out;

3008

goto unlock_out;

3009

3010

anon = PageAnon(page);

3010

anon = PageAnon(page);

3011

3012

switch (ctype) {

3012

switch (ctype) {

3013

case MEM_CGROUP_CHARGE_TYPE_ANON:

3013

case MEM_CGROUP_CHARGE_TYPE_ANON:

3014

/*

3014

/*

3015

* Generally PageAnon tells if it's the anon statistics to be

3015

* Generally PageAnon tells if it's the anon statistics to be

3016

* updated; but sometimes e.g. mem_cgroup_uncharge_page() is

3016

* updated; but sometimes e.g. mem_cgroup_uncharge_page() is

3017

* used before page reached the stage of being marked PageAnon.

3017

* used before page reached the stage of being marked PageAnon.

3018

*/

3018

*/

3019

anon = true;

3019

anon = true;

3020

/* fallthrough */

3020

/* fallthrough */

3021

case MEM_CGROUP_CHARGE_TYPE_DROP:

3021

case MEM_CGROUP_CHARGE_TYPE_DROP:

3022

/* See mem_cgroup_prepare_migration() */

3022

/* See mem_cgroup_prepare_migration() */

3023

if (page_mapped(page))

3023

if (page_mapped(page))

3024

goto unlock_out;

3024

goto unlock_out;

3025

/*

3025

/*

3026

* Pages under migration may not be uncharged. But

3026

* Pages under migration may not be uncharged. But

3027

* end_migration() /must/ be the one uncharging the

3027

* end_migration() /must/ be the one uncharging the

3028

* unused post-migration page and so it has to call

3028

* unused post-migration page and so it has to call

3029

* here with the migration bit still set. See the

3029

* here with the migration bit still set. See the

3030

* res_counter handling below.

3030

* res_counter handling below.

3031

*/

3031

*/

3032

if (!end_migration && PageCgroupMigration(pc))

3032

if (!end_migration && PageCgroupMigration(pc))

3033

goto unlock_out;

3033

goto unlock_out;

3034

break;

3034

break;

3035

case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:

3035

case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:

3036

if (!PageAnon(page)) { /* Shared memory */

3036

if (!PageAnon(page)) { /* Shared memory */

3037

if (page->mapping && !page_is_file_cache(page))

3037

if (page->mapping && !page_is_file_cache(page))

3038

goto unlock_out;

3038

goto unlock_out;

3039

} else if (page_mapped(page)) /* Anon */

3039

} else if (page_mapped(page)) /* Anon */

3040

goto unlock_out;

3040

goto unlock_out;

3041

break;

3041

break;

3042

default:

3042

default:

3043

break;

3043

break;

3044

}

3044

}

3045

3046

mem_cgroup_charge_statistics(memcg, anon, -nr_pages);

3046

mem_cgroup_charge_statistics(memcg, anon, -nr_pages);

3047

3048

ClearPageCgroupUsed(pc);

3048

ClearPageCgroupUsed(pc);

3049

/*

3049

/*

3050

* pc->mem_cgroup is not cleared here. It will be accessed when it's

3050

* pc->mem_cgroup is not cleared here. It will be accessed when it's

3051

* freed from LRU. This is safe because uncharged page is expected not

3051

* freed from LRU. This is safe because uncharged page is expected not

3052

* to be reused (freed soon). Exception is SwapCache, it's handled by

3052

* to be reused (freed soon). Exception is SwapCache, it's handled by

3053

* special functions.

3053

* special functions.

3054

*/

3054

*/

3055

3056

unlock_page_cgroup(pc);

3056

unlock_page_cgroup(pc);

3057

/*

3057

/*

3058

* even after unlock, we have memcg->res.usage here and this memcg

3058

* even after unlock, we have memcg->res.usage here and this memcg

3059

* will never be freed.

3059

* will never be freed.

3060

*/

3060

*/

3061

memcg_check_events(memcg, page);

3061

memcg_check_events(memcg, page);

3062

if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {

3062

if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {

3063

mem_cgroup_swap_statistics(memcg, true);

3063

mem_cgroup_swap_statistics(memcg, true);

3064

mem_cgroup_get(memcg);

3064

mem_cgroup_get(memcg);

3065

}

3065

}

3066

/*

3066

/*

3067

* Migration does not charge the res_counter for the

3067

* Migration does not charge the res_counter for the

3068

* replacement page, so leave it alone when phasing out the

3068

* replacement page, so leave it alone when phasing out the

3069

* page that is unused after the migration.

3069

* page that is unused after the migration.

3070

*/

3070

*/

3071

if (!end_migration && !mem_cgroup_is_root(memcg))

3071

if (!end_migration && !mem_cgroup_is_root(memcg))

3072

mem_cgroup_do_uncharge(memcg, nr_pages, ctype);

3072

mem_cgroup_do_uncharge(memcg, nr_pages, ctype);

3073

3074

return memcg;

3074

return memcg;

3075

3076

unlock_out:

3076

unlock_out:

3077

unlock_page_cgroup(pc);

3077

unlock_page_cgroup(pc);

3078

return NULL;

3078

return NULL;

3079

}

3079

}

3080

3081

void mem_cgroup_uncharge_page(struct page *page)

3081

void mem_cgroup_uncharge_page(struct page *page)

3082

{

3082

{

3083

/* early check. */

3083

/* early check. */

3084

if (page_mapped(page))

3084

if (page_mapped(page))

3085

return;

3085

return;

3086

VM_BUG_ON(page->mapping && !PageAnon(page));

3086

VM_BUG_ON(page->mapping && !PageAnon(page));

3087

if (PageSwapCache(page))

3087

if (PageSwapCache(page))

3088

return;

3088

return;

3089

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);

3089

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);

3090

}

3090

}

3091

3092

void mem_cgroup_uncharge_cache_page(struct page *page)

3092

void mem_cgroup_uncharge_cache_page(struct page *page)

3093

{

3093

{

3094

VM_BUG_ON(page_mapped(page));

3094

VM_BUG_ON(page_mapped(page));

3095

VM_BUG_ON(page->mapping);

3095

VM_BUG_ON(page->mapping);

3096

if (PageSwapCache(page))

3097

return;

3098

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);

3096

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);

3099

}

3097

}

3100

3098

3101

/*

3099

/*

3102

* Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.

3100

* Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.

3103

* In that cases, pages are freed continuously and we can expect pages

3101

* In that cases, pages are freed continuously and we can expect pages

3104

* are in the same memcg. All these calls itself limits the number of

3102

* are in the same memcg. All these calls itself limits the number of

3105

* pages freed at once, then uncharge_start/end() is called properly.

3103

* pages freed at once, then uncharge_start/end() is called properly.

3106

* This may be called prural(2) times in a context,

3104

* This may be called prural(2) times in a context,

3107

*/

3105

*/

3108

3106

3109

void mem_cgroup_uncharge_start(void)

3107

void mem_cgroup_uncharge_start(void)

3110

{

3108

{

3111

current->memcg_batch.do_batch++;

3109

current->memcg_batch.do_batch++;

3112

/* We can do nest. */

3110

/* We can do nest. */

3113

if (current->memcg_batch.do_batch == 1) {

3111

if (current->memcg_batch.do_batch == 1) {

3114

current->memcg_batch.memcg = NULL;

3112

current->memcg_batch.memcg = NULL;

3115

current->memcg_batch.nr_pages = 0;

3113

current->memcg_batch.nr_pages = 0;

3116

current->memcg_batch.memsw_nr_pages = 0;

3114

current->memcg_batch.memsw_nr_pages = 0;

3117

}

3115

}

3118

}

3116

}

3119

3117

3120

void mem_cgroup_uncharge_end(void)

3118

void mem_cgroup_uncharge_end(void)

3121

{

3119

{

3122

struct memcg_batch_info *batch = &current->memcg_batch;

3120

struct memcg_batch_info *batch = &current->memcg_batch;

3123

3121

3124

if (!batch->do_batch)

3122

if (!batch->do_batch)

3125

return;

3123

return;

3126

3124

3127

batch->do_batch--;

3125

batch->do_batch--;

3128

if (batch->do_batch) /* If stacked, do nothing. */

3126

if (batch->do_batch) /* If stacked, do nothing. */

3129

return;

3127

return;

3130

3128

3131

if (!batch->memcg)

3129

if (!batch->memcg)

3132

return;

3130

return;

3133

/*

3131

/*

3134

* This "batch->memcg" is valid without any css_get/put etc...

3132

* This "batch->memcg" is valid without any css_get/put etc...

3135

* bacause we hide charges behind us.

3133

* bacause we hide charges behind us.

3136

*/

3134

*/

3137

if (batch->nr_pages)

3135

if (batch->nr_pages)

3138

res_counter_uncharge(&batch->memcg->res,

3136

res_counter_uncharge(&batch->memcg->res,

3139

batch->nr_pages * PAGE_SIZE);

3137

batch->nr_pages * PAGE_SIZE);

3140

if (batch->memsw_nr_pages)

3138

if (batch->memsw_nr_pages)

3141

res_counter_uncharge(&batch->memcg->memsw,

3139

res_counter_uncharge(&batch->memcg->memsw,

3142

batch->memsw_nr_pages * PAGE_SIZE);

3140

batch->memsw_nr_pages * PAGE_SIZE);

3143

memcg_oom_recover(batch->memcg);

3141

memcg_oom_recover(batch->memcg);

3144

/* forget this pointer (for sanity check) */

3142

/* forget this pointer (for sanity check) */

3145

batch->memcg = NULL;

3143

batch->memcg = NULL;

3146

}

3144

}

3147

3145

3148

#ifdef CONFIG_SWAP

3146

#ifdef CONFIG_SWAP

3149

/*

3147

/*

3150

* called after __delete_from_swap_cache() and drop "page" account.

3148

* called after __delete_from_swap_cache() and drop "page" account.

3151

* memcg information is recorded to swap_cgroup of "ent"

3149

* memcg information is recorded to swap_cgroup of "ent"

3152

*/

3150

*/

3153

void

3151

void

3154

mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)

3152

mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)

3155

{

3153

{

3156

struct mem_cgroup *memcg;

3154

struct mem_cgroup *memcg;

3157

int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;

3155

int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;

3158

3156

3159

if (!swapout) /* this was a swap cache but the swap is unused ! */

3157

if (!swapout) /* this was a swap cache but the swap is unused ! */

3160

ctype = MEM_CGROUP_CHARGE_TYPE_DROP;

3158

ctype = MEM_CGROUP_CHARGE_TYPE_DROP;

3161

3159

3162

if (PageSwapCache(page))

3163

return;

3164

memcg = __mem_cgroup_uncharge_common(page, ctype, false);

3160

memcg = __mem_cgroup_uncharge_common(page, ctype, false);

3165

3161

3166

/*

3162

/*

3167

* record memcg information, if swapout && memcg != NULL,

3163

* record memcg information, if swapout && memcg != NULL,

3168

* mem_cgroup_get() was called in uncharge().

3164

* mem_cgroup_get() was called in uncharge().

3169

*/

3165

*/

3170

if (do_swap_account && swapout && memcg)

3166

if (do_swap_account && swapout && memcg)

3171

swap_cgroup_record(ent, css_id(&memcg->css));

3167

swap_cgroup_record(ent, css_id(&memcg->css));

3172

}

3168

}

3173

#endif

3169

#endif

3174

3170

3175

#ifdef CONFIG_MEMCG_SWAP

3171

#ifdef CONFIG_MEMCG_SWAP

3176

/*

3172

/*

3177

* called from swap_entry_free(). remove record in swap_cgroup and

3173

* called from swap_entry_free(). remove record in swap_cgroup and

3178

* uncharge "memsw" account.

3174

* uncharge "memsw" account.

3179

*/

3175

*/

3180

void mem_cgroup_uncharge_swap(swp_entry_t ent)

3176

void mem_cgroup_uncharge_swap(swp_entry_t ent)

3181

{

3177

{

3182

struct mem_cgroup *memcg;

3178

struct mem_cgroup *memcg;

3183

unsigned short id;

3179

unsigned short id;

3184

3180

3185

if (!do_swap_account)

3181

if (!do_swap_account)

3186

return;

3182

return;

3187

3183

3188

id = swap_cgroup_record(ent, 0);

3184

id = swap_cgroup_record(ent, 0);

3189

rcu_read_lock();

3185

rcu_read_lock();

3190

memcg = mem_cgroup_lookup(id);

3186

memcg = mem_cgroup_lookup(id);

3191

if (memcg) {

3187

if (memcg) {

3192

/*

3188

/*

3193

* We uncharge this because swap is freed.

3189

* We uncharge this because swap is freed.

3194

* This memcg can be obsolete one. We avoid calling css_tryget

3190

* This memcg can be obsolete one. We avoid calling css_tryget

3195

*/

3191

*/

3196

if (!mem_cgroup_is_root(memcg))

3192

if (!mem_cgroup_is_root(memcg))

3197

res_counter_uncharge(&memcg->memsw, PAGE_SIZE);

3193

res_counter_uncharge(&memcg->memsw, PAGE_SIZE);

3198

mem_cgroup_swap_statistics(memcg, false);

3194

mem_cgroup_swap_statistics(memcg, false);

3199

mem_cgroup_put(memcg);

3195

mem_cgroup_put(memcg);

3200

}

3196

}

3201

rcu_read_unlock();

3197

rcu_read_unlock();

3202

}

3198

}

3203

3199

3204

/**

3200

/**

3205

* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.

3201

* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.

3206

* @entry: swap entry to be moved

3202

* @entry: swap entry to be moved

3207

* @from: mem_cgroup which the entry is moved from

3203

* @from: mem_cgroup which the entry is moved from

3208

* @to: mem_cgroup which the entry is moved to

3204

* @to: mem_cgroup which the entry is moved to

3209

*

3205

*

3210

* It succeeds only when the swap_cgroup's record for this entry is the same

3206

* It succeeds only when the swap_cgroup's record for this entry is the same

3211

* as the mem_cgroup's id of @from.

3207

* as the mem_cgroup's id of @from.

3212

*

3208

*

3213

* Returns 0 on success, -EINVAL on failure.

3209

* Returns 0 on success, -EINVAL on failure.

3214

*

3210

*

3215

* The caller must have charged to @to, IOW, called res_counter_charge() about

3211

* The caller must have charged to @to, IOW, called res_counter_charge() about

3216

* both res and memsw, and called css_get().

3212

* both res and memsw, and called css_get().

3217

*/

3213

*/

3218

static int mem_cgroup_move_swap_account(swp_entry_t entry,

3214

static int mem_cgroup_move_swap_account(swp_entry_t entry,

3219

struct mem_cgroup *from, struct mem_cgroup *to)

3215

struct mem_cgroup *from, struct mem_cgroup *to)

3220

{

3216

{

3221

unsigned short old_id, new_id;

3217

unsigned short old_id, new_id;

3222

3218

3223

old_id = css_id(&from->css);

3219

old_id = css_id(&from->css);

3224

new_id = css_id(&to->css);

3220

new_id = css_id(&to->css);

3225

3221

3226

if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {

3222

if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {

3227

mem_cgroup_swap_statistics(from, false);

3223

mem_cgroup_swap_statistics(from, false);

3228

mem_cgroup_swap_statistics(to, true);

3224

mem_cgroup_swap_statistics(to, true);

3229

/*

3225

/*

3230

* This function is only called from task migration context now.

3226

* This function is only called from task migration context now.

3231

* It postpones res_counter and refcount handling till the end

3227

* It postpones res_counter and refcount handling till the end

3232

* of task migration(mem_cgroup_clear_mc()) for performance

3228

* of task migration(mem_cgroup_clear_mc()) for performance

3233

* improvement. But we cannot postpone mem_cgroup_get(to)

3229

* improvement. But we cannot postpone mem_cgroup_get(to)

3234

* because if the process that has been moved to @to does

3230

* because if the process that has been moved to @to does

3235

* swap-in, the refcount of @to might be decreased to 0.

3231

* swap-in, the refcount of @to might be decreased to 0.

3236

*/

3232

*/

3237

mem_cgroup_get(to);

3233

mem_cgroup_get(to);

3238

return 0;

3234

return 0;

3239

}

3235

}

3240

return -EINVAL;

3236

return -EINVAL;

3241

}

3237

}

3242

#else

3238

#else

3243

static inline int mem_cgroup_move_swap_account(swp_entry_t entry,

3239

static inline int mem_cgroup_move_swap_account(swp_entry_t entry,

3244

struct mem_cgroup *from, struct mem_cgroup *to)

3240

struct mem_cgroup *from, struct mem_cgroup *to)

3245

{

3241

{

3246

return -EINVAL;

3242

return -EINVAL;

3247

}

3243

}

3248

#endif

3244

#endif

3249

3245

3250

/*

3246

/*

3251

* Before starting migration, account PAGE_SIZE to mem_cgroup that the old

3247

* Before starting migration, account PAGE_SIZE to mem_cgroup that the old

3252

* page belongs to.

3248

* page belongs to.

3253

*/

3249

*/

3254

void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,

3250

void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,

3255

struct mem_cgroup **memcgp)

3251

struct mem_cgroup **memcgp)

3256

{

3252

{

3257

struct mem_cgroup *memcg = NULL;

3253

struct mem_cgroup *memcg = NULL;

3258

struct page_cgroup *pc;

3254

struct page_cgroup *pc;

3259

enum charge_type ctype;

3255

enum charge_type ctype;

3260

3256

3261

*memcgp = NULL;

3257

*memcgp = NULL;

3262

3258

3263

VM_BUG_ON(PageTransHuge(page));

3259

VM_BUG_ON(PageTransHuge(page));

3264

if (mem_cgroup_disabled())

3260

if (mem_cgroup_disabled())

3265

return;

3261

return;

3266

3262

3267

pc = lookup_page_cgroup(page);

3263

pc = lookup_page_cgroup(page);

3268

lock_page_cgroup(pc);

3264

lock_page_cgroup(pc);

3269

if (PageCgroupUsed(pc)) {

3265

if (PageCgroupUsed(pc)) {

3270

memcg = pc->mem_cgroup;

3266

memcg = pc->mem_cgroup;

3271

css_get(&memcg->css);

3267

css_get(&memcg->css);

3272

/*

3268

/*

3273

* At migrating an anonymous page, its mapcount goes down

3269

* At migrating an anonymous page, its mapcount goes down

3274

* to 0 and uncharge() will be called. But, even if it's fully

3270

* to 0 and uncharge() will be called. But, even if it's fully

3275

* unmapped, migration may fail and this page has to be

3271

* unmapped, migration may fail and this page has to be

3276

* charged again. We set MIGRATION flag here and delay uncharge

3272

* charged again. We set MIGRATION flag here and delay uncharge

3277

* until end_migration() is called

3273

* until end_migration() is called

3278

*

3274

*

3279

* Corner Case Thinking

3275

* Corner Case Thinking

3280

* A)

3276

* A)

3281

* When the old page was mapped as Anon and it's unmap-and-freed

3277

* When the old page was mapped as Anon and it's unmap-and-freed

3282

* while migration was ongoing.

3278

* while migration was ongoing.

3283

* If unmap finds the old page, uncharge() of it will be delayed

3279

* If unmap finds the old page, uncharge() of it will be delayed

3284

* until end_migration(). If unmap finds a new page, it's

3280

* until end_migration(). If unmap finds a new page, it's

3285

* uncharged when it make mapcount to be 1->0. If unmap code

3281

* uncharged when it make mapcount to be 1->0. If unmap code

3286

* finds swap_migration_entry, the new page will not be mapped

3282

* finds swap_migration_entry, the new page will not be mapped

3287

* and end_migration() will find it(mapcount==0).

3283

* and end_migration() will find it(mapcount==0).

3288

*

3284

*

3289

* B)

3285

* B)

3290

* When the old page was mapped but migraion fails, the kernel

3286

* When the old page was mapped but migraion fails, the kernel

3291

* remaps it. A charge for it is kept by MIGRATION flag even

3287

* remaps it. A charge for it is kept by MIGRATION flag even

3292

* if mapcount goes down to 0. We can do remap successfully

3288

* if mapcount goes down to 0. We can do remap successfully

3293

* without charging it again.

3289

* without charging it again.

3294

*

3290

*

3295

* C)

3291

* C)

3296

* The "old" page is under lock_page() until the end of

3292

* The "old" page is under lock_page() until the end of

3297

* migration, so, the old page itself will not be swapped-out.

3293

* migration, so, the old page itself will not be swapped-out.

3298

* If the new page is swapped out before end_migraton, our

3294

* If the new page is swapped out before end_migraton, our

3299

* hook to usual swap-out path will catch the event.

3295

* hook to usual swap-out path will catch the event.

3300

*/

3296

*/

3301

if (PageAnon(page))

3297

if (PageAnon(page))

3302

SetPageCgroupMigration(pc);

3298

SetPageCgroupMigration(pc);

3303

}

3299

}

3304

unlock_page_cgroup(pc);

3300

unlock_page_cgroup(pc);

3305

/*

3301

/*

3306

* If the page is not charged at this point,

3302

* If the page is not charged at this point,

3307

* we return here.

3303

* we return here.

3308

*/

3304

*/

3309

if (!memcg)

3305

if (!memcg)

3310

return;

3306

return;

3311

3307

3312

*memcgp = memcg;

3308

*memcgp = memcg;

3313

/*

3309

/*

3314

* We charge new page before it's used/mapped. So, even if unlock_page()

3310

* We charge new page before it's used/mapped. So, even if unlock_page()

3315

* is called before end_migration, we can catch all events on this new

3311

* is called before end_migration, we can catch all events on this new

3316

* page. In the case new page is migrated but not remapped, new page's

3312

* page. In the case new page is migrated but not remapped, new page's

3317

* mapcount will be finally 0 and we call uncharge in end_migration().

3313

* mapcount will be finally 0 and we call uncharge in end_migration().

3318

*/

3314

*/

3319

if (PageAnon(page))

3315

if (PageAnon(page))

3320

ctype = MEM_CGROUP_CHARGE_TYPE_ANON;

3316

ctype = MEM_CGROUP_CHARGE_TYPE_ANON;

3321

else if (page_is_file_cache(page))

3317

else if (page_is_file_cache(page))

3322

ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;

3318

ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;

3323

else

3319

else

3324

ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;

3320

ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;

3325

/*

3321

/*

3326

* The page is committed to the memcg, but it's not actually

3322

* The page is committed to the memcg, but it's not actually

3327

* charged to the res_counter since we plan on replacing the

3323

* charged to the res_counter since we plan on replacing the

3328

* old one and only one page is going to be left afterwards.

3324

* old one and only one page is going to be left afterwards.

3329

*/

3325

*/

3330

__mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);

3326

__mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);

3331

}

3327

}

3332

3328

3333

/* remove redundant charge if migration failed*/

3329

/* remove redundant charge if migration failed*/

3334

void mem_cgroup_end_migration(struct mem_cgroup *memcg,

3330

void mem_cgroup_end_migration(struct mem_cgroup *memcg,

3335

struct page *oldpage, struct page *newpage, bool migration_ok)

3331

struct page *oldpage, struct page *newpage, bool migration_ok)

3336

{

3332

{

3337

struct page *used, *unused;

3333

struct page *used, *unused;

3338

struct page_cgroup *pc;

3334

struct page_cgroup *pc;

3339

bool anon;

3335

bool anon;

3340

3336

3341

if (!memcg)

3337

if (!memcg)

3342

return;

3338

return;

3343

/* blocks rmdir() */

3339

/* blocks rmdir() */

3344

cgroup_exclude_rmdir(&memcg->css);

3340

cgroup_exclude_rmdir(&memcg->css);

3345

if (!migration_ok) {

3341

if (!migration_ok) {

3346

used = oldpage;

3342

used = oldpage;

3347

unused = newpage;

3343

unused = newpage;

3348

} else {

3344

} else {

3349

used = newpage;

3345

used = newpage;

3350

unused = oldpage;

3346

unused = oldpage;

3351

}

3347

}

3352

anon = PageAnon(used);

3348

anon = PageAnon(used);

3353

if (!PageSwapCache(unused))

3349

__mem_cgroup_uncharge_common(unused,

3354

__mem_cgroup_uncharge_common(unused,

3350

anon ? MEM_CGROUP_CHARGE_TYPE_ANON

3355

anon ? MEM_CGROUP_CHARGE_TYPE_ANON

3351

: MEM_CGROUP_CHARGE_TYPE_CACHE,

3356

: MEM_CGROUP_CHARGE_TYPE_CACHE,

3352

true);

3357

true);

3358

css_put(&memcg->css);

3353

css_put(&memcg->css);

3359

/*

3354

/*

3360

* We disallowed uncharge of pages under migration because mapcount

3355

* We disallowed uncharge of pages under migration because mapcount

3361

* of the page goes down to zero, temporarly.

3356

* of the page goes down to zero, temporarly.

3362

* Clear the flag and check the page should be charged.

3357

* Clear the flag and check the page should be charged.

3363

*/

3358

*/

3364

pc = lookup_page_cgroup(oldpage);

3359

pc = lookup_page_cgroup(oldpage);

3365

lock_page_cgroup(pc);

3360

lock_page_cgroup(pc);

3366

ClearPageCgroupMigration(pc);

3361

ClearPageCgroupMigration(pc);

3367

unlock_page_cgroup(pc);

3362

unlock_page_cgroup(pc);

3368

3363

3369

/*

3364

/*

3370

* If a page is a file cache, radix-tree replacement is very atomic

3365

* If a page is a file cache, radix-tree replacement is very atomic

3371

* and we can skip this check. When it was an Anon page, its mapcount

3366

* and we can skip this check. When it was an Anon page, its mapcount

3372

* goes down to 0. But because we added MIGRATION flage, it's not

3367

* goes down to 0. But because we added MIGRATION flage, it's not

3373

* uncharged yet. There are several case but page->mapcount check

3368

* uncharged yet. There are several case but page->mapcount check

3374

* and USED bit check in mem_cgroup_uncharge_page() will do enough

3369

* and USED bit check in mem_cgroup_uncharge_page() will do enough

3375

* check. (see prepare_charge() also)

3370

* check. (see prepare_charge() also)

3376

*/

3371

*/

3377

if (anon)

3372

if (anon)

3378

mem_cgroup_uncharge_page(used);

3373

mem_cgroup_uncharge_page(used);

3379

/*

3374

/*

3380

* At migration, we may charge account against cgroup which has no

3375

* At migration, we may charge account against cgroup which has no

3381

* tasks.

3376

* tasks.

3382

* So, rmdir()->pre_destroy() can be called while we do this charge.

3377

* So, rmdir()->pre_destroy() can be called while we do this charge.

3383

* In that case, we need to call pre_destroy() again. check it here.

3378

* In that case, we need to call pre_destroy() again. check it here.

3384

*/

3379

*/

3385

cgroup_release_and_wakeup_rmdir(&memcg->css);

3380

cgroup_release_and_wakeup_rmdir(&memcg->css);

3386

}

3381

}

3387

3382

3388

/*

3383

/*

3389

* At replace page cache, newpage is not under any memcg but it's on

3384

* At replace page cache, newpage is not under any memcg but it's on

3390

* LRU. So, this function doesn't touch res_counter but handles LRU

3385

* LRU. So, this function doesn't touch res_counter but handles LRU

3391

* in correct way. Both pages are locked so we cannot race with uncharge.

3386

* in correct way. Both pages are locked so we cannot race with uncharge.

3392

*/

3387

*/

3393

void mem_cgroup_replace_page_cache(struct page *oldpage,

3388

void mem_cgroup_replace_page_cache(struct page *oldpage,

3394

struct page *newpage)

3389

struct page *newpage)

3395

{

3390

{

3396

struct mem_cgroup *memcg = NULL;

3391

struct mem_cgroup *memcg = NULL;

3397

struct page_cgroup *pc;

3392

struct page_cgroup *pc;

3398

enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;

3393

enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;

3399

3394

3400

if (mem_cgroup_disabled())

3395

if (mem_cgroup_disabled())

3401

return;

3396

return;

3402

3397

3403

pc = lookup_page_cgroup(oldpage);

3398

pc = lookup_page_cgroup(oldpage);

3404

/* fix accounting on old pages */

3399

/* fix accounting on old pages */

3405

lock_page_cgroup(pc);

3400

lock_page_cgroup(pc);

3406

if (PageCgroupUsed(pc)) {

3401

if (PageCgroupUsed(pc)) {

3407

memcg = pc->mem_cgroup;

3402

memcg = pc->mem_cgroup;

3408

mem_cgroup_charge_statistics(memcg, false, -1);

3403

mem_cgroup_charge_statistics(memcg, false, -1);

3409

ClearPageCgroupUsed(pc);

3404

ClearPageCgroupUsed(pc);

3410

}

3405

}

3411

unlock_page_cgroup(pc);

3406

unlock_page_cgroup(pc);

3412

3407

3413

/*

3408

/*

3414

* When called from shmem_replace_page(), in some cases the

3409

* When called from shmem_replace_page(), in some cases the

3415

* oldpage has already been charged, and in some cases not.

3410

* oldpage has already been charged, and in some cases not.

3416

*/

3411

*/

3417

if (!memcg)

3412

if (!memcg)

3418

return;

3413

return;

3419

3414

3420

if (PageSwapBacked(oldpage))

3415

if (PageSwapBacked(oldpage))

3421

type = MEM_CGROUP_CHARGE_TYPE_SHMEM;

3416

type = MEM_CGROUP_CHARGE_TYPE_SHMEM;

3422

3417

3423

/*

3418

/*

3424

* Even if newpage->mapping was NULL before starting replacement,

3419

* Even if newpage->mapping was NULL before starting replacement,

3425

* the newpage may be on LRU(or pagevec for LRU) already. We lock

3420

* the newpage may be on LRU(or pagevec for LRU) already. We lock

3426

* LRU while we overwrite pc->mem_cgroup.

3421

* LRU while we overwrite pc->mem_cgroup.

3427

*/

3422

*/

3428

__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);

3423

__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);

3429

}

3424

}

3430

3425

3431

#ifdef CONFIG_DEBUG_VM

3426

#ifdef CONFIG_DEBUG_VM

3432

static struct page_cgroup *lookup_page_cgroup_used(struct page *page)

3427

static struct page_cgroup *lookup_page_cgroup_used(struct page *page)

3433

{

3428

{

3434

struct page_cgroup *pc;

3429

struct page_cgroup *pc;

3435

3430

3436

pc = lookup_page_cgroup(page);

3431

pc = lookup_page_cgroup(page);

3437

/*

3432

/*

3438

* Can be NULL while feeding pages into the page allocator for

3433

* Can be NULL while feeding pages into the page allocator for

3439

* the first time, i.e. during boot or memory hotplug;

3434

* the first time, i.e. during boot or memory hotplug;

3440

* or when mem_cgroup_disabled().

3435

* or when mem_cgroup_disabled().

3441

*/

3436

*/

3442

if (likely(pc) && PageCgroupUsed(pc))

3437

if (likely(pc) && PageCgroupUsed(pc))

3443

return pc;

3438

return pc;

3444

return NULL;

3439

return NULL;

3445

}

3440

}

3446

3441

3447

bool mem_cgroup_bad_page_check(struct page *page)

3442

bool mem_cgroup_bad_page_check(struct page *page)

3448

{

3443

{

3449

if (mem_cgroup_disabled())

3444

if (mem_cgroup_disabled())

3450

return false;

3445

return false;

3451

3446

3452

return lookup_page_cgroup_used(page) != NULL;

3447

return lookup_page_cgroup_used(page) != NULL;

3453

}

3448

}

3454

3449

3455

void mem_cgroup_print_bad_page(struct page *page)

3450

void mem_cgroup_print_bad_page(struct page *page)

3456

{

3451

{

3457

struct page_cgroup *pc;

3452

struct page_cgroup *pc;

3458

3453

3459

pc = lookup_page_cgroup_used(page);

3454

pc = lookup_page_cgroup_used(page);

3460

if (pc) {

3455

if (pc) {

3461

printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",

3456

printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",

3462

pc, pc->flags, pc->mem_cgroup);

3457

pc, pc->flags, pc->mem_cgroup);

3463

}

3458

}

3464

}

3459

}

3465

#endif

3460

#endif

3466

3461

3467

static DEFINE_MUTEX(set_limit_mutex);

3462

static DEFINE_MUTEX(set_limit_mutex);

3468

3463

3469

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,

3464

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,

3470

unsigned long long val)

3465

unsigned long long val)

3471

{

3466

{

3472

int retry_count;

3467

int retry_count;

3473

u64 memswlimit, memlimit;

3468

u64 memswlimit, memlimit;

3474

int ret = 0;

3469

int ret = 0;

3475

int children = mem_cgroup_count_children(memcg);

3470

int children = mem_cgroup_count_children(memcg);

3476

u64 curusage, oldusage;

3471

u64 curusage, oldusage;

3477

int enlarge;

3472

int enlarge;

3478

3473

3479

/*

3474

/*

3480

* For keeping hierarchical_reclaim simple, how long we should retry

3475

* For keeping hierarchical_reclaim simple, how long we should retry

3481

* is depends on callers. We set our retry-count to be function

3476

* is depends on callers. We set our retry-count to be function

3482

* of # of children which we should visit in this loop.

3477

* of # of children which we should visit in this loop.

3483

*/

3478

*/

3484

retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;

3479

retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;

3485

3480

3486

oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);

3481

oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);

3487

3482

3488

enlarge = 0;

3483

enlarge = 0;

3489

while (retry_count) {

3484

while (retry_count) {

3490

if (signal_pending(current)) {

3485

if (signal_pending(current)) {

3491

ret = -EINTR;

3486

ret = -EINTR;

3492

break;

3487

break;

3493

}

3488

}

3494

/*

3489

/*

3495

* Rather than hide all in some function, I do this in

3490

* Rather than hide all in some function, I do this in

3496

* open coded manner. You see what this really does.

3491

* open coded manner. You see what this really does.

3497

* We have to guarantee memcg->res.limit <= memcg->memsw.limit.

3492

* We have to guarantee memcg->res.limit <= memcg->memsw.limit.

3498

*/

3493

*/

3499

mutex_lock(&set_limit_mutex);

3494

mutex_lock(&set_limit_mutex);

3500

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3495

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3501

if (memswlimit < val) {

3496

if (memswlimit < val) {

3502

ret = -EINVAL;

3497

ret = -EINVAL;

3503

mutex_unlock(&set_limit_mutex);

3498

mutex_unlock(&set_limit_mutex);

3504

break;

3499

break;

3505

}

3500

}

3506

3501

3507

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3502

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3508

if (memlimit < val)

3503

if (memlimit < val)

3509

enlarge = 1;

3504

enlarge = 1;

3510

3505

3511

ret = res_counter_set_limit(&memcg->res, val);

3506

ret = res_counter_set_limit(&memcg->res, val);

3512

if (!ret) {

3507

if (!ret) {

3513

if (memswlimit == val)

3508

if (memswlimit == val)

3514

memcg->memsw_is_minimum = true;

3509

memcg->memsw_is_minimum = true;

3515

else

3510

else

3516

memcg->memsw_is_minimum = false;

3511

memcg->memsw_is_minimum = false;

3517

}

3512

}

3518

mutex_unlock(&set_limit_mutex);

3513

mutex_unlock(&set_limit_mutex);

3519

3514

3520

if (!ret)

3515

if (!ret)

3521

break;

3516

break;

3522

3517

3523

mem_cgroup_reclaim(memcg, GFP_KERNEL,

3518

mem_cgroup_reclaim(memcg, GFP_KERNEL,

3524

MEM_CGROUP_RECLAIM_SHRINK);

3519

MEM_CGROUP_RECLAIM_SHRINK);

3525

curusage = res_counter_read_u64(&memcg->res, RES_USAGE);

3520

curusage = res_counter_read_u64(&memcg->res, RES_USAGE);

3526

/* Usage is reduced ? */

3521

/* Usage is reduced ? */

3527

if (curusage >= oldusage)

3522

if (curusage >= oldusage)

3528

retry_count--;

3523

retry_count--;

3529

else

3524

else

3530

oldusage = curusage;

3525

oldusage = curusage;

3531

}

3526

}

3532

if (!ret && enlarge)

3527

if (!ret && enlarge)

3533

memcg_oom_recover(memcg);

3528

memcg_oom_recover(memcg);

3534

3529

3535

return ret;

3530

return ret;

3536

}

3531

}

3537

3532

3538

static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,

3533

static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,

3539

unsigned long long val)

3534

unsigned long long val)

3540

{

3535

{

3541

int retry_count;

3536

int retry_count;

3542

u64 memlimit, memswlimit, oldusage, curusage;

3537

u64 memlimit, memswlimit, oldusage, curusage;

3543

int children = mem_cgroup_count_children(memcg);

3538

int children = mem_cgroup_count_children(memcg);

3544

int ret = -EBUSY;

3539

int ret = -EBUSY;

3545

int enlarge = 0;

3540

int enlarge = 0;

3546

3541

3547

/* see mem_cgroup_resize_res_limit */

3542

/* see mem_cgroup_resize_res_limit */

3548

retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;

3543

retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;

3549

oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

3544

oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

3550

while (retry_count) {

3545

while (retry_count) {

3551

if (signal_pending(current)) {

3546

if (signal_pending(current)) {

3552

ret = -EINTR;

3547

ret = -EINTR;

3553

break;

3548

break;

3554

}

3549

}

3555

/*

3550

/*

3556

* Rather than hide all in some function, I do this in

3551

* Rather than hide all in some function, I do this in

3557

* open coded manner. You see what this really does.

3552

* open coded manner. You see what this really does.

3558

* We have to guarantee memcg->res.limit <= memcg->memsw.limit.

3553

* We have to guarantee memcg->res.limit <= memcg->memsw.limit.

3559

*/

3554

*/

3560

mutex_lock(&set_limit_mutex);

3555

mutex_lock(&set_limit_mutex);

3561

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3556

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3562

if (memlimit > val) {

3557

if (memlimit > val) {

3563

ret = -EINVAL;

3558

ret = -EINVAL;

3564

mutex_unlock(&set_limit_mutex);

3559

mutex_unlock(&set_limit_mutex);

3565

break;

3560

break;

3566

}

3561

}

3567

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3562

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3568

if (memswlimit < val)

3563

if (memswlimit < val)

3569

enlarge = 1;

3564

enlarge = 1;

3570

ret = res_counter_set_limit(&memcg->memsw, val);

3565

ret = res_counter_set_limit(&memcg->memsw, val);

3571

if (!ret) {

3566

if (!ret) {

3572

if (memlimit == val)

3567

if (memlimit == val)

3573

memcg->memsw_is_minimum = true;

3568

memcg->memsw_is_minimum = true;

3574

else

3569

else

3575

memcg->memsw_is_minimum = false;

3570

memcg->memsw_is_minimum = false;

3576

}

3571

}

3577

mutex_unlock(&set_limit_mutex);

3572

mutex_unlock(&set_limit_mutex);

3578

3573

3579

if (!ret)

3574

if (!ret)

3580

break;

3575

break;

3581

3576

3582

mem_cgroup_reclaim(memcg, GFP_KERNEL,

3577

mem_cgroup_reclaim(memcg, GFP_KERNEL,

3583

MEM_CGROUP_RECLAIM_NOSWAP |

3578

MEM_CGROUP_RECLAIM_NOSWAP |

3584

MEM_CGROUP_RECLAIM_SHRINK);

3579

MEM_CGROUP_RECLAIM_SHRINK);

3585

curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

3580

curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

3586

/* Usage is reduced ? */

3581

/* Usage is reduced ? */

3587

if (curusage >= oldusage)

3582

if (curusage >= oldusage)

3588

retry_count--;

3583

retry_count--;

3589

else

3584

else

3590

oldusage = curusage;

3585

oldusage = curusage;

3591

}

3586

}

3592

if (!ret && enlarge)

3587

if (!ret && enlarge)

3593

memcg_oom_recover(memcg);

3588

memcg_oom_recover(memcg);

3594

return ret;

3589

return ret;

3595

}

3590

}

3596

3591

3597

unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,

3592

unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,

3598

gfp_t gfp_mask,

3593

gfp_t gfp_mask,

3599

unsigned long *total_scanned)

3594

unsigned long *total_scanned)

3600

{

3595

{

3601

unsigned long nr_reclaimed = 0;

3596

unsigned long nr_reclaimed = 0;

3602

struct mem_cgroup_per_zone *mz, *next_mz = NULL;

3597

struct mem_cgroup_per_zone *mz, *next_mz = NULL;

3603

unsigned long reclaimed;

3598

unsigned long reclaimed;

3604

int loop = 0;

3599

int loop = 0;

3605

struct mem_cgroup_tree_per_zone *mctz;

3600

struct mem_cgroup_tree_per_zone *mctz;

3606

unsigned long long excess;

3601

unsigned long long excess;

3607

unsigned long nr_scanned;

3602

unsigned long nr_scanned;

3608

3603

3609

if (order > 0)

3604

if (order > 0)

3610

return 0;

3605

return 0;

3611

3606

3612

mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));

3607

mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));

3613

/*

3608

/*

3614

* This loop can run a while, specially if mem_cgroup's continuously

3609

* This loop can run a while, specially if mem_cgroup's continuously

3615

* keep exceeding their soft limit and putting the system under

3610

* keep exceeding their soft limit and putting the system under

3616

* pressure

3611

* pressure

3617

*/

3612

*/

3618

do {

3613

do {

3619

if (next_mz)

3614

if (next_mz)

3620

mz = next_mz;

3615

mz = next_mz;

3621

else

3616

else

3622

mz = mem_cgroup_largest_soft_limit_node(mctz);

3617

mz = mem_cgroup_largest_soft_limit_node(mctz);

3623

if (!mz)

3618

if (!mz)

3624

break;

3619

break;

3625

3620

3626

nr_scanned = 0;

3621

nr_scanned = 0;

3627

reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,

3622

reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,

3628

gfp_mask, &nr_scanned);

3623

gfp_mask, &nr_scanned);

3629

nr_reclaimed += reclaimed;

3624

nr_reclaimed += reclaimed;

3630

*total_scanned += nr_scanned;

3625

*total_scanned += nr_scanned;

3631

spin_lock(&mctz->lock);

3626

spin_lock(&mctz->lock);

3632

3627

3633

/*

3628

/*

3634

* If we failed to reclaim anything from this memory cgroup

3629

* If we failed to reclaim anything from this memory cgroup

3635

* it is time to move on to the next cgroup

3630

* it is time to move on to the next cgroup

3636

*/

3631

*/

3637

next_mz = NULL;

3632

next_mz = NULL;

3638

if (!reclaimed) {

3633

if (!reclaimed) {

3639

do {

3634

do {

3640

/*

3635

/*

3641

* Loop until we find yet another one.

3636

* Loop until we find yet another one.

3642

*

3637

*

3643

* By the time we get the soft_limit lock

3638

* By the time we get the soft_limit lock

3644

* again, someone might have aded the

3639

* again, someone might have aded the

3645

* group back on the RB tree. Iterate to

3640

* group back on the RB tree. Iterate to

3646

* make sure we get a different mem.

3641

* make sure we get a different mem.

3647

* mem_cgroup_largest_soft_limit_node returns

3642

* mem_cgroup_largest_soft_limit_node returns

3648

* NULL if no other cgroup is present on

3643

* NULL if no other cgroup is present on

3649

* the tree

3644

* the tree

3650

*/

3645

*/

3651

next_mz =

3646

next_mz =

3652

__mem_cgroup_largest_soft_limit_node(mctz);

3647

__mem_cgroup_largest_soft_limit_node(mctz);

3653

if (next_mz == mz)

3648

if (next_mz == mz)

3654

css_put(&next_mz->memcg->css);

3649

css_put(&next_mz->memcg->css);

3655

else /* next_mz == NULL or other memcg */

3650

else /* next_mz == NULL or other memcg */

3656

break;

3651

break;

3657

} while (1);

3652

} while (1);

3658

}

3653

}

3659

__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);

3654

__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);

3660

excess = res_counter_soft_limit_excess(&mz->memcg->res);

3655

excess = res_counter_soft_limit_excess(&mz->memcg->res);

3661

/*

3656

/*

3662

* One school of thought says that we should not add

3657

* One school of thought says that we should not add

3663

* back the node to the tree if reclaim returns 0.

3658

* back the node to the tree if reclaim returns 0.

3664

* But our reclaim could return 0, simply because due

3659

* But our reclaim could return 0, simply because due

3665

* to priority we are exposing a smaller subset of

3660

* to priority we are exposing a smaller subset of

3666

* memory to reclaim from. Consider this as a longer

3661

* memory to reclaim from. Consider this as a longer

3667

* term TODO.

3662

* term TODO.

3668

*/

3663

*/

3669

/* If excess == 0, no tree ops */

3664

/* If excess == 0, no tree ops */

3670

__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);

3665

__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);

3671

spin_unlock(&mctz->lock);

3666

spin_unlock(&mctz->lock);

3672

css_put(&mz->memcg->css);

3667

css_put(&mz->memcg->css);

3673

loop++;

3668

loop++;

3674

/*

3669

/*

3675

* Could not reclaim anything and there are no more

3670

* Could not reclaim anything and there are no more

3676

* mem cgroups to try or we seem to be looping without

3671

* mem cgroups to try or we seem to be looping without

3677

* reclaiming anything.

3672

* reclaiming anything.

3678

*/

3673

*/

3679

if (!nr_reclaimed &&

3674

if (!nr_reclaimed &&

3680

(next_mz == NULL ||

3675

(next_mz == NULL ||

3681

loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))

3676

loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))

3682

break;

3677

break;

3683

} while (!nr_reclaimed);

3678

} while (!nr_reclaimed);

3684

if (next_mz)

3679

if (next_mz)

3685

css_put(&next_mz->memcg->css);

3680

css_put(&next_mz->memcg->css);

3686

return nr_reclaimed;

3681

return nr_reclaimed;

3687

}

3682

}

3688

3683

3689

/*

3684

/*

3690

* Traverse a specified page_cgroup list and try to drop them all. This doesn't

3685

* Traverse a specified page_cgroup list and try to drop them all. This doesn't

3691

* reclaim the pages page themselves - it just removes the page_cgroups.

3686

* reclaim the pages page themselves - it just removes the page_cgroups.

3692

* Returns true if some page_cgroups were not freed, indicating that the caller

3687

* Returns true if some page_cgroups were not freed, indicating that the caller

3693

* must retry this operation.

3688

* must retry this operation.

3694

*/

3689

*/

3695

static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,

3690

static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,

3696

int node, int zid, enum lru_list lru)

3691

int node, int zid, enum lru_list lru)

3697

{

3692

{

3698

struct mem_cgroup_per_zone *mz;

3693

struct mem_cgroup_per_zone *mz;

3699

unsigned long flags, loop;

3694

unsigned long flags, loop;

3700

struct list_head *list;

3695

struct list_head *list;

3701

struct page *busy;

3696

struct page *busy;

3702

struct zone *zone;

3697

struct zone *zone;

3703

3698

3704

zone = &NODE_DATA(node)->node_zones[zid];

3699

zone = &NODE_DATA(node)->node_zones[zid];

3705

mz = mem_cgroup_zoneinfo(memcg, node, zid);

3700

mz = mem_cgroup_zoneinfo(memcg, node, zid);

3706

list = &mz->lruvec.lists[lru];

3701

list = &mz->lruvec.lists[lru];

3707

3702

3708

loop = mz->lru_size[lru];

3703

loop = mz->lru_size[lru];

3709

/* give some margin against EBUSY etc...*/

3704

/* give some margin against EBUSY etc...*/

3710

loop += 256;

3705

loop += 256;

3711

busy = NULL;

3706

busy = NULL;

3712

while (loop--) {

3707

while (loop--) {

3713

struct page_cgroup *pc;

3708

struct page_cgroup *pc;

3714

struct page *page;

3709

struct page *page;

3715

3710

3716

spin_lock_irqsave(&zone->lru_lock, flags);

3711

spin_lock_irqsave(&zone->lru_lock, flags);

3717

if (list_empty(list)) {

3712

if (list_empty(list)) {

3718

spin_unlock_irqrestore(&zone->lru_lock, flags);

3713

spin_unlock_irqrestore(&zone->lru_lock, flags);

3719

break;

3714

break;

3720

}

3715

}

3721

page = list_entry(list->prev, struct page, lru);

3716

page = list_entry(list->prev, struct page, lru);

3722

if (busy == page) {

3717

if (busy == page) {

3723

list_move(&page->lru, list);

3718

list_move(&page->lru, list);

3724

busy = NULL;

3719

busy = NULL;

3725

spin_unlock_irqrestore(&zone->lru_lock, flags);

3720

spin_unlock_irqrestore(&zone->lru_lock, flags);

3726

continue;

3721

continue;

3727

}

3722

}

3728

spin_unlock_irqrestore(&zone->lru_lock, flags);

3723

spin_unlock_irqrestore(&zone->lru_lock, flags);

3729

3724

3730

pc = lookup_page_cgroup(page);

3725

pc = lookup_page_cgroup(page);

3731

3726

3732

if (mem_cgroup_move_parent(page, pc, memcg)) {

3727

if (mem_cgroup_move_parent(page, pc, memcg)) {

3733

/* found lock contention or "pc" is obsolete. */

3728

/* found lock contention or "pc" is obsolete. */

3734

busy = page;

3729

busy = page;

3735

cond_resched();

3730

cond_resched();

3736

} else

3731

} else

3737

busy = NULL;

3732

busy = NULL;

3738

}

3733

}

3739

return !list_empty(list);

3734

return !list_empty(list);

3740

}

3735

}

3741

3736

3742

/*

3737

/*

3743

* make mem_cgroup's charge to be 0 if there is no task.

3738

* make mem_cgroup's charge to be 0 if there is no task.

3744

* This enables deleting this mem_cgroup.

3739

* This enables deleting this mem_cgroup.

3745

*/

3740

*/

3746

static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)

3741

static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)

3747

{

3742

{

3748

int ret;

3743

int ret;

3749

int node, zid, shrink;

3744

int node, zid, shrink;

3750

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

3745

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

3751

struct cgroup *cgrp = memcg->css.cgroup;

3746

struct cgroup *cgrp = memcg->css.cgroup;

3752

3747

3753

css_get(&memcg->css);

3748

css_get(&memcg->css);

3754

3749

3755

shrink = 0;

3750

shrink = 0;

3756

/* should free all ? */

3751

/* should free all ? */

3757

if (free_all)

3752

if (free_all)

3758

goto try_to_free;

3753

goto try_to_free;

3759

move_account:

3754

move_account:

3760

do {

3755

do {

3761

ret = -EBUSY;

3756

ret = -EBUSY;

3762

if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))

3757

if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))

3763

goto out;

3758

goto out;

3764

/* This is for making all *used* pages to be on LRU. */

3759

/* This is for making all *used* pages to be on LRU. */

3765

lru_add_drain_all();

3760

lru_add_drain_all();

3766

drain_all_stock_sync(memcg);

3761

drain_all_stock_sync(memcg);

3767

ret = 0;

3762

ret = 0;

3768

mem_cgroup_start_move(memcg);

3763

mem_cgroup_start_move(memcg);

3769

for_each_node_state(node, N_HIGH_MEMORY) {

3764

for_each_node_state(node, N_HIGH_MEMORY) {

3770

for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {

3765

for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {

3771

enum lru_list lru;

3766

enum lru_list lru;

3772

for_each_lru(lru) {

3767

for_each_lru(lru) {

3773

ret = mem_cgroup_force_empty_list(memcg,

3768

ret = mem_cgroup_force_empty_list(memcg,

3774

node, zid, lru);

3769

node, zid, lru);

3775

if (ret)

3770

if (ret)

3776

break;

3771

break;

3777

}

3772

}

3778

}

3773

}

3779

if (ret)

3774

if (ret)

3780

break;

3775

break;

3781

}

3776

}

3782

mem_cgroup_end_move(memcg);

3777

mem_cgroup_end_move(memcg);

3783

memcg_oom_recover(memcg);

3778

memcg_oom_recover(memcg);

3784

cond_resched();

3779

cond_resched();

3785

/* "ret" should also be checked to ensure all lists are empty. */

3780

/* "ret" should also be checked to ensure all lists are empty. */

3786

} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);

3781

} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);

3787

out:

3782

out:

3788

css_put(&memcg->css);

3783

css_put(&memcg->css);

3789

return ret;

3784

return ret;

3790

3785

3791

try_to_free:

3786

try_to_free:

3792

/* returns EBUSY if there is a task or if we come here twice. */

3787

/* returns EBUSY if there is a task or if we come here twice. */

3793

if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {

3788

if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {

3794

ret = -EBUSY;

3789

ret = -EBUSY;

3795

goto out;

3790

goto out;

3796

}

3791

}

3797

/* we call try-to-free pages for make this cgroup empty */

3792

/* we call try-to-free pages for make this cgroup empty */

3798

lru_add_drain_all();

3793

lru_add_drain_all();

3799

/* try to free all pages in this cgroup */

3794

/* try to free all pages in this cgroup */

3800

shrink = 1;

3795

shrink = 1;

3801

while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {

3796

while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {

3802

int progress;

3797

int progress;

3803

3798

3804

if (signal_pending(current)) {

3799

if (signal_pending(current)) {

3805

ret = -EINTR;

3800

ret = -EINTR;

3806

goto out;

3801

goto out;

3807

}

3802

}

3808

progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,

3803

progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,

3809

false);

3804

false);

3810

if (!progress) {

3805

if (!progress) {

3811

nr_retries--;

3806

nr_retries--;

3812

/* maybe some writeback is necessary */

3807

/* maybe some writeback is necessary */

3813

congestion_wait(BLK_RW_ASYNC, HZ/10);

3808

congestion_wait(BLK_RW_ASYNC, HZ/10);

3814

}

3809

}

3815

3810

3816

}

3811

}

3817

lru_add_drain();

3812

lru_add_drain();

3818

/* try move_account...there may be some *locked* pages. */

3813

/* try move_account...there may be some *locked* pages. */

3819

goto move_account;

3814

goto move_account;

3820

}

3815

}

3821

3816

3822

static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)

3817

static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)

3823

{

3818

{

3824

return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);

3819

return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);

3825

}

3820

}

3826

3821

3827

3822

3828

static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)

3823

static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)

3829

{

3824

{

3830

return mem_cgroup_from_cont(cont)->use_hierarchy;

3825

return mem_cgroup_from_cont(cont)->use_hierarchy;

3831

}

3826

}

3832

3827

3833

static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,

3828

static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,

3834

u64 val)

3829

u64 val)

3835

{

3830

{

3836

int retval = 0;

3831

int retval = 0;

3837

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

3832

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

3838

struct cgroup *parent = cont->parent;

3833

struct cgroup *parent = cont->parent;

3839

struct mem_cgroup *parent_memcg = NULL;

3834

struct mem_cgroup *parent_memcg = NULL;

3840

3835

3841

if (parent)

3836

if (parent)

3842

parent_memcg = mem_cgroup_from_cont(parent);

3837

parent_memcg = mem_cgroup_from_cont(parent);

3843

3838

3844

cgroup_lock();

3839

cgroup_lock();

3845

3840

3846

if (memcg->use_hierarchy == val)

3841

if (memcg->use_hierarchy == val)

3847

goto out;

3842

goto out;

3848

3843

3849

/*

3844

/*

3850

* If parent's use_hierarchy is set, we can't make any modifications

3845

* If parent's use_hierarchy is set, we can't make any modifications

3851

* in the child subtrees. If it is unset, then the change can

3846

* in the child subtrees. If it is unset, then the change can

3852

* occur, provided the current cgroup has no children.

3847

* occur, provided the current cgroup has no children.

3853

*

3848

*

3854

* For the root cgroup, parent_mem is NULL, we allow value to be

3849

* For the root cgroup, parent_mem is NULL, we allow value to be

3855

* set if there are no children.

3850

* set if there are no children.

3856

*/

3851

*/

3857

if ((!parent_memcg || !parent_memcg->use_hierarchy) &&

3852

if ((!parent_memcg || !parent_memcg->use_hierarchy) &&

3858

(val == 1 || val == 0)) {

3853

(val == 1 || val == 0)) {

3859

if (list_empty(&cont->children))

3854

if (list_empty(&cont->children))

3860

memcg->use_hierarchy = val;

3855

memcg->use_hierarchy = val;

3861

else

3856

else

3862

retval = -EBUSY;

3857

retval = -EBUSY;

3863

} else

3858

} else

3864

retval = -EINVAL;

3859

retval = -EINVAL;

3865

3860

3866

out:

3861

out:

3867

cgroup_unlock();

3862

cgroup_unlock();

3868

3863

3869

return retval;

3864

return retval;

3870

}

3865

}

3871

3866

3872

3867

3873

static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,

3868

static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,

3874

enum mem_cgroup_stat_index idx)

3869

enum mem_cgroup_stat_index idx)

3875

{

3870

{

3876

struct mem_cgroup *iter;

3871

struct mem_cgroup *iter;

3877

long val = 0;

3872

long val = 0;

3878

3873

3879

/* Per-cpu values can be negative, use a signed accumulator */

3874

/* Per-cpu values can be negative, use a signed accumulator */

3880

for_each_mem_cgroup_tree(iter, memcg)

3875

for_each_mem_cgroup_tree(iter, memcg)

3881

val += mem_cgroup_read_stat(iter, idx);

3876

val += mem_cgroup_read_stat(iter, idx);

3882

3877

3883

if (val < 0) /* race ? */

3878

if (val < 0) /* race ? */

3884

val = 0;

3879

val = 0;

3885

return val;

3880

return val;

3886

}

3881

}

3887

3882

3888

static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)

3883

static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)

3889

{

3884

{

3890

u64 val;

3885

u64 val;

3891

3886

3892

if (!mem_cgroup_is_root(memcg)) {

3887

if (!mem_cgroup_is_root(memcg)) {

3893

if (!swap)

3888

if (!swap)

3894

return res_counter_read_u64(&memcg->res, RES_USAGE);

3889

return res_counter_read_u64(&memcg->res, RES_USAGE);

3895

else

3890

else

3896

return res_counter_read_u64(&memcg->memsw, RES_USAGE);

3891

return res_counter_read_u64(&memcg->memsw, RES_USAGE);

3897

}

3892

}

3898

3893

3899

val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);

3894

val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);

3900

val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);

3895

val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);

3901

3896

3902

if (swap)

3897

if (swap)

3903

val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);

3898

val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);

3904

3899

3905

return val << PAGE_SHIFT;

3900

return val << PAGE_SHIFT;

3906

}

3901

}

3907

3902

3908

static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,

3903

static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,

3909

struct file *file, char __user *buf,

3904

struct file *file, char __user *buf,

3910

size_t nbytes, loff_t *ppos)

3905

size_t nbytes, loff_t *ppos)

3911

{

3906

{

3912

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

3907

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

3913

char str[64];

3908

char str[64];

3914

u64 val;

3909

u64 val;

3915

int type, name, len;

3910

int type, name, len;

3916

3911

3917

type = MEMFILE_TYPE(cft->private);

3912

type = MEMFILE_TYPE(cft->private);

3918

name = MEMFILE_ATTR(cft->private);

3913

name = MEMFILE_ATTR(cft->private);

3919

3914

3920

if (!do_swap_account && type == _MEMSWAP)

3915

if (!do_swap_account && type == _MEMSWAP)

3921

return -EOPNOTSUPP;

3916

return -EOPNOTSUPP;

3922

3917

3923

switch (type) {

3918

switch (type) {

3924

case _MEM:

3919

case _MEM:

3925

if (name == RES_USAGE)

3920

if (name == RES_USAGE)

3926

val = mem_cgroup_usage(memcg, false);

3921

val = mem_cgroup_usage(memcg, false);

3927

else

3922

else

3928

val = res_counter_read_u64(&memcg->res, name);

3923

val = res_counter_read_u64(&memcg->res, name);

3929

break;

3924

break;

3930

case _MEMSWAP:

3925

case _MEMSWAP:

3931

if (name == RES_USAGE)

3926

if (name == RES_USAGE)

3932

val = mem_cgroup_usage(memcg, true);

3927

val = mem_cgroup_usage(memcg, true);

3933

else

3928

else

3934

val = res_counter_read_u64(&memcg->memsw, name);

3929

val = res_counter_read_u64(&memcg->memsw, name);

3935

break;

3930

break;

3936

default:

3931

default:

3937

BUG();

3932

BUG();

3938

}

3933

}

3939

3934

3940

len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);

3935

len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);

3941

return simple_read_from_buffer(buf, nbytes, ppos, str, len);

3936

return simple_read_from_buffer(buf, nbytes, ppos, str, len);

3942

}

3937

}

3943

/*

3938

/*

3944

* The user of this function is...

3939

* The user of this function is...

3945

* RES_LIMIT.

3940

* RES_LIMIT.

3946

*/

3941

*/

3947

static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,

3942

static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,

3948

const char *buffer)

3943

const char *buffer)

3949

{

3944

{

3950

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

3945

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

3951

int type, name;

3946

int type, name;

3952

unsigned long long val;

3947

unsigned long long val;

3953

int ret;

3948

int ret;

3954

3949

3955

type = MEMFILE_TYPE(cft->private);

3950

type = MEMFILE_TYPE(cft->private);

3956

name = MEMFILE_ATTR(cft->private);

3951

name = MEMFILE_ATTR(cft->private);

3957

3952

3958

if (!do_swap_account && type == _MEMSWAP)

3953

if (!do_swap_account && type == _MEMSWAP)

3959

return -EOPNOTSUPP;

3954

return -EOPNOTSUPP;

3960

3955

3961

switch (name) {

3956

switch (name) {

3962

case RES_LIMIT:

3957

case RES_LIMIT:

3963

if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */

3958

if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */

3964

ret = -EINVAL;

3959

ret = -EINVAL;

3965

break;

3960

break;

3966

}

3961

}

3967

/* This function does all necessary parse...reuse it */

3962

/* This function does all necessary parse...reuse it */

3968

ret = res_counter_memparse_write_strategy(buffer, &val);

3963

ret = res_counter_memparse_write_strategy(buffer, &val);

3969

if (ret)

3964

if (ret)

3970

break;

3965

break;

3971

if (type == _MEM)

3966

if (type == _MEM)

3972

ret = mem_cgroup_resize_limit(memcg, val);

3967

ret = mem_cgroup_resize_limit(memcg, val);

3973

else

3968

else

3974

ret = mem_cgroup_resize_memsw_limit(memcg, val);

3969

ret = mem_cgroup_resize_memsw_limit(memcg, val);

3975

break;

3970

break;

3976

case RES_SOFT_LIMIT:

3971

case RES_SOFT_LIMIT:

3977

ret = res_counter_memparse_write_strategy(buffer, &val);

3972

ret = res_counter_memparse_write_strategy(buffer, &val);

3978

if (ret)

3973

if (ret)

3979

break;

3974

break;

3980

/*

3975

/*

3981

* For memsw, soft limits are hard to implement in terms

3976

* For memsw, soft limits are hard to implement in terms

3982

* of semantics, for now, we support soft limits for

3977

* of semantics, for now, we support soft limits for

3983

* control without swap

3978

* control without swap

3984

*/

3979

*/

3985

if (type == _MEM)

3980

if (type == _MEM)

3986

ret = res_counter_set_soft_limit(&memcg->res, val);

3981

ret = res_counter_set_soft_limit(&memcg->res, val);

3987

else

3982

else

3988

ret = -EINVAL;

3983

ret = -EINVAL;

3989

break;

3984

break;

3990

default:

3985

default:

3991

ret = -EINVAL; /* should be BUG() ? */

3986

ret = -EINVAL; /* should be BUG() ? */

3992

break;

3987

break;

3993

}

3988

}

3994

return ret;

3989

return ret;

3995

}

3990

}

3996

3991

3997

static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,

3992

static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,

3998

unsigned long long *mem_limit, unsigned long long *memsw_limit)

3993

unsigned long long *mem_limit, unsigned long long *memsw_limit)

3999

{

3994

{

4000

struct cgroup *cgroup;

3995

struct cgroup *cgroup;

4001

unsigned long long min_limit, min_memsw_limit, tmp;

3996

unsigned long long min_limit, min_memsw_limit, tmp;

4002

3997

4003

min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

3998

min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

4004

min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

3999

min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

4005

cgroup = memcg->css.cgroup;

4000

cgroup = memcg->css.cgroup;

4006

if (!memcg->use_hierarchy)

4001

if (!memcg->use_hierarchy)

4007

goto out;

4002

goto out;

4008

4003

4009

while (cgroup->parent) {

4004

while (cgroup->parent) {

4010

cgroup = cgroup->parent;

4005

cgroup = cgroup->parent;

4011

memcg = mem_cgroup_from_cont(cgroup);

4006

memcg = mem_cgroup_from_cont(cgroup);

4012

if (!memcg->use_hierarchy)

4007

if (!memcg->use_hierarchy)

4013

break;

4008

break;

4014

tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);

4009

tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);

4015

min_limit = min(min_limit, tmp);

4010

min_limit = min(min_limit, tmp);

4016

tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

4011

tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

4017

min_memsw_limit = min(min_memsw_limit, tmp);

4012

min_memsw_limit = min(min_memsw_limit, tmp);

4018

}

4013

}

4019

out:

4014

out:

4020

*mem_limit = min_limit;

4015

*mem_limit = min_limit;

4021

*memsw_limit = min_memsw_limit;

4016

*memsw_limit = min_memsw_limit;

4022

}

4017

}

4023

4018

4024

static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)

4019

static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)

4025

{

4020

{

4026

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

4021

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

4027

int type, name;

4022

int type, name;

4028

4023

4029

type = MEMFILE_TYPE(event);

4024

type = MEMFILE_TYPE(event);

4030

name = MEMFILE_ATTR(event);

4025

name = MEMFILE_ATTR(event);

4031

4026

4032

if (!do_swap_account && type == _MEMSWAP)

4027

if (!do_swap_account && type == _MEMSWAP)

4033

return -EOPNOTSUPP;

4028

return -EOPNOTSUPP;

4034

4029

4035

switch (name) {

4030

switch (name) {

4036

case RES_MAX_USAGE:

4031

case RES_MAX_USAGE:

4037

if (type == _MEM)

4032

if (type == _MEM)

4038

res_counter_reset_max(&memcg->res);

4033

res_counter_reset_max(&memcg->res);

4039

else

4034

else

4040

res_counter_reset_max(&memcg->memsw);

4035

res_counter_reset_max(&memcg->memsw);

4041

break;

4036

break;

4042

case RES_FAILCNT:

4037

case RES_FAILCNT:

4043

if (type == _MEM)

4038

if (type == _MEM)

4044

res_counter_reset_failcnt(&memcg->res);

4039

res_counter_reset_failcnt(&memcg->res);

4045

else

4040

else

4046

res_counter_reset_failcnt(&memcg->memsw);

4041

res_counter_reset_failcnt(&memcg->memsw);

4047

break;

4042

break;

4048

}

4043

}

4049

4044

4050

return 0;

4045

return 0;

4051

}

4046

}

4052

4047

4053

static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,

4048

static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,

4054

struct cftype *cft)

4049

struct cftype *cft)

4055

{

4050

{

4056

return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;

4051

return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;

4057

}

4052

}

4058

4053

4059

#ifdef CONFIG_MMU

4054

#ifdef CONFIG_MMU

4060

static int mem_cgroup_move_charge_write(struct cgroup *cgrp,

4055

static int mem_cgroup_move_charge_write(struct cgroup *cgrp,

4061

struct cftype *cft, u64 val)

4056

struct cftype *cft, u64 val)

4062

{

4057

{

4063

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4058

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4064

4059

4065

if (val >= (1 << NR_MOVE_TYPE))

4060

if (val >= (1 << NR_MOVE_TYPE))

4066

return -EINVAL;

4061

return -EINVAL;

4067

/*

4062

/*

4068

* We check this value several times in both in can_attach() and

4063

* We check this value several times in both in can_attach() and

4069

* attach(), so we need cgroup lock to prevent this value from being

4064

* attach(), so we need cgroup lock to prevent this value from being

4070

* inconsistent.

4065

* inconsistent.

4071

*/

4066

*/

4072

cgroup_lock();

4067

cgroup_lock();

4073

memcg->move_charge_at_immigrate = val;

4068

memcg->move_charge_at_immigrate = val;

4074

cgroup_unlock();

4069

cgroup_unlock();

4075

4070

4076

return 0;

4071

return 0;

4077

}

4072

}

4078

#else

4073

#else

4079

static int mem_cgroup_move_charge_write(struct cgroup *cgrp,

4074

static int mem_cgroup_move_charge_write(struct cgroup *cgrp,

4080

struct cftype *cft, u64 val)

4075

struct cftype *cft, u64 val)

4081

{

4076

{

4082

return -ENOSYS;

4077

return -ENOSYS;

4083

}

4078

}

4084

#endif

4079

#endif

4085

4080

4086

#ifdef CONFIG_NUMA

4081

#ifdef CONFIG_NUMA

4087

static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,

4082

static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,

4088

struct seq_file *m)

4083

struct seq_file *m)

4089

{

4084

{

4090

int nid;

4085

int nid;

4091

unsigned long total_nr, file_nr, anon_nr, unevictable_nr;

4086

unsigned long total_nr, file_nr, anon_nr, unevictable_nr;

4092

unsigned long node_nr;

4087

unsigned long node_nr;

4093

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

4088

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

4094

4089

4095

total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);

4090

total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);

4096

seq_printf(m, "total=%lu", total_nr);

4091

seq_printf(m, "total=%lu", total_nr);

4097

for_each_node_state(nid, N_HIGH_MEMORY) {

4092

for_each_node_state(nid, N_HIGH_MEMORY) {

4098

node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);

4093

node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);

4099

seq_printf(m, " N%d=%lu", nid, node_nr);

4094

seq_printf(m, " N%d=%lu", nid, node_nr);

4100

}

4095

}

4101

seq_putc(m, '\n');

4096

seq_putc(m, '\n');

4102

4097

4103

file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);

4098

file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);

4104

seq_printf(m, "file=%lu", file_nr);

4099

seq_printf(m, "file=%lu", file_nr);

4105

for_each_node_state(nid, N_HIGH_MEMORY) {

4100

for_each_node_state(nid, N_HIGH_MEMORY) {

4106

node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,

4101

node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,

4107

LRU_ALL_FILE);

4102

LRU_ALL_FILE);

4108

seq_printf(m, " N%d=%lu", nid, node_nr);

4103

seq_printf(m, " N%d=%lu", nid, node_nr);

4109

}

4104

}

4110

seq_putc(m, '\n');

4105

seq_putc(m, '\n');

4111

4106

4112

anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);

4107

anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);

4113

seq_printf(m, "anon=%lu", anon_nr);

4108

seq_printf(m, "anon=%lu", anon_nr);

4114

for_each_node_state(nid, N_HIGH_MEMORY) {

4109

for_each_node_state(nid, N_HIGH_MEMORY) {

4115

node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,

4110

node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,

4116

LRU_ALL_ANON);

4111

LRU_ALL_ANON);

4117

seq_printf(m, " N%d=%lu", nid, node_nr);

4112

seq_printf(m, " N%d=%lu", nid, node_nr);

4118

}

4113

}

4119

seq_putc(m, '\n');

4114

seq_putc(m, '\n');

4120

4115

4121

unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));

4116

unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));

4122

seq_printf(m, "unevictable=%lu", unevictable_nr);

4117

seq_printf(m, "unevictable=%lu", unevictable_nr);

4123

for_each_node_state(nid, N_HIGH_MEMORY) {

4118

for_each_node_state(nid, N_HIGH_MEMORY) {

4124

node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,

4119

node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,

4125

BIT(LRU_UNEVICTABLE));

4120

BIT(LRU_UNEVICTABLE));

4126

seq_printf(m, " N%d=%lu", nid, node_nr);

4121

seq_printf(m, " N%d=%lu", nid, node_nr);

4127

}

4122

}

4128

seq_putc(m, '\n');

4123

seq_putc(m, '\n');

4129

return 0;

4124

return 0;

4130

}

4125

}

4131

#endif /* CONFIG_NUMA */

4126

#endif /* CONFIG_NUMA */

4132

4127

4133

static const char * const mem_cgroup_lru_names[] = {

4128

static const char * const mem_cgroup_lru_names[] = {

4134

"inactive_anon",

4129

"inactive_anon",

4135

"active_anon",

4130

"active_anon",

4136

"inactive_file",

4131

"inactive_file",

4137

"active_file",

4132

"active_file",

4138

"unevictable",

4133

"unevictable",

4139

};

4134

};

4140

4135

4141

static inline void mem_cgroup_lru_names_not_uptodate(void)

4136

static inline void mem_cgroup_lru_names_not_uptodate(void)

4142

{

4137

{

4143

BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);

4138

BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);

4144

}

4139

}

4145

4140

4146

static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,

4141

static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,

4147

struct seq_file *m)

4142

struct seq_file *m)

4148

{

4143

{

4149

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

4144

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

4150

struct mem_cgroup *mi;

4145

struct mem_cgroup *mi;

4151

unsigned int i;

4146

unsigned int i;

4152

4147

4153

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

4148

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

4154

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

4149

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

4155

continue;

4150

continue;

4156

seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],

4151

seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],

4157

mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);

4152

mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);

4158

}

4153

}

4159

4154

4160

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)

4155

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)

4161

seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],

4156

seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],

4162

mem_cgroup_read_events(memcg, i));

4157

mem_cgroup_read_events(memcg, i));

4163

4158

4164

for (i = 0; i < NR_LRU_LISTS; i++)

4159

for (i = 0; i < NR_LRU_LISTS; i++)

4165

seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],

4160

seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],

4166

mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);

4161

mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);

4167

4162

4168

/* Hierarchical information */

4163

/* Hierarchical information */

4169

{

4164

{

4170

unsigned long long limit, memsw_limit;

4165

unsigned long long limit, memsw_limit;

4171

memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);

4166

memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);

4172

seq_printf(m, "hierarchical_memory_limit %llu\n", limit);

4167

seq_printf(m, "hierarchical_memory_limit %llu\n", limit);

4173

if (do_swap_account)

4168

if (do_swap_account)

4174

seq_printf(m, "hierarchical_memsw_limit %llu\n",

4169

seq_printf(m, "hierarchical_memsw_limit %llu\n",

4175

memsw_limit);

4170

memsw_limit);

4176

}

4171

}

4177

4172

4178

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

4173

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

4179

long long val = 0;

4174

long long val = 0;

4180

4175

4181

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

4176

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

4182

continue;

4177

continue;

4183

for_each_mem_cgroup_tree(mi, memcg)

4178

for_each_mem_cgroup_tree(mi, memcg)

4184

val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;

4179

val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;

4185

seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);

4180

seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);

4186

}

4181

}

4187

4182

4188

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

4183

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

4189

unsigned long long val = 0;

4184

unsigned long long val = 0;

4190

4185

4191

for_each_mem_cgroup_tree(mi, memcg)

4186

for_each_mem_cgroup_tree(mi, memcg)

4192

val += mem_cgroup_read_events(mi, i);

4187

val += mem_cgroup_read_events(mi, i);

4193

seq_printf(m, "total_%s %llu\n",

4188

seq_printf(m, "total_%s %llu\n",

4194

mem_cgroup_events_names[i], val);

4189

mem_cgroup_events_names[i], val);

4195

}

4190

}

4196

4191

4197

for (i = 0; i < NR_LRU_LISTS; i++) {

4192

for (i = 0; i < NR_LRU_LISTS; i++) {

4198

unsigned long long val = 0;

4193

unsigned long long val = 0;

4199

4194

4200

for_each_mem_cgroup_tree(mi, memcg)

4195

for_each_mem_cgroup_tree(mi, memcg)

4201

val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;

4196

val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;

4202

seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);

4197

seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);

4203

}

4198

}

4204

4199

4205

#ifdef CONFIG_DEBUG_VM

4200

#ifdef CONFIG_DEBUG_VM

4206

{

4201

{

4207

int nid, zid;

4202

int nid, zid;

4208

struct mem_cgroup_per_zone *mz;

4203

struct mem_cgroup_per_zone *mz;

4209

struct zone_reclaim_stat *rstat;

4204

struct zone_reclaim_stat *rstat;

4210

unsigned long recent_rotated[2] = {0, 0};

4205

unsigned long recent_rotated[2] = {0, 0};

4211

unsigned long recent_scanned[2] = {0, 0};

4206

unsigned long recent_scanned[2] = {0, 0};

4212

4207

4213

for_each_online_node(nid)

4208

for_each_online_node(nid)

4214

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

4209

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

4215

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

4210

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

4216

rstat = &mz->lruvec.reclaim_stat;

4211

rstat = &mz->lruvec.reclaim_stat;

4217

4212

4218

recent_rotated[0] += rstat->recent_rotated[0];

4213

recent_rotated[0] += rstat->recent_rotated[0];

4219

recent_rotated[1] += rstat->recent_rotated[1];

4214

recent_rotated[1] += rstat->recent_rotated[1];

4220

recent_scanned[0] += rstat->recent_scanned[0];

4215

recent_scanned[0] += rstat->recent_scanned[0];

4221

recent_scanned[1] += rstat->recent_scanned[1];

4216

recent_scanned[1] += rstat->recent_scanned[1];

4222

}

4217

}

4223

seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);

4218

seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);

4224

seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);

4219

seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);

4225

seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);

4220

seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);

4226

seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);

4221

seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);

4227

}

4222

}

4228

#endif

4223

#endif

4229

4224

4230

return 0;

4225

return 0;

4231

}

4226

}

4232

4227

4233

static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)

4228

static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)

4234

{

4229

{

4235

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4230

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4236

4231

4237

return mem_cgroup_swappiness(memcg);

4232

return mem_cgroup_swappiness(memcg);

4238

}

4233

}

4239

4234

4240

static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,

4235

static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,

4241

u64 val)

4236

u64 val)

4242

{

4237

{

4243

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4238

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4244

struct mem_cgroup *parent;

4239

struct mem_cgroup *parent;

4245

4240

4246

if (val > 100)

4241

if (val > 100)

4247

return -EINVAL;

4242

return -EINVAL;

4248

4243

4249

if (cgrp->parent == NULL)

4244

if (cgrp->parent == NULL)

4250

return -EINVAL;

4245

return -EINVAL;

4251

4246

4252

parent = mem_cgroup_from_cont(cgrp->parent);

4247

parent = mem_cgroup_from_cont(cgrp->parent);

4253

4248

4254

cgroup_lock();

4249

cgroup_lock();

4255

4250

4256

/* If under hierarchy, only empty-root can set this value */

4251

/* If under hierarchy, only empty-root can set this value */

4257

if ((parent->use_hierarchy) ||

4252

if ((parent->use_hierarchy) ||

4258

(memcg->use_hierarchy && !list_empty(&cgrp->children))) {

4253

(memcg->use_hierarchy && !list_empty(&cgrp->children))) {

4259

cgroup_unlock();

4254

cgroup_unlock();

4260

return -EINVAL;

4255

return -EINVAL;

4261

}

4256

}

4262

4257

4263

memcg->swappiness = val;

4258

memcg->swappiness = val;

4264

4259

4265

cgroup_unlock();

4260

cgroup_unlock();

4266

4261

4267

return 0;

4262

return 0;

4268

}

4263

}

4269

4264

4270

static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)

4265

static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)

4271

{

4266

{

4272

struct mem_cgroup_threshold_ary *t;

4267

struct mem_cgroup_threshold_ary *t;

4273

u64 usage;

4268

u64 usage;

4274

int i;

4269

int i;

4275

4270

4276

rcu_read_lock();

4271

rcu_read_lock();

4277

if (!swap)

4272

if (!swap)

4278

t = rcu_dereference(memcg->thresholds.primary);

4273

t = rcu_dereference(memcg->thresholds.primary);

4279

else

4274

else

4280

t = rcu_dereference(memcg->memsw_thresholds.primary);

4275

t = rcu_dereference(memcg->memsw_thresholds.primary);

4281

4276

4282

if (!t)

4277

if (!t)

4283

goto unlock;

4278

goto unlock;

4284

4279

4285

usage = mem_cgroup_usage(memcg, swap);

4280

usage = mem_cgroup_usage(memcg, swap);

4286

4281

4287

/*

4282

/*

4288

* current_threshold points to threshold just below or equal to usage.

4283

* current_threshold points to threshold just below or equal to usage.

4289

* If it's not true, a threshold was crossed after last

4284

* If it's not true, a threshold was crossed after last

4290

* call of __mem_cgroup_threshold().

4285

* call of __mem_cgroup_threshold().

4291

*/

4286

*/

4292

i = t->current_threshold;

4287

i = t->current_threshold;

4293

4288

4294

/*

4289

/*

4295

* Iterate backward over array of thresholds starting from

4290

* Iterate backward over array of thresholds starting from

4296

* current_threshold and check if a threshold is crossed.

4291

* current_threshold and check if a threshold is crossed.

4297

* If none of thresholds below usage is crossed, we read

4292

* If none of thresholds below usage is crossed, we read

4298

* only one element of the array here.

4293

* only one element of the array here.

4299

*/

4294

*/

4300

for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)

4295

for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)

4301

eventfd_signal(t->entries[i].eventfd, 1);

4296

eventfd_signal(t->entries[i].eventfd, 1);

4302

4297

4303

/* i = current_threshold + 1 */

4298

/* i = current_threshold + 1 */

4304

i++;

4299

i++;

4305

4300

4306

/*

4301

/*

4307

* Iterate forward over array of thresholds starting from

4302

* Iterate forward over array of thresholds starting from

4308

* current_threshold+1 and check if a threshold is crossed.

4303

* current_threshold+1 and check if a threshold is crossed.

4309

* If none of thresholds above usage is crossed, we read

4304

* If none of thresholds above usage is crossed, we read

4310

* only one element of the array here.

4305

* only one element of the array here.

4311

*/

4306

*/

4312

for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)

4307

for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)

4313

eventfd_signal(t->entries[i].eventfd, 1);

4308

eventfd_signal(t->entries[i].eventfd, 1);

4314

4309

4315

/* Update current_threshold */

4310

/* Update current_threshold */

4316

t->current_threshold = i - 1;

4311

t->current_threshold = i - 1;

4317

unlock:

4312

unlock:

4318

rcu_read_unlock();

4313

rcu_read_unlock();

4319

}

4314

}

4320

4315

4321

static void mem_cgroup_threshold(struct mem_cgroup *memcg)

4316

static void mem_cgroup_threshold(struct mem_cgroup *memcg)

4322

{

4317

{

4323

while (memcg) {

4318

while (memcg) {

4324

__mem_cgroup_threshold(memcg, false);

4319

__mem_cgroup_threshold(memcg, false);

4325

if (do_swap_account)

4320

if (do_swap_account)

4326

__mem_cgroup_threshold(memcg, true);

4321

__mem_cgroup_threshold(memcg, true);

4327

4322

4328

memcg = parent_mem_cgroup(memcg);

4323

memcg = parent_mem_cgroup(memcg);

4329

}

4324

}

4330

}

4325

}

4331

4326

4332

static int compare_thresholds(const void *a, const void *b)

4327

static int compare_thresholds(const void *a, const void *b)

4333

{

4328

{

4334

const struct mem_cgroup_threshold *_a = a;

4329

const struct mem_cgroup_threshold *_a = a;

4335

const struct mem_cgroup_threshold *_b = b;

4330

const struct mem_cgroup_threshold *_b = b;

4336

4331

4337

return _a->threshold - _b->threshold;

4332

return _a->threshold - _b->threshold;

4338

}

4333

}

4339

4334

4340

static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)

4335

static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)

4341

{

4336

{

4342

struct mem_cgroup_eventfd_list *ev;

4337

struct mem_cgroup_eventfd_list *ev;

4343

4338

4344

list_for_each_entry(ev, &memcg->oom_notify, list)

4339

list_for_each_entry(ev, &memcg->oom_notify, list)

4345

eventfd_signal(ev->eventfd, 1);

4340

eventfd_signal(ev->eventfd, 1);

4346

return 0;

4341

return 0;

4347

}

4342

}

4348

4343

4349

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)

4344

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)

4350

{

4345

{

4351

struct mem_cgroup *iter;

4346

struct mem_cgroup *iter;

4352

4347

4353

for_each_mem_cgroup_tree(iter, memcg)

4348

for_each_mem_cgroup_tree(iter, memcg)

4354

mem_cgroup_oom_notify_cb(iter);

4349

mem_cgroup_oom_notify_cb(iter);

4355

}

4350

}

4356

4351

4357

static int mem_cgroup_usage_register_event(struct cgroup *cgrp,

4352

static int mem_cgroup_usage_register_event(struct cgroup *cgrp,

4358

struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)

4353

struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)

4359

{

4354

{

4360

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4355

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4361

struct mem_cgroup_thresholds *thresholds;

4356

struct mem_cgroup_thresholds *thresholds;

4362

struct mem_cgroup_threshold_ary *new;

4357

struct mem_cgroup_threshold_ary *new;

4363

int type = MEMFILE_TYPE(cft->private);

4358

int type = MEMFILE_TYPE(cft->private);

4364

u64 threshold, usage;

4359

u64 threshold, usage;

4365

int i, size, ret;

4360

int i, size, ret;

4366

4361

4367

ret = res_counter_memparse_write_strategy(args, &threshold);

4362

ret = res_counter_memparse_write_strategy(args, &threshold);

4368

if (ret)

4363

if (ret)

4369

return ret;

4364

return ret;

4370

4365

4371

mutex_lock(&memcg->thresholds_lock);

4366

mutex_lock(&memcg->thresholds_lock);

4372

4367

4373

if (type == _MEM)

4368

if (type == _MEM)

4374

thresholds = &memcg->thresholds;

4369

thresholds = &memcg->thresholds;

4375

else if (type == _MEMSWAP)

4370

else if (type == _MEMSWAP)

4376

thresholds = &memcg->memsw_thresholds;

4371

thresholds = &memcg->memsw_thresholds;

4377

else

4372

else

4378

BUG();

4373

BUG();

4379

4374

4380

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

4375

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

4381

4376

4382

/* Check if a threshold crossed before adding a new one */

4377

/* Check if a threshold crossed before adding a new one */

4383

if (thresholds->primary)

4378

if (thresholds->primary)

4384

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4379

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4385

4380

4386

size = thresholds->primary ? thresholds->primary->size + 1 : 1;

4381

size = thresholds->primary ? thresholds->primary->size + 1 : 1;

4387

4382

4388

/* Allocate memory for new array of thresholds */

4383

/* Allocate memory for new array of thresholds */

4389

new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),

4384

new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),

4390

GFP_KERNEL);

4385

GFP_KERNEL);

4391

if (!new) {

4386

if (!new) {

4392

ret = -ENOMEM;

4387

ret = -ENOMEM;

4393

goto unlock;

4388

goto unlock;

4394

}

4389

}

4395

new->size = size;

4390

new->size = size;

4396

4391

4397

/* Copy thresholds (if any) to new array */

4392

/* Copy thresholds (if any) to new array */

4398

if (thresholds->primary) {

4393

if (thresholds->primary) {

4399

memcpy(new->entries, thresholds->primary->entries, (size - 1) *

4394

memcpy(new->entries, thresholds->primary->entries, (size - 1) *

4400

sizeof(struct mem_cgroup_threshold));

4395

sizeof(struct mem_cgroup_threshold));

4401

}

4396

}

4402

4397

4403

/* Add new threshold */

4398

/* Add new threshold */

4404

new->entries[size - 1].eventfd = eventfd;

4399

new->entries[size - 1].eventfd = eventfd;

4405

new->entries[size - 1].threshold = threshold;

4400

new->entries[size - 1].threshold = threshold;

4406

4401

4407

/* Sort thresholds. Registering of new threshold isn't time-critical */

4402

/* Sort thresholds. Registering of new threshold isn't time-critical */

4408

sort(new->entries, size, sizeof(struct mem_cgroup_threshold),

4403

sort(new->entries, size, sizeof(struct mem_cgroup_threshold),

4409

compare_thresholds, NULL);

4404

compare_thresholds, NULL);

4410

4405

4411

/* Find current threshold */

4406

/* Find current threshold */

4412

new->current_threshold = -1;

4407

new->current_threshold = -1;

4413

for (i = 0; i < size; i++) {

4408

for (i = 0; i < size; i++) {

4414

if (new->entries[i].threshold <= usage) {

4409

if (new->entries[i].threshold <= usage) {

4415

/*

4410

/*

4416

* new->current_threshold will not be used until

4411

* new->current_threshold will not be used until

4417

* rcu_assign_pointer(), so it's safe to increment

4412

* rcu_assign_pointer(), so it's safe to increment

4418

* it here.

4413

* it here.

4419

*/

4414

*/

4420

++new->current_threshold;

4415

++new->current_threshold;

4421

} else

4416

} else

4422

break;

4417

break;

4423

}

4418

}

4424

4419

4425

/* Free old spare buffer and save old primary buffer as spare */

4420

/* Free old spare buffer and save old primary buffer as spare */

4426

kfree(thresholds->spare);

4421

kfree(thresholds->spare);

4427

thresholds->spare = thresholds->primary;

4422

thresholds->spare = thresholds->primary;

4428

4423

4429

rcu_assign_pointer(thresholds->primary, new);

4424

rcu_assign_pointer(thresholds->primary, new);

4430

4425

4431

/* To be sure that nobody uses thresholds */

4426

/* To be sure that nobody uses thresholds */

4432

synchronize_rcu();

4427

synchronize_rcu();

4433

4428

4434

unlock:

4429

unlock:

4435

mutex_unlock(&memcg->thresholds_lock);

4430

mutex_unlock(&memcg->thresholds_lock);

4436

4431

4437

return ret;

4432

return ret;

4438

}

4433

}

4439

4434

4440

static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,

4435

static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,

4441

struct cftype *cft, struct eventfd_ctx *eventfd)

4436

struct cftype *cft, struct eventfd_ctx *eventfd)

4442

{

4437

{

4443

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4438

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4444

struct mem_cgroup_thresholds *thresholds;

4439

struct mem_cgroup_thresholds *thresholds;

4445

struct mem_cgroup_threshold_ary *new;

4440

struct mem_cgroup_threshold_ary *new;

4446

int type = MEMFILE_TYPE(cft->private);

4441

int type = MEMFILE_TYPE(cft->private);

4447

u64 usage;

4442

u64 usage;

4448

int i, j, size;

4443

int i, j, size;

4449

4444

4450

mutex_lock(&memcg->thresholds_lock);

4445

mutex_lock(&memcg->thresholds_lock);

4451

if (type == _MEM)

4446

if (type == _MEM)

4452

thresholds = &memcg->thresholds;

4447

thresholds = &memcg->thresholds;

4453

else if (type == _MEMSWAP)

4448

else if (type == _MEMSWAP)

4454

thresholds = &memcg->memsw_thresholds;

4449

thresholds = &memcg->memsw_thresholds;

4455

else

4450

else

4456

BUG();

4451

BUG();

4457

4452

4458

if (!thresholds->primary)

4453

if (!thresholds->primary)

4459

goto unlock;

4454

goto unlock;

4460

4455

4461

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

4456

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

4462

4457

4463

/* Check if a threshold crossed before removing */

4458

/* Check if a threshold crossed before removing */

4464

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4459

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4465

4460

4466

/* Calculate new number of threshold */

4461

/* Calculate new number of threshold */

4467

size = 0;

4462

size = 0;

4468

for (i = 0; i < thresholds->primary->size; i++) {

4463

for (i = 0; i < thresholds->primary->size; i++) {

4469

if (thresholds->primary->entries[i].eventfd != eventfd)

4464

if (thresholds->primary->entries[i].eventfd != eventfd)

4470

size++;

4465

size++;

4471

}

4466

}

4472

4467

4473

new = thresholds->spare;

4468

new = thresholds->spare;

4474

4469

4475

/* Set thresholds array to NULL if we don't have thresholds */

4470

/* Set thresholds array to NULL if we don't have thresholds */

4476

if (!size) {

4471

if (!size) {

4477

kfree(new);

4472

kfree(new);

4478

new = NULL;

4473

new = NULL;

4479

goto swap_buffers;

4474

goto swap_buffers;

4480

}

4475

}

4481

4476

4482

new->size = size;

4477

new->size = size;

4483

4478

4484

/* Copy thresholds and find current threshold */

4479

/* Copy thresholds and find current threshold */

4485

new->current_threshold = -1;

4480

new->current_threshold = -1;

4486

for (i = 0, j = 0; i < thresholds->primary->size; i++) {

4481

for (i = 0, j = 0; i < thresholds->primary->size; i++) {

4487

if (thresholds->primary->entries[i].eventfd == eventfd)

4482

if (thresholds->primary->entries[i].eventfd == eventfd)

4488

continue;

4483

continue;

4489

4484

4490

new->entries[j] = thresholds->primary->entries[i];

4485

new->entries[j] = thresholds->primary->entries[i];

4491

if (new->entries[j].threshold <= usage) {

4486

if (new->entries[j].threshold <= usage) {

4492

/*

4487

/*

4493

* new->current_threshold will not be used

4488

* new->current_threshold will not be used

4494

* until rcu_assign_pointer(), so it's safe to increment

4489

* until rcu_assign_pointer(), so it's safe to increment

4495

* it here.

4490

* it here.

4496

*/

4491

*/

4497

++new->current_threshold;

4492

++new->current_threshold;

4498

}

4493

}

4499

j++;

4494

j++;

4500

}

4495

}

4501

4496

4502

swap_buffers:

4497

swap_buffers:

4503

/* Swap primary and spare array */

4498

/* Swap primary and spare array */

4504

thresholds->spare = thresholds->primary;

4499

thresholds->spare = thresholds->primary;

4505

/* If all events are unregistered, free the spare array */

4500

/* If all events are unregistered, free the spare array */

4506

if (!new) {

4501

if (!new) {

4507

kfree(thresholds->spare);

4502

kfree(thresholds->spare);

4508

thresholds->spare = NULL;

4503

thresholds->spare = NULL;

4509

}

4504

}

4510

4505

4511

rcu_assign_pointer(thresholds->primary, new);

4506

rcu_assign_pointer(thresholds->primary, new);

4512

4507

4513

/* To be sure that nobody uses thresholds */

4508

/* To be sure that nobody uses thresholds */

4514

synchronize_rcu();

4509

synchronize_rcu();

4515

unlock:

4510

unlock:

4516

mutex_unlock(&memcg->thresholds_lock);

4511

mutex_unlock(&memcg->thresholds_lock);

4517

}

4512

}

4518

4513

4519

static int mem_cgroup_oom_register_event(struct cgroup *cgrp,

4514

static int mem_cgroup_oom_register_event(struct cgroup *cgrp,

4520

struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)

4515

struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)

4521

{

4516

{

4522

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4517

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4523

struct mem_cgroup_eventfd_list *event;

4518

struct mem_cgroup_eventfd_list *event;

4524

int type = MEMFILE_TYPE(cft->private);

4519

int type = MEMFILE_TYPE(cft->private);

4525

4520

4526

BUG_ON(type != _OOM_TYPE);

4521

BUG_ON(type != _OOM_TYPE);

4527

event = kmalloc(sizeof(*event), GFP_KERNEL);

4522

event = kmalloc(sizeof(*event), GFP_KERNEL);

4528

if (!event)

4523

if (!event)

4529

return -ENOMEM;

4524

return -ENOMEM;

4530

4525

4531

spin_lock(&memcg_oom_lock);

4526

spin_lock(&memcg_oom_lock);

4532

4527

4533

event->eventfd = eventfd;

4528

event->eventfd = eventfd;

4534

list_add(&event->list, &memcg->oom_notify);

4529

list_add(&event->list, &memcg->oom_notify);

4535

4530

4536

/* already in OOM ? */

4531

/* already in OOM ? */

4537

if (atomic_read(&memcg->under_oom))

4532

if (atomic_read(&memcg->under_oom))

4538

eventfd_signal(eventfd, 1);

4533

eventfd_signal(eventfd, 1);

4539

spin_unlock(&memcg_oom_lock);

4534

spin_unlock(&memcg_oom_lock);

4540

4535

4541

return 0;

4536

return 0;

4542

}

4537

}

4543

4538

4544

static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,

4539

static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,

4545

struct cftype *cft, struct eventfd_ctx *eventfd)

4540

struct cftype *cft, struct eventfd_ctx *eventfd)

4546

{

4541

{

4547

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4542

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4548

struct mem_cgroup_eventfd_list *ev, *tmp;

4543

struct mem_cgroup_eventfd_list *ev, *tmp;

4549

int type = MEMFILE_TYPE(cft->private);

4544

int type = MEMFILE_TYPE(cft->private);

4550

4545

4551

BUG_ON(type != _OOM_TYPE);

4546

BUG_ON(type != _OOM_TYPE);

4552

4547

4553

spin_lock(&memcg_oom_lock);

4548

spin_lock(&memcg_oom_lock);

4554

4549

4555

list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {

4550

list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {

4556

if (ev->eventfd == eventfd) {

4551

if (ev->eventfd == eventfd) {

4557

list_del(&ev->list);

4552

list_del(&ev->list);

4558

kfree(ev);

4553

kfree(ev);

4559

}

4554

}

4560

}

4555

}

4561

4556

4562

spin_unlock(&memcg_oom_lock);

4557

spin_unlock(&memcg_oom_lock);

4563

}

4558

}

4564

4559

4565

static int mem_cgroup_oom_control_read(struct cgroup *cgrp,

4560

static int mem_cgroup_oom_control_read(struct cgroup *cgrp,

4566

struct cftype *cft, struct cgroup_map_cb *cb)

4561

struct cftype *cft, struct cgroup_map_cb *cb)

4567

{

4562

{

4568

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4563

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4569

4564

4570

cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);

4565

cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);

4571

4566

4572

if (atomic_read(&memcg->under_oom))

4567

if (atomic_read(&memcg->under_oom))

4573

cb->fill(cb, "under_oom", 1);

4568

cb->fill(cb, "under_oom", 1);

4574

else

4569

else

4575

cb->fill(cb, "under_oom", 0);

4570

cb->fill(cb, "under_oom", 0);

4576

return 0;

4571

return 0;

4577

}

4572

}

4578

4573

4579

static int mem_cgroup_oom_control_write(struct cgroup *cgrp,

4574

static int mem_cgroup_oom_control_write(struct cgroup *cgrp,

4580

struct cftype *cft, u64 val)

4575

struct cftype *cft, u64 val)

4581

{

4576

{

4582

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4577

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);

4583

struct mem_cgroup *parent;

4578

struct mem_cgroup *parent;

4584

4579

4585

/* cannot set to root cgroup and only 0 and 1 are allowed */

4580

/* cannot set to root cgroup and only 0 and 1 are allowed */

4586

if (!cgrp->parent || !((val == 0) || (val == 1)))

4581

if (!cgrp->parent || !((val == 0) || (val == 1)))

4587

return -EINVAL;

4582

return -EINVAL;

4588

4583

4589

parent = mem_cgroup_from_cont(cgrp->parent);

4584

parent = mem_cgroup_from_cont(cgrp->parent);

4590

4585

4591

cgroup_lock();

4586

cgroup_lock();

4592

/* oom-kill-disable is a flag for subhierarchy. */

4587

/* oom-kill-disable is a flag for subhierarchy. */

4593

if ((parent->use_hierarchy) ||

4588

if ((parent->use_hierarchy) ||

4594

(memcg->use_hierarchy && !list_empty(&cgrp->children))) {

4589

(memcg->use_hierarchy && !list_empty(&cgrp->children))) {

4595

cgroup_unlock();

4590

cgroup_unlock();

4596

return -EINVAL;

4591

return -EINVAL;

4597

}

4592

}

4598

memcg->oom_kill_disable = val;

4593

memcg->oom_kill_disable = val;

4599

if (!val)

4594

if (!val)

4600

memcg_oom_recover(memcg);

4595

memcg_oom_recover(memcg);

4601

cgroup_unlock();

4596

cgroup_unlock();

4602

return 0;

4597

return 0;

4603

}

4598

}

4604

4599

4605

#ifdef CONFIG_MEMCG_KMEM

4600

#ifdef CONFIG_MEMCG_KMEM

4606

static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)

4601

static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)

4607

{

4602

{

4608

return mem_cgroup_sockets_init(memcg, ss);

4603

return mem_cgroup_sockets_init(memcg, ss);

4609

};

4604

};

4610

4605

4611

static void kmem_cgroup_destroy(struct mem_cgroup *memcg)

4606

static void kmem_cgroup_destroy(struct mem_cgroup *memcg)

4612

{

4607

{

4613

mem_cgroup_sockets_destroy(memcg);

4608

mem_cgroup_sockets_destroy(memcg);

4614

}

4609

}

4615

#else

4610

#else

4616

static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)

4611

static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)

4617

{

4612

{

4618

return 0;

4613

return 0;

4619

}

4614

}

4620

4615

4621

static void kmem_cgroup_destroy(struct mem_cgroup *memcg)

4616

static void kmem_cgroup_destroy(struct mem_cgroup *memcg)

4622

{

4617

{

4623

}

4618

}

4624

#endif

4619

#endif

4625

4620

4626

static struct cftype mem_cgroup_files[] = {

4621

static struct cftype mem_cgroup_files[] = {

4627

{

4622

{

4628

.name = "usage_in_bytes",

4623

.name = "usage_in_bytes",

4629

.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),

4624

.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),

4630

.read = mem_cgroup_read,

4625

.read = mem_cgroup_read,

4631

.register_event = mem_cgroup_usage_register_event,

4626

.register_event = mem_cgroup_usage_register_event,

4632

.unregister_event = mem_cgroup_usage_unregister_event,

4627

.unregister_event = mem_cgroup_usage_unregister_event,

4633

},

4628

},

4634

{

4629

{

4635

.name = "max_usage_in_bytes",

4630

.name = "max_usage_in_bytes",

4636

.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),

4631

.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),

4637

.trigger = mem_cgroup_reset,

4632

.trigger = mem_cgroup_reset,

4638

.read = mem_cgroup_read,

4633

.read = mem_cgroup_read,

4639

},

4634

},

4640

{

4635

{

4641

.name = "limit_in_bytes",

4636

.name = "limit_in_bytes",

4642

.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),

4637

.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),

4643

.write_string = mem_cgroup_write,

4638

.write_string = mem_cgroup_write,

4644

.read = mem_cgroup_read,

4639

.read = mem_cgroup_read,

4645

},

4640

},

4646

{

4641

{

4647

.name = "soft_limit_in_bytes",

4642

.name = "soft_limit_in_bytes",

4648

.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),

4643

.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),

4649

.write_string = mem_cgroup_write,

4644

.write_string = mem_cgroup_write,

4650

.read = mem_cgroup_read,

4645

.read = mem_cgroup_read,

4651

},

4646

},

4652

{

4647

{

4653

.name = "failcnt",

4648

.name = "failcnt",

4654

.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),

4649

.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),

4655

.trigger = mem_cgroup_reset,

4650

.trigger = mem_cgroup_reset,

4656

.read = mem_cgroup_read,

4651

.read = mem_cgroup_read,

4657

},

4652

},

4658

{

4653

{

4659

.name = "stat",

4654

.name = "stat",

4660

.read_seq_string = memcg_stat_show,

4655

.read_seq_string = memcg_stat_show,

4661

},

4656

},

4662

{

4657

{

4663

.name = "force_empty",

4658

.name = "force_empty",

4664

.trigger = mem_cgroup_force_empty_write,

4659

.trigger = mem_cgroup_force_empty_write,

4665

},

4660

},

4666

{

4661

{

4667

.name = "use_hierarchy",

4662

.name = "use_hierarchy",

4668

.write_u64 = mem_cgroup_hierarchy_write,

4663

.write_u64 = mem_cgroup_hierarchy_write,

4669

.read_u64 = mem_cgroup_hierarchy_read,

4664

.read_u64 = mem_cgroup_hierarchy_read,

4670

},

4665

},

4671

{

4666

{

4672

.name = "swappiness",

4667

.name = "swappiness",

4673

.read_u64 = mem_cgroup_swappiness_read,

4668

.read_u64 = mem_cgroup_swappiness_read,

4674

.write_u64 = mem_cgroup_swappiness_write,

4669

.write_u64 = mem_cgroup_swappiness_write,

4675

},

4670

},

4676

{

4671

{

4677

.name = "move_charge_at_immigrate",

4672

.name = "move_charge_at_immigrate",

4678

.read_u64 = mem_cgroup_move_charge_read,

4673

.read_u64 = mem_cgroup_move_charge_read,

4679

.write_u64 = mem_cgroup_move_charge_write,

4674

.write_u64 = mem_cgroup_move_charge_write,

4680

},

4675

},

4681

{

4676

{

4682

.name = "oom_control",

4677

.name = "oom_control",

4683

.read_map = mem_cgroup_oom_control_read,

4678

.read_map = mem_cgroup_oom_control_read,

4684

.write_u64 = mem_cgroup_oom_control_write,

4679

.write_u64 = mem_cgroup_oom_control_write,

4685

.register_event = mem_cgroup_oom_register_event,

4680

.register_event = mem_cgroup_oom_register_event,

4686

.unregister_event = mem_cgroup_oom_unregister_event,

4681

.unregister_event = mem_cgroup_oom_unregister_event,

4687

.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),

4682

.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),

4688

},

4683

},

4689

#ifdef CONFIG_NUMA

4684

#ifdef CONFIG_NUMA

4690

{

4685

{

4691

.name = "numa_stat",

4686

.name = "numa_stat",

4692

.read_seq_string = memcg_numa_stat_show,

4687

.read_seq_string = memcg_numa_stat_show,

4693

},

4688

},

4694

#endif

4689

#endif

4695

#ifdef CONFIG_MEMCG_SWAP

4690

#ifdef CONFIG_MEMCG_SWAP

4696

{

4691

{

4697

.name = "memsw.usage_in_bytes",

4692

.name = "memsw.usage_in_bytes",

4698

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),

4693

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),

4699

.read = mem_cgroup_read,

4694

.read = mem_cgroup_read,

4700

.register_event = mem_cgroup_usage_register_event,

4695

.register_event = mem_cgroup_usage_register_event,

4701

.unregister_event = mem_cgroup_usage_unregister_event,

4696

.unregister_event = mem_cgroup_usage_unregister_event,

4702

},

4697

},

4703

{

4698

{

4704

.name = "memsw.max_usage_in_bytes",

4699

.name = "memsw.max_usage_in_bytes",

4705

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),

4700

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),

4706

.trigger = mem_cgroup_reset,

4701

.trigger = mem_cgroup_reset,

4707

.read = mem_cgroup_read,

4702

.read = mem_cgroup_read,

4708

},

4703

},

4709

{

4704

{

4710

.name = "memsw.limit_in_bytes",

4705

.name = "memsw.limit_in_bytes",

4711

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),

4706

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),

4712

.write_string = mem_cgroup_write,

4707

.write_string = mem_cgroup_write,

4713

.read = mem_cgroup_read,

4708

.read = mem_cgroup_read,

4714

},

4709

},

4715

{

4710

{

4716

.name = "memsw.failcnt",

4711

.name = "memsw.failcnt",

4717

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),

4712

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),

4718

.trigger = mem_cgroup_reset,

4713

.trigger = mem_cgroup_reset,

4719

.read = mem_cgroup_read,

4714

.read = mem_cgroup_read,

4720

},

4715

},

4721

#endif

4716

#endif

4722

{ }, /* terminate */

4717

{ }, /* terminate */

4723

};

4718

};

4724

4719

4725

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)

4720

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)

4726

{

4721

{

4727

struct mem_cgroup_per_node *pn;

4722

struct mem_cgroup_per_node *pn;

4728

struct mem_cgroup_per_zone *mz;

4723

struct mem_cgroup_per_zone *mz;

4729

int zone, tmp = node;

4724

int zone, tmp = node;

4730

/*

4725

/*

4731

* This routine is called against possible nodes.

4726

* This routine is called against possible nodes.

4732

* But it's BUG to call kmalloc() against offline node.

4727

* But it's BUG to call kmalloc() against offline node.

4733

*

4728

*

4734

* TODO: this routine can waste much memory for nodes which will

4729

* TODO: this routine can waste much memory for nodes which will

4735

* never be onlined. It's better to use memory hotplug callback

4730

* never be onlined. It's better to use memory hotplug callback

4736

* function.

4731

* function.

4737

*/

4732

*/

4738

if (!node_state(node, N_NORMAL_MEMORY))

4733

if (!node_state(node, N_NORMAL_MEMORY))

4739

tmp = -1;

4734

tmp = -1;

4740

pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

4735

pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

4741

if (!pn)

4736

if (!pn)

4742

return 1;

4737

return 1;

4743

4738

4744

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

4739

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

4745

mz = &pn->zoneinfo[zone];

4740

mz = &pn->zoneinfo[zone];

4746

lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);

4741

lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);

4747

mz->usage_in_excess = 0;

4742

mz->usage_in_excess = 0;

4748

mz->on_tree = false;

4743

mz->on_tree = false;

4749

mz->memcg = memcg;

4744

mz->memcg = memcg;

4750

}

4745

}

4751

memcg->info.nodeinfo[node] = pn;

4746

memcg->info.nodeinfo[node] = pn;

4752

return 0;

4747

return 0;

4753

}

4748

}

4754

4749

4755

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)

4750

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)

4756

{

4751

{

4757

kfree(memcg->info.nodeinfo[node]);

4752

kfree(memcg->info.nodeinfo[node]);

4758

}

4753

}

4759

4754

4760

static struct mem_cgroup *mem_cgroup_alloc(void)

4755

static struct mem_cgroup *mem_cgroup_alloc(void)

4761

{

4756

{

4762

struct mem_cgroup *memcg;

4757

struct mem_cgroup *memcg;

4763

int size = sizeof(struct mem_cgroup);

4758

int size = sizeof(struct mem_cgroup);

4764

4759

4765

/* Can be very big if MAX_NUMNODES is very big */

4760

/* Can be very big if MAX_NUMNODES is very big */

4766

if (size < PAGE_SIZE)

4761

if (size < PAGE_SIZE)

4767

memcg = kzalloc(size, GFP_KERNEL);

4762

memcg = kzalloc(size, GFP_KERNEL);

4768

else

4763

else

4769

memcg = vzalloc(size);

4764

memcg = vzalloc(size);

4770

4765

4771

if (!memcg)

4766

if (!memcg)

4772

return NULL;

4767

return NULL;

4773

4768

4774

memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);

4769

memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);

4775

if (!memcg->stat)

4770

if (!memcg->stat)

4776

goto out_free;

4771

goto out_free;

4777

spin_lock_init(&memcg->pcp_counter_lock);

4772

spin_lock_init(&memcg->pcp_counter_lock);

4778

return memcg;

4773

return memcg;

4779

4774

4780

out_free:

4775

out_free:

4781

if (size < PAGE_SIZE)

4776

if (size < PAGE_SIZE)

4782

kfree(memcg);

4777

kfree(memcg);

4783

else

4778

else

4784

vfree(memcg);

4779

vfree(memcg);

4785

return NULL;

4780

return NULL;

4786

}

4781

}

4787

4782

4788

/*

4783

/*

4789

* Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,

4784

* Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,

4790

* but in process context. The work_freeing structure is overlaid

4785

* but in process context. The work_freeing structure is overlaid

4791

* on the rcu_freeing structure, which itself is overlaid on memsw.

4786

* on the rcu_freeing structure, which itself is overlaid on memsw.

4792

*/

4787

*/

4793

static void free_work(struct work_struct *work)

4788

static void free_work(struct work_struct *work)

4794

{

4789

{

4795

struct mem_cgroup *memcg;

4790

struct mem_cgroup *memcg;

4796

int size = sizeof(struct mem_cgroup);

4791

int size = sizeof(struct mem_cgroup);

4797

4792

4798

memcg = container_of(work, struct mem_cgroup, work_freeing);

4793

memcg = container_of(work, struct mem_cgroup, work_freeing);

4799

/*

4794

/*

4800

* We need to make sure that (at least for now), the jump label

4795

* We need to make sure that (at least for now), the jump label

4801

* destruction code runs outside of the cgroup lock. This is because

4796

* destruction code runs outside of the cgroup lock. This is because

4802

* get_online_cpus(), which is called from the static_branch update,

4797

* get_online_cpus(), which is called from the static_branch update,

4803

* can't be called inside the cgroup_lock. cpusets are the ones

4798

* can't be called inside the cgroup_lock. cpusets are the ones

4804

* enforcing this dependency, so if they ever change, we might as well.

4799

* enforcing this dependency, so if they ever change, we might as well.

4805

*

4800

*

4806

* schedule_work() will guarantee this happens. Be careful if you need

4801

* schedule_work() will guarantee this happens. Be careful if you need

4807

* to move this code around, and make sure it is outside

4802

* to move this code around, and make sure it is outside

4808

* the cgroup_lock.

4803

* the cgroup_lock.

4809

*/

4804

*/

4810

disarm_sock_keys(memcg);

4805

disarm_sock_keys(memcg);

4811

if (size < PAGE_SIZE)

4806

if (size < PAGE_SIZE)

4812

kfree(memcg);

4807

kfree(memcg);

4813

else

4808

else

4814

vfree(memcg);

4809

vfree(memcg);

4815

}

4810

}

4816

4811

4817

static void free_rcu(struct rcu_head *rcu_head)

4812

static void free_rcu(struct rcu_head *rcu_head)

4818

{

4813

{

4819

struct mem_cgroup *memcg;

4814

struct mem_cgroup *memcg;

4820

4815

4821

memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);

4816

memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);

4822

INIT_WORK(&memcg->work_freeing, free_work);

4817

INIT_WORK(&memcg->work_freeing, free_work);

4823

schedule_work(&memcg->work_freeing);

4818

schedule_work(&memcg->work_freeing);

4824

}

4819

}

4825

4820

4826

/*

4821

/*

4827

* At destroying mem_cgroup, references from swap_cgroup can remain.

4822

* At destroying mem_cgroup, references from swap_cgroup can remain.

4828

* (scanning all at force_empty is too costly...)

4823

* (scanning all at force_empty is too costly...)

4829

*

4824

*

4830

* Instead of clearing all references at force_empty, we remember

4825

* Instead of clearing all references at force_empty, we remember

4831

* the number of reference from swap_cgroup and free mem_cgroup when

4826

* the number of reference from swap_cgroup and free mem_cgroup when

4832

* it goes down to 0.

4827

* it goes down to 0.

4833

*

4828

*

4834

* Removal of cgroup itself succeeds regardless of refs from swap.

4829

* Removal of cgroup itself succeeds regardless of refs from swap.

4835

*/

4830

*/

4836

4831

4837

static void __mem_cgroup_free(struct mem_cgroup *memcg)

4832

static void __mem_cgroup_free(struct mem_cgroup *memcg)

4838

{

4833

{

4839

int node;

4834

int node;

4840

4835

4841

mem_cgroup_remove_from_trees(memcg);

4836

mem_cgroup_remove_from_trees(memcg);

4842

free_css_id(&mem_cgroup_subsys, &memcg->css);

4837

free_css_id(&mem_cgroup_subsys, &memcg->css);

4843

4838

4844

for_each_node(node)

4839

for_each_node(node)

4845

free_mem_cgroup_per_zone_info(memcg, node);

4840

free_mem_cgroup_per_zone_info(memcg, node);

4846

4841

4847

free_percpu(memcg->stat);

4842

free_percpu(memcg->stat);

4848

call_rcu(&memcg->rcu_freeing, free_rcu);

4843

call_rcu(&memcg->rcu_freeing, free_rcu);

4849

}

4844

}

4850

4845

4851

static void mem_cgroup_get(struct mem_cgroup *memcg)

4846

static void mem_cgroup_get(struct mem_cgroup *memcg)

4852

{

4847

{

4853

atomic_inc(&memcg->refcnt);

4848

atomic_inc(&memcg->refcnt);

4854

}

4849

}

4855

4850

4856

static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)

4851

static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)

4857

{

4852

{

4858

if (atomic_sub_and_test(count, &memcg->refcnt)) {

4853

if (atomic_sub_and_test(count, &memcg->refcnt)) {

4859

struct mem_cgroup *parent = parent_mem_cgroup(memcg);

4854

struct mem_cgroup *parent = parent_mem_cgroup(memcg);

4860

__mem_cgroup_free(memcg);

4855

__mem_cgroup_free(memcg);

4861

if (parent)

4856

if (parent)

4862

mem_cgroup_put(parent);

4857

mem_cgroup_put(parent);

4863

}

4858

}

4864

}

4859

}

4865

4860

4866

static void mem_cgroup_put(struct mem_cgroup *memcg)

4861

static void mem_cgroup_put(struct mem_cgroup *memcg)

4867

{

4862

{

4868

__mem_cgroup_put(memcg, 1);

4863

__mem_cgroup_put(memcg, 1);

4869

}

4864

}

4870

4865

4871

/*

4866

/*

4872

* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.

4867

* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.

4873

*/

4868

*/

4874

struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)

4869

struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)

4875

{

4870

{

4876

if (!memcg->res.parent)

4871

if (!memcg->res.parent)

4877

return NULL;

4872

return NULL;

4878

return mem_cgroup_from_res_counter(memcg->res.parent, res);

4873

return mem_cgroup_from_res_counter(memcg->res.parent, res);

4879

}

4874

}

4880

EXPORT_SYMBOL(parent_mem_cgroup);

4875

EXPORT_SYMBOL(parent_mem_cgroup);

4881

4876

4882

#ifdef CONFIG_MEMCG_SWAP

4877

#ifdef CONFIG_MEMCG_SWAP

4883

static void __init enable_swap_cgroup(void)

4878

static void __init enable_swap_cgroup(void)

4884

{

4879

{

4885

if (!mem_cgroup_disabled() && really_do_swap_account)

4880

if (!mem_cgroup_disabled() && really_do_swap_account)

4886

do_swap_account = 1;

4881

do_swap_account = 1;

4887

}

4882

}

4888

#else

4883

#else

4889

static void __init enable_swap_cgroup(void)

4884

static void __init enable_swap_cgroup(void)

4890

{

4885

{

4891

}

4886

}

4892

#endif

4887

#endif

4893

4888

4894

static int mem_cgroup_soft_limit_tree_init(void)

4889

static int mem_cgroup_soft_limit_tree_init(void)

4895

{

4890

{

4896

struct mem_cgroup_tree_per_node *rtpn;

4891

struct mem_cgroup_tree_per_node *rtpn;

4897

struct mem_cgroup_tree_per_zone *rtpz;

4892

struct mem_cgroup_tree_per_zone *rtpz;

4898

int tmp, node, zone;

4893

int tmp, node, zone;

4899

4894

4900

for_each_node(node) {

4895

for_each_node(node) {

4901

tmp = node;

4896

tmp = node;

4902

if (!node_state(node, N_NORMAL_MEMORY))

4897

if (!node_state(node, N_NORMAL_MEMORY))

4903

tmp = -1;

4898

tmp = -1;

4904

rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);

4899

rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);

4905

if (!rtpn)

4900

if (!rtpn)

4906

goto err_cleanup;

4901

goto err_cleanup;

4907

4902

4908

soft_limit_tree.rb_tree_per_node[node] = rtpn;

4903

soft_limit_tree.rb_tree_per_node[node] = rtpn;

4909

4904

4910

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

4905

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

4911

rtpz = &rtpn->rb_tree_per_zone[zone];

4906

rtpz = &rtpn->rb_tree_per_zone[zone];

4912

rtpz->rb_root = RB_ROOT;

4907

rtpz->rb_root = RB_ROOT;

4913

spin_lock_init(&rtpz->lock);

4908

spin_lock_init(&rtpz->lock);

4914

}

4909

}

4915

}

4910

}

4916

return 0;

4911

return 0;

4917

4912

4918

err_cleanup:

4913

err_cleanup:

4919

for_each_node(node) {

4914

for_each_node(node) {

4920

if (!soft_limit_tree.rb_tree_per_node[node])

4915

if (!soft_limit_tree.rb_tree_per_node[node])

4921

break;

4916

break;

4922

kfree(soft_limit_tree.rb_tree_per_node[node]);

4917

kfree(soft_limit_tree.rb_tree_per_node[node]);

4923

soft_limit_tree.rb_tree_per_node[node] = NULL;

4918

soft_limit_tree.rb_tree_per_node[node] = NULL;

4924

}

4919

}

4925

return 1;

4920

return 1;

4926

4921

4927

}

4922

}

4928

4923

4929

static struct cgroup_subsys_state * __ref

4924

static struct cgroup_subsys_state * __ref

4930

mem_cgroup_create(struct cgroup *cont)

4925

mem_cgroup_create(struct cgroup *cont)

4931

{

4926

{

4932

struct mem_cgroup *memcg, *parent;

4927

struct mem_cgroup *memcg, *parent;

4933

long error = -ENOMEM;

4928

long error = -ENOMEM;

4934

int node;

4929

int node;

4935

4930

4936

memcg = mem_cgroup_alloc();

4931

memcg = mem_cgroup_alloc();

4937

if (!memcg)

4932

if (!memcg)

4938

return ERR_PTR(error);

4933

return ERR_PTR(error);

4939

4934

4940

for_each_node(node)

4935

for_each_node(node)

4941

if (alloc_mem_cgroup_per_zone_info(memcg, node))

4936

if (alloc_mem_cgroup_per_zone_info(memcg, node))

4942

goto free_out;

4937

goto free_out;

4943

4938

4944

/* root ? */

4939

/* root ? */

4945

if (cont->parent == NULL) {

4940

if (cont->parent == NULL) {

4946

int cpu;

4941

int cpu;

4947

enable_swap_cgroup();

4942

enable_swap_cgroup();

4948

parent = NULL;

4943

parent = NULL;

4949

if (mem_cgroup_soft_limit_tree_init())

4944

if (mem_cgroup_soft_limit_tree_init())

4950

goto free_out;

4945

goto free_out;

4951

root_mem_cgroup = memcg;

4946

root_mem_cgroup = memcg;

4952

for_each_possible_cpu(cpu) {

4947

for_each_possible_cpu(cpu) {

4953

struct memcg_stock_pcp *stock =

4948

struct memcg_stock_pcp *stock =

4954

&per_cpu(memcg_stock, cpu);

4949

&per_cpu(memcg_stock, cpu);

4955

INIT_WORK(&stock->work, drain_local_stock);

4950

INIT_WORK(&stock->work, drain_local_stock);

4956

}

4951

}

4957

hotcpu_notifier(memcg_cpu_hotplug_callback, 0);

4952

hotcpu_notifier(memcg_cpu_hotplug_callback, 0);

4958

} else {

4953

} else {

4959

parent = mem_cgroup_from_cont(cont->parent);

4954

parent = mem_cgroup_from_cont(cont->parent);

4960

memcg->use_hierarchy = parent->use_hierarchy;

4955

memcg->use_hierarchy = parent->use_hierarchy;

4961

memcg->oom_kill_disable = parent->oom_kill_disable;

4956

memcg->oom_kill_disable = parent->oom_kill_disable;

4962

}

4957

}

4963

4958

4964

if (parent && parent->use_hierarchy) {

4959

if (parent && parent->use_hierarchy) {

4965

res_counter_init(&memcg->res, &parent->res);

4960

res_counter_init(&memcg->res, &parent->res);

4966

res_counter_init(&memcg->memsw, &parent->memsw);

4961

res_counter_init(&memcg->memsw, &parent->memsw);

4967

/*

4962

/*

4968

* We increment refcnt of the parent to ensure that we can

4963

* We increment refcnt of the parent to ensure that we can

4969

* safely access it on res_counter_charge/uncharge.

4964

* safely access it on res_counter_charge/uncharge.

4970

* This refcnt will be decremented when freeing this

4965

* This refcnt will be decremented when freeing this

4971

* mem_cgroup(see mem_cgroup_put).

4966

* mem_cgroup(see mem_cgroup_put).

4972

*/

4967

*/

4973

mem_cgroup_get(parent);

4968

mem_cgroup_get(parent);

4974

} else {

4969

} else {

4975

res_counter_init(&memcg->res, NULL);

4970

res_counter_init(&memcg->res, NULL);

4976

res_counter_init(&memcg->memsw, NULL);

4971

res_counter_init(&memcg->memsw, NULL);

4977

}

4972

}

4978

memcg->last_scanned_node = MAX_NUMNODES;

4973

memcg->last_scanned_node = MAX_NUMNODES;

4979

INIT_LIST_HEAD(&memcg->oom_notify);

4974

INIT_LIST_HEAD(&memcg->oom_notify);

4980

4975

4981

if (parent)

4976

if (parent)

4982

memcg->swappiness = mem_cgroup_swappiness(parent);

4977

memcg->swappiness = mem_cgroup_swappiness(parent);

4983

atomic_set(&memcg->refcnt, 1);

4978

atomic_set(&memcg->refcnt, 1);

4984

memcg->move_charge_at_immigrate = 0;

4979

memcg->move_charge_at_immigrate = 0;

4985

mutex_init(&memcg->thresholds_lock);

4980

mutex_init(&memcg->thresholds_lock);

4986

spin_lock_init(&memcg->move_lock);

4981

spin_lock_init(&memcg->move_lock);

4987

4982

4988

error = memcg_init_kmem(memcg, &mem_cgroup_subsys);

4983

error = memcg_init_kmem(memcg, &mem_cgroup_subsys);

4989

if (error) {

4984

if (error) {

4990

/*

4985

/*

4991

* We call put now because our (and parent's) refcnts

4986

* We call put now because our (and parent's) refcnts

4992

* are already in place. mem_cgroup_put() will internally

4987

* are already in place. mem_cgroup_put() will internally

4993

* call __mem_cgroup_free, so return directly

4988

* call __mem_cgroup_free, so return directly

4994

*/

4989

*/

4995

mem_cgroup_put(memcg);

4990

mem_cgroup_put(memcg);

4996

return ERR_PTR(error);

4991

return ERR_PTR(error);

4997

}

4992

}

4998

return &memcg->css;

4993

return &memcg->css;

4999

free_out:

4994

free_out:

5000

__mem_cgroup_free(memcg);

4995

__mem_cgroup_free(memcg);

5001

return ERR_PTR(error);

4996

return ERR_PTR(error);

5002

}

4997

}

5003

4998

5004

static int mem_cgroup_pre_destroy(struct cgroup *cont)

4999

static int mem_cgroup_pre_destroy(struct cgroup *cont)

5005

{

5000

{

5006

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

5001

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

5007

5002

5008

return mem_cgroup_force_empty(memcg, false);

5003

return mem_cgroup_force_empty(memcg, false);

5009

}

5004

}

5010

5005

5011

static void mem_cgroup_destroy(struct cgroup *cont)

5006

static void mem_cgroup_destroy(struct cgroup *cont)

5012

{

5007

{

5013

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

5008

struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);

5014

5009

5015

kmem_cgroup_destroy(memcg);

5010

kmem_cgroup_destroy(memcg);

5016

5011

5017

mem_cgroup_put(memcg);

5012

mem_cgroup_put(memcg);

5018

}

5013

}

5019

5014

5020

#ifdef CONFIG_MMU

5015

#ifdef CONFIG_MMU

5021

/* Handlers for move charge at task migration. */

5016

/* Handlers for move charge at task migration. */

5022

#define PRECHARGE_COUNT_AT_ONCE 256

5017

#define PRECHARGE_COUNT_AT_ONCE 256

5023

static int mem_cgroup_do_precharge(unsigned long count)

5018

static int mem_cgroup_do_precharge(unsigned long count)

5024

{

5019

{

5025

int ret = 0;

5020

int ret = 0;

5026

int batch_count = PRECHARGE_COUNT_AT_ONCE;

5021

int batch_count = PRECHARGE_COUNT_AT_ONCE;

5027

struct mem_cgroup *memcg = mc.to;

5022

struct mem_cgroup *memcg = mc.to;

5028

5023

5029

if (mem_cgroup_is_root(memcg)) {

5024

if (mem_cgroup_is_root(memcg)) {

5030

mc.precharge += count;

5025

mc.precharge += count;

5031

/* we don't need css_get for root */

5026

/* we don't need css_get for root */

5032

return ret;

5027

return ret;

5033

}

5028

}

5034

/* try to charge at once */

5029

/* try to charge at once */

5035

if (count > 1) {

5030

if (count > 1) {

5036

struct res_counter *dummy;

5031

struct res_counter *dummy;

5037

/*

5032

/*

5038

* "memcg" cannot be under rmdir() because we've already checked

5033

* "memcg" cannot be under rmdir() because we've already checked

5039

* by cgroup_lock_live_cgroup() that it is not removed and we

5034

* by cgroup_lock_live_cgroup() that it is not removed and we

5040

* are still under the same cgroup_mutex. So we can postpone

5035

* are still under the same cgroup_mutex. So we can postpone

5041

* css_get().

5036

* css_get().

5042

*/

5037

*/

5043

if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))

5038

if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))

5044

goto one_by_one;

5039

goto one_by_one;

5045

if (do_swap_account && res_counter_charge(&memcg->memsw,

5040

if (do_swap_account && res_counter_charge(&memcg->memsw,

5046

PAGE_SIZE * count, &dummy)) {

5041

PAGE_SIZE * count, &dummy)) {

5047

res_counter_uncharge(&memcg->res, PAGE_SIZE * count);

5042

res_counter_uncharge(&memcg->res, PAGE_SIZE * count);

5048

goto one_by_one;

5043

goto one_by_one;

5049

}

5044

}

5050

mc.precharge += count;

5045

mc.precharge += count;

5051

return ret;

5046

return ret;

5052

}

5047

}

5053

one_by_one:

5048

one_by_one:

5054

/* fall back to one by one charge */

5049

/* fall back to one by one charge */

5055

while (count--) {

5050

while (count--) {

5056

if (signal_pending(current)) {

5051

if (signal_pending(current)) {

5057

ret = -EINTR;

5052

ret = -EINTR;

5058

break;

5053

break;

5059

}

5054

}

5060

if (!batch_count--) {

5055

if (!batch_count--) {

5061

batch_count = PRECHARGE_COUNT_AT_ONCE;

5056

batch_count = PRECHARGE_COUNT_AT_ONCE;

5062

cond_resched();

5057

cond_resched();

5063

}

5058

}

5064

ret = __mem_cgroup_try_charge(NULL,

5059

ret = __mem_cgroup_try_charge(NULL,

5065

GFP_KERNEL, 1, &memcg, false);

5060

GFP_KERNEL, 1, &memcg, false);

5066

if (ret)

5061

if (ret)

5067

/* mem_cgroup_clear_mc() will do uncharge later */

5062

/* mem_cgroup_clear_mc() will do uncharge later */

5068

return ret;

5063

return ret;

5069

mc.precharge++;

5064

mc.precharge++;

5070

}

5065

}

5071

return ret;

5066

return ret;

5072

}

5067

}

5073

5068

5074

/**

5069

/**

5075

* get_mctgt_type - get target type of moving charge

5070

* get_mctgt_type - get target type of moving charge

5076

* @vma: the vma the pte to be checked belongs

5071

* @vma: the vma the pte to be checked belongs

5077

* @addr: the address corresponding to the pte to be checked

5072

* @addr: the address corresponding to the pte to be checked

5078

* @ptent: the pte to be checked

5073

* @ptent: the pte to be checked

5079

* @target: the pointer the target page or swap ent will be stored(can be NULL)

5074

* @target: the pointer the target page or swap ent will be stored(can be NULL)

5080

*

5075

*

5081

* Returns

5076

* Returns

5082

* 0(MC_TARGET_NONE): if the pte is not a target for move charge.

5077

* 0(MC_TARGET_NONE): if the pte is not a target for move charge.

5083

* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for

5078

* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for

5084

* move charge. if @target is not NULL, the page is stored in target->page

5079

* move charge. if @target is not NULL, the page is stored in target->page

5085

* with extra refcnt got(Callers should handle it).

5080

* with extra refcnt got(Callers should handle it).

5086

* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a

5081

* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a

5087

* target for charge migration. if @target is not NULL, the entry is stored

5082

* target for charge migration. if @target is not NULL, the entry is stored

5088

* in target->ent.

5083

* in target->ent.

5089

*

5084

*

5090

* Called with pte lock held.

5085

* Called with pte lock held.

5091

*/

5086

*/

5092

union mc_target {

5087

union mc_target {

5093

struct page *page;

5088

struct page *page;

5094

swp_entry_t ent;

5089

swp_entry_t ent;

5095

};

5090

};

5096

5091

5097

enum mc_target_type {

5092

enum mc_target_type {

5098

MC_TARGET_NONE = 0,

5093

MC_TARGET_NONE = 0,

5099

MC_TARGET_PAGE,

5094

MC_TARGET_PAGE,

5100

MC_TARGET_SWAP,

5095

MC_TARGET_SWAP,

5101

};

5096

};

5102

5097

5103

static struct page *mc_handle_present_pte(struct vm_area_struct *vma,

5098

static struct page *mc_handle_present_pte(struct vm_area_struct *vma,

5104

unsigned long addr, pte_t ptent)

5099

unsigned long addr, pte_t ptent)

5105

{

5100

{

5106

struct page *page = vm_normal_page(vma, addr, ptent);

5101

struct page *page = vm_normal_page(vma, addr, ptent);

5107

5102

5108

if (!page || !page_mapped(page))

5103

if (!page || !page_mapped(page))

5109

return NULL;

5104

return NULL;

5110

if (PageAnon(page)) {

5105

if (PageAnon(page)) {

5111

/* we don't move shared anon */

5106

/* we don't move shared anon */

5112

if (!move_anon())

5107

if (!move_anon())

5113

return NULL;

5108

return NULL;

5114

} else if (!move_file())

5109

} else if (!move_file())

5115

/* we ignore mapcount for file pages */

5110

/* we ignore mapcount for file pages */

5116

return NULL;

5111

return NULL;

5117

if (!get_page_unless_zero(page))

5112

if (!get_page_unless_zero(page))

5118

return NULL;

5113

return NULL;

5119

5114

5120

return page;

5115

return page;

5121

}

5116

}

5122

5117

5123

#ifdef CONFIG_SWAP

5118

#ifdef CONFIG_SWAP

5124

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

5119

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

5125

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5120

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5126

{

5121

{

5127

struct page *page = NULL;

5122

struct page *page = NULL;

5128

swp_entry_t ent = pte_to_swp_entry(ptent);

5123

swp_entry_t ent = pte_to_swp_entry(ptent);

5129

5124

5130

if (!move_anon() || non_swap_entry(ent))

5125

if (!move_anon() || non_swap_entry(ent))

5131

return NULL;

5126

return NULL;

5132

/*

5127

/*

5133

* Because lookup_swap_cache() updates some statistics counter,

5128

* Because lookup_swap_cache() updates some statistics counter,

5134

* we call find_get_page() with swapper_space directly.

5129

* we call find_get_page() with swapper_space directly.

5135

*/

5130

*/

5136

page = find_get_page(&swapper_space, ent.val);

5131

page = find_get_page(&swapper_space, ent.val);

5137

if (do_swap_account)

5132

if (do_swap_account)

5138

entry->val = ent.val;

5133

entry->val = ent.val;

5139

5134

5140

return page;

5135

return page;

5141

}

5136

}

5142

#else

5137

#else

5143

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

5138

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

5144

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5139

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5145

{

5140

{

5146

return NULL;

5141

return NULL;

5147

}

5142

}

5148

#endif

5143

#endif

5149

5144

5150

static struct page *mc_handle_file_pte(struct vm_area_struct *vma,

5145

static struct page *mc_handle_file_pte(struct vm_area_struct *vma,

5151

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5146

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5152

{

5147

{

5153

struct page *page = NULL;

5148

struct page *page = NULL;

5154

struct address_space *mapping;

5149

struct address_space *mapping;

5155

pgoff_t pgoff;

5150

pgoff_t pgoff;

5156

5151

5157

if (!vma->vm_file) /* anonymous vma */

5152

if (!vma->vm_file) /* anonymous vma */

5158

return NULL;

5153

return NULL;

5159

if (!move_file())

5154

if (!move_file())

5160

return NULL;

5155

return NULL;

5161

5156

5162

mapping = vma->vm_file->f_mapping;

5157

mapping = vma->vm_file->f_mapping;

5163

if (pte_none(ptent))

5158

if (pte_none(ptent))

5164

pgoff = linear_page_index(vma, addr);

5159

pgoff = linear_page_index(vma, addr);

5165

else /* pte_file(ptent) is true */

5160

else /* pte_file(ptent) is true */

5166

pgoff = pte_to_pgoff(ptent);

5161

pgoff = pte_to_pgoff(ptent);

5167

5162

5168

/* page is moved even if it's not RSS of this task(page-faulted). */

5163

/* page is moved even if it's not RSS of this task(page-faulted). */

5169

page = find_get_page(mapping, pgoff);

5164

page = find_get_page(mapping, pgoff);

5170

5165

5171

#ifdef CONFIG_SWAP

5166

#ifdef CONFIG_SWAP

5172

/* shmem/tmpfs may report page out on swap: account for that too. */

5167

/* shmem/tmpfs may report page out on swap: account for that too. */

5173

if (radix_tree_exceptional_entry(page)) {

5168

if (radix_tree_exceptional_entry(page)) {

5174

swp_entry_t swap = radix_to_swp_entry(page);

5169

swp_entry_t swap = radix_to_swp_entry(page);

5175

if (do_swap_account)

5170

if (do_swap_account)

5176

*entry = swap;

5171

*entry = swap;

5177

page = find_get_page(&swapper_space, swap.val);

5172

page = find_get_page(&swapper_space, swap.val);

5178

}

5173

}

5179

#endif

5174

#endif

5180

return page;

5175

return page;

5181

}

5176

}

5182

5177

5183

static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,

5178

static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,

5184

unsigned long addr, pte_t ptent, union mc_target *target)

5179

unsigned long addr, pte_t ptent, union mc_target *target)

5185

{

5180

{

5186

struct page *page = NULL;

5181

struct page *page = NULL;

5187

struct page_cgroup *pc;

5182

struct page_cgroup *pc;

5188

enum mc_target_type ret = MC_TARGET_NONE;

5183

enum mc_target_type ret = MC_TARGET_NONE;

5189

swp_entry_t ent = { .val = 0 };

5184

swp_entry_t ent = { .val = 0 };

5190

5185

5191

if (pte_present(ptent))

5186

if (pte_present(ptent))

5192

page = mc_handle_present_pte(vma, addr, ptent);

5187

page = mc_handle_present_pte(vma, addr, ptent);

5193

else if (is_swap_pte(ptent))

5188

else if (is_swap_pte(ptent))

5194

page = mc_handle_swap_pte(vma, addr, ptent, &ent);

5189

page = mc_handle_swap_pte(vma, addr, ptent, &ent);

5195

else if (pte_none(ptent) || pte_file(ptent))

5190

else if (pte_none(ptent) || pte_file(ptent))

5196

page = mc_handle_file_pte(vma, addr, ptent, &ent);

5191

page = mc_handle_file_pte(vma, addr, ptent, &ent);

5197

5192

5198

if (!page && !ent.val)

5193

if (!page && !ent.val)

5199

return ret;

5194

return ret;

5200

if (page) {

5195

if (page) {

5201

pc = lookup_page_cgroup(page);

5196

pc = lookup_page_cgroup(page);

5202

/*

5197

/*

5203

* Do only loose check w/o page_cgroup lock.

5198

* Do only loose check w/o page_cgroup lock.

5204

* mem_cgroup_move_account() checks the pc is valid or not under

5199

* mem_cgroup_move_account() checks the pc is valid or not under

5205

* the lock.

5200

* the lock.

5206

*/

5201

*/

5207

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

5202

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

5208

ret = MC_TARGET_PAGE;

5203

ret = MC_TARGET_PAGE;

5209

if (target)

5204

if (target)

5210

target->page = page;

5205

target->page = page;

5211

}

5206

}

5212

if (!ret || !target)

5207

if (!ret || !target)

5213

put_page(page);

5208

put_page(page);

5214

}

5209

}

5215

/* There is a swap entry and a page doesn't exist or isn't charged */

5210

/* There is a swap entry and a page doesn't exist or isn't charged */

5216

if (ent.val && !ret &&

5211

if (ent.val && !ret &&

5217

css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {

5212

css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {

5218

ret = MC_TARGET_SWAP;

5213

ret = MC_TARGET_SWAP;

5219

if (target)

5214

if (target)

5220

target->ent = ent;

5215

target->ent = ent;

5221

}

5216

}

5222

return ret;

5217

return ret;

5223

}

5218

}

5224

5219

5225

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

5220

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

5226

/*

5221

/*

5227

* We don't consider swapping or file mapped pages because THP does not

5222

* We don't consider swapping or file mapped pages because THP does not

5228

* support them for now.

5223

* support them for now.

5229

* Caller should make sure that pmd_trans_huge(pmd) is true.

5224

* Caller should make sure that pmd_trans_huge(pmd) is true.

5230

*/

5225

*/

5231

static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,

5226

static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,

5232

unsigned long addr, pmd_t pmd, union mc_target *target)

5227

unsigned long addr, pmd_t pmd, union mc_target *target)

5233

{

5228

{

5234

struct page *page = NULL;

5229

struct page *page = NULL;

5235

struct page_cgroup *pc;

5230

struct page_cgroup *pc;

5236

enum mc_target_type ret = MC_TARGET_NONE;

5231

enum mc_target_type ret = MC_TARGET_NONE;

5237

5232

5238

page = pmd_page(pmd);

5233

page = pmd_page(pmd);

5239

VM_BUG_ON(!page || !PageHead(page));

5234

VM_BUG_ON(!page || !PageHead(page));

5240

if (!move_anon())

5235

if (!move_anon())

5241

return ret;

5236

return ret;

5242

pc = lookup_page_cgroup(page);

5237

pc = lookup_page_cgroup(page);

5243

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

5238

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

5244

ret = MC_TARGET_PAGE;

5239

ret = MC_TARGET_PAGE;

5245

if (target) {

5240

if (target) {

5246

get_page(page);

5241

get_page(page);

5247

target->page = page;

5242

target->page = page;

5248

}

5243

}

5249

}

5244

}

5250

return ret;

5245

return ret;

5251

}

5246

}

5252

#else

5247

#else

5253

static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,

5248

static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,

5254

unsigned long addr, pmd_t pmd, union mc_target *target)

5249

unsigned long addr, pmd_t pmd, union mc_target *target)

5255

{

5250

{

5256

return MC_TARGET_NONE;

5251

return MC_TARGET_NONE;

5257

}

5252

}

5258

#endif

5253

#endif

5259

5254

5260

static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,

5255

static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,

5261

unsigned long addr, unsigned long end,

5256

unsigned long addr, unsigned long end,

5262

struct mm_walk *walk)

5257

struct mm_walk *walk)

5263

{

5258

{

5264

struct vm_area_struct *vma = walk->private;

5259

struct vm_area_struct *vma = walk->private;

5265

pte_t *pte;

5260

pte_t *pte;

5266

spinlock_t *ptl;

5261

spinlock_t *ptl;

5267

5262

5268

if (pmd_trans_huge_lock(pmd, vma) == 1) {

5263

if (pmd_trans_huge_lock(pmd, vma) == 1) {

5269

if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)

5264

if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)

5270

mc.precharge += HPAGE_PMD_NR;

5265

mc.precharge += HPAGE_PMD_NR;

5271

spin_unlock(&vma->vm_mm->page_table_lock);

5266

spin_unlock(&vma->vm_mm->page_table_lock);

5272

return 0;

5267

return 0;

5273

}

5268

}

5274

5269

5275

if (pmd_trans_unstable(pmd))

5270

if (pmd_trans_unstable(pmd))

5276

return 0;

5271

return 0;

5277

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5272

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5278

for (; addr != end; pte++, addr += PAGE_SIZE)

5273

for (; addr != end; pte++, addr += PAGE_SIZE)

5279

if (get_mctgt_type(vma, addr, *pte, NULL))

5274

if (get_mctgt_type(vma, addr, *pte, NULL))

5280

mc.precharge++; /* increment precharge temporarily */

5275

mc.precharge++; /* increment precharge temporarily */

5281

pte_unmap_unlock(pte - 1, ptl);

5276

pte_unmap_unlock(pte - 1, ptl);

5282

cond_resched();

5277

cond_resched();

5283

5278

5284

return 0;

5279

return 0;

5285

}

5280

}

5286

5281

5287

static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)

5282

static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)

5288

{

5283

{

5289

unsigned long precharge;

5284

unsigned long precharge;

5290

struct vm_area_struct *vma;

5285

struct vm_area_struct *vma;

5291

5286

5292

down_read(&mm->mmap_sem);

5287

down_read(&mm->mmap_sem);

5293

for (vma = mm->mmap; vma; vma = vma->vm_next) {

5288

for (vma = mm->mmap; vma; vma = vma->vm_next) {

5294

struct mm_walk mem_cgroup_count_precharge_walk = {

5289

struct mm_walk mem_cgroup_count_precharge_walk = {

5295

.pmd_entry = mem_cgroup_count_precharge_pte_range,

5290

.pmd_entry = mem_cgroup_count_precharge_pte_range,

5296

.mm = mm,

5291

.mm = mm,

5297

.private = vma,

5292

.private = vma,

5298

};

5293

};

5299

if (is_vm_hugetlb_page(vma))

5294

if (is_vm_hugetlb_page(vma))

5300

continue;

5295

continue;

5301

walk_page_range(vma->vm_start, vma->vm_end,

5296

walk_page_range(vma->vm_start, vma->vm_end,

5302

&mem_cgroup_count_precharge_walk);

5297

&mem_cgroup_count_precharge_walk);

5303

}

5298

}

5304

up_read(&mm->mmap_sem);

5299

up_read(&mm->mmap_sem);

5305

5300

5306

precharge = mc.precharge;

5301

precharge = mc.precharge;

5307

mc.precharge = 0;

5302

mc.precharge = 0;

5308

5303

5309

return precharge;

5304

return precharge;

5310

}

5305

}

5311

5306

5312

static int mem_cgroup_precharge_mc(struct mm_struct *mm)

5307

static int mem_cgroup_precharge_mc(struct mm_struct *mm)

5313

{

5308

{

5314

unsigned long precharge = mem_cgroup_count_precharge(mm);

5309

unsigned long precharge = mem_cgroup_count_precharge(mm);

5315

5310

5316

VM_BUG_ON(mc.moving_task);

5311

VM_BUG_ON(mc.moving_task);

5317

mc.moving_task = current;

5312

mc.moving_task = current;

5318

return mem_cgroup_do_precharge(precharge);

5313

return mem_cgroup_do_precharge(precharge);

5319

}

5314

}

5320

5315

5321

/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */

5316

/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */

5322

static void __mem_cgroup_clear_mc(void)

5317

static void __mem_cgroup_clear_mc(void)

5323

{

5318

{

5324

struct mem_cgroup *from = mc.from;

5319

struct mem_cgroup *from = mc.from;

5325

struct mem_cgroup *to = mc.to;

5320

struct mem_cgroup *to = mc.to;

5326

5321

5327

/* we must uncharge all the leftover precharges from mc.to */

5322

/* we must uncharge all the leftover precharges from mc.to */

5328

if (mc.precharge) {

5323

if (mc.precharge) {

5329

__mem_cgroup_cancel_charge(mc.to, mc.precharge);

5324

__mem_cgroup_cancel_charge(mc.to, mc.precharge);

5330

mc.precharge = 0;

5325

mc.precharge = 0;

5331

}

5326

}

5332

/*

5327

/*

5333

* we didn't uncharge from mc.from at mem_cgroup_move_account(), so

5328

* we didn't uncharge from mc.from at mem_cgroup_move_account(), so

5334

* we must uncharge here.

5329

* we must uncharge here.

5335

*/

5330

*/

5336

if (mc.moved_charge) {

5331

if (mc.moved_charge) {

5337

__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);

5332

__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);

5338

mc.moved_charge = 0;

5333

mc.moved_charge = 0;

5339

}

5334

}

5340

/* we must fixup refcnts and charges */

5335

/* we must fixup refcnts and charges */

5341

if (mc.moved_swap) {

5336

if (mc.moved_swap) {

5342

/* uncharge swap account from the old cgroup */

5337

/* uncharge swap account from the old cgroup */

5343

if (!mem_cgroup_is_root(mc.from))

5338

if (!mem_cgroup_is_root(mc.from))

5344

res_counter_uncharge(&mc.from->memsw,

5339

res_counter_uncharge(&mc.from->memsw,

5345

PAGE_SIZE * mc.moved_swap);

5340

PAGE_SIZE * mc.moved_swap);

5346

__mem_cgroup_put(mc.from, mc.moved_swap);

5341

__mem_cgroup_put(mc.from, mc.moved_swap);

5347

5342

5348

if (!mem_cgroup_is_root(mc.to)) {

5343

if (!mem_cgroup_is_root(mc.to)) {

5349

/*

5344

/*

5350

* we charged both to->res and to->memsw, so we should

5345

* we charged both to->res and to->memsw, so we should

5351

* uncharge to->res.

5346

* uncharge to->res.

5352

*/

5347

*/

5353

res_counter_uncharge(&mc.to->res,

5348

res_counter_uncharge(&mc.to->res,

5354

PAGE_SIZE * mc.moved_swap);

5349

PAGE_SIZE * mc.moved_swap);

5355

}

5350

}

5356

/* we've already done mem_cgroup_get(mc.to) */

5351

/* we've already done mem_cgroup_get(mc.to) */

5357

mc.moved_swap = 0;

5352

mc.moved_swap = 0;

5358

}

5353

}

5359

memcg_oom_recover(from);

5354

memcg_oom_recover(from);

5360

memcg_oom_recover(to);

5355

memcg_oom_recover(to);

5361

wake_up_all(&mc.waitq);

5356

wake_up_all(&mc.waitq);

5362

}

5357

}

5363

5358

5364

static void mem_cgroup_clear_mc(void)

5359

static void mem_cgroup_clear_mc(void)

5365

{

5360

{

5366

struct mem_cgroup *from = mc.from;

5361

struct mem_cgroup *from = mc.from;

5367

5362

5368

/*

5363

/*

5369

* we must clear moving_task before waking up waiters at the end of

5364

* we must clear moving_task before waking up waiters at the end of

5370

* task migration.

5365

* task migration.

5371

*/

5366

*/

5372

mc.moving_task = NULL;

5367

mc.moving_task = NULL;

5373

__mem_cgroup_clear_mc();

5368

__mem_cgroup_clear_mc();

5374

spin_lock(&mc.lock);

5369

spin_lock(&mc.lock);

5375

mc.from = NULL;

5370

mc.from = NULL;

5376

mc.to = NULL;

5371

mc.to = NULL;

5377

spin_unlock(&mc.lock);

5372

spin_unlock(&mc.lock);

5378

mem_cgroup_end_move(from);

5373

mem_cgroup_end_move(from);

5379

}

5374

}

5380

5375

5381

static int mem_cgroup_can_attach(struct cgroup *cgroup,

5376

static int mem_cgroup_can_attach(struct cgroup *cgroup,

5382

struct cgroup_taskset *tset)

5377

struct cgroup_taskset *tset)

5383

{

5378

{

5384

struct task_struct *p = cgroup_taskset_first(tset);

5379

struct task_struct *p = cgroup_taskset_first(tset);

5385

int ret = 0;

5380

int ret = 0;

5386

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);

5381

struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);

5387

5382

5388

if (memcg->move_charge_at_immigrate) {

5383

if (memcg->move_charge_at_immigrate) {

5389

struct mm_struct *mm;

5384

struct mm_struct *mm;

5390

struct mem_cgroup *from = mem_cgroup_from_task(p);

5385

struct mem_cgroup *from = mem_cgroup_from_task(p);

5391

5386

5392

VM_BUG_ON(from == memcg);

5387

VM_BUG_ON(from == memcg);

5393

5388

5394

mm = get_task_mm(p);

5389

mm = get_task_mm(p);

5395

if (!mm)

5390

if (!mm)

5396

return 0;

5391

return 0;

5397

/* We move charges only when we move a owner of the mm */

5392

/* We move charges only when we move a owner of the mm */

5398

if (mm->owner == p) {

5393

if (mm->owner == p) {

5399

VM_BUG_ON(mc.from);

5394

VM_BUG_ON(mc.from);

5400

VM_BUG_ON(mc.to);

5395

VM_BUG_ON(mc.to);

5401

VM_BUG_ON(mc.precharge);

5396

VM_BUG_ON(mc.precharge);

5402

VM_BUG_ON(mc.moved_charge);

5397

VM_BUG_ON(mc.moved_charge);

5403

VM_BUG_ON(mc.moved_swap);

5398

VM_BUG_ON(mc.moved_swap);

5404

mem_cgroup_start_move(from);

5399

mem_cgroup_start_move(from);

5405

spin_lock(&mc.lock);

5400

spin_lock(&mc.lock);

5406

mc.from = from;

5401

mc.from = from;

5407

mc.to = memcg;

5402

mc.to = memcg;

5408

spin_unlock(&mc.lock);

5403

spin_unlock(&mc.lock);

5409

/* We set mc.moving_task later */

5404

/* We set mc.moving_task later */

5410

5405

5411

ret = mem_cgroup_precharge_mc(mm);

5406

ret = mem_cgroup_precharge_mc(mm);

5412

if (ret)

5407

if (ret)

5413

mem_cgroup_clear_mc();

5408

mem_cgroup_clear_mc();

5414

}

5409

}

5415

mmput(mm);

5410

mmput(mm);

5416

}

5411

}

5417

return ret;

5412

return ret;

5418

}

5413

}

5419

5414

5420

static void mem_cgroup_cancel_attach(struct cgroup *cgroup,

5415

static void mem_cgroup_cancel_attach(struct cgroup *cgroup,

5421

struct cgroup_taskset *tset)

5416

struct cgroup_taskset *tset)

5422

{

5417

{

5423

mem_cgroup_clear_mc();

5418

mem_cgroup_clear_mc();

5424

}

5419

}

5425

5420

5426

static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,

5421

static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,

5427

unsigned long addr, unsigned long end,

5422

unsigned long addr, unsigned long end,

5428

struct mm_walk *walk)

5423

struct mm_walk *walk)

5429

{

5424

{

5430

int ret = 0;

5425

int ret = 0;

5431

struct vm_area_struct *vma = walk->private;

5426

struct vm_area_struct *vma = walk->private;

5432

pte_t *pte;

5427

pte_t *pte;

5433

spinlock_t *ptl;

5428

spinlock_t *ptl;

5434

enum mc_target_type target_type;

5429

enum mc_target_type target_type;

5435

union mc_target target;

5430

union mc_target target;

5436

struct page *page;

5431

struct page *page;

5437

struct page_cgroup *pc;

5432

struct page_cgroup *pc;

5438

5433

5439

/*

5434

/*

5440

* We don't take compound_lock() here but no race with splitting thp

5435

* We don't take compound_lock() here but no race with splitting thp

5441

* happens because:

5436

* happens because:

5442

* - if pmd_trans_huge_lock() returns 1, the relevant thp is not

5437

* - if pmd_trans_huge_lock() returns 1, the relevant thp is not

5443

* under splitting, which means there's no concurrent thp split,

5438

* under splitting, which means there's no concurrent thp split,

5444

* - if another thread runs into split_huge_page() just after we

5439

* - if another thread runs into split_huge_page() just after we

5445

* entered this if-block, the thread must wait for page table lock

5440

* entered this if-block, the thread must wait for page table lock

5446

* to be unlocked in __split_huge_page_splitting(), where the main

5441

* to be unlocked in __split_huge_page_splitting(), where the main

5447

* part of thp split is not executed yet.

5442

* part of thp split is not executed yet.

5448

*/

5443

*/

5449

if (pmd_trans_huge_lock(pmd, vma) == 1) {

5444

if (pmd_trans_huge_lock(pmd, vma) == 1) {

5450

if (mc.precharge < HPAGE_PMD_NR) {

5445

if (mc.precharge < HPAGE_PMD_NR) {

5451

spin_unlock(&vma->vm_mm->page_table_lock);

5446

spin_unlock(&vma->vm_mm->page_table_lock);

5452

return 0;

5447

return 0;

5453

}

5448

}

5454

target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);

5449

target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);

5455

if (target_type == MC_TARGET_PAGE) {

5450

if (target_type == MC_TARGET_PAGE) {

5456

page = target.page;

5451

page = target.page;

5457

if (!isolate_lru_page(page)) {

5452

if (!isolate_lru_page(page)) {

5458

pc = lookup_page_cgroup(page);

5453

pc = lookup_page_cgroup(page);

5459

if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,

5454

if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,

5460

pc, mc.from, mc.to)) {

5455

pc, mc.from, mc.to)) {

5461

mc.precharge -= HPAGE_PMD_NR;

5456

mc.precharge -= HPAGE_PMD_NR;

5462

mc.moved_charge += HPAGE_PMD_NR;

5457

mc.moved_charge += HPAGE_PMD_NR;

5463

}

5458

}

5464

putback_lru_page(page);

5459

putback_lru_page(page);

5465

}

5460

}

5466

put_page(page);

5461

put_page(page);

5467

}

5462

}

5468

spin_unlock(&vma->vm_mm->page_table_lock);

5463

spin_unlock(&vma->vm_mm->page_table_lock);

5469

return 0;

5464

return 0;

5470

}

5465

}

5471

5466

5472

if (pmd_trans_unstable(pmd))

5467

if (pmd_trans_unstable(pmd))

5473

return 0;

5468

return 0;

5474

retry:

5469

retry:

5475

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5470

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5476

for (; addr != end; addr += PAGE_SIZE) {

5471

for (; addr != end; addr += PAGE_SIZE) {

5477

pte_t ptent = *(pte++);

5472

pte_t ptent = *(pte++);

5478

swp_entry_t ent;

5473

swp_entry_t ent;

5479

5474

5480

if (!mc.precharge)

5475

if (!mc.precharge)

5481

break;

5476

break;

5482

5477

5483

switch (get_mctgt_type(vma, addr, ptent, &target)) {

5478

switch (get_mctgt_type(vma, addr, ptent, &target)) {

5484

case MC_TARGET_PAGE:

5479

case MC_TARGET_PAGE:

5485

page = target.page;

5480

page = target.page;

5486

if (isolate_lru_page(page))

5481

if (isolate_lru_page(page))

5487

goto put;

5482

goto put;

5488

pc = lookup_page_cgroup(page);

5483

pc = lookup_page_cgroup(page);

5489

if (!mem_cgroup_move_account(page, 1, pc,

5484

if (!mem_cgroup_move_account(page, 1, pc,

5490

mc.from, mc.to)) {

5485

mc.from, mc.to)) {

5491

mc.precharge--;

5486

mc.precharge--;

5492

/* we uncharge from mc.from later. */

5487

/* we uncharge from mc.from later. */

5493

mc.moved_charge++;

5488

mc.moved_charge++;

5494

}

5489

}

5495

putback_lru_page(page);

5490

putback_lru_page(page);

5496

put: /* get_mctgt_type() gets the page */

5491

put: /* get_mctgt_type() gets the page */

5497

put_page(page);

5492

put_page(page);

5498

break;

5493

break;

5499

case MC_TARGET_SWAP:

5494

case MC_TARGET_SWAP:

5500

ent = target.ent;

5495

ent = target.ent;

5501

if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {

5496

if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {

5502

mc.precharge--;

5497

mc.precharge--;

5503

/* we fixup refcnts and charges later. */

5498

/* we fixup refcnts and charges later. */

5504

mc.moved_swap++;

5499

mc.moved_swap++;

5505

}

5500

}

5506

break;

5501

break;

5507

default:

5502

default:

5508

break;

5503

break;

5509

}

5504

}

5510

}

5505

}

5511

pte_unmap_unlock(pte - 1, ptl);

5506

pte_unmap_unlock(pte - 1, ptl);

5512

cond_resched();

5507

cond_resched();

5513

5508

5514

if (addr != end) {

5509

if (addr != end) {

5515

/*

5510

/*

5516

* We have consumed all precharges we got in can_attach().

5511

* We have consumed all precharges we got in can_attach().

5517

* We try charge one by one, but don't do any additional

5512

* We try charge one by one, but don't do any additional

5518

* charges to mc.to if we have failed in charge once in attach()

5513

* charges to mc.to if we have failed in charge once in attach()

5519

* phase.

5514

* phase.

5520

*/

5515

*/

5521

ret = mem_cgroup_do_precharge(1);

5516

ret = mem_cgroup_do_precharge(1);

5522

if (!ret)

5517

if (!ret)

5523

goto retry;

5518

goto retry;

5524

}

5519

}

5525

5520

5526

return ret;

5521

return ret;

5527

}

5522

}

5528

5523

5529

static void mem_cgroup_move_charge(struct mm_struct *mm)

5524

static void mem_cgroup_move_charge(struct mm_struct *mm)

5530

{

5525

{

5531

struct vm_area_struct *vma;

5526

struct vm_area_struct *vma;

5532

5527

5533

lru_add_drain_all();

5528

lru_add_drain_all();

5534

retry:

5529

retry:

5535

if (unlikely(!down_read_trylock(&mm->mmap_sem))) {

5530

if (unlikely(!down_read_trylock(&mm->mmap_sem))) {

5536

/*

5531

/*

5537

* Someone who are holding the mmap_sem might be waiting in

5532

* Someone who are holding the mmap_sem might be waiting in

5538

* waitq. So we cancel all extra charges, wake up all waiters,

5533

* waitq. So we cancel all extra charges, wake up all waiters,

5539

* and retry. Because we cancel precharges, we might not be able

5534

* and retry. Because we cancel precharges, we might not be able

5540

* to move enough charges, but moving charge is a best-effort

5535

* to move enough charges, but moving charge is a best-effort

5541

* feature anyway, so it wouldn't be a big problem.

5536

* feature anyway, so it wouldn't be a big problem.

5542

*/

5537

*/

5543

__mem_cgroup_clear_mc();

5538

__mem_cgroup_clear_mc();

5544

cond_resched();

5539

cond_resched();

5545

goto retry;

5540

goto retry;

5546

}

5541

}

5547

for (vma = mm->mmap; vma; vma = vma->vm_next) {

5542

for (vma = mm->mmap; vma; vma = vma->vm_next) {

5548

int ret;

5543

int ret;

5549

struct mm_walk mem_cgroup_move_charge_walk = {

5544

struct mm_walk mem_cgroup_move_charge_walk = {

5550

.pmd_entry = mem_cgroup_move_charge_pte_range,

5545

.pmd_entry = mem_cgroup_move_charge_pte_range,

5551

.mm = mm,

5546

.mm = mm,

5552

.private = vma,

5547

.private = vma,

5553

};

5548

};

5554

if (is_vm_hugetlb_page(vma))

5549

if (is_vm_hugetlb_page(vma))

5555

continue;

5550

continue;

5556

ret = walk_page_range(vma->vm_start, vma->vm_end,

5551

ret = walk_page_range(vma->vm_start, vma->vm_end,

5557

&mem_cgroup_move_charge_walk);

5552

&mem_cgroup_move_charge_walk);

5558

if (ret)

5553

if (ret)

5559

/*

5554

/*

5560

* means we have consumed all precharges and failed in

5555

* means we have consumed all precharges and failed in

5561

* doing additional charge. Just abandon here.

5556

* doing additional charge. Just abandon here.

5562

*/

5557

*/

5563

break;

5558

break;

5564

}

5559

}

5565

up_read(&mm->mmap_sem);

5560

up_read(&mm->mmap_sem);

5566

}

5561

}

5567

5562

5568

static void mem_cgroup_move_task(struct cgroup *cont,

5563

static void mem_cgroup_move_task(struct cgroup *cont,

5569

struct cgroup_taskset *tset)

5564

struct cgroup_taskset *tset)

5570

{

5565

{

5571

struct task_struct *p = cgroup_taskset_first(tset);

5566

struct task_struct *p = cgroup_taskset_first(tset);

5572

struct mm_struct *mm = get_task_mm(p);

5567

struct mm_struct *mm = get_task_mm(p);

5573

5568

5574

if (mm) {

5569

if (mm) {

5575

if (mc.to)

5570

if (mc.to)

5576

mem_cgroup_move_charge(mm);

5571

mem_cgroup_move_charge(mm);

5577

mmput(mm);

5572

mmput(mm);

5578

}

5573

}

5579

if (mc.to)

5574

if (mc.to)

5580

mem_cgroup_clear_mc();

5575

mem_cgroup_clear_mc();

5581

}

5576

}

5582

#else /* !CONFIG_MMU */

5577

#else /* !CONFIG_MMU */

5583

static int mem_cgroup_can_attach(struct cgroup *cgroup,

5578

static int mem_cgroup_can_attach(struct cgroup *cgroup,

5584

struct cgroup_taskset *tset)

5579

struct cgroup_taskset *tset)

5585

{

5580

{

5586

return 0;

5581

return 0;

5587

}

5582

}

5588

static void mem_cgroup_cancel_attach(struct cgroup *cgroup,

5583

static void mem_cgroup_cancel_attach(struct cgroup *cgroup,

5589

struct cgroup_taskset *tset)

5584

struct cgroup_taskset *tset)

5590

{

5585

{

5591

}

5586

}

5592

static void mem_cgroup_move_task(struct cgroup *cont,

5587

static void mem_cgroup_move_task(struct cgroup *cont,

5593

struct cgroup_taskset *tset)

5588

struct cgroup_taskset *tset)

5594

{

5589

{

5595

}

5590

}

5596

#endif

5591

#endif

5597

5592

5598

struct cgroup_subsys mem_cgroup_subsys = {

5593

struct cgroup_subsys mem_cgroup_subsys = {

5599

.name = "memory",

5594

.name = "memory",

5600

.subsys_id = mem_cgroup_subsys_id,

5595

.subsys_id = mem_cgroup_subsys_id,

5601

.create = mem_cgroup_create,

5596

.create = mem_cgroup_create,

5602

.pre_destroy = mem_cgroup_pre_destroy,

5597

.pre_destroy = mem_cgroup_pre_destroy,

5603

.destroy = mem_cgroup_destroy,

5598

.destroy = mem_cgroup_destroy,

5604

.can_attach = mem_cgroup_can_attach,

5599

.can_attach = mem_cgroup_can_attach,

5605

.cancel_attach = mem_cgroup_cancel_attach,

5600

.cancel_attach = mem_cgroup_cancel_attach,

5606

.attach = mem_cgroup_move_task,

5601

.attach = mem_cgroup_move_task,

5607

.base_cftypes = mem_cgroup_files,

5602

.base_cftypes = mem_cgroup_files,

5608

.early_init = 0,

5603

.early_init = 0,

5609

.use_id = 1,

5604

.use_id = 1,

5610

.__DEPRECATED_clear_css_refs = true,

5605

.__DEPRECATED_clear_css_refs = true,

5611

};

5606

};

5612

5607

5613

#ifdef CONFIG_MEMCG_SWAP

5608

#ifdef CONFIG_MEMCG_SWAP

5614

static int __init enable_swap_account(char *s)

5609

static int __init enable_swap_account(char *s)

5615

{

5610

{

5616

/* consider enabled if no parameter or 1 is given */

5611

/* consider enabled if no parameter or 1 is given */

5617

if (!strcmp(s, "1"))

5612

if (!strcmp(s, "1"))

5618

really_do_swap_account = 1;

5613

really_do_swap_account = 1;

5619

else if (!strcmp(s, "0"))

5614

else if (!strcmp(s, "0"))

5620

really_do_swap_account = 0;

5615

really_do_swap_account = 0;

5621

return 1;

5616

return 1;

5622

}

5617

}

5623

__setup("swapaccount=", enable_swap_account);

5618

__setup("swapaccount=", enable_swap_account);

5624

5619

5625

#endif

5620

#endif

5626

5621

GITLAB

mm: memcg: only check for PageSwapCache when uncharging anon

 /* memcontrol.c - Memory Controller
  *
  * Copyright IBM Corporation, 2007
  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
  *
  * Copyright 2007 OpenVZ SWsoft Inc
  * Author: Pavel Emelianov <xemul@openvz.org>
  *
  * Memory thresholds
  * Copyright (C) 2009 Nokia Corporation
  * Author: Kirill A. Shutemov
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  */
 #include <linux/res_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/smp.h>
 #include <linux/page-flags.h>
 #include <linux/backing-dev.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/limits.h>
 #include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/spinlock.h>
 #include <linux/eventfd.h>
 #include <linux/sort.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/tcp_memcontrol.h>
 #include <asm/uaccess.h>
 #include <trace/events/vmscan.h>
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
 #ifdef CONFIG_MEMCG_SWAP
 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
 int do_swap_account __read_mostly;
 /* for remember boot option*/
 #ifdef CONFIG_MEMCG_SWAP_ENABLED
 static int really_do_swap_account __initdata = 1;
 #else
 static int really_do_swap_account __initdata = 0;
 #endif
 #else
 #define do_swap_account		0
 #endif
 /*
  * Statistics for memory cgroup.
  */
 enum mem_cgroup_stat_index {
 	/*
 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
 	 */
 	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
 	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
 	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
 	MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
 	MEM_CGROUP_STAT_NSTATS,
 };
 static const char * const mem_cgroup_stat_names[] = {
 	"cache",
 	"rss",
 	"mapped_file",
 	"swap",
 };
 enum mem_cgroup_events_index {
 	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
 	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
 	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
 	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
 	MEM_CGROUP_EVENTS_NSTATS,
 };
 static const char * const mem_cgroup_events_names[] = {
 	"pgpgin",
 	"pgpgout",
 	"pgfault",
 	"pgmajfault",
 };
 /*
  * Per memcg event counter is incremented at every pagein/pageout. With THP,
  * it will be incremated by the number of pages. This counter is used for
  * for trigger some periodic events. This is straightforward and better
  * than using jiffies etc. to handle periodic memcg event.
  */
 enum mem_cgroup_events_target {
 	MEM_CGROUP_TARGET_THRESH,
 	MEM_CGROUP_TARGET_SOFTLIMIT,
 	MEM_CGROUP_TARGET_NUMAINFO,
 	MEM_CGROUP_NTARGETS,
 };
 #define THRESHOLDS_EVENTS_TARGET 128
 #define SOFTLIMIT_EVENTS_TARGET 1024
 #define NUMAINFO_EVENTS_TARGET	1024
 struct mem_cgroup_stat_cpu {
 	long count[MEM_CGROUP_STAT_NSTATS];
 	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
 	unsigned long nr_page_events;
 	unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 struct mem_cgroup_reclaim_iter {
 	/* css_id of the last scanned hierarchy member */
 	int position;
 	/* scan generation, increased every round-trip */
 	unsigned int generation;
 };
 /*
  * per-zone information in memory controller.
  */
 struct mem_cgroup_per_zone {
 	struct lruvec		lruvec;
 	unsigned long		lru_size[NR_LRU_LISTS];
 	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
 	struct rb_node		tree_node;	/* RB tree node */
 	unsigned long long	usage_in_excess;/* Set to the value by which */
 						/* the soft limit is exceeded*/
 	bool			on_tree;
 	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
 						/* use container_of	   */
 };
 struct mem_cgroup_per_node {
 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 };
 struct mem_cgroup_lru_info {
 	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 };
 /*
  * Cgroups above their limits are maintained in a RB-Tree, independent of
  * their hierarchy representation
  */
 struct mem_cgroup_tree_per_zone {
 	struct rb_root rb_root;
 	spinlock_t lock;
 };
 struct mem_cgroup_tree_per_node {
 	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
 };
 struct mem_cgroup_tree {
 	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 };
 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 struct mem_cgroup_threshold {
 	struct eventfd_ctx *eventfd;
 	u64 threshold;
 };
 /* For threshold */
 struct mem_cgroup_threshold_ary {
 	/* An array index points to threshold just below or equal to usage. */
 	int current_threshold;
 	/* Size of entries[] */
 	unsigned int size;
 	/* Array of thresholds */
 	struct mem_cgroup_threshold entries[0];
 };
 struct mem_cgroup_thresholds {
 	/* Primary thresholds array */
 	struct mem_cgroup_threshold_ary *primary;
 	/*
 	 * Spare threshold array.
 	 * This is needed to make mem_cgroup_unregister_event() "never fail".
 	 * It must be able to store at least primary->size - 1 entries.
 	 */
 	struct mem_cgroup_threshold_ary *spare;
 };
 /* for OOM */
 struct mem_cgroup_eventfd_list {
 	struct list_head list;
 	struct eventfd_ctx *eventfd;
 };
 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
  * to help the administrator determine what knobs to tune.
  *
  * TODO: Add a water mark for the memory controller. Reclaim will begin when
  * we hit the water mark. May be even add a low water mark, such that
  * no reclaim occurs from a cgroup at it's low water mark, this is
  * a feature that will be implemented much later in the future.
  */
 struct mem_cgroup {
 	struct cgroup_subsys_state css;
 	/*
 	 * the counter to account for memory usage
 	 */
 	struct res_counter res;
 	union {
 		/*
 		 * the counter to account for mem+swap usage.
 		 */
 		struct res_counter memsw;
 		/*
 		 * rcu_freeing is used only when freeing struct mem_cgroup,
 		 * so put it into a union to avoid wasting more memory.
 		 * It must be disjoint from the css field.  It could be
 		 * in a union with the res field, but res plays a much
 		 * larger part in mem_cgroup life than memsw, and might
 		 * be of interest, even at time of free, when debugging.
 		 * So share rcu_head with the less interesting memsw.
 		 */
 		struct rcu_head rcu_freeing;
 		/*
 		 * We also need some space for a worker in deferred freeing.
 		 * By the time we call it, rcu_freeing is no longer in use.
 		 */
 		struct work_struct work_freeing;
 	};
 	/*
 	 * Per cgroup active and inactive list, similar to the
 	 * per zone LRU lists.
 	 */
 	struct mem_cgroup_lru_info info;
 	int last_scanned_node;
 #if MAX_NUMNODES > 1
 	nodemask_t	scan_nodes;
 	atomic_t	numainfo_events;
 	atomic_t	numainfo_updating;
 #endif
 	/*
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
 	bool use_hierarchy;
 	bool		oom_lock;
 	atomic_t	under_oom;
 	atomic_t	refcnt;
 	int	swappiness;
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
 	/* set when res.limit == memsw.limit */
 	bool		memsw_is_minimum;
 	/* protect arrays of thresholds */
 	struct mutex thresholds_lock;
 	/* thresholds for memory usage. RCU-protected */
 	struct mem_cgroup_thresholds thresholds;
 	/* thresholds for mem+swap usage. RCU-protected */
 	struct mem_cgroup_thresholds memsw_thresholds;
 	/* For oom notifier event fd */
 	struct list_head oom_notify;
 	/*
 	 * Should we move charges of a task when a task is moved into this
 	 * mem_cgroup ? And what type of charges should we move ?
 	 */
 	unsigned long 	move_charge_at_immigrate;
 	/*
 	 * set > 0 if pages under this cgroup are moving to other cgroup.
 	 */
 	atomic_t	moving_account;
 	/* taken only while moving_account > 0 */
 	spinlock_t	move_lock;
 	/*
 	 * percpu counter.
 	 */
 	struct mem_cgroup_stat_cpu __percpu *stat;
 	/*
 	 * used when a cpu is offlined or other synchronizations
 	 * See mem_cgroup_read_stat().
 	 */
 	struct mem_cgroup_stat_cpu nocpu_base;
 	spinlock_t pcp_counter_lock;
 #ifdef CONFIG_INET
 	struct tcp_memcontrol tcp_mem;
 #endif
 };
 /* Stuffs for move charges at task migration. */
 /*
  * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
  * left-shifted bitmap of these types.
  */
 enum move_type {
 	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
 	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
 	NR_MOVE_TYPE,
 };
 /* "mc" and its members are protected by cgroup_mutex */
 static struct move_charge_struct {
 	spinlock_t	  lock; /* for from, to */
 	struct mem_cgroup *from;
 	struct mem_cgroup *to;
 	unsigned long precharge;
 	unsigned long moved_charge;
 	unsigned long moved_swap;
 	struct task_struct *moving_task;	/* a task moving charges */
 	wait_queue_head_t waitq;		/* a waitq for other context */
 } mc = {
 	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 };
 static bool move_anon(void)
 {
 	return test_bit(MOVE_CHARGE_TYPE_ANON,
 					&mc.to->move_charge_at_immigrate);
 }
 static bool move_file(void)
 {
 	return test_bit(MOVE_CHARGE_TYPE_FILE,
 					&mc.to->move_charge_at_immigrate);
 }
 /*
  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
  * limit reclaim to prevent infinite loops, if they ever occur.
  */
 #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
 #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
 enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_ANON,
 	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
 	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
 	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
 	NR_CHARGE_TYPE,
 };
 /* for encoding cft->private value on file */
 #define _MEM			(0)
 #define _MEMSWAP		(1)
 #define _OOM_TYPE		(2)
 #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
 #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)
 /* Used for OOM nofiier */
 #define OOM_CONTROL		(0)
 /*
  * Reclaim flags for mem_cgroup_hierarchical_reclaim
  */
 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
 #define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
 #define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
 #define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
 static void mem_cgroup_get(struct mem_cgroup *memcg);
 static void mem_cgroup_put(struct mem_cgroup *memcg);
 /* Writing them here to avoid exposing memcg's inner layout */
 #ifdef CONFIG_MEMCG_KMEM
 #include <net/sock.h>
 #include <net/ip.h>
 static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
 void sock_update_memcg(struct sock *sk)
 {
 	if (mem_cgroup_sockets_enabled) {
 		struct mem_cgroup *memcg;
 		struct cg_proto *cg_proto;
 		BUG_ON(!sk->sk_prot->proto_cgroup);
 		/* Socket cloning can throw us here with sk_cgrp already
 		 * filled. It won't however, necessarily happen from
 		 * process context. So the test for root memcg given
 		 * the current task's memcg won't help us in this case.
 		 *
 		 * Respecting the original socket's memcg is a better
 		 * decision in this case.
 		 */
 		if (sk->sk_cgrp) {
 			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
 			mem_cgroup_get(sk->sk_cgrp->memcg);
 			return;
 		}
 		rcu_read_lock();
 		memcg = mem_cgroup_from_task(current);
 		cg_proto = sk->sk_prot->proto_cgroup(memcg);
 		if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
 			mem_cgroup_get(memcg);
 			sk->sk_cgrp = cg_proto;
 		}
 		rcu_read_unlock();
 	}
 }
 EXPORT_SYMBOL(sock_update_memcg);
 void sock_release_memcg(struct sock *sk)
 {
 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
 		struct mem_cgroup *memcg;
 		WARN_ON(!sk->sk_cgrp->memcg);
 		memcg = sk->sk_cgrp->memcg;
 		mem_cgroup_put(memcg);
 	}
 }
 #ifdef CONFIG_INET
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 {
 	if (!memcg || mem_cgroup_is_root(memcg))
 		return NULL;
 	return &memcg->tcp_mem.cg_proto;
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
 #endif /* CONFIG_INET */
 #endif /* CONFIG_MEMCG_KMEM */
 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
 	if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
 		return;
 	static_key_slow_dec(&memcg_socket_limit_enabled);
 }
 #else
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
 }
 #endif
 static void drain_all_stock_async(struct mem_cgroup *memcg);
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
 {
 	return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
 }
 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
 {
 	return &memcg->css;
 }
 static struct mem_cgroup_per_zone *
 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	return mem_cgroup_zoneinfo(memcg, nid, zid);
 }
 static struct mem_cgroup_tree_per_zone *
 soft_limit_tree_node_zone(int nid, int zid)
 {
 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 }
 static struct mem_cgroup_tree_per_zone *
 soft_limit_tree_from_page(struct page *page)
 {
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 }
 static void
 __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz,
 				unsigned long long new_usage_in_excess)
 {
 	struct rb_node **p = &mctz->rb_root.rb_node;
 	struct rb_node *parent = NULL;
 	struct mem_cgroup_per_zone *mz_node;
 	if (mz->on_tree)
 		return;
 	mz->usage_in_excess = new_usage_in_excess;
 	if (!mz->usage_in_excess)
 		return;
 	while (*p) {
 		parent = *p;
 		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
 					tree_node);
 		if (mz->usage_in_excess < mz_node->usage_in_excess)
 			p = &(*p)->rb_left;
 		/*
 		 * We can't avoid mem cgroups that are over their soft
 		 * limit by the same amount
 		 */
 		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 			p = &(*p)->rb_right;
 	}
 	rb_link_node(&mz->tree_node, parent, p);
 	rb_insert_color(&mz->tree_node, &mctz->rb_root);
 	mz->on_tree = true;
 }
 static void
 __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz)
 {
 	if (!mz->on_tree)
 		return;
 	rb_erase(&mz->tree_node, &mctz->rb_root);
 	mz->on_tree = false;
 }
 static void
 mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz)
 {
 	spin_lock(&mctz->lock);
 	__mem_cgroup_remove_exceeded(memcg, mz, mctz);
 	spin_unlock(&mctz->lock);
 }
 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 {
 	unsigned long long excess;
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup_tree_per_zone *mctz;
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	mctz = soft_limit_tree_from_page(page);
 	/*
 	 * Necessary to update all ancestors when hierarchy is used.
 	 * because their event counter is not touched.
 	 */
 	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 		mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 		excess = res_counter_soft_limit_excess(&memcg->res);
 		/*
 		 * We have to update the tree if mz is on RB-tree or
 		 * mem is over its softlimit.
 		 */
 		if (excess || mz->on_tree) {
 			spin_lock(&mctz->lock);
 			/* if on-tree, remove it */
 			if (mz->on_tree)
 				__mem_cgroup_remove_exceeded(memcg, mz, mctz);
 			/*
 			 * Insert again. mz->usage_in_excess will be updated.
 			 * If excess is 0, no tree ops.
 			 */
 			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
 			spin_unlock(&mctz->lock);
 		}
 	}
 }
 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 {
 	int node, zone;
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup_tree_per_zone *mctz;
 	for_each_node(node) {
 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 			mz = mem_cgroup_zoneinfo(memcg, node, zone);
 			mctz = soft_limit_tree_node_zone(node, zone);
 			mem_cgroup_remove_exceeded(memcg, mz, mctz);
 		}
 	}
 }
 static struct mem_cgroup_per_zone *
 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 {
 	struct rb_node *rightmost = NULL;
 	struct mem_cgroup_per_zone *mz;
 retry:
 	mz = NULL;
 	rightmost = rb_last(&mctz->rb_root);
 	if (!rightmost)
 		goto done;		/* Nothing to reclaim from */
 	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
 	/*
 	 * Remove the node now but someone else can add it back,
 	 * we will to add it back at the end of reclaim to its correct
 	 * position in the tree.
 	 */
 	__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
 	if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
 		!css_tryget(&mz->memcg->css))
 		goto retry;
 done:
 	return mz;
 }
 static struct mem_cgroup_per_zone *
 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 {
 	struct mem_cgroup_per_zone *mz;
 	spin_lock(&mctz->lock);
 	mz = __mem_cgroup_largest_soft_limit_node(mctz);
 	spin_unlock(&mctz->lock);
 	return mz;
 }
 /*
  * Implementation Note: reading percpu statistics for memcg.
  *
  * Both of vmstat[] and percpu_counter has threshold and do periodic
  * synchronization to implement "quick" read. There are trade-off between
  * reading cost and precision of value. Then, we may have a chance to implement
  * a periodic synchronizion of counter in memcg's counter.
  *
  * But this _read() function is used for user interface now. The user accounts
  * memory usage by memory cgroup and he _always_ requires exact value because
  * he accounts memory. Even if we provide quick-and-fuzzy read, we always
  * have to visit all online cpus and make sum. So, for now, unnecessary
  * synchronization is not implemented. (just implemented for cpu hotplug)
  *
  * If there are kernel internal actions which can make use of some not-exact
  * value, and reading all cpu value can be performance bottleneck in some
  * common workload, threashold and synchonization as vmstat[] should be
  * implemented.
  */
 static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
 				 enum mem_cgroup_stat_index idx)
 {
 	long val = 0;
 	int cpu;
 	get_online_cpus();
 	for_each_online_cpu(cpu)
 		val += per_cpu(memcg->stat->count[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 	spin_lock(&memcg->pcp_counter_lock);
 	val += memcg->nocpu_base.count[idx];
 	spin_unlock(&memcg->pcp_counter_lock);
 #endif
 	put_online_cpus();
 	return val;
 }
 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
 					 bool charge)
 {
 	int val = (charge) ? 1 : -1;
 	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
 }
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 					    enum mem_cgroup_events_index idx)
 {
 	unsigned long val = 0;
 	int cpu;
 	for_each_online_cpu(cpu)
 		val += per_cpu(memcg->stat->events[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 	spin_lock(&memcg->pcp_counter_lock);
 	val += memcg->nocpu_base.events[idx];
 	spin_unlock(&memcg->pcp_counter_lock);
 #endif
 	return val;
 }
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 					 bool anon, int nr_pages)
 {
 	preempt_disable();
 	/*
 	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
 	 * counted as CACHE even if it's on ANON LRU.
 	 */
 	if (anon)
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
 				nr_pages);
 	else
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
 				nr_pages);
 	/* pagein of a big page is an event. So, ignore page size */
 	if (nr_pages > 0)
 		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 	else {
 		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 		nr_pages = -nr_pages; /* for event */
 	}
 	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 	preempt_enable();
 }
 unsigned long
 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
 	struct mem_cgroup_per_zone *mz;
 	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
 	return mz->lru_size[lru];
 }
 static unsigned long
 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
 			unsigned int lru_mask)
 {
 	struct mem_cgroup_per_zone *mz;
 	enum lru_list lru;
 	unsigned long ret = 0;
 	mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 	for_each_lru(lru) {
 		if (BIT(lru) & lru_mask)
 			ret += mz->lru_size[lru];
 	}
 	return ret;
 }
 static unsigned long
 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 			int nid, unsigned int lru_mask)
 {
 	u64 total = 0;
 	int zid;
 	for (zid = 0; zid < MAX_NR_ZONES; zid++)
 		total += mem_cgroup_zone_nr_lru_pages(memcg,
 						nid, zid, lru_mask);
 	return total;
 }
 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
 			unsigned int lru_mask)
 {
 	int nid;
 	u64 total = 0;
 	for_each_node_state(nid, N_HIGH_MEMORY)
 		total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
 	return total;
 }
 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 				       enum mem_cgroup_events_target target)
 {
 	unsigned long val, next;
 	val = __this_cpu_read(memcg->stat->nr_page_events);
 	next = __this_cpu_read(memcg->stat->targets[target]);
 	/* from time_after() in jiffies.h */
 	if ((long)next - (long)val < 0) {
 		switch (target) {
 		case MEM_CGROUP_TARGET_THRESH:
 			next = val + THRESHOLDS_EVENTS_TARGET;
 			break;
 		case MEM_CGROUP_TARGET_SOFTLIMIT:
 			next = val + SOFTLIMIT_EVENTS_TARGET;
 			break;
 		case MEM_CGROUP_TARGET_NUMAINFO:
 			next = val + NUMAINFO_EVENTS_TARGET;
 			break;
 		default:
 			break;
 		}
 		__this_cpu_write(memcg->stat->targets[target], next);
 		return true;
 	}
 	return false;
 }
 /*
  * Check events in order.
  *
  */
 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 {
 	preempt_disable();
 	/* threshold event is triggered in finer grain than soft limit */
 	if (unlikely(mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_THRESH))) {
 		bool do_softlimit;
 		bool do_numainfo __maybe_unused;
 		do_softlimit = mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_SOFTLIMIT);
 #if MAX_NUMNODES > 1
 		do_numainfo = mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_NUMAINFO);
 #endif
 		preempt_enable();
 		mem_cgroup_threshold(memcg);
 		if (unlikely(do_softlimit))
 			mem_cgroup_update_tree(memcg, page);
 #if MAX_NUMNODES > 1
 		if (unlikely(do_numainfo))
 			atomic_inc(&memcg->numainfo_events);
 #endif
 	} else
 		preempt_enable();
 }
 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 {
 	return container_of(cgroup_subsys_state(cont,
 				mem_cgroup_subsys_id), struct mem_cgroup,
 				css);
 }
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 {
 	/*
 	 * mm_update_next_owner() may clear mm->owner to NULL
 	 * if it races with swapoff, page migration, etc.
 	 * So this can be called with p == NULL.
 	 */
 	if (unlikely(!p))
 		return NULL;
 	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 				struct mem_cgroup, css);
 }
 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
 	struct mem_cgroup *memcg = NULL;
 	if (!mm)
 		return NULL;
 	/*
 	 * Because we have no locks, mm->owner's may be being moved to other
 	 * cgroup. We use css_tryget() here even if this looks
 	 * pessimistic (rather than adding locks here).
 	 */
 	rcu_read_lock();
 	do {
 		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 		if (unlikely(!memcg))
 			break;
 	} while (!css_tryget(&memcg->css));
 	rcu_read_unlock();
 	return memcg;
 }
 /**
  * mem_cgroup_iter - iterate over memory cgroup hierarchy
  * @root: hierarchy root
  * @prev: previously returned memcg, NULL on first invocation
  * @reclaim: cookie for shared reclaim walks, NULL for full walks
  *
  * Returns references to children of the hierarchy below @root, or
  * @root itself, or %NULL after a full round-trip.
  *
  * Caller must pass the return value in @prev on subsequent
  * invocations for reference counting, or use mem_cgroup_iter_break()
  * to cancel a hierarchy walk before the round-trip is complete.
  *
  * Reclaimers can specify a zone and a priority level in @reclaim to
  * divide up the memcgs in the hierarchy among all concurrent
  * reclaimers operating on the same zone and priority.
  */
 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 				   struct mem_cgroup *prev,
 				   struct mem_cgroup_reclaim_cookie *reclaim)
 {
 	struct mem_cgroup *memcg = NULL;
 	int id = 0;
 	if (mem_cgroup_disabled())
 		return NULL;
 	if (!root)
 		root = root_mem_cgroup;
 	if (prev && !reclaim)
 		id = css_id(&prev->css);
 	if (prev && prev != root)
 		css_put(&prev->css);
 	if (!root->use_hierarchy && root != root_mem_cgroup) {
 		if (prev)
 			return NULL;
 		return root;
 	}
 	while (!memcg) {
 		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
 		struct cgroup_subsys_state *css;
 		if (reclaim) {
 			int nid = zone_to_nid(reclaim->zone);
 			int zid = zone_idx(reclaim->zone);
 			struct mem_cgroup_per_zone *mz;
 			mz = mem_cgroup_zoneinfo(root, nid, zid);
 			iter = &mz->reclaim_iter[reclaim->priority];
 			if (prev && reclaim->generation != iter->generation)
 				return NULL;
 			id = iter->position;
 		}
 		rcu_read_lock();
 		css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
 		if (css) {
 			if (css == &root->css || css_tryget(css))
 				memcg = container_of(css,
 						     struct mem_cgroup, css);
 		} else
 			id = 0;
 		rcu_read_unlock();
 		if (reclaim) {
 			iter->position = id;
 			if (!css)
 				iter->generation++;
 			else if (!prev && memcg)
 				reclaim->generation = iter->generation;
 		}
 		if (prev && !css)
 			return NULL;
 	}
 	return memcg;
 }
 /**
  * mem_cgroup_iter_break - abort a hierarchy walk prematurely
  * @root: hierarchy root
  * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
  */
 void mem_cgroup_iter_break(struct mem_cgroup *root,
 			   struct mem_cgroup *prev)
 {
 	if (!root)
 		root = root_mem_cgroup;
 	if (prev && prev != root)
 		css_put(&prev->css);
 }
 /*
  * Iteration constructs for visiting all cgroups (under a tree).  If
  * loops are exited prematurely (break), mem_cgroup_iter_break() must
  * be used for reference counting.
  */
 #define for_each_mem_cgroup_tree(iter, root)		\
 	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
 	     iter != NULL;				\
 	     iter = mem_cgroup_iter(root, iter, NULL))
 #define for_each_mem_cgroup(iter)			\
 	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
 	     iter != NULL;				\
 	     iter = mem_cgroup_iter(NULL, iter, NULL))
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
 	return (memcg == root_mem_cgroup);
 }
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 	struct mem_cgroup *memcg;
 	if (!mm)
 		return;
 	rcu_read_lock();
 	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 	if (unlikely(!memcg))
 		goto out;
 	switch (idx) {
 	case PGFAULT:
 		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
 		break;
 	case PGMAJFAULT:
 		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
 		break;
 	default:
 		BUG();
 	}
 out:
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(mem_cgroup_count_vm_event);
 /**
  * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
  * @zone: zone of the wanted lruvec
  * @memcg: memcg of the wanted lruvec
  *
  * Returns the lru list vector holding pages for the given @zone and
  * @mem.  This can be the global zone lruvec, if the memory controller
  * is disabled.
  */
 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
 				      struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_per_zone *mz;
 	if (mem_cgroup_disabled())
 		return &zone->lruvec;
 	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
 	return &mz->lruvec;
 }
 /*
  * Following LRU functions are allowed to be used without PCG_LOCK.
  * Operations are called by routine of global LRU independently from memcg.
  * What we have to take care of here is validness of pc->mem_cgroup.
  *
  * Changes to pc->mem_cgroup happens when
  * 1. charge
  * 2. moving account
  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
  * It is added to LRU before charge.
  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
  * When moving account, the page is not on LRU. It's isolated.
  */
 /**
  * mem_cgroup_page_lruvec - return lruvec for adding an lru page
  * @page: the page
  * @zone: zone of the page
  */
 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
 {
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc;
 	if (mem_cgroup_disabled())
 		return &zone->lruvec;
 	pc = lookup_page_cgroup(page);
 	memcg = pc->mem_cgroup;
 	/*
 	 * Surreptitiously switch any uncharged offlist page to root:
 	 * an uncharged page off lru does nothing to secure
 	 * its former mem_cgroup from sudden removal.
 	 *
 	 * Our caller holds lru_lock, and PageCgroupUsed is updated
 	 * under page_cgroup lock: between them, they make all uses
 	 * of pc->mem_cgroup safe.
 	 */
 	if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
 		pc->mem_cgroup = memcg = root_mem_cgroup;
 	mz = page_cgroup_zoneinfo(memcg, page);
 	return &mz->lruvec;
 }
 /**
  * mem_cgroup_update_lru_size - account for adding or removing an lru page
  * @lruvec: mem_cgroup per zone lru vector
  * @lru: index of lru list the page is sitting on
  * @nr_pages: positive when adding or negative when removing
  *
  * This function must be called when a page is added to or removed from an
  * lru list.
  */
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 				int nr_pages)
 {
 	struct mem_cgroup_per_zone *mz;
 	unsigned long *lru_size;
 	if (mem_cgroup_disabled())
 		return;
 	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
 	lru_size = mz->lru_size + lru;
 	*lru_size += nr_pages;
 	VM_BUG_ON((long)(*lru_size) < 0);
 }
 /*
  * Checks whether given mem is same or in the root_mem_cgroup's
  * hierarchy subtree
  */
 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 				  struct mem_cgroup *memcg)
 {
 	if (root_memcg == memcg)
 		return true;
 	if (!root_memcg->use_hierarchy || !memcg)
 		return false;
 	return css_is_ancestor(&memcg->css, &root_memcg->css);
 }
 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 				       struct mem_cgroup *memcg)
 {
 	bool ret;
 	rcu_read_lock();
 	ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
 	rcu_read_unlock();
 	return ret;
 }
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
 {
 	int ret;
 	struct mem_cgroup *curr = NULL;
 	struct task_struct *p;
 	p = find_lock_task_mm(task);
 	if (p) {
 		curr = try_get_mem_cgroup_from_mm(p->mm);
 		task_unlock(p);
 	} else {
 		/*
 		 * All threads may have already detached their mm's, but the oom
 		 * killer still needs to detect if they have already been oom
 		 * killed to prevent needlessly killing additional tasks.
 		 */
 		task_lock(task);
 		curr = mem_cgroup_from_task(task);
 		if (curr)
 			css_get(&curr->css);
 		task_unlock(task);
 	}
 	if (!curr)
 		return 0;
 	/*
 	 * We should check use_hierarchy of "memcg" not "curr". Because checking
 	 * use_hierarchy of "curr" here make this function true if hierarchy is
 	 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
 	 * hierarchy(even if use_hierarchy is disabled in "memcg").
 	 */
 	ret = mem_cgroup_same_or_subtree(memcg, curr);
 	css_put(&curr->css);
 	return ret;
 }
 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 {
 	unsigned long inactive_ratio;
 	unsigned long inactive;
 	unsigned long active;
 	unsigned long gb;
 	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
 	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
 	if (gb)
 		inactive_ratio = int_sqrt(10 * gb);
 	else
 		inactive_ratio = 1;
 	return inactive * inactive_ratio < active;
 }
 int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
 {
 	unsigned long active;
 	unsigned long inactive;
 	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
 	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
 	return (active > inactive);
 }
 #define mem_cgroup_from_res_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 /**
  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
  * @memcg: the memory cgroup
  *
  * Returns the maximum amount of memory @mem can be charged with, in
  * pages.
  */
 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
 {
 	unsigned long long margin;
 	margin = res_counter_margin(&memcg->res);
 	if (do_swap_account)
 		margin = min(margin, res_counter_margin(&memcg->memsw));
 	return margin >> PAGE_SHIFT;
 }
 int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
 	struct cgroup *cgrp = memcg->css.cgroup;
 	/* root ? */
 	if (cgrp->parent == NULL)
 		return vm_swappiness;
 	return memcg->swappiness;
 }
 /*
  * memcg->moving_account is used for checking possibility that some thread is
  * calling move_account(). When a thread on CPU-A starts moving pages under
  * a memcg, other threads should check memcg->moving_account under
  * rcu_read_lock(), like this:
  *
  *         CPU-A                                    CPU-B
  *                                              rcu_read_lock()
  *         memcg->moving_account+1              if (memcg->mocing_account)
  *                                                   take heavy locks.
  *         synchronize_rcu()                    update something.
  *                                              rcu_read_unlock()
  *         start move here.
  */
 /* for quick checking without looking up memcg */
 atomic_t memcg_moving __read_mostly;
 static void mem_cgroup_start_move(struct mem_cgroup *memcg)
 {
 	atomic_inc(&memcg_moving);
 	atomic_inc(&memcg->moving_account);
 	synchronize_rcu();
 }
 static void mem_cgroup_end_move(struct mem_cgroup *memcg)
 {
 	/*
 	 * Now, mem_cgroup_clear_mc() may call this function with NULL.
 	 * We check NULL in callee rather than caller.
 	 */
 	if (memcg) {
 		atomic_dec(&memcg_moving);
 		atomic_dec(&memcg->moving_account);
 	}
 }
 /*
  * 2 routines for checking "mem" is under move_account() or not.
  *
  * mem_cgroup_stolen() -  checking whether a cgroup is mc.from or not. This
  *			  is used for avoiding races in accounting.  If true,
  *			  pc->mem_cgroup may be overwritten.
  *
  * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
  *			  under hierarchy of moving cgroups. This is for
  *			  waiting at hith-memory prressure caused by "move".
  */
 static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
 {
 	VM_BUG_ON(!rcu_read_lock_held());
 	return atomic_read(&memcg->moving_account) > 0;
 }
 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *from;
 	struct mem_cgroup *to;
 	bool ret = false;
 	/*
 	 * Unlike task_move routines, we access mc.to, mc.from not under
 	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
 	 */
 	spin_lock(&mc.lock);
 	from = mc.from;
 	to = mc.to;
 	if (!from)
 		goto unlock;
 	ret = mem_cgroup_same_or_subtree(memcg, from)
 		|| mem_cgroup_same_or_subtree(memcg, to);
 unlock:
 	spin_unlock(&mc.lock);
 	return ret;
 }
 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
 {
 	if (mc.moving_task && current != mc.moving_task) {
 		if (mem_cgroup_under_move(memcg)) {
 			DEFINE_WAIT(wait);
 			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
 			/* moving charge context might have finished. */
 			if (mc.moving_task)
 				schedule();
 			finish_wait(&mc.waitq, &wait);
 			return true;
 		}
 	}
 	return false;
 }
 /*
  * Take this lock when
  * - a code tries to modify page's memcg while it's USED.
  * - a code tries to modify page state accounting in a memcg.
  * see mem_cgroup_stolen(), too.
  */
 static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
 				  unsigned long *flags)
 {
 	spin_lock_irqsave(&memcg->move_lock, *flags);
 }
 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
 				unsigned long *flags)
 {
 	spin_unlock_irqrestore(&memcg->move_lock, *flags);
 }
 /**
  * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
  * @memcg: The memory cgroup that went over limit
  * @p: Task that is going to be killed
  *
  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
  * enabled
  */
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 	struct cgroup *task_cgrp;
 	struct cgroup *mem_cgrp;
 	/*
 	 * Need a buffer in BSS, can't rely on allocations. The code relies
 	 * on the assumption that OOM is serialized for memory controller.
 	 * If this assumption is broken, revisit this code.
 	 */
 	static char memcg_name[PATH_MAX];
 	int ret;
 	if (!memcg || !p)
 		return;
 	rcu_read_lock();
 	mem_cgrp = memcg->css.cgroup;
 	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
 	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
 	if (ret < 0) {
 		/*
 		 * Unfortunately, we are unable to convert to a useful name
 		 * But we'll still print out the usage information
 		 */
 		rcu_read_unlock();
 		goto done;
 	}
 	rcu_read_unlock();
 	printk(KERN_INFO "Task in %s killed", memcg_name);
 	rcu_read_lock();
 	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
 	if (ret < 0) {
 		rcu_read_unlock();
 		goto done;
 	}
 	rcu_read_unlock();
 	/*
 	 * Continues from above, so we don't need an KERN_ level
 	 */
 	printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
 done:
 	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
 		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
 		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
 		res_counter_read_u64(&memcg->res, RES_FAILCNT));
 	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
 		"failcnt %llu\n",
 		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
 		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
 		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
 }
 /*
  * This function returns the number of memcg under hierarchy tree. Returns
  * 1(self count) if no children.
  */
 static int mem_cgroup_count_children(struct mem_cgroup *memcg)
 {
 	int num = 0;
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, memcg)
 		num++;
 	return num;
 }
 /*
  * Return the memory (and swap, if configured) limit for a memcg.
  */
 static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
 {
 	u64 limit;
 	u64 memsw;
 	limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 	limit += total_swap_pages << PAGE_SHIFT;
 	memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 	/*
 	 * If memsw is finite and limits the amount of swap space available
 	 * to this memcg, return that limit.
 	 */
 	return min(limit, memsw);
 }
 void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 			      int order)
 {
 	struct mem_cgroup *iter;
 	unsigned long chosen_points = 0;
 	unsigned long totalpages;
 	unsigned int points = 0;
 	struct task_struct *chosen = NULL;
 	/*
 	 * If current has a pending SIGKILL, then automatically select it.  The
 	 * goal is to allow it to allocate so that it may quickly exit and free
 	 * its memory.
 	 */
 	if (fatal_signal_pending(current)) {
 		set_thread_flag(TIF_MEMDIE);
 		return;
 	}
 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
 	totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
 	for_each_mem_cgroup_tree(iter, memcg) {
 		struct cgroup *cgroup = iter->css.cgroup;
 		struct cgroup_iter it;
 		struct task_struct *task;
 		cgroup_iter_start(cgroup, &it);
 		while ((task = cgroup_iter_next(cgroup, &it))) {
 			switch (oom_scan_process_thread(task, totalpages, NULL,
 							false)) {
 			case OOM_SCAN_SELECT:
 				if (chosen)
 					put_task_struct(chosen);
 				chosen = task;
 				chosen_points = ULONG_MAX;
 				get_task_struct(chosen);
 				/* fall through */
 			case OOM_SCAN_CONTINUE:
 				continue;
 			case OOM_SCAN_ABORT:
 				cgroup_iter_end(cgroup, &it);
 				mem_cgroup_iter_break(memcg, iter);
 				if (chosen)
 					put_task_struct(chosen);
 				return;
 			case OOM_SCAN_OK:
 				break;
 			};
 			points = oom_badness(task, memcg, NULL, totalpages);
 			if (points > chosen_points) {
 				if (chosen)
 					put_task_struct(chosen);
 				chosen = task;
 				chosen_points = points;
 				get_task_struct(chosen);
 			}
 		}
 		cgroup_iter_end(cgroup, &it);
 	}
 	if (!chosen)
 		return;
 	points = chosen_points * 1000 / totalpages;
 	oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
 			 NULL, "Memory cgroup out of memory");
 }
 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 					gfp_t gfp_mask,
 					unsigned long flags)
 {
 	unsigned long total = 0;
 	bool noswap = false;
 	int loop;
 	if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
 		noswap = true;
 	if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
 		noswap = true;
 	for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
 		if (loop)
 			drain_all_stock_async(memcg);
 		total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
 		/*
 		 * Allow limit shrinkers, which are triggered directly
 		 * by userspace, to catch signals and stop reclaim
 		 * after minimal progress, regardless of the margin.
 		 */
 		if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
 			break;
 		if (mem_cgroup_margin(memcg))
 			break;
 		/*
 		 * If nothing was reclaimed after two attempts, there
 		 * may be no reclaimable pages in this hierarchy.
 		 */
 		if (loop && !total)
 			break;
 	}
 	return total;
 }
 /**
  * test_mem_cgroup_node_reclaimable
  * @memcg: the target memcg
  * @nid: the node ID to be checked.
  * @noswap : specify true here if the user wants flle only information.
  *
  * This function returns whether the specified memcg contains any
  * reclaimable pages on a node. Returns true if there are any reclaimable
  * pages in the node.
  */
 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
 		int nid, bool noswap)
 {
 	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
 		return true;
 	if (noswap || !total_swap_pages)
 		return false;
 	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
 		return true;
 	return false;
 }
 #if MAX_NUMNODES > 1
 /*
  * Always updating the nodemask is not very good - even if we have an empty
  * list or the wrong list here, we can start from some node and traverse all
  * nodes based on the zonelist. So update the list loosely once per 10 secs.
  *
  */
 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
 {
 	int nid;
 	/*
 	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
 	 * pagein/pageout changes since the last update.
 	 */
 	if (!atomic_read(&memcg->numainfo_events))
 		return;
 	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
 		return;
 	/* make a nodemask where this memcg uses memory from */
 	memcg->scan_nodes = node_states[N_HIGH_MEMORY];
 	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
 		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
 			node_clear(nid, memcg->scan_nodes);
 	}
 	atomic_set(&memcg->numainfo_events, 0);
 	atomic_set(&memcg->numainfo_updating, 0);
 }
 /*
  * Selecting a node where we start reclaim from. Because what we need is just
  * reducing usage counter, start from anywhere is O,K. Considering
  * memory reclaim from current node, there are pros. and cons.
  *
  * Freeing memory from current node means freeing memory from a node which
  * we'll use or we've used. So, it may make LRU bad. And if several threads
  * hit limits, it will see a contention on a node. But freeing from remote
  * node means more costs for memory reclaim because of memory latency.
  *
  * Now, we use round-robin. Better algorithm is welcomed.
  */
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 {
 	int node;
 	mem_cgroup_may_update_nodemask(memcg);
 	node = memcg->last_scanned_node;
 	node = next_node(node, memcg->scan_nodes);
 	if (node == MAX_NUMNODES)
 		node = first_node(memcg->scan_nodes);
 	/*
 	 * We call this when we hit limit, not when pages are added to LRU.
 	 * No LRU may hold pages because all pages are UNEVICTABLE or
 	 * memcg is too small and all pages are not on LRU. In that case,
 	 * we use curret node.
 	 */
 	if (unlikely(node == MAX_NUMNODES))
 		node = numa_node_id();
 	memcg->last_scanned_node = node;
 	return node;
 }
 /*
  * Check all nodes whether it contains reclaimable pages or not.
  * For quick scan, we make use of scan_nodes. This will allow us to skip
  * unused nodes. But scan_nodes is lazily updated and may not cotain
  * enough new information. We need to do double check.
  */
 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 {
 	int nid;
 	/*
 	 * quick check...making use of scan_node.
 	 * We can skip unused nodes.
 	 */
 	if (!nodes_empty(memcg->scan_nodes)) {
 		for (nid = first_node(memcg->scan_nodes);
 		     nid < MAX_NUMNODES;
 		     nid = next_node(nid, memcg->scan_nodes)) {
 			if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
 				return true;
 		}
 	}
 	/*
 	 * Check rest of nodes.
 	 */
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		if (node_isset(nid, memcg->scan_nodes))
 			continue;
 		if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
 			return true;
 	}
 	return false;
 }
 #else
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 {
 	return 0;
 }
 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 {
 	return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
 }
 #endif
 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
 				   struct zone *zone,
 				   gfp_t gfp_mask,
 				   unsigned long *total_scanned)
 {
 	struct mem_cgroup *victim = NULL;
 	int total = 0;
 	int loop = 0;
 	unsigned long excess;
 	unsigned long nr_scanned;
 	struct mem_cgroup_reclaim_cookie reclaim = {
 		.zone = zone,
 		.priority = 0,
 	};
 	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
 	while (1) {
 		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
 		if (!victim) {
 			loop++;
 			if (loop >= 2) {
 				/*
 				 * If we have not been able to reclaim
 				 * anything, it might because there are
 				 * no reclaimable pages under this hierarchy
 				 */
 				if (!total)
 					break;
 				/*
 				 * We want to do more targeted reclaim.
 				 * excess >> 2 is not to excessive so as to
 				 * reclaim too much, nor too less that we keep
 				 * coming back to reclaim from this cgroup
 				 */
 				if (total >= (excess >> 2) ||
 					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
 					break;
 			}
 			continue;
 		}
 		if (!mem_cgroup_reclaimable(victim, false))
 			continue;
 		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
 						     zone, &nr_scanned);
 		*total_scanned += nr_scanned;
 		if (!res_counter_soft_limit_excess(&root_memcg->res))
 			break;
 	}
 	mem_cgroup_iter_break(root_memcg, victim);
 	return total;
 }
 /*
  * Check OOM-Killer is already running under our hierarchy.
  * If someone is running, return false.
  * Has to be called with memcg_oom_lock
  */
 static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter, *failed = NULL;
 	for_each_mem_cgroup_tree(iter, memcg) {
 		if (iter->oom_lock) {
 			/*
 			 * this subtree of our hierarchy is already locked
 			 * so we cannot give a lock.
 			 */
 			failed = iter;
 			mem_cgroup_iter_break(memcg, iter);
 			break;
 		} else
 			iter->oom_lock = true;
 	}
 	if (!failed)
 		return true;
 	/*
 	 * OK, we failed to lock the whole subtree so we have to clean up
 	 * what we set up to the failing subtree
 	 */
 	for_each_mem_cgroup_tree(iter, memcg) {
 		if (iter == failed) {
 			mem_cgroup_iter_break(memcg, iter);
 			break;
 		}
 		iter->oom_lock = false;
 	}
 	return false;
 }
 /*
  * Has to be called with memcg_oom_lock
  */
 static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, memcg)
 		iter->oom_lock = false;
 	return 0;
 }
 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, memcg)
 		atomic_inc(&iter->under_oom);
 }
 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 	/*
 	 * When a new child is created while the hierarchy is under oom,
 	 * mem_cgroup_oom_lock() may not be called. We have to use
 	 * atomic_add_unless() here.
 	 */
 	for_each_mem_cgroup_tree(iter, memcg)
 		atomic_add_unless(&iter->under_oom, -1, 0);
 }
 static DEFINE_SPINLOCK(memcg_oom_lock);
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 struct oom_wait_info {
 	struct mem_cgroup *memcg;
 	wait_queue_t	wait;
 };
 static int memcg_oom_wake_function(wait_queue_t *wait,
 	unsigned mode, int sync, void *arg)
 {
 	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
 	struct mem_cgroup *oom_wait_memcg;
 	struct oom_wait_info *oom_wait_info;
 	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
 	oom_wait_memcg = oom_wait_info->memcg;
 	/*
 	 * Both of oom_wait_info->memcg and wake_memcg are stable under us.
 	 * Then we can use css_is_ancestor without taking care of RCU.
 	 */
 	if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
 		&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
 		return 0;
 	return autoremove_wake_function(wait, mode, sync, arg);
 }
 static void memcg_wakeup_oom(struct mem_cgroup *memcg)
 {
 	/* for filtering, pass "memcg" as argument. */
 	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
 }
 static void memcg_oom_recover(struct mem_cgroup *memcg)
 {
 	if (memcg && atomic_read(&memcg->under_oom))
 		memcg_wakeup_oom(memcg);
 }
 /*
  * try to call OOM killer. returns false if we should exit memory-reclaim loop.
  */
 static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
 				  int order)
 {
 	struct oom_wait_info owait;
 	bool locked, need_to_kill;
 	owait.memcg = memcg;
 	owait.wait.flags = 0;
 	owait.wait.func = memcg_oom_wake_function;
 	owait.wait.private = current;
 	INIT_LIST_HEAD(&owait.wait.task_list);
 	need_to_kill = true;
 	mem_cgroup_mark_under_oom(memcg);
 	/* At first, try to OOM lock hierarchy under memcg.*/
 	spin_lock(&memcg_oom_lock);
 	locked = mem_cgroup_oom_lock(memcg);
 	/*
 	 * Even if signal_pending(), we can't quit charge() loop without
 	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
 	 * under OOM is always welcomed, use TASK_KILLABLE here.
 	 */
 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
 	if (!locked || memcg->oom_kill_disable)
 		need_to_kill = false;
 	if (locked)
 		mem_cgroup_oom_notify(memcg);
 	spin_unlock(&memcg_oom_lock);
 	if (need_to_kill) {
 		finish_wait(&memcg_oom_waitq, &owait.wait);
 		mem_cgroup_out_of_memory(memcg, mask, order);
 	} else {
 		schedule();
 		finish_wait(&memcg_oom_waitq, &owait.wait);
 	}
 	spin_lock(&memcg_oom_lock);
 	if (locked)
 		mem_cgroup_oom_unlock(memcg);
 	memcg_wakeup_oom(memcg);
 	spin_unlock(&memcg_oom_lock);
 	mem_cgroup_unmark_under_oom(memcg);
 	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
 		return false;
 	/* Give chance to dying process */
 	schedule_timeout_uninterruptible(1);
 	return true;
 }
 /*
  * Currently used to update mapped file statistics, but the routine can be
  * generalized to update other statistics as well.
  *
  * Notes: Race condition
  *
  * We usually use page_cgroup_lock() for accessing page_cgroup member but
  * it tends to be costly. But considering some conditions, we doesn't need
  * to do so _always_.
  *
  * Considering "charge", lock_page_cgroup() is not required because all
  * file-stat operations happen after a page is attached to radix-tree. There
  * are no race with "charge".
  *
  * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
  * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
  * if there are race with "uncharge". Statistics itself is properly handled
  * by flags.
  *
  * Considering "move", this is an only case we see a race. To make the race
  * small, we check mm->moving_account and detect there are possibility of race
  * If there is, we take a lock.
  */
 void __mem_cgroup_begin_update_page_stat(struct page *page,
 				bool *locked, unsigned long *flags)
 {
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup(page);
 again:
 	memcg = pc->mem_cgroup;
 	if (unlikely(!memcg || !PageCgroupUsed(pc)))
 		return;
 	/*
 	 * If this memory cgroup is not under account moving, we don't
 	 * need to take move_lock_mem_cgroup(). Because we already hold
 	 * rcu_read_lock(), any calls to move_account will be delayed until
 	 * rcu_read_unlock() if mem_cgroup_stolen() == true.
 	 */
 	if (!mem_cgroup_stolen(memcg))
 		return;
 	move_lock_mem_cgroup(memcg, flags);
 	if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
 		move_unlock_mem_cgroup(memcg, flags);
 		goto again;
 	}
 	*locked = true;
 }
 void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
 {
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	/*
 	 * It's guaranteed that pc->mem_cgroup never changes while
 	 * lock is held because a routine modifies pc->mem_cgroup
 	 * should take move_lock_mem_cgroup().
 	 */
 	move_unlock_mem_cgroup(pc->mem_cgroup, flags);
 }
 void mem_cgroup_update_page_stat(struct page *page,
 				 enum mem_cgroup_page_stat_item idx, int val)
 {
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	unsigned long uninitialized_var(flags);
 	if (mem_cgroup_disabled())
 		return;
 	memcg = pc->mem_cgroup;
 	if (unlikely(!memcg || !PageCgroupUsed(pc)))
 		return;
 	switch (idx) {
 	case MEMCG_NR_FILE_MAPPED:
 		idx = MEM_CGROUP_STAT_FILE_MAPPED;
 		break;
 	default:
 		BUG();
 	}
 	this_cpu_add(memcg->stat->count[idx], val);
 }
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
  * TODO: maybe necessary to use big numbers in big irons.
  */
 #define CHARGE_BATCH	32U
 struct memcg_stock_pcp {
 	struct mem_cgroup *cached; /* this never be root cgroup */
 	unsigned int nr_pages;
 	struct work_struct work;
 	unsigned long flags;
 #define FLUSHING_CACHED_CHARGE	0
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
 static DEFINE_MUTEX(percpu_charge_mutex);
 /*
  * Try to consume stocked charge on this cpu. If success, one page is consumed
  * from local stock and true is returned. If the stock is 0 or charges from a
  * cgroup which is not current target, returns false. This stock will be
  * refilled.
  */
 static bool consume_stock(struct mem_cgroup *memcg)
 {
 	struct memcg_stock_pcp *stock;
 	bool ret = true;
 	stock = &get_cpu_var(memcg_stock);
 	if (memcg == stock->cached && stock->nr_pages)
 		stock->nr_pages--;
 	else /* need to call res_counter_charge */
 		ret = false;
 	put_cpu_var(memcg_stock);
 	return ret;
 }
 /*
  * Returns stocks cached in percpu to res_counter and reset cached information.
  */
 static void drain_stock(struct memcg_stock_pcp *stock)
 {
 	struct mem_cgroup *old = stock->cached;
 	if (stock->nr_pages) {
 		unsigned long bytes = stock->nr_pages * PAGE_SIZE;
 		res_counter_uncharge(&old->res, bytes);
 		if (do_swap_account)
 			res_counter_uncharge(&old->memsw, bytes);
 		stock->nr_pages = 0;
 	}
 	stock->cached = NULL;
 }
 /*
  * This must be called under preempt disabled or must be called by
  * a thread which is pinned to local cpu.
  */
 static void drain_local_stock(struct work_struct *dummy)
 {
 	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
 	drain_stock(stock);
 	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
 /*
  * Cache charges(val) which is from res_counter, to local per_cpu area.
  * This will be consumed by consume_stock() function, later.
  */
 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
 	if (stock->cached != memcg) { /* reset if necessary */
 		drain_stock(stock);
 		stock->cached = memcg;
 	}
 	stock->nr_pages += nr_pages;
 	put_cpu_var(memcg_stock);
 }
 /*
  * Drains all per-CPU charge caches for given root_memcg resp. subtree
  * of the hierarchy under it. sync flag says whether we should block
  * until the work is done.
  */
 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
 {
 	int cpu, curcpu;
 	/* Notify other cpus that system-wide "drain" is running */
 	get_online_cpus();
 	curcpu = get_cpu();
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
 		struct mem_cgroup *memcg;
 		memcg = stock->cached;
 		if (!memcg || !stock->nr_pages)
 			continue;
 		if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
 			continue;
 		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
 			if (cpu == curcpu)
 				drain_local_stock(&stock->work);
 			else
 				schedule_work_on(cpu, &stock->work);
 		}
 	}
 	put_cpu();
 	if (!sync)
 		goto out;
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
 		if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
 			flush_work(&stock->work);
 	}
 out:
  	put_online_cpus();
 }
 /*
  * Tries to drain stocked charges in other cpus. This function is asynchronous
  * and just put a work per cpu for draining localy on each cpu. Caller can
  * expects some charges will be back to res_counter later but cannot wait for
  * it.
  */
 static void drain_all_stock_async(struct mem_cgroup *root_memcg)
 {
 	/*
 	 * If someone calls draining, avoid adding more kworker runs.
 	 */
 	if (!mutex_trylock(&percpu_charge_mutex))
 		return;
 	drain_all_stock(root_memcg, false);
 	mutex_unlock(&percpu_charge_mutex);
 }
 /* This is a synchronous drain interface. */
 static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
 {
 	/* called when force_empty is called */
 	mutex_lock(&percpu_charge_mutex);
 	drain_all_stock(root_memcg, true);
 	mutex_unlock(&percpu_charge_mutex);
 }
 /*
  * This function drains percpu counter value from DEAD cpu and
  * move it to local cpu. Note that this function can be preempted.
  */
 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
 {
 	int i;
 	spin_lock(&memcg->pcp_counter_lock);
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 		long x = per_cpu(memcg->stat->count[i], cpu);
 		per_cpu(memcg->stat->count[i], cpu) = 0;
 		memcg->nocpu_base.count[i] += x;
 	}
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
 		unsigned long x = per_cpu(memcg->stat->events[i], cpu);
 		per_cpu(memcg->stat->events[i], cpu) = 0;
 		memcg->nocpu_base.events[i] += x;
 	}
 	spin_unlock(&memcg->pcp_counter_lock);
 }
 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
 					unsigned long action,
 					void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	struct memcg_stock_pcp *stock;
 	struct mem_cgroup *iter;
 	if (action == CPU_ONLINE)
 		return NOTIFY_OK;
 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
 		return NOTIFY_OK;
 	for_each_mem_cgroup(iter)
 		mem_cgroup_drain_pcp_counter(iter, cpu);
 	stock = &per_cpu(memcg_stock, cpu);
 	drain_stock(stock);
 	return NOTIFY_OK;
 }
 /* See __mem_cgroup_try_charge() for details */
 enum {
 	CHARGE_OK,		/* success */
 	CHARGE_RETRY,		/* need to retry but retry is not bad */
 	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
 	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
 	CHARGE_OOM_DIE,		/* the current is killed because of OOM */
 };
 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				unsigned int nr_pages, bool oom_check)
 {
 	unsigned long csize = nr_pages * PAGE_SIZE;
 	struct mem_cgroup *mem_over_limit;
 	struct res_counter *fail_res;
 	unsigned long flags = 0;
 	int ret;
 	ret = res_counter_charge(&memcg->res, csize, &fail_res);
 	if (likely(!ret)) {
 		if (!do_swap_account)
 			return CHARGE_OK;
 		ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
 		if (likely(!ret))
 			return CHARGE_OK;
 		res_counter_uncharge(&memcg->res, csize);
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
 		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
 	} else
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
 	/*
 	 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
 	 * of regular pages (CHARGE_BATCH), or a single regular page (1).
 	 *
 	 * Never reclaim on behalf of optional batching, retry with a
 	 * single page instead.
 	 */
 	if (nr_pages == CHARGE_BATCH)
 		return CHARGE_RETRY;
 	if (!(gfp_mask & __GFP_WAIT))
 		return CHARGE_WOULDBLOCK;
 	ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
 		return CHARGE_RETRY;
 	/*
 	 * Even though the limit is exceeded at this point, reclaim
 	 * may have been able to free some pages.  Retry the charge
 	 * before killing the task.
 	 *
 	 * Only for regular pages, though: huge pages are rather
 	 * unlikely to succeed so close to the limit, and we fall back
 	 * to regular pages anyway in case of failure.
 	 */
 	if (nr_pages == 1 && ret)
 		return CHARGE_RETRY;
 	/*
 	 * At task move, charge accounts can be doubly counted. So, it's
 	 * better to wait until the end of task_move if something is going on.
 	 */
 	if (mem_cgroup_wait_acct_move(mem_over_limit))
 		return CHARGE_RETRY;
 	/* If we don't need to call oom-killer at el, return immediately */
 	if (!oom_check)
 		return CHARGE_NOMEM;
 	/* check OOM */
 	if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
 		return CHARGE_OOM_DIE;
 	return CHARGE_RETRY;
 }
 /*
  * __mem_cgroup_try_charge() does
  * 1. detect memcg to be charged against from passed *mm and *ptr,
  * 2. update res_counter
  * 3. call memory reclaim if necessary.
  *
  * In some special case, if the task is fatal, fatal_signal_pending() or
  * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
  * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
  * as possible without any hazards. 2: all pages should have a valid
  * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
  * pointer, that is treated as a charge to root_mem_cgroup.
  *
  * So __mem_cgroup_try_charge() will return
  *  0       ...  on success, filling *ptr with a valid memcg pointer.
  *  -ENOMEM ...  charge failure because of resource limits.
  *  -EINTR  ...  if thread is fatal. *ptr is filled with root_mem_cgroup.
  *
  * Unlike the exported interface, an "oom" parameter is added. if oom==true,
  * the oom-killer can be invoked.
  */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
 				   gfp_t gfp_mask,
 				   unsigned int nr_pages,
 				   struct mem_cgroup **ptr,
 				   bool oom)
 {
 	unsigned int batch = max(CHARGE_BATCH, nr_pages);
 	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct mem_cgroup *memcg = NULL;
 	int ret;
 	/*
 	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
 	 * in system level. So, allow to go ahead dying process in addition to
 	 * MEMDIE process.
 	 */
 	if (unlikely(test_thread_flag(TIF_MEMDIE)
 		     || fatal_signal_pending(current)))
 		goto bypass;
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
 	 * The mm_struct's mem_cgroup changes on task migration if the
 	 * thread group leader migrates. It's possible that mm is not
 	 * set, if so charge the init_mm (happens for pagecache usage).
 	 */
 	if (!*ptr && !mm)
 		*ptr = root_mem_cgroup;
 again:
 	if (*ptr) { /* css should be a valid one */
 		memcg = *ptr;
 		VM_BUG_ON(css_is_removed(&memcg->css));
 		if (mem_cgroup_is_root(memcg))
 			goto done;
 		if (nr_pages == 1 && consume_stock(memcg))
 			goto done;
 		css_get(&memcg->css);
 	} else {
 		struct task_struct *p;
 		rcu_read_lock();
 		p = rcu_dereference(mm->owner);
 		/*
 		 * Because we don't have task_lock(), "p" can exit.
 		 * In that case, "memcg" can point to root or p can be NULL with
 		 * race with swapoff. Then, we have small risk of mis-accouning.
 		 * But such kind of mis-account by race always happens because
 		 * we don't have cgroup_mutex(). It's overkill and we allo that
 		 * small race, here.
 		 * (*) swapoff at el will charge against mm-struct not against
 		 * task-struct. So, mm->owner can be NULL.
 		 */
 		memcg = mem_cgroup_from_task(p);
 		if (!memcg)
 			memcg = root_mem_cgroup;
 		if (mem_cgroup_is_root(memcg)) {
 			rcu_read_unlock();
 			goto done;
 		}
 		if (nr_pages == 1 && consume_stock(memcg)) {
 			/*
 			 * It seems dagerous to access memcg without css_get().
 			 * But considering how consume_stok works, it's not
 			 * necessary. If consume_stock success, some charges
 			 * from this memcg are cached on this cpu. So, we
 			 * don't need to call css_get()/css_tryget() before
 			 * calling consume_stock().
 			 */
 			rcu_read_unlock();
 			goto done;
 		}
 		/* after here, we may be blocked. we need to get refcnt */
 		if (!css_tryget(&memcg->css)) {
 			rcu_read_unlock();
 			goto again;
 		}
 		rcu_read_unlock();
 	}
 	do {
 		bool oom_check;
 		/* If killed, bypass charge */
 		if (fatal_signal_pending(current)) {
 			css_put(&memcg->css);
 			goto bypass;
 		}
 		oom_check = false;
 		if (oom && !nr_oom_retries) {
 			oom_check = true;
 			nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
 		}
 		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
 		switch (ret) {
 		case CHARGE_OK:
 			break;
 		case CHARGE_RETRY: /* not in OOM situation but retry */
 			batch = nr_pages;
 			css_put(&memcg->css);
 			memcg = NULL;
 			goto again;
 		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
 			css_put(&memcg->css);
 			goto nomem;
 		case CHARGE_NOMEM: /* OOM routine works */
 			if (!oom) {
 				css_put(&memcg->css);
 				goto nomem;
 			}
 			/* If oom, we never return -ENOMEM */
 			nr_oom_retries--;
 			break;
 		case CHARGE_OOM_DIE: /* Killed by OOM Killer */
 			css_put(&memcg->css);
 			goto bypass;
 		}
 	} while (ret != CHARGE_OK);
 	if (batch > nr_pages)
 		refill_stock(memcg, batch - nr_pages);
 	css_put(&memcg->css);
 done:
 	*ptr = memcg;
 	return 0;
 nomem:
 	*ptr = NULL;
 	return -ENOMEM;
 bypass:
 	*ptr = root_mem_cgroup;
 	return -EINTR;
 }
 /*
  * Somemtimes we have to undo a charge we got by try_charge().
  * This function is for that and do uncharge, put css's refcnt.
  * gotten by try_charge().
  */
 static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
 				       unsigned int nr_pages)
 {
 	if (!mem_cgroup_is_root(memcg)) {
 		unsigned long bytes = nr_pages * PAGE_SIZE;
 		res_counter_uncharge(&memcg->res, bytes);
 		if (do_swap_account)
 			res_counter_uncharge(&memcg->memsw, bytes);
 	}
 }
 /*
  * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
  * This is useful when moving usage to parent cgroup.
  */
 static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
 					unsigned int nr_pages)
 {
 	unsigned long bytes = nr_pages * PAGE_SIZE;
 	if (mem_cgroup_is_root(memcg))
 		return;
 	res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
 	if (do_swap_account)
 		res_counter_uncharge_until(&memcg->memsw,
 						memcg->memsw.parent, bytes);
 }
 /*
  * A helper function to get mem_cgroup from ID. must be called under
  * rcu_read_lock(). The caller must check css_is_removed() or some if
  * it's concern. (dropping refcnt from swap can be called against removed
  * memcg.)
  */
 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
 {
 	struct cgroup_subsys_state *css;
 	/* ID 0 is unused ID */
 	if (!id)
 		return NULL;
 	css = css_lookup(&mem_cgroup_subsys, id);
 	if (!css)
 		return NULL;
 	return container_of(css, struct mem_cgroup, css);
 }
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct page_cgroup *pc;
 	unsigned short id;
 	swp_entry_t ent;
 	VM_BUG_ON(!PageLocked(page));
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		memcg = pc->mem_cgroup;
 		if (memcg && !css_tryget(&memcg->css))
 			memcg = NULL;
 	} else if (PageSwapCache(page)) {
 		ent.val = page_private(page);
 		id = lookup_swap_cgroup_id(ent);
 		rcu_read_lock();
 		memcg = mem_cgroup_lookup(id);
 		if (memcg && !css_tryget(&memcg->css))
 			memcg = NULL;
 		rcu_read_unlock();
 	}
 	unlock_page_cgroup(pc);
 	return memcg;
 }
 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 				       struct page *page,
 				       unsigned int nr_pages,
 				       enum charge_type ctype,
 				       bool lrucare)
 {
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	struct zone *uninitialized_var(zone);
 	struct lruvec *lruvec;
 	bool was_on_lru = false;
 	bool anon;
 	lock_page_cgroup(pc);
 	if (unlikely(PageCgroupUsed(pc))) {
 		unlock_page_cgroup(pc);
 		__mem_cgroup_cancel_charge(memcg, nr_pages);
 		return;
 	}
 	/*
 	 * we don't need page_cgroup_lock about tail pages, becase they are not
 	 * accessed by any other context at this point.
 	 */
 	/*
 	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
 	 * may already be on some other mem_cgroup's LRU.  Take care of it.
 	 */
 	if (lrucare) {
 		zone = page_zone(page);
 		spin_lock_irq(&zone->lru_lock);
 		if (PageLRU(page)) {
 			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
 			ClearPageLRU(page);
 			del_page_from_lru_list(page, lruvec, page_lru(page));
 			was_on_lru = true;
 		}
 	}
 	pc->mem_cgroup = memcg;
 	/*
 	 * We access a page_cgroup asynchronously without lock_page_cgroup().
 	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
 	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
 	 * before USED bit, we need memory barrier here.
 	 * See mem_cgroup_add_lru_list(), etc.
  	 */
 	smp_wmb();
 	SetPageCgroupUsed(pc);
 	if (lrucare) {
 		if (was_on_lru) {
 			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
 			VM_BUG_ON(PageLRU(page));
 			SetPageLRU(page);
 			add_page_to_lru_list(page, lruvec, page_lru(page));
 		}
 		spin_unlock_irq(&zone->lru_lock);
 	}
 	if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
 		anon = true;
 	else
 		anon = false;
 	mem_cgroup_charge_statistics(memcg, anon, nr_pages);
 	unlock_page_cgroup(pc);
 	/*
 	 * "charge_statistics" updated event counter. Then, check it.
 	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
 	 * if they exceeds softlimit.
 	 */
 	memcg_check_events(memcg, page);
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
 /*
  * Because tail pages are not marked as "used", set it. We're under
  * zone->lru_lock, 'splitting on pmd' and compound_lock.
  * charge/uncharge will be never happen and move_account() is done under
  * compound_lock(), so we don't have to take care of races.
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
 	struct page_cgroup *head_pc = lookup_page_cgroup(head);
 	struct page_cgroup *pc;
 	int i;
 	if (mem_cgroup_disabled())
 		return;
 	for (i = 1; i < HPAGE_PMD_NR; i++) {
 		pc = head_pc + i;
 		pc->mem_cgroup = head_pc->mem_cgroup;
 		smp_wmb();/* see __commit_charge() */
 		pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
 	}
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 /**
  * mem_cgroup_move_account - move account of the page
  * @page: the page
  * @nr_pages: number of regular pages (>1 for huge pages)
  * @pc:	page_cgroup of the page.
  * @from: mem_cgroup which the page is moved from.
  * @to:	mem_cgroup which the page is moved to. @from != @to.
  *
  * The caller must confirm following.
  * - page is not on LRU (isolate_page() is useful.)
  * - compound_lock is held when nr_pages > 1
  *
  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
  * from old cgroup.
  */
 static int mem_cgroup_move_account(struct page *page,
 				   unsigned int nr_pages,
 				   struct page_cgroup *pc,
 				   struct mem_cgroup *from,
 				   struct mem_cgroup *to)
 {
 	unsigned long flags;
 	int ret;
 	bool anon = PageAnon(page);
 	VM_BUG_ON(from == to);
 	VM_BUG_ON(PageLRU(page));
 	/*
 	 * The page is isolated from LRU. So, collapse function
 	 * will not handle this page. But page splitting can happen.
 	 * Do this check under compound_page_lock(). The caller should
 	 * hold it.
 	 */
 	ret = -EBUSY;
 	if (nr_pages > 1 && !PageTransHuge(page))
 		goto out;
 	lock_page_cgroup(pc);
 	ret = -EINVAL;
 	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
 		goto unlock;
 	move_lock_mem_cgroup(from, &flags);
 	if (!anon && page_mapped(page)) {
 		/* Update mapped_file data for mem_cgroup */
 		preempt_disable();
 		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
 		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
 		preempt_enable();
 	}
 	mem_cgroup_charge_statistics(from, anon, -nr_pages);
 	/* caller should have done css_get */
 	pc->mem_cgroup = to;
 	mem_cgroup_charge_statistics(to, anon, nr_pages);
 	/*
 	 * We charges against "to" which may not have any tasks. Then, "to"
 	 * can be under rmdir(). But in current implementation, caller of
 	 * this function is just force_empty() and move charge, so it's
 	 * guaranteed that "to" is never removed. So, we don't check rmdir
 	 * status here.
 	 */
 	move_unlock_mem_cgroup(from, &flags);
 	ret = 0;
 unlock:
 	unlock_page_cgroup(pc);
 	/*
 	 * check events
 	 */
 	memcg_check_events(to, page);
 	memcg_check_events(from, page);
 out:
 	return ret;
 }
 /*
  * move charges to its parent.
  */
 static int mem_cgroup_move_parent(struct page *page,
 				  struct page_cgroup *pc,
 				  struct mem_cgroup *child)
 {
 	struct mem_cgroup *parent;
 	unsigned int nr_pages;
 	unsigned long uninitialized_var(flags);
 	int ret;
 	/* Is ROOT ? */
 	if (mem_cgroup_is_root(child))
 		return -EINVAL;
 	ret = -EBUSY;
 	if (!get_page_unless_zero(page))
 		goto out;
 	if (isolate_lru_page(page))
 		goto put;
 	nr_pages = hpage_nr_pages(page);
 	parent = parent_mem_cgroup(child);
 	/*
 	 * If no parent, move charges to root cgroup.
 	 */
 	if (!parent)
 		parent = root_mem_cgroup;
 	if (nr_pages > 1)
 		flags = compound_lock_irqsave(page);
 	ret = mem_cgroup_move_account(page, nr_pages,
 				pc, child, parent);
 	if (!ret)
 		__mem_cgroup_cancel_local_charge(child, nr_pages);
 	if (nr_pages > 1)
 		compound_unlock_irqrestore(page, flags);
 	putback_lru_page(page);
 put:
 	put_page(page);
 out:
 	return ret;
 }
 /*
  * Charge the memory controller for page usage.
  * Return
  * 0 if the charge was successful
  * < 0 if the cgroup is over its limit
  */
 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask, enum charge_type ctype)
 {
 	struct mem_cgroup *memcg = NULL;
 	unsigned int nr_pages = 1;
 	bool oom = true;
 	int ret;
 	if (PageTransHuge(page)) {
 		nr_pages <<= compound_order(page);
 		VM_BUG_ON(!PageTransHuge(page));
 		/*
 		 * Never OOM-kill a process for a huge page.  The
 		 * fault handler will fall back to regular pages.
 		 */
 		oom = false;
 	}
 	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
 	if (ret == -ENOMEM)
 		return ret;
 	__mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
 	return 0;
 }
 int mem_cgroup_newpage_charge(struct page *page,
 			      struct mm_struct *mm, gfp_t gfp_mask)
 {
 	if (mem_cgroup_disabled())
 		return 0;
 	VM_BUG_ON(page_mapped(page));
 	VM_BUG_ON(page->mapping && !PageAnon(page));
 	VM_BUG_ON(!mm);
 	return mem_cgroup_charge_common(page, mm, gfp_mask,
 					MEM_CGROUP_CHARGE_TYPE_ANON);
 }
 static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
 					enum charge_type ctype);
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask)
 {
 	struct mem_cgroup *memcg = NULL;
 	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	int ret;
 	if (mem_cgroup_disabled())
 		return 0;
 	if (PageCompound(page))
 		return 0;
 	if (unlikely(!mm))
 		mm = &init_mm;
 	if (!page_is_file_cache(page))
 		type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
 	if (!PageSwapCache(page))
 		ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
 	else { /* page is swapcache/shmem */
 		ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
 		if (!ret)
 			__mem_cgroup_commit_charge_swapin(page, memcg, type);
 	}
 	return ret;
 }
 /*
  * While swap-in, try_charge -> commit or cancel, the page is locked.
  * And when try_charge() successfully returns, one refcnt to memcg without
  * struct page_cgroup is acquired. This refcnt will be consumed by
  * "commit()" or removed by "cancel()"
  */
 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 				 struct page *page,
 				 gfp_t mask, struct mem_cgroup **memcgp)
 {
 	struct mem_cgroup *memcg;
 	int ret;
 	*memcgp = NULL;
 	if (mem_cgroup_disabled())
 		return 0;
 	if (!do_swap_account)
 		goto charge_cur_mm;
 	/*
 	 * A racing thread's fault, or swapoff, may have already updated
 	 * the pte, and even removed page from swap cache: in those cases
 	 * do_swap_page()'s pte_same() test will fail; but there's also a
 	 * KSM case which does need to charge the page.
 	 */
 	if (!PageSwapCache(page))
 		goto charge_cur_mm;
 	memcg = try_get_mem_cgroup_from_page(page);
 	if (!memcg)
 		goto charge_cur_mm;
 	*memcgp = memcg;
 	ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
 	css_put(&memcg->css);
 	if (ret == -EINTR)
 		ret = 0;
 	return ret;
 charge_cur_mm:
 	if (unlikely(!mm))
 		mm = &init_mm;
 	ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
 	if (ret == -EINTR)
 		ret = 0;
 	return ret;
 }
 static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
 					enum charge_type ctype)
 {
 	if (mem_cgroup_disabled())
 		return;
 	if (!memcg)
 		return;
 	cgroup_exclude_rmdir(&memcg->css);
 	__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
 	/*
 	 * Now swap is on-memory. This means this page may be
 	 * counted both as mem and swap....double count.
 	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
 	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
 	 * may call delete_from_swap_cache() before reach here.
 	 */
 	if (do_swap_account && PageSwapCache(page)) {
 		swp_entry_t ent = {.val = page_private(page)};
 		mem_cgroup_uncharge_swap(ent);
 	}
 	/*
 	 * At swapin, we may charge account against cgroup which has no tasks.
 	 * So, rmdir()->pre_destroy() can be called while we do this charge.
 	 * In that case, we need to call pre_destroy() again. check it here.
 	 */
 	cgroup_release_and_wakeup_rmdir(&memcg->css);
 }
 void mem_cgroup_commit_charge_swapin(struct page *page,
 				     struct mem_cgroup *memcg)
 {
 	__mem_cgroup_commit_charge_swapin(page, memcg,
 					  MEM_CGROUP_CHARGE_TYPE_ANON);
 }
 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
 {
 	if (mem_cgroup_disabled())
 		return;
 	if (!memcg)
 		return;
 	__mem_cgroup_cancel_charge(memcg, 1);
 }
 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
 				   unsigned int nr_pages,
 				   const enum charge_type ctype)
 {
 	struct memcg_batch_info *batch = NULL;
 	bool uncharge_memsw = true;
 	/* If swapout, usage of swap doesn't decrease */
 	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
 		uncharge_memsw = false;
 	batch = &current->memcg_batch;
 	/*
 	 * In usual, we do css_get() when we remember memcg pointer.
 	 * But in this case, we keep res->usage until end of a series of
 	 * uncharges. Then, it's ok to ignore memcg's refcnt.
 	 */
 	if (!batch->memcg)
 		batch->memcg = memcg;
 	/*
 	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
 	 * In those cases, all pages freed continuously can be expected to be in
 	 * the same cgroup and we have chance to coalesce uncharges.
 	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
 	 * because we want to do uncharge as soon as possible.
 	 */
 	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
 		goto direct_uncharge;
 	if (nr_pages > 1)
 		goto direct_uncharge;
 	/*
 	 * In typical case, batch->memcg == mem. This means we can
 	 * merge a series of uncharges to an uncharge of res_counter.
 	 * If not, we uncharge res_counter ony by one.
 	 */
 	if (batch->memcg != memcg)
 		goto direct_uncharge;
 	/* remember freed charge and uncharge it later */
 	batch->nr_pages++;
 	if (uncharge_memsw)
 		batch->memsw_nr_pages++;
 	return;
 direct_uncharge:
 	res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
 	if (uncharge_memsw)
 		res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
 	if (unlikely(batch->memcg != memcg))
 		memcg_oom_recover(memcg);
 }
 /*
  * uncharge if !page_mapped(page)
  */
 static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
 			     bool end_migration)
 {
 	struct mem_cgroup *memcg = NULL;
 	unsigned int nr_pages = 1;
 	struct page_cgroup *pc;
 	bool anon;
 	if (mem_cgroup_disabled())
 		return NULL;
 	VM_BUG_ON(PageSwapCache(page));
 	if (PageTransHuge(page)) {
 		nr_pages <<= compound_order(page);
 		VM_BUG_ON(!PageTransHuge(page));
 	}
 	/*
 	 * Check if our page_cgroup is valid
 	 */
 	pc = lookup_page_cgroup(page);
 	if (unlikely(!PageCgroupUsed(pc)))
 		return NULL;
 	lock_page_cgroup(pc);
 	memcg = pc->mem_cgroup;
 	if (!PageCgroupUsed(pc))
 		goto unlock_out;
 	anon = PageAnon(page);
 	switch (ctype) {
 	case MEM_CGROUP_CHARGE_TYPE_ANON:
 		/*
 		 * Generally PageAnon tells if it's the anon statistics to be
 		 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
 		 * used before page reached the stage of being marked PageAnon.
 		 */
 		anon = true;
 		/* fallthrough */
 	case MEM_CGROUP_CHARGE_TYPE_DROP:
 		/* See mem_cgroup_prepare_migration() */
 		if (page_mapped(page))
 			goto unlock_out;
 		/*
 		 * Pages under migration may not be uncharged.  But
 		 * end_migration() /must/ be the one uncharging the
 		 * unused post-migration page and so it has to call
 		 * here with the migration bit still set.  See the
 		 * res_counter handling below.
 		 */
 		if (!end_migration && PageCgroupMigration(pc))
 			goto unlock_out;
 		break;
 	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
 		if (!PageAnon(page)) {	/* Shared memory */
 			if (page->mapping && !page_is_file_cache(page))
 				goto unlock_out;
 		} else if (page_mapped(page)) /* Anon */
 				goto unlock_out;
 		break;
 	default:
 		break;
 	}
 	mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
 	ClearPageCgroupUsed(pc);
 	/*
 	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
 	 * freed from LRU. This is safe because uncharged page is expected not
 	 * to be reused (freed soon). Exception is SwapCache, it's handled by
 	 * special functions.
 	 */
 	unlock_page_cgroup(pc);
 	/*
 	 * even after unlock, we have memcg->res.usage here and this memcg
 	 * will never be freed.
 	 */
 	memcg_check_events(memcg, page);
 	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
 		mem_cgroup_swap_statistics(memcg, true);
 		mem_cgroup_get(memcg);
 	}
 	/*
 	 * Migration does not charge the res_counter for the
 	 * replacement page, so leave it alone when phasing out the
 	 * page that is unused after the migration.
 	 */
 	if (!end_migration && !mem_cgroup_is_root(memcg))
 		mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
 	return memcg;
 unlock_out:
 	unlock_page_cgroup(pc);
 	return NULL;
 }
 void mem_cgroup_uncharge_page(struct page *page)
 {
 	/* early check. */
 	if (page_mapped(page))
 		return;
 	VM_BUG_ON(page->mapping && !PageAnon(page));
 	if (PageSwapCache(page))
 		return;
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
 }
 void mem_cgroup_uncharge_cache_page(struct page *page)
 {
 	VM_BUG_ON(page_mapped(page));
 	VM_BUG_ON(page->mapping);
-	if (PageSwapCache(page))
-		return;
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
 }
 /*
  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
  * In that cases, pages are freed continuously and we can expect pages
  * are in the same memcg. All these calls itself limits the number of
  * pages freed at once, then uncharge_start/end() is called properly.
  * This may be called prural(2) times in a context,
  */
 void mem_cgroup_uncharge_start(void)
 {
 	current->memcg_batch.do_batch++;
 	/* We can do nest. */
 	if (current->memcg_batch.do_batch == 1) {
 		current->memcg_batch.memcg = NULL;
 		current->memcg_batch.nr_pages = 0;
 		current->memcg_batch.memsw_nr_pages = 0;
 	}
 }
 void mem_cgroup_uncharge_end(void)
 {
 	struct memcg_batch_info *batch = &current->memcg_batch;
 	if (!batch->do_batch)
 		return;
 	batch->do_batch--;
 	if (batch->do_batch) /* If stacked, do nothing. */
 		return;
 	if (!batch->memcg)
 		return;
 	/*
 	 * This "batch->memcg" is valid without any css_get/put etc...
 	 * bacause we hide charges behind us.
 	 */
 	if (batch->nr_pages)
 		res_counter_uncharge(&batch->memcg->res,
 				     batch->nr_pages * PAGE_SIZE);
 	if (batch->memsw_nr_pages)
 		res_counter_uncharge(&batch->memcg->memsw,
 				     batch->memsw_nr_pages * PAGE_SIZE);
 	memcg_oom_recover(batch->memcg);
 	/* forget this pointer (for sanity check) */
 	batch->memcg = NULL;
 }
 #ifdef CONFIG_SWAP
 /*
  * called after __delete_from_swap_cache() and drop "page" account.
  * memcg information is recorded to swap_cgroup of "ent"
  */
 void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 {
 	struct mem_cgroup *memcg;
 	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
 	if (!swapout) /* this was a swap cache but the swap is unused ! */
 		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
-	if (PageSwapCache(page))
-		return;
 	memcg = __mem_cgroup_uncharge_common(page, ctype, false);
 	/*
 	 * record memcg information,  if swapout && memcg != NULL,
 	 * mem_cgroup_get() was called in uncharge().
 	 */
 	if (do_swap_account && swapout && memcg)
 		swap_cgroup_record(ent, css_id(&memcg->css));
 }
 #endif
 #ifdef CONFIG_MEMCG_SWAP
 /*
  * called from swap_entry_free(). remove record in swap_cgroup and
  * uncharge "memsw" account.
  */
 void mem_cgroup_uncharge_swap(swp_entry_t ent)
 {
 	struct mem_cgroup *memcg;
 	unsigned short id;
 	if (!do_swap_account)
 		return;
 	id = swap_cgroup_record(ent, 0);
 	rcu_read_lock();
 	memcg = mem_cgroup_lookup(id);
 	if (memcg) {
 		/*
 		 * We uncharge this because swap is freed.
 		 * This memcg can be obsolete one. We avoid calling css_tryget
 		 */
 		if (!mem_cgroup_is_root(memcg))
 			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 		mem_cgroup_swap_statistics(memcg, false);
 		mem_cgroup_put(memcg);
 	}
 	rcu_read_unlock();
 }
 /**
  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
  * @entry: swap entry to be moved
  * @from:  mem_cgroup which the entry is moved from
  * @to:  mem_cgroup which the entry is moved to
  *
  * It succeeds only when the swap_cgroup's record for this entry is the same
  * as the mem_cgroup's id of @from.
  *
  * Returns 0 on success, -EINVAL on failure.
  *
  * The caller must have charged to @to, IOW, called res_counter_charge() about
  * both res and memsw, and called css_get().
  */
 static int mem_cgroup_move_swap_account(swp_entry_t entry,
 				struct mem_cgroup *from, struct mem_cgroup *to)
 {
 	unsigned short old_id, new_id;
 	old_id = css_id(&from->css);
 	new_id = css_id(&to->css);
 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
 		mem_cgroup_swap_statistics(from, false);
 		mem_cgroup_swap_statistics(to, true);
 		/*
 		 * This function is only called from task migration context now.
 		 * It postpones res_counter and refcount handling till the end
 		 * of task migration(mem_cgroup_clear_mc()) for performance
 		 * improvement. But we cannot postpone mem_cgroup_get(to)
 		 * because if the process that has been moved to @to does
 		 * swap-in, the refcount of @to might be decreased to 0.
 		 */
 		mem_cgroup_get(to);
 		return 0;
 	}
 	return -EINVAL;
 }
 #else
 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 				struct mem_cgroup *from, struct mem_cgroup *to)
 {
 	return -EINVAL;
 }
 #endif
 /*
  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
  * page belongs to.
  */
 void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 				  struct mem_cgroup **memcgp)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct page_cgroup *pc;
 	enum charge_type ctype;
 	*memcgp = NULL;
 	VM_BUG_ON(PageTransHuge(page));
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		memcg = pc->mem_cgroup;
 		css_get(&memcg->css);
 		/*
 		 * At migrating an anonymous page, its mapcount goes down
 		 * to 0 and uncharge() will be called. But, even if it's fully
 		 * unmapped, migration may fail and this page has to be
 		 * charged again. We set MIGRATION flag here and delay uncharge
 		 * until end_migration() is called
 		 *
 		 * Corner Case Thinking
 		 * A)
 		 * When the old page was mapped as Anon and it's unmap-and-freed
 		 * while migration was ongoing.
 		 * If unmap finds the old page, uncharge() of it will be delayed
 		 * until end_migration(). If unmap finds a new page, it's
 		 * uncharged when it make mapcount to be 1->0. If unmap code
 		 * finds swap_migration_entry, the new page will not be mapped
 		 * and end_migration() will find it(mapcount==0).
 		 *
 		 * B)
 		 * When the old page was mapped but migraion fails, the kernel
 		 * remaps it. A charge for it is kept by MIGRATION flag even
 		 * if mapcount goes down to 0. We can do remap successfully
 		 * without charging it again.
 		 *
 		 * C)
 		 * The "old" page is under lock_page() until the end of
 		 * migration, so, the old page itself will not be swapped-out.
 		 * If the new page is swapped out before end_migraton, our
 		 * hook to usual swap-out path will catch the event.
 		 */
 		if (PageAnon(page))
 			SetPageCgroupMigration(pc);
 	}
 	unlock_page_cgroup(pc);
 	/*
 	 * If the page is not charged at this point,
 	 * we return here.
 	 */
 	if (!memcg)
 		return;
 	*memcgp = memcg;
 	/*
 	 * We charge new page before it's used/mapped. So, even if unlock_page()
 	 * is called before end_migration, we can catch all events on this new
 	 * page. In the case new page is migrated but not remapped, new page's
 	 * mapcount will be finally 0 and we call uncharge in end_migration().
 	 */
 	if (PageAnon(page))
 		ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
 	else if (page_is_file_cache(page))
 		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	else
 		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
 	/*
 	 * The page is committed to the memcg, but it's not actually
 	 * charged to the res_counter since we plan on replacing the
 	 * old one and only one page is going to be left afterwards.
 	 */
 	__mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
 }
 /* remove redundant charge if migration failed*/
 void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 	struct page *oldpage, struct page *newpage, bool migration_ok)
 {
 	struct page *used, *unused;
 	struct page_cgroup *pc;
 	bool anon;
 	if (!memcg)
 		return;
 	/* blocks rmdir() */
 	cgroup_exclude_rmdir(&memcg->css);
 	if (!migration_ok) {
 		used = oldpage;
 		unused = newpage;
 	} else {
 		used = newpage;
 		unused = oldpage;
 	}
 	anon = PageAnon(used);
-	if (!PageSwapCache(unused))
+	__mem_cgroup_uncharge_common(unused,
-		__mem_cgroup_uncharge_common(unused,
+				     anon ? MEM_CGROUP_CHARGE_TYPE_ANON
-					     anon ? MEM_CGROUP_CHARGE_TYPE_ANON
+				     : MEM_CGROUP_CHARGE_TYPE_CACHE,
-					     : MEM_CGROUP_CHARGE_TYPE_CACHE,
+				     true);
-					     true);
 	css_put(&memcg->css);
 	/*
 	 * We disallowed uncharge of pages under migration because mapcount
 	 * of the page goes down to zero, temporarly.
 	 * Clear the flag and check the page should be charged.
 	 */
 	pc = lookup_page_cgroup(oldpage);
 	lock_page_cgroup(pc);
 	ClearPageCgroupMigration(pc);
 	unlock_page_cgroup(pc);
 	/*
 	 * If a page is a file cache, radix-tree replacement is very atomic
 	 * and we can skip this check. When it was an Anon page, its mapcount
 	 * goes down to 0. But because we added MIGRATION flage, it's not
 	 * uncharged yet. There are several case but page->mapcount check
 	 * and USED bit check in mem_cgroup_uncharge_page() will do enough
 	 * check. (see prepare_charge() also)
 	 */
 	if (anon)
 		mem_cgroup_uncharge_page(used);
 	/*
 	 * At migration, we may charge account against cgroup which has no
 	 * tasks.
 	 * So, rmdir()->pre_destroy() can be called while we do this charge.
 	 * In that case, we need to call pre_destroy() again. check it here.
 	 */
 	cgroup_release_and_wakeup_rmdir(&memcg->css);
 }
 /*
  * At replace page cache, newpage is not under any memcg but it's on
  * LRU. So, this function doesn't touch res_counter but handles LRU
  * in correct way. Both pages are locked so we cannot race with uncharge.
  */
 void mem_cgroup_replace_page_cache(struct page *oldpage,
 				  struct page *newpage)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct page_cgroup *pc;
 	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(oldpage);
 	/* fix accounting on old pages */
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		memcg = pc->mem_cgroup;
 		mem_cgroup_charge_statistics(memcg, false, -1);
 		ClearPageCgroupUsed(pc);
 	}
 	unlock_page_cgroup(pc);
 	/*
 	 * When called from shmem_replace_page(), in some cases the
 	 * oldpage has already been charged, and in some cases not.
 	 */
 	if (!memcg)
 		return;
 	if (PageSwapBacked(oldpage))
 		type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
 	/*
 	 * Even if newpage->mapping was NULL before starting replacement,
 	 * the newpage may be on LRU(or pagevec for LRU) already. We lock
 	 * LRU while we overwrite pc->mem_cgroup.
 	 */
 	__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
 }
 #ifdef CONFIG_DEBUG_VM
 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
 {
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup(page);
 	/*
 	 * Can be NULL while feeding pages into the page allocator for
 	 * the first time, i.e. during boot or memory hotplug;
 	 * or when mem_cgroup_disabled().
 	 */
 	if (likely(pc) && PageCgroupUsed(pc))
 		return pc;
 	return NULL;
 }
 bool mem_cgroup_bad_page_check(struct page *page)
 {
 	if (mem_cgroup_disabled())
 		return false;
 	return lookup_page_cgroup_used(page) != NULL;
 }
 void mem_cgroup_print_bad_page(struct page *page)
 {
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup_used(page);
 	if (pc) {
 		printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
 		       pc, pc->flags, pc->mem_cgroup);
 	}
 }
 #endif
 static DEFINE_MUTEX(set_limit_mutex);
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 				unsigned long long val)
 {
 	int retry_count;
 	u64 memswlimit, memlimit;
 	int ret = 0;
 	int children = mem_cgroup_count_children(memcg);
 	u64 curusage, oldusage;
 	int enlarge;
 	/*
 	 * For keeping hierarchical_reclaim simple, how long we should retry
 	 * is depends on callers. We set our retry-count to be function
 	 * of # of children which we should visit in this loop.
 	 */
 	retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
 	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 	enlarge = 0;
 	while (retry_count) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		/*
 		 * Rather than hide all in some function, I do this in
 		 * open coded manner. You see what this really does.
 		 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
 		 */
 		mutex_lock(&set_limit_mutex);
 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 		if (memswlimit < val) {
 			ret = -EINVAL;
 			mutex_unlock(&set_limit_mutex);
 			break;
 		}
 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 		if (memlimit < val)
 			enlarge = 1;
 		ret = res_counter_set_limit(&memcg->res, val);
 		if (!ret) {
 			if (memswlimit == val)
 				memcg->memsw_is_minimum = true;
 			else
 				memcg->memsw_is_minimum = false;
 		}
 		mutex_unlock(&set_limit_mutex);
 		if (!ret)
 			break;
 		mem_cgroup_reclaim(memcg, GFP_KERNEL,
 				   MEM_CGROUP_RECLAIM_SHRINK);
 		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 		/* Usage is reduced ? */
   		if (curusage >= oldusage)
 			retry_count--;
 		else
 			oldusage = curusage;
 	}
 	if (!ret && enlarge)
 		memcg_oom_recover(memcg);
 	return ret;
 }
 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 					unsigned long long val)
 {
 	int retry_count;
 	u64 memlimit, memswlimit, oldusage, curusage;
 	int children = mem_cgroup_count_children(memcg);
 	int ret = -EBUSY;
 	int enlarge = 0;
 	/* see mem_cgroup_resize_res_limit */
  	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
 	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
 	while (retry_count) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		/*
 		 * Rather than hide all in some function, I do this in
 		 * open coded manner. You see what this really does.
 		 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
 		 */
 		mutex_lock(&set_limit_mutex);
 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 		if (memlimit > val) {
 			ret = -EINVAL;
 			mutex_unlock(&set_limit_mutex);
 			break;
 		}
 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 		if (memswlimit < val)
 			enlarge = 1;
 		ret = res_counter_set_limit(&memcg->memsw, val);
 		if (!ret) {
 			if (memlimit == val)
 				memcg->memsw_is_minimum = true;
 			else
 				memcg->memsw_is_minimum = false;
 		}
 		mutex_unlock(&set_limit_mutex);
 		if (!ret)
 			break;
 		mem_cgroup_reclaim(memcg, GFP_KERNEL,
 				   MEM_CGROUP_RECLAIM_NOSWAP |
 				   MEM_CGROUP_RECLAIM_SHRINK);
 		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
 		/* Usage is reduced ? */
 		if (curusage >= oldusage)
 			retry_count--;
 		else
 			oldusage = curusage;
 	}
 	if (!ret && enlarge)
 		memcg_oom_recover(memcg);
 	return ret;
 }
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 					    gfp_t gfp_mask,
 					    unsigned long *total_scanned)
 {
 	unsigned long nr_reclaimed = 0;
 	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
 	unsigned long reclaimed;
 	int loop = 0;
 	struct mem_cgroup_tree_per_zone *mctz;
 	unsigned long long excess;
 	unsigned long nr_scanned;
 	if (order > 0)
 		return 0;
 	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
 	/*
 	 * This loop can run a while, specially if mem_cgroup's continuously
 	 * keep exceeding their soft limit and putting the system under
 	 * pressure
 	 */
 	do {
 		if (next_mz)
 			mz = next_mz;
 		else
 			mz = mem_cgroup_largest_soft_limit_node(mctz);
 		if (!mz)
 			break;
 		nr_scanned = 0;
 		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
 						    gfp_mask, &nr_scanned);
 		nr_reclaimed += reclaimed;
 		*total_scanned += nr_scanned;
 		spin_lock(&mctz->lock);
 		/*
 		 * If we failed to reclaim anything from this memory cgroup
 		 * it is time to move on to the next cgroup
 		 */
 		next_mz = NULL;
 		if (!reclaimed) {
 			do {
 				/*
 				 * Loop until we find yet another one.
 				 *
 				 * By the time we get the soft_limit lock
 				 * again, someone might have aded the
 				 * group back on the RB tree. Iterate to
 				 * make sure we get a different mem.
 				 * mem_cgroup_largest_soft_limit_node returns
 				 * NULL if no other cgroup is present on
 				 * the tree
 				 */
 				next_mz =
 				__mem_cgroup_largest_soft_limit_node(mctz);
 				if (next_mz == mz)
 					css_put(&next_mz->memcg->css);
 				else /* next_mz == NULL or other memcg */
 					break;
 			} while (1);
 		}
 		__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
 		excess = res_counter_soft_limit_excess(&mz->memcg->res);
 		/*
 		 * One school of thought says that we should not add
 		 * back the node to the tree if reclaim returns 0.
 		 * But our reclaim could return 0, simply because due
 		 * to priority we are exposing a smaller subset of
 		 * memory to reclaim from. Consider this as a longer
 		 * term TODO.
 		 */
 		/* If excess == 0, no tree ops */
 		__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
 		spin_unlock(&mctz->lock);
 		css_put(&mz->memcg->css);
 		loop++;
 		/*
 		 * Could not reclaim anything and there are no more
 		 * mem cgroups to try or we seem to be looping without
 		 * reclaiming anything.
 		 */
 		if (!nr_reclaimed &&
 			(next_mz == NULL ||
 			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
 			break;
 	} while (!nr_reclaimed);
 	if (next_mz)
 		css_put(&next_mz->memcg->css);
 	return nr_reclaimed;
 }
 /*
  * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
  * reclaim the pages page themselves - it just removes the page_cgroups.
  * Returns true if some page_cgroups were not freed, indicating that the caller
  * must retry this operation.
  */
 static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 				int node, int zid, enum lru_list lru)
 {
 	struct mem_cgroup_per_zone *mz;
 	unsigned long flags, loop;
 	struct list_head *list;
 	struct page *busy;
 	struct zone *zone;
 	zone = &NODE_DATA(node)->node_zones[zid];
 	mz = mem_cgroup_zoneinfo(memcg, node, zid);
 	list = &mz->lruvec.lists[lru];
 	loop = mz->lru_size[lru];
 	/* give some margin against EBUSY etc...*/
 	loop += 256;
 	busy = NULL;
 	while (loop--) {
 		struct page_cgroup *pc;
 		struct page *page;
 		spin_lock_irqsave(&zone->lru_lock, flags);
 		if (list_empty(list)) {
 			spin_unlock_irqrestore(&zone->lru_lock, flags);
 			break;
 		}
 		page = list_entry(list->prev, struct page, lru);
 		if (busy == page) {
 			list_move(&page->lru, list);
 			busy = NULL;
 			spin_unlock_irqrestore(&zone->lru_lock, flags);
 			continue;
 		}
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 		pc = lookup_page_cgroup(page);
 		if (mem_cgroup_move_parent(page, pc, memcg)) {
 			/* found lock contention or "pc" is obsolete. */
 			busy = page;
 			cond_resched();
 		} else
 			busy = NULL;
 	}
 	return !list_empty(list);
 }
 /*
  * make mem_cgroup's charge to be 0 if there is no task.
  * This enables deleting this mem_cgroup.
  */
 static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
 {
 	int ret;
 	int node, zid, shrink;
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct cgroup *cgrp = memcg->css.cgroup;
 	css_get(&memcg->css);
 	shrink = 0;
 	/* should free all ? */
 	if (free_all)
 		goto try_to_free;
 move_account:
 	do {
 		ret = -EBUSY;
 		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
 			goto out;
 		/* This is for making all *used* pages to be on LRU. */
 		lru_add_drain_all();
 		drain_all_stock_sync(memcg);
 		ret = 0;
 		mem_cgroup_start_move(memcg);
 		for_each_node_state(node, N_HIGH_MEMORY) {
 			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
 				enum lru_list lru;
 				for_each_lru(lru) {
 					ret = mem_cgroup_force_empty_list(memcg,
 							node, zid, lru);
 					if (ret)
 						break;
 				}
 			}
 			if (ret)
 				break;
 		}
 		mem_cgroup_end_move(memcg);
 		memcg_oom_recover(memcg);
 		cond_resched();
 	/* "ret" should also be checked to ensure all lists are empty. */
 	} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
 out:
 	css_put(&memcg->css);
 	return ret;
 try_to_free:
 	/* returns EBUSY if there is a task or if we come here twice. */
 	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
 		ret = -EBUSY;
 		goto out;
 	}
 	/* we call try-to-free pages for make this cgroup empty */
 	lru_add_drain_all();
 	/* try to free all pages in this cgroup */
 	shrink = 1;
 	while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
 		int progress;
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			goto out;
 		}
 		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
 						false);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
 			congestion_wait(BLK_RW_ASYNC, HZ/10);
 		}
 	}
 	lru_add_drain();
 	/* try move_account...there may be some *locked* pages. */
 	goto move_account;
 }
 static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
 {
 	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
 }
 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
 {
 	return mem_cgroup_from_cont(cont)->use_hierarchy;
 }
 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
 					u64 val)
 {
 	int retval = 0;
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	struct cgroup *parent = cont->parent;
 	struct mem_cgroup *parent_memcg = NULL;
 	if (parent)
 		parent_memcg = mem_cgroup_from_cont(parent);
 	cgroup_lock();
 	if (memcg->use_hierarchy == val)
 		goto out;
 	/*
 	 * If parent's use_hierarchy is set, we can't make any modifications
 	 * in the child subtrees. If it is unset, then the change can
 	 * occur, provided the current cgroup has no children.
 	 *
 	 * For the root cgroup, parent_mem is NULL, we allow value to be
 	 * set if there are no children.
 	 */
 	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
 				(val == 1 || val == 0)) {
 		if (list_empty(&cont->children))
 			memcg->use_hierarchy = val;
 		else
 			retval = -EBUSY;
 	} else
 		retval = -EINVAL;
 out:
 	cgroup_unlock();
 	return retval;
 }
 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
 					       enum mem_cgroup_stat_index idx)
 {
 	struct mem_cgroup *iter;
 	long val = 0;
 	/* Per-cpu values can be negative, use a signed accumulator */
 	for_each_mem_cgroup_tree(iter, memcg)
 		val += mem_cgroup_read_stat(iter, idx);
 	if (val < 0) /* race ? */
 		val = 0;
 	return val;
 }
 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
 	u64 val;
 	if (!mem_cgroup_is_root(memcg)) {
 		if (!swap)
 			return res_counter_read_u64(&memcg->res, RES_USAGE);
 		else
 			return res_counter_read_u64(&memcg->memsw, RES_USAGE);
 	}
 	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
 	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
 	if (swap)
 		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
 	return val << PAGE_SHIFT;
 }
 static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
 			       struct file *file, char __user *buf,
 			       size_t nbytes, loff_t *ppos)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	char str[64];
 	u64 val;
 	int type, name, len;
 	type = MEMFILE_TYPE(cft->private);
 	name = MEMFILE_ATTR(cft->private);
 	if (!do_swap_account && type == _MEMSWAP)
 		return -EOPNOTSUPP;
 	switch (type) {
 	case _MEM:
 		if (name == RES_USAGE)
 			val = mem_cgroup_usage(memcg, false);
 		else
 			val = res_counter_read_u64(&memcg->res, name);
 		break;
 	case _MEMSWAP:
 		if (name == RES_USAGE)
 			val = mem_cgroup_usage(memcg, true);
 		else
 			val = res_counter_read_u64(&memcg->memsw, name);
 		break;
 	default:
 		BUG();
 	}
 	len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
 	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
 }
 /*
  * The user of this function is...
  * RES_LIMIT.
  */
 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 			    const char *buffer)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	int type, name;
 	unsigned long long val;
 	int ret;
 	type = MEMFILE_TYPE(cft->private);
 	name = MEMFILE_ATTR(cft->private);
 	if (!do_swap_account && type == _MEMSWAP)
 		return -EOPNOTSUPP;
 	switch (name) {
 	case RES_LIMIT:
 		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
 			ret = -EINVAL;
 			break;
 		}
 		/* This function does all necessary parse...reuse it */
 		ret = res_counter_memparse_write_strategy(buffer, &val);
 		if (ret)
 			break;
 		if (type == _MEM)
 			ret = mem_cgroup_resize_limit(memcg, val);
 		else
 			ret = mem_cgroup_resize_memsw_limit(memcg, val);
 		break;
 	case RES_SOFT_LIMIT:
 		ret = res_counter_memparse_write_strategy(buffer, &val);
 		if (ret)
 			break;
 		/*
 		 * For memsw, soft limits are hard to implement in terms
 		 * of semantics, for now, we support soft limits for
 		 * control without swap
 		 */
 		if (type == _MEM)
 			ret = res_counter_set_soft_limit(&memcg->res, val);
 		else
 			ret = -EINVAL;
 		break;
 	default:
 		ret = -EINVAL; /* should be BUG() ? */
 		break;
 	}
 	return ret;
 }
 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
 		unsigned long long *mem_limit, unsigned long long *memsw_limit)
 {
 	struct cgroup *cgroup;
 	unsigned long long min_limit, min_memsw_limit, tmp;
 	min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 	min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 	cgroup = memcg->css.cgroup;
 	if (!memcg->use_hierarchy)
 		goto out;
 	while (cgroup->parent) {
 		cgroup = cgroup->parent;
 		memcg = mem_cgroup_from_cont(cgroup);
 		if (!memcg->use_hierarchy)
 			break;
 		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
 		min_limit = min(min_limit, tmp);
 		tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 		min_memsw_limit = min(min_memsw_limit, tmp);
 	}
 out:
 	*mem_limit = min_limit;
 	*memsw_limit = min_memsw_limit;
 }
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	int type, name;
 	type = MEMFILE_TYPE(event);
 	name = MEMFILE_ATTR(event);
 	if (!do_swap_account && type == _MEMSWAP)
 		return -EOPNOTSUPP;
 	switch (name) {
 	case RES_MAX_USAGE:
 		if (type == _MEM)
 			res_counter_reset_max(&memcg->res);
 		else
 			res_counter_reset_max(&memcg->memsw);
 		break;
 	case RES_FAILCNT:
 		if (type == _MEM)
 			res_counter_reset_failcnt(&memcg->res);
 		else
 			res_counter_reset_failcnt(&memcg->memsw);
 		break;
 	}
 	return 0;
 }
 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
 					struct cftype *cft)
 {
 	return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
 }
 #ifdef CONFIG_MMU
 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 					struct cftype *cft, u64 val)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	if (val >= (1 << NR_MOVE_TYPE))
 		return -EINVAL;
 	/*
 	 * We check this value several times in both in can_attach() and
 	 * attach(), so we need cgroup lock to prevent this value from being
 	 * inconsistent.
 	 */
 	cgroup_lock();
 	memcg->move_charge_at_immigrate = val;
 	cgroup_unlock();
 	return 0;
 }
 #else
 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 					struct cftype *cft, u64 val)
 {
 	return -ENOSYS;
 }
 #endif
 #ifdef CONFIG_NUMA
 static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
 				      struct seq_file *m)
 {
 	int nid;
 	unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
 	unsigned long node_nr;
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
 	seq_printf(m, "total=%lu", total_nr);
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
 		seq_printf(m, " N%d=%lu", nid, node_nr);
 	}
 	seq_putc(m, '\n');
 	file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
 	seq_printf(m, "file=%lu", file_nr);
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
 				LRU_ALL_FILE);
 		seq_printf(m, " N%d=%lu", nid, node_nr);
 	}
 	seq_putc(m, '\n');
 	anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
 	seq_printf(m, "anon=%lu", anon_nr);
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
 				LRU_ALL_ANON);
 		seq_printf(m, " N%d=%lu", nid, node_nr);
 	}
 	seq_putc(m, '\n');
 	unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
 	seq_printf(m, "unevictable=%lu", unevictable_nr);
 	for_each_node_state(nid, N_HIGH_MEMORY) {
 		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
 				BIT(LRU_UNEVICTABLE));
 		seq_printf(m, " N%d=%lu", nid, node_nr);
 	}
 	seq_putc(m, '\n');
 	return 0;
 }
 #endif /* CONFIG_NUMA */
 static const char * const mem_cgroup_lru_names[] = {
 	"inactive_anon",
 	"active_anon",
 	"inactive_file",
 	"active_file",
 	"unevictable",
 };
 static inline void mem_cgroup_lru_names_not_uptodate(void)
 {
 	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
 }
 static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
 				 struct seq_file *m)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	struct mem_cgroup *mi;
 	unsigned int i;
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 			continue;
 		seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
 			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
 	}
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
 		seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
 			   mem_cgroup_read_events(memcg, i));
 	for (i = 0; i < NR_LRU_LISTS; i++)
 		seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
 			   mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
 	/* Hierarchical information */
 	{
 		unsigned long long limit, memsw_limit;
 		memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
 		seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
 		if (do_swap_account)
 			seq_printf(m, "hierarchical_memsw_limit %llu\n",
 				   memsw_limit);
 	}
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 		long long val = 0;
 		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 			continue;
 		for_each_mem_cgroup_tree(mi, memcg)
 			val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
 		seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
 	}
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
 		unsigned long long val = 0;
 		for_each_mem_cgroup_tree(mi, memcg)
 			val += mem_cgroup_read_events(mi, i);
 		seq_printf(m, "total_%s %llu\n",
 			   mem_cgroup_events_names[i], val);
 	}
 	for (i = 0; i < NR_LRU_LISTS; i++) {
 		unsigned long long val = 0;
 		for_each_mem_cgroup_tree(mi, memcg)
 			val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
 		seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
 	}
 #ifdef CONFIG_DEBUG_VM
 	{
 		int nid, zid;
 		struct mem_cgroup_per_zone *mz;
 		struct zone_reclaim_stat *rstat;
 		unsigned long recent_rotated[2] = {0, 0};
 		unsigned long recent_scanned[2] = {0, 0};
 		for_each_online_node(nid)
 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 				mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 				rstat = &mz->lruvec.reclaim_stat;
 				recent_rotated[0] += rstat->recent_rotated[0];
 				recent_rotated[1] += rstat->recent_rotated[1];
 				recent_scanned[0] += rstat->recent_scanned[0];
 				recent_scanned[1] += rstat->recent_scanned[1];
 			}
 		seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
 		seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
 		seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
 		seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
 	}
 #endif
 	return 0;
 }
 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	return mem_cgroup_swappiness(memcg);
 }
 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
 				       u64 val)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup *parent;
 	if (val > 100)
 		return -EINVAL;
 	if (cgrp->parent == NULL)
 		return -EINVAL;
 	parent = mem_cgroup_from_cont(cgrp->parent);
 	cgroup_lock();
 	/* If under hierarchy, only empty-root can set this value */
 	if ((parent->use_hierarchy) ||
 	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
 		cgroup_unlock();
 		return -EINVAL;
 	}
 	memcg->swappiness = val;
 	cgroup_unlock();
 	return 0;
 }
 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 {
 	struct mem_cgroup_threshold_ary *t;
 	u64 usage;
 	int i;
 	rcu_read_lock();
 	if (!swap)
 		t = rcu_dereference(memcg->thresholds.primary);
 	else
 		t = rcu_dereference(memcg->memsw_thresholds.primary);
 	if (!t)
 		goto unlock;
 	usage = mem_cgroup_usage(memcg, swap);
 	/*
 	 * current_threshold points to threshold just below or equal to usage.
 	 * If it's not true, a threshold was crossed after last
 	 * call of __mem_cgroup_threshold().
 	 */
 	i = t->current_threshold;
 	/*
 	 * Iterate backward over array of thresholds starting from
 	 * current_threshold and check if a threshold is crossed.
 	 * If none of thresholds below usage is crossed, we read
 	 * only one element of the array here.
 	 */
 	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
 		eventfd_signal(t->entries[i].eventfd, 1);
 	/* i = current_threshold + 1 */
 	i++;
 	/*
 	 * Iterate forward over array of thresholds starting from
 	 * current_threshold+1 and check if a threshold is crossed.
 	 * If none of thresholds above usage is crossed, we read
 	 * only one element of the array here.
 	 */
 	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
 		eventfd_signal(t->entries[i].eventfd, 1);
 	/* Update current_threshold */
 	t->current_threshold = i - 1;
 unlock:
 	rcu_read_unlock();
 }
 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
 {
 	while (memcg) {
 		__mem_cgroup_threshold(memcg, false);
 		if (do_swap_account)
 			__mem_cgroup_threshold(memcg, true);
 		memcg = parent_mem_cgroup(memcg);
 	}
 }
 static int compare_thresholds(const void *a, const void *b)
 {
 	const struct mem_cgroup_threshold *_a = a;
 	const struct mem_cgroup_threshold *_b = b;
 	return _a->threshold - _b->threshold;
 }
 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_eventfd_list *ev;
 	list_for_each_entry(ev, &memcg->oom_notify, list)
 		eventfd_signal(ev->eventfd, 1);
 	return 0;
 }
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, memcg)
 		mem_cgroup_oom_notify_cb(iter);
 }
 static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
 	int type = MEMFILE_TYPE(cft->private);
 	u64 threshold, usage;
 	int i, size, ret;
 	ret = res_counter_memparse_write_strategy(args, &threshold);
 	if (ret)
 		return ret;
 	mutex_lock(&memcg->thresholds_lock);
 	if (type == _MEM)
 		thresholds = &memcg->thresholds;
 	else if (type == _MEMSWAP)
 		thresholds = &memcg->memsw_thresholds;
 	else
 		BUG();
 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
 	/* Check if a threshold crossed before adding a new one */
 	if (thresholds->primary)
 		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
 	/* Allocate memory for new array of thresholds */
 	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
 			GFP_KERNEL);
 	if (!new) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
 	new->size = size;
 	/* Copy thresholds (if any) to new array */
 	if (thresholds->primary) {
 		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
 				sizeof(struct mem_cgroup_threshold));
 	}
 	/* Add new threshold */
 	new->entries[size - 1].eventfd = eventfd;
 	new->entries[size - 1].threshold = threshold;
 	/* Sort thresholds. Registering of new threshold isn't time-critical */
 	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
 			compare_thresholds, NULL);
 	/* Find current threshold */
 	new->current_threshold = -1;
 	for (i = 0; i < size; i++) {
 		if (new->entries[i].threshold <= usage) {
 			/*
 			 * new->current_threshold will not be used until
 			 * rcu_assign_pointer(), so it's safe to increment
 			 * it here.
 			 */
 			++new->current_threshold;
 		} else
 			break;
 	}
 	/* Free old spare buffer and save old primary buffer as spare */
 	kfree(thresholds->spare);
 	thresholds->spare = thresholds->primary;
 	rcu_assign_pointer(thresholds->primary, new);
 	/* To be sure that nobody uses thresholds */
 	synchronize_rcu();
 unlock:
 	mutex_unlock(&memcg->thresholds_lock);
 	return ret;
 }
 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
 	int type = MEMFILE_TYPE(cft->private);
 	u64 usage;
 	int i, j, size;
 	mutex_lock(&memcg->thresholds_lock);
 	if (type == _MEM)
 		thresholds = &memcg->thresholds;
 	else if (type == _MEMSWAP)
 		thresholds = &memcg->memsw_thresholds;
 	else
 		BUG();
 	if (!thresholds->primary)
 		goto unlock;
 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
 	/* Check if a threshold crossed before removing */
 	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 	/* Calculate new number of threshold */
 	size = 0;
 	for (i = 0; i < thresholds->primary->size; i++) {
 		if (thresholds->primary->entries[i].eventfd != eventfd)
 			size++;
 	}
 	new = thresholds->spare;
 	/* Set thresholds array to NULL if we don't have thresholds */
 	if (!size) {
 		kfree(new);
 		new = NULL;
 		goto swap_buffers;
 	}
 	new->size = size;
 	/* Copy thresholds and find current threshold */
 	new->current_threshold = -1;
 	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
 		if (thresholds->primary->entries[i].eventfd == eventfd)
 			continue;
 		new->entries[j] = thresholds->primary->entries[i];
 		if (new->entries[j].threshold <= usage) {
 			/*
 			 * new->current_threshold will not be used
 			 * until rcu_assign_pointer(), so it's safe to increment
 			 * it here.
 			 */
 			++new->current_threshold;
 		}
 		j++;
 	}
 swap_buffers:
 	/* Swap primary and spare array */
 	thresholds->spare = thresholds->primary;
 	/* If all events are unregistered, free the spare array */
 	if (!new) {
 		kfree(thresholds->spare);
 		thresholds->spare = NULL;
 	}
 	rcu_assign_pointer(thresholds->primary, new);
 	/* To be sure that nobody uses thresholds */
 	synchronize_rcu();
 unlock:
 	mutex_unlock(&memcg->thresholds_lock);
 }
 static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_eventfd_list *event;
 	int type = MEMFILE_TYPE(cft->private);
 	BUG_ON(type != _OOM_TYPE);
 	event = kmalloc(sizeof(*event),	GFP_KERNEL);
 	if (!event)
 		return -ENOMEM;
 	spin_lock(&memcg_oom_lock);
 	event->eventfd = eventfd;
 	list_add(&event->list, &memcg->oom_notify);
 	/* already in OOM ? */
 	if (atomic_read(&memcg->under_oom))
 		eventfd_signal(eventfd, 1);
 	spin_unlock(&memcg_oom_lock);
 	return 0;
 }
 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 	struct cftype *cft, struct eventfd_ctx *eventfd)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup_eventfd_list *ev, *tmp;
 	int type = MEMFILE_TYPE(cft->private);
 	BUG_ON(type != _OOM_TYPE);
 	spin_lock(&memcg_oom_lock);
 	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
 		if (ev->eventfd == eventfd) {
 			list_del(&ev->list);
 			kfree(ev);
 		}
 	}
 	spin_unlock(&memcg_oom_lock);
 }
 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
 	struct cftype *cft,  struct cgroup_map_cb *cb)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
 	if (atomic_read(&memcg->under_oom))
 		cb->fill(cb, "under_oom", 1);
 	else
 		cb->fill(cb, "under_oom", 0);
 	return 0;
 }
 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 	struct cftype *cft, u64 val)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup *parent;
 	/* cannot set to root cgroup and only 0 and 1 are allowed */
 	if (!cgrp->parent || !((val == 0) || (val == 1)))
 		return -EINVAL;
 	parent = mem_cgroup_from_cont(cgrp->parent);
 	cgroup_lock();
 	/* oom-kill-disable is a flag for subhierarchy. */
 	if ((parent->use_hierarchy) ||
 	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
 		cgroup_unlock();
 		return -EINVAL;
 	}
 	memcg->oom_kill_disable = val;
 	if (!val)
 		memcg_oom_recover(memcg);
 	cgroup_unlock();
 	return 0;
 }
 #ifdef CONFIG_MEMCG_KMEM
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	return mem_cgroup_sockets_init(memcg, ss);
 };
 static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
 {
 	mem_cgroup_sockets_destroy(memcg);
 }
 #else
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	return 0;
 }
 static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
 {
 }
 #endif
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
 		.read = mem_cgroup_read,
 		.register_event = mem_cgroup_usage_register_event,
 		.unregister_event = mem_cgroup_usage_unregister_event,
 	},
 	{
 		.name = "max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
 		.read = mem_cgroup_read,
 	},
 	{
 		.name = "limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read = mem_cgroup_read,
 	},
 	{
 		.name = "soft_limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read = mem_cgroup_read,
 	},
 	{
 		.name = "failcnt",
 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
 		.read = mem_cgroup_read,
 	},
 	{
 		.name = "stat",
 		.read_seq_string = memcg_stat_show,
 	},
 	{
 		.name = "force_empty",
 		.trigger = mem_cgroup_force_empty_write,
 	},
 	{
 		.name = "use_hierarchy",
 		.write_u64 = mem_cgroup_hierarchy_write,
 		.read_u64 = mem_cgroup_hierarchy_read,
 	},
 	{
 		.name = "swappiness",
 		.read_u64 = mem_cgroup_swappiness_read,
 		.write_u64 = mem_cgroup_swappiness_write,
 	},
 	{
 		.name = "move_charge_at_immigrate",
 		.read_u64 = mem_cgroup_move_charge_read,
 		.write_u64 = mem_cgroup_move_charge_write,
 	},
 	{
 		.name = "oom_control",
 		.read_map = mem_cgroup_oom_control_read,
 		.write_u64 = mem_cgroup_oom_control_write,
 		.register_event = mem_cgroup_oom_register_event,
 		.unregister_event = mem_cgroup_oom_unregister_event,
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 	},
 #ifdef CONFIG_NUMA
 	{
 		.name = "numa_stat",
 		.read_seq_string = memcg_numa_stat_show,
 	},
 #endif
 #ifdef CONFIG_MEMCG_SWAP
 	{
 		.name = "memsw.usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
 		.read = mem_cgroup_read,
 		.register_event = mem_cgroup_usage_register_event,
 		.unregister_event = mem_cgroup_usage_unregister_event,
 	},
 	{
 		.name = "memsw.max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
 		.read = mem_cgroup_read,
 	},
 	{
 		.name = "memsw.limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read = mem_cgroup_read,
 	},
 	{
 		.name = "memsw.failcnt",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
 		.read = mem_cgroup_read,
 	},
 #endif
 	{ },	/* terminate */
 };
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
 	struct mem_cgroup_per_node *pn;
 	struct mem_cgroup_per_zone *mz;
 	int zone, tmp = node;
 	/*
 	 * This routine is called against possible nodes.
 	 * But it's BUG to call kmalloc() against offline node.
 	 *
 	 * TODO: this routine can waste much memory for nodes which will
 	 *       never be onlined. It's better to use memory hotplug callback
 	 *       function.
 	 */
 	if (!node_state(node, N_NORMAL_MEMORY))
 		tmp = -1;
 	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
 	if (!pn)
 		return 1;
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
 		lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
 		mz->usage_in_excess = 0;
 		mz->on_tree = false;
 		mz->memcg = memcg;
 	}
 	memcg->info.nodeinfo[node] = pn;
 	return 0;
 }
 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
 	kfree(memcg->info.nodeinfo[node]);
 }
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
 	struct mem_cgroup *memcg;
 	int size = sizeof(struct mem_cgroup);
 	/* Can be very big if MAX_NUMNODES is very big */
 	if (size < PAGE_SIZE)
 		memcg = kzalloc(size, GFP_KERNEL);
 	else
 		memcg = vzalloc(size);
 	if (!memcg)
 		return NULL;
 	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
 	if (!memcg->stat)
 		goto out_free;
 	spin_lock_init(&memcg->pcp_counter_lock);
 	return memcg;
 out_free:
 	if (size < PAGE_SIZE)
 		kfree(memcg);
 	else
 		vfree(memcg);
 	return NULL;
 }
 /*
  * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
  * but in process context.  The work_freeing structure is overlaid
  * on the rcu_freeing structure, which itself is overlaid on memsw.
  */
 static void free_work(struct work_struct *work)
 {
 	struct mem_cgroup *memcg;
 	int size = sizeof(struct mem_cgroup);
 	memcg = container_of(work, struct mem_cgroup, work_freeing);
 	/*
 	 * We need to make sure that (at least for now), the jump label
 	 * destruction code runs outside of the cgroup lock. This is because
 	 * get_online_cpus(), which is called from the static_branch update,
 	 * can't be called inside the cgroup_lock. cpusets are the ones
 	 * enforcing this dependency, so if they ever change, we might as well.
 	 *
 	 * schedule_work() will guarantee this happens. Be careful if you need
 	 * to move this code around, and make sure it is outside
 	 * the cgroup_lock.
 	 */
 	disarm_sock_keys(memcg);
 	if (size < PAGE_SIZE)
 		kfree(memcg);
 	else
 		vfree(memcg);
 }
 static void free_rcu(struct rcu_head *rcu_head)
 {
 	struct mem_cgroup *memcg;
 	memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
 	INIT_WORK(&memcg->work_freeing, free_work);
 	schedule_work(&memcg->work_freeing);
 }
 /*
  * At destroying mem_cgroup, references from swap_cgroup can remain.
  * (scanning all at force_empty is too costly...)
  *
  * Instead of clearing all references at force_empty, we remember
  * the number of reference from swap_cgroup and free mem_cgroup when
  * it goes down to 0.
  *
  * Removal of cgroup itself succeeds regardless of refs from swap.
  */
 static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
 	int node;
 	mem_cgroup_remove_from_trees(memcg);
 	free_css_id(&mem_cgroup_subsys, &memcg->css);
 	for_each_node(node)
 		free_mem_cgroup_per_zone_info(memcg, node);
 	free_percpu(memcg->stat);
 	call_rcu(&memcg->rcu_freeing, free_rcu);
 }
 static void mem_cgroup_get(struct mem_cgroup *memcg)
 {
 	atomic_inc(&memcg->refcnt);
 }
 static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
 {
 	if (atomic_sub_and_test(count, &memcg->refcnt)) {
 		struct mem_cgroup *parent = parent_mem_cgroup(memcg);
 		__mem_cgroup_free(memcg);
 		if (parent)
 			mem_cgroup_put(parent);
 	}
 }
 static void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 	__mem_cgroup_put(memcg, 1);
 }
 /*
  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
  */
 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 {
 	if (!memcg->res.parent)
 		return NULL;
 	return mem_cgroup_from_res_counter(memcg->res.parent, res);
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
 #ifdef CONFIG_MEMCG_SWAP
 static void __init enable_swap_cgroup(void)
 {
 	if (!mem_cgroup_disabled() && really_do_swap_account)
 		do_swap_account = 1;
 }
 #else
 static void __init enable_swap_cgroup(void)
 {
 }
 #endif
 static int mem_cgroup_soft_limit_tree_init(void)
 {
 	struct mem_cgroup_tree_per_node *rtpn;
 	struct mem_cgroup_tree_per_zone *rtpz;
 	int tmp, node, zone;
 	for_each_node(node) {
 		tmp = node;
 		if (!node_state(node, N_NORMAL_MEMORY))
 			tmp = -1;
 		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
 		if (!rtpn)
 			goto err_cleanup;
 		soft_limit_tree.rb_tree_per_node[node] = rtpn;
 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 			rtpz = &rtpn->rb_tree_per_zone[zone];
 			rtpz->rb_root = RB_ROOT;
 			spin_lock_init(&rtpz->lock);
 		}
 	}
 	return 0;
 err_cleanup:
 	for_each_node(node) {
 		if (!soft_limit_tree.rb_tree_per_node[node])
 			break;
 		kfree(soft_limit_tree.rb_tree_per_node[node]);
 		soft_limit_tree.rb_tree_per_node[node] = NULL;
 	}
 	return 1;
 }
 static struct cgroup_subsys_state * __ref
 mem_cgroup_create(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg, *parent;
 	long error = -ENOMEM;
 	int node;
 	memcg = mem_cgroup_alloc();
 	if (!memcg)
 		return ERR_PTR(error);
 	for_each_node(node)
 		if (alloc_mem_cgroup_per_zone_info(memcg, node))
 			goto free_out;
 	/* root ? */
 	if (cont->parent == NULL) {
 		int cpu;
 		enable_swap_cgroup();
 		parent = NULL;
 		if (mem_cgroup_soft_limit_tree_init())
 			goto free_out;
 		root_mem_cgroup = memcg;
 		for_each_possible_cpu(cpu) {
 			struct memcg_stock_pcp *stock =
 						&per_cpu(memcg_stock, cpu);
 			INIT_WORK(&stock->work, drain_local_stock);
 		}
 		hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
 	} else {
 		parent = mem_cgroup_from_cont(cont->parent);
 		memcg->use_hierarchy = parent->use_hierarchy;
 		memcg->oom_kill_disable = parent->oom_kill_disable;
 	}
 	if (parent && parent->use_hierarchy) {
 		res_counter_init(&memcg->res, &parent->res);
 		res_counter_init(&memcg->memsw, &parent->memsw);
 		/*
 		 * We increment refcnt of the parent to ensure that we can
 		 * safely access it on res_counter_charge/uncharge.
 		 * This refcnt will be decremented when freeing this
 		 * mem_cgroup(see mem_cgroup_put).
 		 */
 		mem_cgroup_get(parent);
 	} else {
 		res_counter_init(&memcg->res, NULL);
 		res_counter_init(&memcg->memsw, NULL);
 	}
 	memcg->last_scanned_node = MAX_NUMNODES;
 	INIT_LIST_HEAD(&memcg->oom_notify);
 	if (parent)
 		memcg->swappiness = mem_cgroup_swappiness(parent);
 	atomic_set(&memcg->refcnt, 1);
 	memcg->move_charge_at_immigrate = 0;
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
 	error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
 	if (error) {
 		/*
 		 * We call put now because our (and parent's) refcnts
 		 * are already in place. mem_cgroup_put() will internally
 		 * call __mem_cgroup_free, so return directly
 		 */
 		mem_cgroup_put(memcg);
 		return ERR_PTR(error);
 	}
 	return &memcg->css;
 free_out:
 	__mem_cgroup_free(memcg);
 	return ERR_PTR(error);
 }
 static int mem_cgroup_pre_destroy(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	return mem_cgroup_force_empty(memcg, false);
 }
 static void mem_cgroup_destroy(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	kmem_cgroup_destroy(memcg);
 	mem_cgroup_put(memcg);
 }
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
 #define PRECHARGE_COUNT_AT_ONCE	256
 static int mem_cgroup_do_precharge(unsigned long count)
 {
 	int ret = 0;
 	int batch_count = PRECHARGE_COUNT_AT_ONCE;
 	struct mem_cgroup *memcg = mc.to;
 	if (mem_cgroup_is_root(memcg)) {
 		mc.precharge += count;
 		/* we don't need css_get for root */
 		return ret;
 	}
 	/* try to charge at once */
 	if (count > 1) {
 		struct res_counter *dummy;
 		/*
 		 * "memcg" cannot be under rmdir() because we've already checked
 		 * by cgroup_lock_live_cgroup() that it is not removed and we
 		 * are still under the same cgroup_mutex. So we can postpone
 		 * css_get().
 		 */
 		if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
 			goto one_by_one;
 		if (do_swap_account && res_counter_charge(&memcg->memsw,
 						PAGE_SIZE * count, &dummy)) {
 			res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
 			goto one_by_one;
 		}
 		mc.precharge += count;
 		return ret;
 	}
 one_by_one:
 	/* fall back to one by one charge */
 	while (count--) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		if (!batch_count--) {
 			batch_count = PRECHARGE_COUNT_AT_ONCE;
 			cond_resched();
 		}
 		ret = __mem_cgroup_try_charge(NULL,
 					GFP_KERNEL, 1, &memcg, false);
 		if (ret)
 			/* mem_cgroup_clear_mc() will do uncharge later */
 			return ret;
 		mc.precharge++;
 	}
 	return ret;
 }
 /**
  * get_mctgt_type - get target type of moving charge
  * @vma: the vma the pte to be checked belongs
  * @addr: the address corresponding to the pte to be checked
  * @ptent: the pte to be checked
  * @target: the pointer the target page or swap ent will be stored(can be NULL)
  *
  * Returns
  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
  *     move charge. if @target is not NULL, the page is stored in target->page
  *     with extra refcnt got(Callers should handle it).
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  *     target for charge migration. if @target is not NULL, the entry is stored
  *     in target->ent.
  *
  * Called with pte lock held.
  */
 union mc_target {
 	struct page	*page;
 	swp_entry_t	ent;
 };
 enum mc_target_type {
 	MC_TARGET_NONE = 0,
 	MC_TARGET_PAGE,
 	MC_TARGET_SWAP,
 };
 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
 						unsigned long addr, pte_t ptent)
 {
 	struct page *page = vm_normal_page(vma, addr, ptent);
 	if (!page || !page_mapped(page))
 		return NULL;
 	if (PageAnon(page)) {
 		/* we don't move shared anon */
 		if (!move_anon())
 			return NULL;
 	} else if (!move_file())
 		/* we ignore mapcount for file pages */
 		return NULL;
 	if (!get_page_unless_zero(page))
 		return NULL;
 	return page;
 }
 #ifdef CONFIG_SWAP
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	struct page *page = NULL;
 	swp_entry_t ent = pte_to_swp_entry(ptent);
 	if (!move_anon() || non_swap_entry(ent))
 		return NULL;
 	/*
 	 * Because lookup_swap_cache() updates some statistics counter,
 	 * we call find_get_page() with swapper_space directly.
 	 */
 	page = find_get_page(&swapper_space, ent.val);
 	if (do_swap_account)
 		entry->val = ent.val;
 	return page;
 }
 #else
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	return NULL;
 }
 #endif
 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	struct page *page = NULL;
 	struct address_space *mapping;
 	pgoff_t pgoff;
 	if (!vma->vm_file) /* anonymous vma */
 		return NULL;
 	if (!move_file())
 		return NULL;
 	mapping = vma->vm_file->f_mapping;
 	if (pte_none(ptent))
 		pgoff = linear_page_index(vma, addr);
 	else /* pte_file(ptent) is true */
 		pgoff = pte_to_pgoff(ptent);
 	/* page is moved even if it's not RSS of this task(page-faulted). */
 	page = find_get_page(mapping, pgoff);
 #ifdef CONFIG_SWAP
 	/* shmem/tmpfs may report page out on swap: account for that too. */
 	if (radix_tree_exceptional_entry(page)) {
 		swp_entry_t swap = radix_to_swp_entry(page);
 		if (do_swap_account)
 			*entry = swap;
 		page = find_get_page(&swapper_space, swap.val);
 	}
 #endif
 	return page;
 }
 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		unsigned long addr, pte_t ptent, union mc_target *target)
 {
 	struct page *page = NULL;
 	struct page_cgroup *pc;
 	enum mc_target_type ret = MC_TARGET_NONE;
 	swp_entry_t ent = { .val = 0 };
 	if (pte_present(ptent))
 		page = mc_handle_present_pte(vma, addr, ptent);
 	else if (is_swap_pte(ptent))
 		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
 	else if (pte_none(ptent) || pte_file(ptent))
 		page = mc_handle_file_pte(vma, addr, ptent, &ent);
 	if (!page && !ent.val)
 		return ret;
 	if (page) {
 		pc = lookup_page_cgroup(page);
 		/*
 		 * Do only loose check w/o page_cgroup lock.
 		 * mem_cgroup_move_account() checks the pc is valid or not under
 		 * the lock.
 		 */
 		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
 			ret = MC_TARGET_PAGE;
 			if (target)
 				target->page = page;
 		}
 		if (!ret || !target)
 			put_page(page);
 	}
 	/* There is a swap entry and a page doesn't exist or isn't charged */
 	if (ent.val && !ret &&
 			css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
 		ret = MC_TARGET_SWAP;
 		if (target)
 			target->ent = ent;
 	}
 	return ret;
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
  * We don't consider swapping or file mapped pages because THP does not
  * support them for now.
  * Caller should make sure that pmd_trans_huge(pmd) is true.
  */
 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t pmd, union mc_target *target)
 {
 	struct page *page = NULL;
 	struct page_cgroup *pc;
 	enum mc_target_type ret = MC_TARGET_NONE;
 	page = pmd_page(pmd);
 	VM_BUG_ON(!page || !PageHead(page));
 	if (!move_anon())
 		return ret;
 	pc = lookup_page_cgroup(page);
 	if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
 		ret = MC_TARGET_PAGE;
 		if (target) {
 			get_page(page);
 			target->page = page;
 		}
 	}
 	return ret;
 }
 #else
 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t pmd, union mc_target *target)
 {
 	return MC_TARGET_NONE;
 }
 #endif
 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 					unsigned long addr, unsigned long end,
 					struct mm_walk *walk)
 {
 	struct vm_area_struct *vma = walk->private;
 	pte_t *pte;
 	spinlock_t *ptl;
 	if (pmd_trans_huge_lock(pmd, vma) == 1) {
 		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
 			mc.precharge += HPAGE_PMD_NR;
 		spin_unlock(&vma->vm_mm->page_table_lock);
 		return 0;
 	}
 	if (pmd_trans_unstable(pmd))
 		return 0;
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE)
 		if (get_mctgt_type(vma, addr, *pte, NULL))
 			mc.precharge++;	/* increment precharge temporarily */
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	return 0;
 }
 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
 {
 	unsigned long precharge;
 	struct vm_area_struct *vma;
 	down_read(&mm->mmap_sem);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		struct mm_walk mem_cgroup_count_precharge_walk = {
 			.pmd_entry = mem_cgroup_count_precharge_pte_range,
 			.mm = mm,
 			.private = vma,
 		};
 		if (is_vm_hugetlb_page(vma))
 			continue;
 		walk_page_range(vma->vm_start, vma->vm_end,
 					&mem_cgroup_count_precharge_walk);
 	}
 	up_read(&mm->mmap_sem);
 	precharge = mc.precharge;
 	mc.precharge = 0;
 	return precharge;
 }
 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 {
 	unsigned long precharge = mem_cgroup_count_precharge(mm);
 	VM_BUG_ON(mc.moving_task);
 	mc.moving_task = current;
 	return mem_cgroup_do_precharge(precharge);
 }
 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
 static void __mem_cgroup_clear_mc(void)
 {
 	struct mem_cgroup *from = mc.from;
 	struct mem_cgroup *to = mc.to;
 	/* we must uncharge all the leftover precharges from mc.to */
 	if (mc.precharge) {
 		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
 		mc.precharge = 0;
 	}
 	/*
 	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
 	 * we must uncharge here.
 	 */
 	if (mc.moved_charge) {
 		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
 		mc.moved_charge = 0;
 	}
 	/* we must fixup refcnts and charges */
 	if (mc.moved_swap) {
 		/* uncharge swap account from the old cgroup */
 		if (!mem_cgroup_is_root(mc.from))
 			res_counter_uncharge(&mc.from->memsw,
 						PAGE_SIZE * mc.moved_swap);
 		__mem_cgroup_put(mc.from, mc.moved_swap);
 		if (!mem_cgroup_is_root(mc.to)) {
 			/*
 			 * we charged both to->res and to->memsw, so we should
 			 * uncharge to->res.
 			 */
 			res_counter_uncharge(&mc.to->res,
 						PAGE_SIZE * mc.moved_swap);
 		}
 		/* we've already done mem_cgroup_get(mc.to) */
 		mc.moved_swap = 0;
 	}
 	memcg_oom_recover(from);
 	memcg_oom_recover(to);
 	wake_up_all(&mc.waitq);
 }
 static void mem_cgroup_clear_mc(void)
 {
 	struct mem_cgroup *from = mc.from;
 	/*
 	 * we must clear moving_task before waking up waiters at the end of
 	 * task migration.
 	 */
 	mc.moving_task = NULL;
 	__mem_cgroup_clear_mc();
 	spin_lock(&mc.lock);
 	mc.from = NULL;
 	mc.to = NULL;
 	spin_unlock(&mc.lock);
 	mem_cgroup_end_move(from);
 }
 static int mem_cgroup_can_attach(struct cgroup *cgroup,
 				 struct cgroup_taskset *tset)
 {
 	struct task_struct *p = cgroup_taskset_first(tset);
 	int ret = 0;
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
 	if (memcg->move_charge_at_immigrate) {
 		struct mm_struct *mm;
 		struct mem_cgroup *from = mem_cgroup_from_task(p);
 		VM_BUG_ON(from == memcg);
 		mm = get_task_mm(p);
 		if (!mm)
 			return 0;
 		/* We move charges only when we move a owner of the mm */
 		if (mm->owner == p) {
 			VM_BUG_ON(mc.from);
 			VM_BUG_ON(mc.to);
 			VM_BUG_ON(mc.precharge);
 			VM_BUG_ON(mc.moved_charge);
 			VM_BUG_ON(mc.moved_swap);
 			mem_cgroup_start_move(from);
 			spin_lock(&mc.lock);
 			mc.from = from;
 			mc.to = memcg;
 			spin_unlock(&mc.lock);
 			/* We set mc.moving_task later */
 			ret = mem_cgroup_precharge_mc(mm);
 			if (ret)
 				mem_cgroup_clear_mc();
 		}
 		mmput(mm);
 	}
 	return ret;
 }
 static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
 				     struct cgroup_taskset *tset)
 {
 	mem_cgroup_clear_mc();
 }
 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct mm_walk *walk)
 {
 	int ret = 0;
 	struct vm_area_struct *vma = walk->private;
 	pte_t *pte;
 	spinlock_t *ptl;
 	enum mc_target_type target_type;
 	union mc_target target;
 	struct page *page;
 	struct page_cgroup *pc;
 	/*
 	 * We don't take compound_lock() here but no race with splitting thp
 	 * happens because:
 	 *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
 	 *    under splitting, which means there's no concurrent thp split,
 	 *  - if another thread runs into split_huge_page() just after we
 	 *    entered this if-block, the thread must wait for page table lock
 	 *    to be unlocked in __split_huge_page_splitting(), where the main
 	 *    part of thp split is not executed yet.
 	 */
 	if (pmd_trans_huge_lock(pmd, vma) == 1) {
 		if (mc.precharge < HPAGE_PMD_NR) {
 			spin_unlock(&vma->vm_mm->page_table_lock);
 			return 0;
 		}
 		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
 		if (target_type == MC_TARGET_PAGE) {
 			page = target.page;
 			if (!isolate_lru_page(page)) {
 				pc = lookup_page_cgroup(page);
 				if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
 							pc, mc.from, mc.to)) {
 					mc.precharge -= HPAGE_PMD_NR;
 					mc.moved_charge += HPAGE_PMD_NR;
 				}
 				putback_lru_page(page);
 			}
 			put_page(page);
 		}
 		spin_unlock(&vma->vm_mm->page_table_lock);
 		return 0;
 	}
 	if (pmd_trans_unstable(pmd))
 		return 0;
 retry:
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; addr += PAGE_SIZE) {
 		pte_t ptent = *(pte++);
 		swp_entry_t ent;
 		if (!mc.precharge)
 			break;
 		switch (get_mctgt_type(vma, addr, ptent, &target)) {
 		case MC_TARGET_PAGE:
 			page = target.page;
 			if (isolate_lru_page(page))
 				goto put;
 			pc = lookup_page_cgroup(page);
 			if (!mem_cgroup_move_account(page, 1, pc,
 						     mc.from, mc.to)) {
 				mc.precharge--;
 				/* we uncharge from mc.from later. */
 				mc.moved_charge++;
 			}
 			putback_lru_page(page);
 put:			/* get_mctgt_type() gets the page */
 			put_page(page);
 			break;
 		case MC_TARGET_SWAP:
 			ent = target.ent;
 			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
 				mc.precharge--;
 				/* we fixup refcnts and charges later. */
 				mc.moved_swap++;
 			}
 			break;
 		default:
 			break;
 		}
 	}
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	if (addr != end) {
 		/*
 		 * We have consumed all precharges we got in can_attach().
 		 * We try charge one by one, but don't do any additional
 		 * charges to mc.to if we have failed in charge once in attach()
 		 * phase.
 		 */
 		ret = mem_cgroup_do_precharge(1);
 		if (!ret)
 			goto retry;
 	}
 	return ret;
 }
 static void mem_cgroup_move_charge(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
 	lru_add_drain_all();
 retry:
 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
 		/*
 		 * Someone who are holding the mmap_sem might be waiting in
 		 * waitq. So we cancel all extra charges, wake up all waiters,
 		 * and retry. Because we cancel precharges, we might not be able
 		 * to move enough charges, but moving charge is a best-effort
 		 * feature anyway, so it wouldn't be a big problem.
 		 */
 		__mem_cgroup_clear_mc();
 		cond_resched();
 		goto retry;
 	}
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		int ret;
 		struct mm_walk mem_cgroup_move_charge_walk = {
 			.pmd_entry = mem_cgroup_move_charge_pte_range,
 			.mm = mm,
 			.private = vma,
 		};
 		if (is_vm_hugetlb_page(vma))
 			continue;
 		ret = walk_page_range(vma->vm_start, vma->vm_end,
 						&mem_cgroup_move_charge_walk);
 		if (ret)
 			/*
 			 * means we have consumed all precharges and failed in
 			 * doing additional charge. Just abandon here.
 			 */
 			break;
 	}
 	up_read(&mm->mmap_sem);
 }
 static void mem_cgroup_move_task(struct cgroup *cont,
 				 struct cgroup_taskset *tset)
 {
 	struct task_struct *p = cgroup_taskset_first(tset);
 	struct mm_struct *mm = get_task_mm(p);
 	if (mm) {
 		if (mc.to)
 			mem_cgroup_move_charge(mm);
 		mmput(mm);
 	}
 	if (mc.to)
 		mem_cgroup_clear_mc();
 }
 #else	/* !CONFIG_MMU */
 static int mem_cgroup_can_attach(struct cgroup *cgroup,
 				 struct cgroup_taskset *tset)
 {
 	return 0;
 }
 static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
 				     struct cgroup_taskset *tset)
 {
 }
 static void mem_cgroup_move_task(struct cgroup *cont,
 				 struct cgroup_taskset *tset)
 {
 }
 #endif
 struct cgroup_subsys mem_cgroup_subsys = {
 	.name = "memory",
 	.subsys_id = mem_cgroup_subsys_id,
 	.create = mem_cgroup_create,
 	.pre_destroy = mem_cgroup_pre_destroy,
 	.destroy = mem_cgroup_destroy,
 	.can_attach = mem_cgroup_can_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.attach = mem_cgroup_move_task,
 	.base_cftypes = mem_cgroup_files,
 	.early_init = 0,
 	.use_id = 1,
 	.__DEPRECATED_clear_css_refs = true,
 };
 #ifdef CONFIG_MEMCG_SWAP
 static int __init enable_swap_account(char *s)
 {
 	/* consider enabled if no parameter or 1 is given */
 	if (!strcmp(s, "1"))
 		really_do_swap_account = 1;
 	else if (!strcmp(s, "0"))
 		really_do_swap_account = 0;
 	return 1;
 }
 __setup("swapaccount=", enable_swap_account);
 #endif